# Binnary clasification using Transformer. An 'Edible Mushroom Dataset'

I downloaded this dataset from kaggle, just search the title in the web and there will be.

In [3]:
# imports
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from tab_transformer_pytorch import TabTransformer
import tensorflow as tf

In [4]:
# load the dataset
dataset = pd.read_csv('mushroom_cleaned.csv')

In [5]:
dataset.head()

Unnamed: 0,cap-diameter,cap-shape,gill-attachment,gill-color,stem-height,stem-width,stem-color,season,class
0,1372,2,2,10,3.807467,1545,11,1.804273,1
1,1461,2,2,10,3.807467,1557,11,1.804273,1
2,1371,2,2,10,3.612496,1566,11,1.804273,1
3,1261,6,2,10,3.787572,1566,11,1.804273,1
4,1305,6,2,10,3.711971,1464,11,0.943195,1


The dataset has 2 classes to classify:
- class 1 --> Non edible Mushroom
- class 0 --> Edible Mushroom

In [6]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54035 entries, 0 to 54034
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   cap-diameter     54035 non-null  int64  
 1   cap-shape        54035 non-null  int64  
 2   gill-attachment  54035 non-null  int64  
 3   gill-color       54035 non-null  int64  
 4   stem-height      54035 non-null  float64
 5   stem-width       54035 non-null  int64  
 6   stem-color       54035 non-null  int64  
 7   season           54035 non-null  float64
 8   class            54035 non-null  int64  
dtypes: float64(2), int64(7)
memory usage: 3.7 MB


In [7]:
# We're going to test if we have a balanced dataset
dataset[dataset["class"]==1].shape , dataset[dataset["class"]==0].shape

((29675, 9), (24360, 9))

We have a barely balanced dataset with $29675$ $non$-$edible$ mushrooms and $24360$ $edible$ mushrooms

As we can see in the info, we're going to make our columns into categorical variables

In [8]:
categorical_columns = ['cap-diameter','cap-shape', 'gill-attachment', 'gill-color', 'stem-height', 'stem-width', 'stem-color', 'season' ]

# Label encoding
labelEncoder = LabelEncoder()
for col in categorical_columns:
    dataset[col + '_encoded'] = labelEncoder.fit_transform(dataset[col])

In [9]:
dataset.head()

Unnamed: 0,cap-diameter,cap-shape,gill-attachment,gill-color,stem-height,stem-width,stem-color,season,class,cap-diameter_encoded,cap-shape_encoded,gill-attachment_encoded,gill-color_encoded,stem-height_encoded,stem-width_encoded,stem-color_encoded,season_encoded
0,1372,2,2,10,3.807467,1545,11,1.804273,1,1372,2,2,10,1447,1545,11,3
1,1461,2,2,10,3.807467,1557,11,1.804273,1,1461,2,2,10,1447,1557,11,3
2,1371,2,2,10,3.612496,1566,11,1.804273,1,1371,2,2,10,1399,1566,11,3
3,1261,6,2,10,3.787572,1566,11,1.804273,1,1261,6,2,10,1442,1566,11,3
4,1305,6,2,10,3.711971,1464,11,0.943195,1,1305,6,2,10,1424,1464,11,2


In [10]:
# One-hot encoding
onehotEncoder = OneHotEncoder(sparse_output=False, drop='first')
encodedFeatures = pd.DataFrame(onehotEncoder.fit_transform(dataset[categorical_columns]), 
                                columns=onehotEncoder.get_feature_names_out(categorical_columns))

encodedFeatures.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54035 entries, 0 to 54034
Columns: 6847 entries, cap-diameter_1 to season_1.804272708628173
dtypes: float64(6847)
memory usage: 2.8 GB


Now we have encoded the features of the dataset

In [11]:
featureNames = onehotEncoder.get_feature_names_out(categorical_columns)
featureNames

array(['cap-diameter_1', 'cap-diameter_2', 'cap-diameter_3', ...,
       'season_0.8884502877862838', 'season_0.9431945538974952',
       'season_1.804272708628173'], dtype=object)

In [12]:
encodedDataset = pd.DataFrame(encodedFeatures, columns=featureNames)
encodedDataset.shape, encodedDataset.info(), encodedDataset.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54035 entries, 0 to 54034
Columns: 6847 entries, cap-diameter_1 to season_1.804272708628173
dtypes: float64(6847)
memory usage: 2.8 GB


((54035, 6847),
 None,
    cap-diameter_1  cap-diameter_2  cap-diameter_3  cap-diameter_4  \
 0             0.0             0.0             0.0             0.0   
 1             0.0             0.0             0.0             0.0   
 2             0.0             0.0             0.0             0.0   
 3             0.0             0.0             0.0             0.0   
 4             0.0             0.0             0.0             0.0   
 
    cap-diameter_5  cap-diameter_6  cap-diameter_7  cap-diameter_8  \
 0             0.0             0.0             0.0             0.0   
 1             0.0             0.0             0.0             0.0   
 2             0.0             0.0             0.0             0.0   
 3             0.0             0.0             0.0             0.0   
 4             0.0             0.0             0.0             0.0   
 
    cap-diameter_9  cap-diameter_10  ...  stem-color_6  stem-color_7  \
 0             0.0              0.0  ...           0.0       

In [13]:
# We concatenate the original dataset with the new one
encodedDataset = pd.concat([dataset, encodedFeatures], axis=1)
encodedDataset.shape, encodedDataset.info(), encodedDataset.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54035 entries, 0 to 54034
Columns: 6864 entries, cap-diameter to season_1.804272708628173
dtypes: float64(6849), int64(15)
memory usage: 2.8 GB


((54035, 6864),
 None,
    cap-diameter  cap-shape  gill-attachment  gill-color  stem-height  \
 0          1372          2                2          10     3.807467   
 1          1461          2                2          10     3.807467   
 2          1371          2                2          10     3.612496   
 3          1261          6                2          10     3.787572   
 4          1305          6                2          10     3.711971   
 
    stem-width  stem-color    season  class  cap-diameter_encoded  ...  \
 0        1545          11  1.804273      1                  1372  ...   
 1        1557          11  1.804273      1                  1461  ...   
 2        1566          11  1.804273      1                  1371  ...   
 3        1566          11  1.804273      1                  1261  ...   
 4        1464          11  0.943195      1                  1305  ...   
 
    stem-color_6  stem-color_7  stem-color_8  stem-color_9  stem-color_10  \
 0           0

In [14]:
encodedDataset.drop(categorical_columns, axis=1, inplace=True)
encodedDataset.shape, encodedDataset.info(), encodedDataset.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54035 entries, 0 to 54034
Columns: 6856 entries, class to season_1.804272708628173
dtypes: float64(6847), int64(9)
memory usage: 2.8 GB


((54035, 6856),
 None,
    class  cap-diameter_encoded  cap-shape_encoded  gill-attachment_encoded  \
 0      1                  1372                  2                        2   
 1      1                  1461                  2                        2   
 2      1                  1371                  2                        2   
 3      1                  1261                  6                        2   
 4      1                  1305                  6                        2   
 
    gill-color_encoded  stem-height_encoded  stem-width_encoded  \
 0                  10                 1447                1545   
 1                  10                 1447                1557   
 2                  10                 1399                1566   
 3                  10                 1442                1566   
 4                  10                 1424                1464   
 
    stem-color_encoded  season_encoded  cap-diameter_1  ...  stem-color_6  \
 0                  

We just have our pre-processed dataset so we can continue splitting into traning and testing and try those wit different methods and models

In [15]:
X = encodedDataset.drop('class', axis=1)
Y = encodedDataset['class']

Split into train and test

In [16]:
X_train, X_temp, y_train, y_temp = train_test_split(X, Y, train_size=.8, random_state=123)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, train_size=0.5)
X_train.shape, X_val.shape, X_test.shape, y_train.shape, y_val.shape, y_test.shape

((43228, 6855), (5403, 6855), (5404, 6855), (43228,), (5403,), (5404,))

In [17]:
trainDataset = tf.data.Dataset.from_tensor_slices((X_train.values, y_train.values))
trainDataset = trainDataset.shuffle(buffer_size=len(X_train)).batch(32)
#
valDataset = tf.data.Dataset.from_tensor_slices((X_val.values, y_val.values))
valDataset = valDataset.shuffle(buffer_size=len(X_val)).batch(32)
# 
testDataset = tf.data.Dataset.from_tensor_slices((X_test.values, y_test.values))
testDataset = testDataset.shuffle(buffer_size=len(X_test)).batch(32)

In [31]:
categories = [X_train[col].nunique() for col in X_train.columns]
len(categories)

6855

In [32]:
categories

[1821,
 7,
 7,
 12,
 1442,
 3452,
 13,
 4,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 

create the model

In [36]:
model = TabTransformer(
    categories=categories,
    num_continuous=(X_train.shape[1] - len(categorical_columns)),
    dim=32,
    depth=6,
    heads=8,
    attn_dropout=0.1,
    ff_dropout=0.1
)
# Compile model
model.compile(
    optimizer='adam',
    loss='binnary_crossentropy',
    metrics=['accuracy']
)
# Train model
history = model.fit(trainDataset, valDataset, epochs=10)

# Evalue the model
test_loss, test_accuracy = model.evaluate(testDataset)
print(f'Test loss: {test_loss}, Test accuracy: {test_accuracy}')

: 