# Binnary clasification using Transformer. An 'Edible Mushroom Dataset'

I downloaded this dataset from kaggle, just search the title in the web and there will be.

In [41]:
# imports
import torch
import numpy as np
import pandas as pd
import torch.nn as nn
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from tab_transformer_pytorch import TabTransformer
from sklearn.model_selection import train_test_split

In [26]:
# load the dataset
dataset = pd.read_csv('mushroom_cleaned.csv')
print(dataset.info())
print(dataset.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54035 entries, 0 to 54034
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   cap-diameter     54035 non-null  int64  
 1   cap-shape        54035 non-null  int64  
 2   gill-attachment  54035 non-null  int64  
 3   gill-color       54035 non-null  int64  
 4   stem-height      54035 non-null  float64
 5   stem-width       54035 non-null  int64  
 6   stem-color       54035 non-null  int64  
 7   season           54035 non-null  float64
 8   class            54035 non-null  int64  
dtypes: float64(2), int64(7)
memory usage: 3.7 MB
None
   cap-diameter  cap-shape  gill-attachment  gill-color  stem-height  \
0          1372          2                2          10     3.807467   
1          1461          2                2          10     3.807467   
2          1371          2                2          10     3.612496   
3          1261          6                2

Identify the categorical and continous columns:
- *Categorical* --> Those who has an int64 dtype
- *Continous* --> Those who has a float64 dtype

In [27]:
categorical_cols = ['cap-diameter', 'cap-shape', 'gill-attachment', 'gill-color', 'stem-width', 'stem-color' ]
continous_cols = ['stem-height', 'season']
label_col = 'class'

In [31]:
# We apply a categorical encoder to categorical columns
label_encoders = {}
for col in categorical_cols: 
    le = LabelEncoder()
    dataset[col] = le.fit_transform(dataset[col])
    label_encoders[col] = le

In [34]:
# We split the dataset into 'data' and 'labels'
X = dataset.drop(label_col, axis=1)
y = dataset[label_col]
X.shape, y.shape

((54035, 8), (54035,))

In [35]:
# Split into training and testing 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((43228, 8), (10807, 8), (43228,), (10807,))

In [36]:
# Separate categotical and continous features
X_train_categ = X_train[categorical_cols].values
X_train_cont = X_train[continous_cols].values
#
X_test_categ = X_test[categorical_cols].values
X_test_cont = X_test[continous_cols].values
X_train_categ.shape, X_train_cont.shape, X_test_categ.shape, X_test_cont.shape

((43228, 6), (43228, 2), (10807, 6), (10807, 2))

In [37]:
# Define the TabTransformer model
num_unique_categories = [dataset[col].nunique() for col in categorical_cols]
#
modelTabTrans = TabTransformer(
    categories=num_unique_categories,
    num_continuous=len(continous_cols),
    dim=32,
    dim_out=1,
    depth=6,
    heads=8,
    attn_dropout=.1,
    ff_dropout=.1
)

In [38]:
# Transform data --> Pytorch tensors
X_train_categ_tensor = torch.tensor(X_train_categ, dtype=torch.long)
X_train_cont_tensor = torch.tensor(X_train_cont, dtype=torch.float)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float)
#
# optimizer and loss function definition
optimizer = torch.optim.Adam(modelTabTrans.parameters(), lr=.001)
criterion = nn.BCEWithLogitsLoss()

In [39]:
# Training 
modelTabTrans.train()
for epoch in range(10): # 10 epoch training
    optimizer.zero_grad()
    y_pred = modelTabTrans(X_train_categ_tensor, X_train_cont_tensor)
    loss = criterion(y_pred.squeeze(), y_train_tensor)
    loss.backward()
    optimizer.step()
    print(f'Epoch: {epoch}, Loss: {loss.item()}')

Epoch: 0, Loss: 0.6910781264305115
Epoch: 1, Loss: 0.6533674001693726
Epoch: 2, Loss: 0.6340625882148743
Epoch: 3, Loss: 0.5992587208747864
Epoch: 4, Loss: 0.5830015540122986
Epoch: 5, Loss: 0.5523414015769958
Epoch: 6, Loss: 0.5312123894691467
Epoch: 7, Loss: 0.5063426494598389
Epoch: 8, Loss: 0.4847387671470642
Epoch: 9, Loss: 0.46398094296455383


In [42]:
# Evaluation ( test loss  + accuracy )
modelTabTrans.eval() 
with torch.no_grad():
    X_test_categ_tensor = torch.tensor(X_test_categ, dtype=torch.long)
    X_test_cont_tensor = torch.tensor(X_test_cont, dtype=torch.float)
    y_test_tensor = torch.tensor(y_test.values, dtype=torch.float)
    test_output = modelTabTrans(X_test_categ_tensor, X_test_cont_tensor)
    test_loss = criterion(test_output.squeeze(), y_test_tensor)
    print(f'Test Loss: {test_loss.item()}')
    
    # Accuracy
    test_preds = torch.sigmoid(test_output).squeeze().round()
    accuracy = accuracy_score(y_test_tensor.numpy(), test_preds.numpy())
    print(f'Test Accuracy: {accuracy * 100:.4f}')

Test Loss: 0.44760382175445557
Test Accuracy: 78.3751
