In [30]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# Make sure scikit-learn is on version 1.2.2
# due to this error: https://github.com/scikit-learn/scikit-learn/issues/26768
# Models
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn import metrics
from sklearn.utils import shuffle
np.random.seed(8)
encoder = LabelEncoder()

In [72]:
# Functions adapted from https://www.kaggle.com/code/gargmanish/basic-machine-learning-with-cancer/notebook
    # model: the learning model
    # train_data: data used to train model
    # test_data: data used to test model
def classification_model(model, train_x, train_y, test_x, test_y):
    model.fit(train_x, train_y)
    predictions = model.predict(train_x)
    accuracy = metrics.accuracy_score(predictions, train_y)
    print("Accuracy on training data: %s" % "{0:.3%}".format(accuracy))
    kf = KFold(n_splits=5, shuffle=True)
    kf.get_n_splits(train_x)
    error = []
    for i, (train, test) in enumerate(kf.split(train_x)):
        fold_x = train_x.iloc[train, :]
        fold_y = train_y.iloc[train]
        model.fit(fold_x, fold_y)
        
        fold_test_x = train_x.iloc[test, :]
        fold_test_y = train_y.iloc[test]
        error.append(model.score(fold_test_x, fold_test_y))
        print(f'Fold: {i}')
        print(" Cross-Validation Score : %s" % "{0:.3%}".format(np.mean(error)))
    
    test_score = model.score(test_x, test_y)
    print("Test Set Score: %s" % "{0:.3%}".format(test_score))
        
def classification_model_gridsearchCV(model, param_grid, data_X, data_Y, test_x, test_y):
    clf = GridSearchCV(model, param_grid, cv=10, scoring='accuracy')
    clf.fit(data_X, data_Y)
    print("The best parameters found:")
    print(clf.best_params_)
    print("The best estimator:")
    print(clf.best_estimator_)
    print("The best score on the training data is: {0:.3%}".format(clf.best_score_))
    best_model = clf.best_estimator_
    print("Test set score with the best model: %s" % "{0:.3%}".format(best_model.score(test_X, test_Y)))

# Dataset 1: Wisconsin Breast Cancer Diagnostic Dataset
- Source: https://www.kaggle.com/datasets/uciml/breast-cancer-wisconsin-data?resource=download

In [66]:
# Reading data and preprocessing
data = pd.read_csv('data/breast_cancer_data.csv')

# 'id' column doesn't provide useful information
# 'Unnamed: 32' column contains NaN values
data.drop(columns=['id', 'Unnamed: 32'], inplace=True)

# Split the data
train, test = train_test_split(data, test_size=0.3)
train_X = train.drop(columns='diagnosis')
train_Y = encoder.fit_transform(train['diagnosis'])
test_X = test.drop(columns='diagnosis')
test_Y = encoder.fit_transform(test['diagnosis'])
X = data.drop(columns='diagnosis')
Y = data['diagnosis']
# Encode Benign -> 0, Malignant -> 1
encoded_Y = encoder.fit_transform(Y).reshape(-1, 1)

### Dataset 1: Decision Tree Classifier

In [18]:
model = DecisionTreeClassifier()
classification_model(model, train_X, train_Y, test_X, test_Y)

Accuracy on training data: 100.000%
Fold: 0
 Cross-Validation Score : 96.250%
Fold: 1
 Cross-Validation Score : 95.625%
Fold: 2
 Cross-Validation Score : 95.000%
Fold: 3
 Cross-Validation Score : 94.035%
Fold: 4
 Cross-Validation Score : 92.443%
Test Set Score: 92.982%


In [19]:
param_grid = {'max_features': ['sqrt', 'log2'],
              'min_samples_split': [2,3,4,5,6,7,8,9,10], 
              'min_samples_leaf':[2,3,4,5,6,7,8,9,10],
              'criterion': ['gini', 'entropy']}
classification_model_gridsearchCV(model, param_grid, train_X, train_Y, test_X, test_Y)

The best parameters found:
{'criterion': 'entropy', 'max_features': 'sqrt', 'min_samples_leaf': 8, 'min_samples_split': 2}
The best estimator:
DecisionTreeClassifier(criterion='entropy', max_features='sqrt',
                       min_samples_leaf=8)
The best score on the training data is: 95.724%
Test set score with the best model: 90.643%


### Dataset 1: Neural Network

In [21]:
model = MLPClassifier(max_iter=500)
classification_model(model, train_X, train_Y, test_X, test_Y)

Accuracy on training data: 95.477%
Fold: 0
 Cross-Validation Score : 91.250%
Fold: 1
 Cross-Validation Score : 93.750%
Fold: 2
 Cross-Validation Score : 94.583%
Fold: 3
 Cross-Validation Score : 93.406%
Fold: 4
 Cross-Validation Score : 93.459%
Test Set Score: 90.643%


In [23]:
param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50)],
    'alpha': [0.0001, 0.001, 0.01],
    'learning_rate': ['constant', 'invscaling', 'adaptive'],
}
best_model = classification_model_gridsearchCV(model, param_grid, train_X, np.ravel(train_Y), test_X, np.ravel(test_Y))



The best parameters found:
{'alpha': 0.001, 'hidden_layer_sizes': (50,), 'learning_rate': 'constant'}
The best estimator:
MLPClassifier(alpha=0.001, hidden_layer_sizes=(50,), max_iter=500)
The best score on the training data is: 95.224%
Test set score with the best model: 92.398%


### Dataset 1: k-nearest neighbors

In [22]:
model = KNeighborsClassifier()
classification_model(model, train_X, train_Y, test_X, test_Y)

Accuracy on training data: 95.226%
Fold: 0
 Cross-Validation Score : 96.250%
Fold: 1
 Cross-Validation Score : 95.625%
Fold: 2
 Cross-Validation Score : 95.417%
Fold: 3
 Cross-Validation Score : 94.980%
Fold: 4
 Cross-Validation Score : 93.959%
Test Set Score: 90.643%


In [26]:
param_grid = {
    'n_neighbors': [1,2,3,4,5,6,7,8,9,10],
    'weights': ['uniform', 'distance'],
    'p': [1, 2],
}
classification_model_gridsearchCV(model, param_grid, train_X, train_Y, test_X, test_Y)

The best parameters found:
{'n_neighbors': 6, 'p': 1, 'weights': 'distance'}
The best estimator:
KNeighborsClassifier(n_neighbors=6, p=1, weights='distance')
The best score on the training data is: 95.474%
Test set score with the best model: 91.813%


### Dataset 1: Boosting


In [28]:
model = AdaBoostClassifier()
classification_model(model, train_X, train_Y, test_X, test_Y)

Accuracy on training data: 100.000%
Fold: 0
 Cross-Validation Score : 96.250%
Fold: 1
 Cross-Validation Score : 95.625%
Fold: 2
 Cross-Validation Score : 95.417%
Fold: 3
 Cross-Validation Score : 95.613%
Fold: 4
 Cross-Validation Score : 95.731%
Test Set Score: 95.322%


In [29]:
param_grid = {
    'n_estimators': [10, 50, 250, 500, 1000],
    'learning_rate': [0.01, 0.1, 0.3, 0.5, 0.7, 1.0],
}
classification_model_gridsearchCV(model, param_grid, train_X, train_Y, test_X, test_Y)

The best parameters found:
{'learning_rate': 1.0, 'n_estimators': 500}
The best estimator:
AdaBoostClassifier(n_estimators=500)
The best score on the training data is: 96.487%
Test set score with the best model: 98.830%


### Dataset 1: SVM

In [32]:
model = SVC()
classification_model(model, train_X, train_Y, test_X, test_Y)

Accuracy on training data: 92.965%
Fold: 0
 Cross-Validation Score : 96.250%
Fold: 1
 Cross-Validation Score : 94.375%
Fold: 2
 Cross-Validation Score : 92.083%
Fold: 3
 Cross-Validation Score : 90.581%
Fold: 4
 Cross-Validation Score : 91.453%
Test Set Score: 90.643%


In [45]:
param_grid = {
    'C': [0.001, 0.1, 1.0, 10.0, 100.0, 1000.0],
    'gamma': ['scale', 'auto'],
}
classification_model_gridsearchCV(model, param_grid, train_X, train_Y, test_X, test_Y)

The best parameters found:
{'C': 1000.0, 'gamma': 'scale'}
The best estimator:
SVC(C=1000.0)
The best score on the training data is: 94.955%
Test set score with the best model: 93.567%


# Dataset 2: Banknote Authentication

In [68]:
data = pd.read_csv('data/banknote_data.csv')
train, test = train_test_split(data, test_size=0.3)
train_X = train.drop(columns='class')
train_Y = train['class']
test_X = test.drop(columns='class')
test_Y = test['class']
train_X

Unnamed: 0,variance,skewness,curtosis,entropy
998,-3.08660,-6.6362,10.54050,-0.891820
133,0.43390,5.5395,2.03300,-0.404320
513,5.93740,6.1664,-2.59050,-0.365530
419,1.95720,-5.1153,8.61270,-1.429700
541,3.66670,4.3020,0.55923,0.337910
...,...,...,...,...
663,3.77980,-3.3109,2.64910,0.066365
643,0.38251,6.8121,1.81280,-0.612510
495,-0.12624,10.3216,-3.71210,-6.118500
235,2.04660,2.0300,2.17610,-0.083634


### Dataset 2: DecisionTreeClassifier

In [74]:
model = DecisionTreeClassifier()
classification_model(model, train_X, train_Y, test_X, test_Y)

Accuracy on training data: 100.000%
Fold: 0
 Cross-Validation Score : 97.917%
Fold: 1
 Cross-Validation Score : 98.177%
Fold: 2
 Cross-Validation Score : 97.917%
Fold: 3
 Cross-Validation Score : 97.656%
Fold: 4
 Cross-Validation Score : 97.917%
Test Set Score: 99.515%


In [75]:
param_grid = {'max_features': ['sqrt', 'log2'],
              'min_samples_split': [2,3,4,5,6,7,8,9,10], 
              'min_samples_leaf':[2,3,4,5,6,7,8,9,10],
              'criterion': ['gini', 'entropy']}
classification_model_gridsearchCV(model, param_grid, train_X, train_Y, test_X, test_Y)

The best parameters found:
{'criterion': 'gini', 'max_features': 'sqrt', 'min_samples_leaf': 3, 'min_samples_split': 3}
The best estimator:
DecisionTreeClassifier(max_features='sqrt', min_samples_leaf=3,
                       min_samples_split=3)
The best score on the training data is: 98.438%
Test set score with the best model: 98.544%


### Dataset 2: Neural Network