In [164]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn import metrics
from sklearn.utils import shuffle
np.random.seed(8)
encoder = LabelEncoder()

In [207]:
# Functions adapted from https://www.kaggle.com/code/gargmanish/basic-machine-learning-with-cancer/notebook
    # model: the learning model
    # train_data: data used to train model
    # test_data: data used to test model
def classification_model(model, x, y):
    x, y = shuffle(x, y)
    model.fit(x, y)
    predictions = model.predict(x)
    accuracy = metrics.accuracy_score(predictions, y)
    print("Accuracy : %s" % "{0:.3%}".format(accuracy))
    kf = KFold(n_splits=5)
    kf.get_n_splits(x)
    error = []
    for i, (train, test) in enumerate(kf.split(x)):
        train_x = x.iloc[train, :]
        train_y = y[train]
        model.fit(train_x, train_y)
        
        test_x = x.iloc[test, :]
        test_y = y[test]
        error.append(model.score(test_x, test_y))
        print(f'Fold: {i}')
        print(" Cross-Validation Score : %s" % "{0:.3%}".format(np.mean(error)))
        
def classification_model_gridsearchCV(model, param_grid, data_X, data_Y):
    clf = GridSearchCV(model, param_grid, cv=10, scoring='accuracy')
    clf.fit(data_X, data_Y)
    print("The best parameters found:")
    print(clf.best_params_)
    print("The best estimator:")
    print(clf.best_estimator_)
    print("The best score on the training data is:")
    print('{0:.3%}'.format(clf.best_score_))
    #print("Test Set Score: %s" % "{0:.3%}".format(dt_classifier_score))
    return clf.best_estimator_

# Dataset 1: Wisconsin Breast Cancer Diagnostic Dataset
- Source: https://www.kaggle.com/datasets/uciml/breast-cancer-wisconsin-data?resource=download

In [165]:
# Reading data and preprocessing
data = pd.read_csv('data/breast_cancer_data.csv')

# 'id' column doesn't provide useful information
# 'Unnamed: 32' column contains NaN values
data.drop(columns=['id', 'Unnamed: 32'], inplace=True)

# Split the data
train, test = train_test_split(data, test_size=0.3)
train_X = train.drop(columns='diagnosis')
train_Y = encoder.fit_transform(train['diagnosis']).reshape(-1, 1)
test_X = test.drop(columns='diagnosis')
test_Y = encoder.fit_transform(test['diagnosis']).reshape(-1, 1)
X = data.drop(columns='diagnosis')
Y = data['diagnosis']
# Encode Benign -> 0, Malignant -> 1
encoded_Y = encoder.fit_transform(Y).reshape(-1, 1)

### Dataset 1: Decision Tree Classifier

In [191]:
model = DecisionTreeClassifier()
classification_model(model, train_X, train_Y)
dt_classifier_score = model.score(test_X, test_Y)
print("Test Set Score: %s" % "{0:.3%}".format(dt_classifier_score))

Accuracy : 100.000%
Fold: 0
 Cross-Validation Score : 91.250%
Fold: 1
 Cross-Validation Score : 89.375%
Fold: 2
 Cross-Validation Score : 89.583%
Fold: 3
 Cross-Validation Score : 90.605%
Fold: 4
 Cross-Validation Score : 90.712%
Test Set Score: 91.813%


In [209]:
param_grid = {'max_features': ['sqrt', 'log2'],
              'min_samples_split': [2,3,4,5,6,7,8,9,10], 
              'min_samples_leaf':[2,3,4,5,6,7,8,9,10],
              'criterion': ['gini', 'entropy']}
best_model = classification_model_gridsearchCV(model, param_grid, train_X, train_Y)
print("Test Set Score: %s" % "{0:.3%}".format(best_model.score(test_X, test_Y)))

The best parameters found:
{'criterion': 'entropy', 'max_features': 'log2', 'min_samples_leaf': 6, 'min_samples_split': 9}
The best estimator:
DecisionTreeClassifier(criterion='entropy', max_features='log2',
                       min_samples_leaf=6, min_samples_split=9)
The best score on the training data is:
94.468%
Test Set Score: 95.906%
