In [None]:
COMP5318 Assignment 1: Classification

In [None]:
Group number: 113

In [14]:
import pandas as pd
import numpy as np
# Import data
data = pd.read_csv("breast-cancer-wisconsin.csv")

# data = pd.read_csv("test-before.csv")

# Extraction class column
class_column = data['class']
# delete class column
data = data.drop('class', axis=1)

from sklearn.impute import SimpleImputer
# Will? Replace with a missing value
data = data.replace('?', np.nan)
# Replace the missing value with the average value
imputer = SimpleImputer(strategy='mean')
data = imputer.fit_transform(data)

from sklearn.preprocessing import MinMaxScaler
# Normalized data
scaler = MinMaxScaler()
data = scaler.fit_transform(data)
# change class value
class_column = class_column.replace({'class1': 0, 'class2': 1})

# Print first ten rows of pre-processed dataset to 4 decimal places as per assignment spec
# A function is provided to assist
def print_data(X, y, n_rows=10):
    
    for example_num in range(n_rows):
        for feature in X[example_num]:
            print("{:.4f}".format(feature), end=",")

        if example_num == len(X)-1:
            print(y[example_num],end="")
        else:
            print(y[example_num])
            
# Output the first 10 rows of the data set
print_data(data, class_column, n_rows=10)
print()

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier

from sklearn.model_selection import cross_val_score
# Setting the 10 fold stratified cross-validation
cvKFold=StratifiedKFold(n_splits=10, shuffle=True, random_state=0)

# Logistic Regression
def logregClassifier(X, y):
    model = LogisticRegression()
    # Computational cross verification
    scores = cross_val_score(model, X, y, cv=cvKFold)
    return scores.mean()

#Naïve Bayes
def nbClassifier(X, y):
    model = GaussianNB()
    scores = cross_val_score(model, X, y, cv=cvKFold)
    return scores.mean()

# Decision Tree
def dtClassifier(X, y):
    model = DecisionTreeClassifier(criterion='entropy', random_state=0)
    scores = cross_val_score(model, X, y, cv=cvKFold)
    return scores.mean()

# Ensembles: Bagging, Ada Boost and Gradient Boosting
def bagDTClassifier(X, y, n_estimators, max_samples, max_depth):
    model = BaggingClassifier(
        DecisionTreeClassifier(criterion='entropy', max_depth=max_depth),
        n_estimators=n_estimators,
        max_samples=max_samples,
        random_state=0)
    scores = cross_val_score(model, X, y, cv=cvKFold)
    return scores.mean()

def adaDTClassifier(X, y, n_estimators, learning_rate, max_depth):
    model = AdaBoostClassifier(
        DecisionTreeClassifier(criterion='entropy', max_depth=max_depth),
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        random_state=0)
    scores = cross_val_score(model, X, y, cv=cvKFold)
    return scores.mean()

def gbClassifier(X, y, n_estimators, learning_rate):
    model = GradientBoostingClassifier(
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        random_state=0)
    scores = cross_val_score(model, X, y, cv=cvKFold)
    return scores.mean()

#Bagging
bag_n_estimators = 60
bag_max_samples = 100
bag_max_depth = 6

#AdaBoost
ada_n_estimators = 60
ada_learning_rate = 0.5
ada_bag_max_depth = 6

#GB
gb_n_estimators = 60
gb_learning_rate = 0.5

# Print results for each classifier in part 1 to 4 decimal places here:
print("LogR average cross-validation accuracy: {:.4f}".format(logregClassifier(data, class_column)))
print("NB average cross-validation accuracy: {:.4f}".format(nbClassifier(data, class_column)))
print("DT average cross-validation accuracy: {:.4f}".format(dtClassifier(data, class_column)))
print("Bagging average cross-validation accuracy: {:.4f}".format(bagDTClassifier(data, class_column, bag_n_estimators, bag_max_samples, bag_max_depth)))
print("AdaBoost average cross-validation accuracy: {:.4f}".format(adaDTClassifier(data, class_column, ada_n_estimators, ada_learning_rate, ada_bag_max_depth)))
print("GB average cross-validation accuracy: {:.4f}".format(gbClassifier(data, class_column, gb_n_estimators, gb_learning_rate)))
print()


# KNN
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.neighbors import KNeighborsClassifier

k = [1, 3, 5, 7, 9]
p = [1, 2]

def bestKNNClassifier(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)
    
    knn = KNeighborsClassifier()
    param_grid = {'n_neighbors': k, 'p': p}
    grid_search = GridSearchCV(knn, param_grid, cv=cvKFold)
    grid_search.fit(X_train, y_train)
    
    best_params = grid_search.best_params_
    train_acc = grid_search.best_score_
    test_acc = grid_search.score(X_test, y_test)
    
    return best_params, train_acc, test_acc

# SVM
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.svm import SVC

C = [0.01, 0.1, 1, 5, 15] 
gamma = [0.01, 0.1, 1, 10, 50]

def bestSVMClassifier(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)

    svm = SVC(kernel='rbf')
    param_grid = {'C': C, 'gamma': gamma}
    grid_search = GridSearchCV(svm, param_grid=param_grid, cv=cvKFold)
    grid_search.fit(X_train, y_train)

    best_params = grid_search.best_params_
    train_acc = grid_search.best_score_
    test_acc = grid_search.score(X_test, y_test)

    return best_params, train_acc, test_acc

# Random Forest
# You should use RandomForestClassifier from sklearn.ensemble with information gain and max_features set to ‘sqrt’.
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

n_estimators = [10, 30, 60, 100, 150]
max_leaf_nodes = [6, 12, 18]

def bestRFClassifier(X, y):
    # Split data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)

    # Define parameter grid for grid search
    param_grid = {'n_estimators': n_estimators, 'max_leaf_nodes': max_leaf_nodes}

    # Create a random forest classifier with information gain and max_features set to 'sqrt'
    rf = RandomForestClassifier(criterion='entropy', max_features='sqrt', random_state=0)

    # Use grid search to find the best hyperparameters
    grid_search = GridSearchCV(rf, param_grid=param_grid, cv=cvKFold)
    grid_search.fit(X_train, y_train)

    # Get the best hyperparameters and corresponding scores
    best_params = grid_search.best_params_
    train_acc = grid_search.best_score_
    test_acc = grid_search.score(X_test, y_test)
    y_pred = grid_search.predict(X_test)
    macro_f1 = f1_score(y_test, y_pred, average='macro')
    weighted_f1 = f1_score(y_test, y_pred, average='weighted')

    return best_params, train_acc, test_acc, macro_f1, weighted_f1


#print result of knn, svm, rf
best_knn_params, best_knn_train_acc, best_knn_test_acc = bestKNNClassifier(data, class_column)
print("KNN best k:", best_knn_params['n_neighbors'])
print("KNN best p:", best_knn_params['p'])
print("KNN cross-validation accuracy:", round(best_knn_train_acc, 4))
print("KNN test set accuracy:", round(best_knn_test_acc, 4))
print()

best_params, train_acc, test_acc = bestSVMClassifier(data, class_column)
print("SVM best C: {:.4f}".format(best_params['C']))
print("SVM best gamma: {:.4f}".format(best_params['gamma']))
print("SVM cross-validation accuracy: ", round(train_acc, 4))
print("SVM test set accuracy: ", round(test_acc, 4))
print()

best_params, train_acc, test_acc, macro_f1, weighted_f1 = bestRFClassifier(data, class_column)
print("RF best n_estimators:", best_params['n_estimators'])
print("RF best max_leaf_nodes:", best_params['max_leaf_nodes'])
print("RF cross-validation accuracy:", round(train_acc, 4))
print("RF test set accuracy:", round(test_acc, 4))
print("RF test set macro average F1:", round(macro_f1, 4))
print("RF test set weighted average F1:", round(weighted_f1, 4))

0.4444,0.0000,0.0000,0.0000,0.1111,0.0000,0.2222,0.0000,0.0000,0
0.4444,0.3333,0.3333,0.4444,0.6667,1.0000,0.2222,0.1111,0.0000,0
0.2222,0.0000,0.0000,0.0000,0.1111,0.1111,0.2222,0.0000,0.0000,0
0.5556,0.7778,0.7778,0.0000,0.2222,0.3333,0.2222,0.6667,0.0000,0
0.3333,0.0000,0.0000,0.2222,0.1111,0.0000,0.2222,0.0000,0.0000,0
0.7778,1.0000,1.0000,0.7778,0.6667,1.0000,0.8889,0.6667,0.0000,1
0.0000,0.0000,0.0000,0.0000,0.1111,1.0000,0.2222,0.0000,0.0000,0
0.1111,0.0000,0.1111,0.0000,0.1111,0.0000,0.2222,0.0000,0.0000,0
0.1111,0.0000,0.0000,0.0000,0.1111,0.0000,0.0000,0.0000,0.4444,0
0.3333,0.1111,0.0000,0.0000,0.1111,0.0000,0.1111,0.0000,0.0000,0

LogR average cross-validation accuracy: 0.9642
NB average cross-validation accuracy: 0.9585
DT average cross-validation accuracy: 0.9385
Bagging average cross-validation accuracy: 0.9571
AdaBoost average cross-validation accuracy: 0.9570
GB average cross-validation accuracy: 0.9613

KNN best k: 3
KNN best p: 1
KNN cross-validation accuracy: 9.695e