# Assignment 2
The objective of this assignment is to solve a classification task using different classifiers and to compare them according to different evaluation metrics. You are asked to use classifiers from the scikit-learn library, and also the “custom” NN class studied during the course.


# Scikit-learn
Scikit-learn is an open source machine learning library that supports supervised and unsupervised learning. It also provides various tools for model fitting, data preprocessing, model selection, model evaluation, and many other utilities. For this assignement, you are going to use classification algorithms provided by scikit-learn, including SVM, LogisticRegression, RandomForest, k-Nearest Neighbors (kNN) and DummyClassifier.

DummyClassifier makes predictions that ignore the input features. This classifier serves as a simple baseline to compare against other more complex classifiers.

The documentation for the scikit-learn library is available at https://scikit-learn.org/stable/index.html


In [None]:
#Import classifiers from scikit-learn and numpy library
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import classification_report, roc_curve, auc, RocCurveDisplay
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, PrecisionRecallDisplay, precision_recall_curve
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score

import pandas as pd
import numpy as np

import plotly.express as px
import plotly.graph_objects as go

from plotly.subplots import make_subplots

import matplotlib.pyplot as plt

from NeuralNetwork import NeuralNetwork as NN


# Part 1: digits dataset

The dataset provided in the 'hand_digits.txt' file contains features extracted from handwritten digits (0 - 9) for 442 samples. The features correspond to coordinates of 8 equally-spaced points, after normalization and resampling, taken from each sample. The corresponding label is also provided


![title](digits_sample.png)

In [None]:
#Load the data by separating them in data (features) and labels (corresponding true digit)
#Note: features are already normalized
file_path = 'hand_digits.txt'
features = 16
data = np.loadtxt(file_path, usecols=range(0, features))
labels = np.loadtxt(file_path, usecols=[features])


In [None]:
# Create a dictionary of models
PredictionModels = {
    'Dummy Classifier': DummyClassifier(strategy='most_frequent'),
    'kNN': KNeighborsClassifier(n_neighbors=5),
    'SVM': SVC(kernel="linear", C=1),
    'Random Forest': RandomForestClassifier(),
    'Logistic Regression': LogisticRegression(max_iter = 2000)
}

# Scaling for use with logistic regression as I was getting an lbfgs convergence warning
scaler = preprocessing.StandardScaler().fit(train_x)
train_x_scaled = scaler.transform(train_x)
validation_x_scaled = scaler.transform(validation_x)


# Loop for training of models
for name, model in PredictionModels.items():

  if name != "Logistic Regression":

    print('training for classifer: {}'.format(name))
    model.fit(train_x, train_y)

  if name =='Logistic Regression':

    print('training for classifer: {}'.format(name))
    model.fit(train_x_scaled, train_y)

print("done")

In [None]:
# Showing Predictions vs Ground Truth

for name, model in PredictionModels.items():

  print('Model: ', name)
  print('Predictions:', model.predict(validation_x[0:5]))
  print('Ground truth:', validation_y[0:5])

In [None]:
# Get and print confusion matrices for each classifier

for name, model in PredictionModels.items():

  if name != 'Logistic Regression':

    y_pred = model.predict(validation_x)
    cm = confusion_matrix(validation_y, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot()
    disp.ax_.set_title(name)

  if name == 'Logistic Regression':

    y_pred = model.predict(validation_x_scaled)
    cm = confusion_matrix(validation_y, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot()
    disp.ax_.set_title(name)

In [None]:
# Create Lists to store model metrics

accuracy_list = []
train_accuracy_list = []

precision_list = []
train_precision_list = []

f1_list = []
train_f1_list = []

recall_list = []
train_recall_list = []

names = []

# For loop to calculate accuracy, f1, recall and precision for each model
# Loop appends the lists created above
for name, model in PredictionModels.items():

  if name != 'Logistic Regression':

    y_pred = model.predict(validation_x)
    y_pred_train = model.predict(train_x)

    accuracy = accuracy_score(validation_y, y_pred)
    accuracy_list.append(accuracy)

    accuracy = accuracy_score(train_y, y_pred_train)
    train_accuracy_list.append(accuracy)

    f1 = f1_score(validation_y, y_pred, average = 'macro')
    f1_list.append(f1)

    f1_train = f1_score(train_y, y_pred_train, average = 'macro')
    train_f1_list.append(f1_train)

    precision = precision_score(validation_y, y_pred, average = 'macro')
    precision_list.append(precision)

    precision = precision_score(train_y, y_pred_train, average = 'macro')
    train_precision_list.append(precision)

    recall = recall_score(validation_y, y_pred, average = 'macro')
    recall_list.append(recall)

    recall = recall_score(train_y, y_pred_train, average = 'macro')
    train_recall_list.append(recall)

    names.append(name)

  if name == 'Logistic Regression':

    y_pred = model.predict(validation_x_scaled)
    y_pred_train = model.predict(train_x_scaled)

    accuracy = accuracy_score(validation_y, y_pred)
    accuracy_list.append(accuracy)

    accuracy = accuracy_score(train_y, y_pred_train)
    train_accuracy_list.append(accuracy)

    f1 = f1_score(validation_y, y_pred, average = 'macro')
    f1_list.append(f1)

    f1_train = f1_score(train_y, y_pred_train, average = 'macro')
    train_f1_list.append(f1_train)

    precision = precision_score(validation_y, y_pred, average = 'macro')
    precision_list.append(precision)

    precision = precision_score(train_y, y_pred_train, average = 'macro')
    train_precision_list.append(precision)

    recall = recall_score(validation_y, y_pred, average = 'macro')
    recall_list.append(recall)

    recall = recall_score(train_y, y_pred_train, average = 'macro')
    train_recall_list.append(recall)

    names.append(name)




In [None]:
# Create Plot of training and validation metrics with drop down menu to select desired metrics
fig = go.Figure()

# Add traces to display the two different sets of data
fig.add_traces(go.Bar(x=names, y=accuracy_list, name=f"Validation", marker_color='rgb(55, 83, 109)', text=accuracy_list, textposition='auto'))
fig.add_traces(go.Bar(x=names, y=train_accuracy_list,name=f"Training", marker_color='rgb(26, 180, 255)', text=train_accuracy_list, textposition='auto'))
fig.update_traces(texttemplate='%{y:.3f}')  # Drop Decimals for display
fig.update_layout(yaxis_title = 'Accuracy') # Set y axis title, otherwise the y axis will have no title before using update buttons

# Create drop down menu to update the plot
updatemenus = [{'buttons': [
      	                    {'method': 'update',
                             'label': 'Accuracy',
                             'args': [{'y': [accuracy_list, train_accuracy_list]},{"yaxis.title.text":"Accuracy"}]
                              },
                            {'method': 'update',
                             'label': 'Precision',
                             'args': [{'y': [precision_list, train_precision_list]},{"yaxis.title.text":"Precision"}]},
                           {'method': 'update',
                             'label': 'Recall',
                             'args': [{'y': [recall_list, train_recall_list]},{"yaxis.title.text":"Recall"}]},
                           {'method': 'update',
                             'label': 'f1',
                             'args': [{'y': [f1_list, train_f1_list]},{"yaxis.title.text":"f1 Score"}]}
                            ],
                'direction': 'down',
                'bgcolor': 'lightblue',
                'showactive': True,}]

# update layout with buttons, position legend, and show the figure
fig.update_layout(updatemenus=updatemenus)
fig.update_layout(legend=dict(
                                  yanchor="top",
                                  y=0.99,
                                  xanchor="left",
                                  x=0.01
                              ))
fig.update_layout(title_text = 'Macro Averaged Metric Scores For Selected Classifiers', xaxis_title = 'Classifiers', title_x = 0.5)
fig.update_yaxes(range=[0, 1.0])

# Below is adapted from https://stackoverflow.com/questions/60913366/how-to-annotate-a-point-outside-the-plot-itself
fig.update_layout(margin=dict(t=150))
fig.add_annotation(dict(font=dict(color="black",size=12),
                            x=1.82,
                            y=1.2,
                            showarrow=False,
                            text='Note: Larger values of these metrics for the training set indicate overfitting',
                            textangle=0,
                            xref="x",
                            yref="paper"
                           ))
fig.show()

# Part 2: Breast Cancer dataset

In [None]:
data = pd.read_csv('bc_data.csv')

NameError: name 'pd' is not defined

In [None]:
print(data.columns)

Index(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error', 'fractal dimension error',
       'worst radius', 'worst texture', 'worst perimeter', 'worst area',
       'worst smoothness', 'worst compactness', 'worst concavity',
       'worst concave points', 'worst symmetry', 'worst fractal dimension',
       'malignant'],
      dtype='object')


In [None]:
# Visualise feature correlation for feature selection

feature_corr = data.corr()
fig = px.imshow(feature_corr)
fig.show()

# From confusion matrix most correlated features = Smoothness Error, Mean Fractal Dimension, Symmetry Error, Texture Error,
# For simplicity I will drop Texture Error as it is the least correlated of the 4 and create a new dataset of the first 3

In [None]:
# Create working data frame with only the top 3 most correlated features

WorkData = pd.DataFrame(data.drop(data.loc[:, 'mean radius':'mean symmetry'].columns, axis=1))
WorkData = WorkData.drop(WorkData.loc[:, 'radius error':'area error'].columns, axis=1)
WorkData = WorkData.drop(WorkData.loc[:, 'compactness error':'concave points error'].columns, axis=1)
WorkData = WorkData.drop(WorkData.loc[:, 'fractal dimension error':'worst fractal dimension'].columns, axis=1)
WorkData = pd.DataFrame(WorkData)

In [None]:
WorkData.head()

In [None]:
# Test train split

train_x, validation_x, train_y, validation_y = train_test_split(pd.DataFrame(WorkData[['mean fractal dimension','smoothness error','symmetry error']]), WorkData['malignant'], test_size=0.3, random_state = 123)

print('Shape training set:', train_x.shape)
print('Shape validation set:', validation_x.shape)

Shape training set: (394, 3)
Shape validation set: (169, 3)


In [None]:
# Model Training

NNmodel = NN()

PredictionModels = {

    'Logistic Regression': LogisticRegression(max_iter = 500),
    'Random Forest': RandomForestClassifier(),
    'Neural Network': NNmodel
}

for name, model in PredictionModels.items():

  if name != 'Neural Network':
    print('training for classifer: {}'.format(name))
    model.fit(train_x, train_y)

  if name == 'Neural Network':
    print('training for Neural Network: {}'.format(name))
    model.fit(train_x, train_y.values.reshape(1,-1))

print("done")



training for classifer: Logistic Regression
training for classifer: Random Forest
training for Neural Network: Neural Network
done


In [None]:
# Model Accuracies

for name, model in PredictionModels.items():

  if name != 'Neural Network':
    print(name)
    print('Accuracy on training set: %.3f' % model.score(train_x, train_y))
    print('Accuracy on validation set: %.3f' % model.score(validation_x, validation_y))

  if name == 'Neural Network':
    print(name)
    NNaccuracy, NNlogloss = model.evaluate(train_x, train_y)
    print('Accuracy on training set: %.3f' % NNaccuracy)
    NNaccuracy, NNlogloss = model.evaluate(validation_x, validation_y)
    print('Accuracy on validation set: %.3f' %  NNaccuracy)


Logistic Regression
Accuracy on training set: 0.617
Accuracy on validation set: 0.663
Random Forest
Accuracy on training set: 1.000
Accuracy on validation set: 0.657
Neural Network
Accuracy on training set: 0.622
Accuracy on validation set: 0.663


In [None]:
# Predictions vs Ground Truth Comparison

for name, model in PredictionModels.items():

  if name != 'Neural Network':
    print('Model: ', name)
    print('Predictions:', model.predict(validation_x[0:10]))
    print('Ground truth:', validation_y[0:10].values.reshape(1,-1))


print('Model: ','Neural Network')
Pred, proba = NNmodel.predict(validation_x[0:10], return_proba=True)
print('Predictions:', Pred)
print('Ground truth:', validation_y[0:10].values.reshape(1,-1))

Model:  Logistic Regression
Predictions: [1 1 1 1 1 1 1 1 1 1]
Ground truth: [[1 1 1 0 1 0 1 1 0 1]]
Model:  Random Forest
Predictions: [1 1 1 1 1 1 1 1 1 1]
Ground truth: [[1 1 1 0 1 0 1 1 0 1]]
Model:  Neural Network
Predictions: [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
Ground truth: [[1 1 1 0 1 0 1 1 0 1]]


In [None]:
# ROC Plots for each model

fig = go.Figure()

fig.add_shape(
    type='line', line=dict(dash='dash'),
    x0=0, x1=1, y0=0, y1=1
)

# Plot ROC curve
for name, model in PredictionModels.items():

  if name != 'Neural Network':
    y_true = validation_y
    y_scores = model.predict_proba(validation_x)[:, 1]

    fpr, tpr, thresholds = roc_curve(y_true, y_scores)
    auc_score = roc_auc_score(y_true, y_scores)

    name = f"({name} AUC={auc_score:.2f})"
    fig.add_trace(go.Scatter(x=fpr, y=tpr, name=name, mode='lines'))

  if name == 'Neural Network':
    y_true = validation_y
    pred, y_scores = NNmodel.predict(validation_x, return_proba=True)

    fpr, tpr, thresholds = roc_curve(y_true, y_scores)
    auc_score = roc_auc_score(y_true, y_scores)

    name = f"({name} AUC={auc_score:.2f})"
    fig.add_trace(go.Scatter(x=fpr, y=tpr, name=name, mode='lines'))

fig.update_layout(
    title_text = 'ROC Curves For Selected Models',
    title_x = 0.5,
    xaxis_title='False Positive Rate',
    yaxis_title='True Positive Rate',
    yaxis=dict(scaleanchor="x", scaleratio=1),
    xaxis=dict(constrain='domain'),
    legend=dict(
                                  yanchor="top",
                                  y=0.99,
                                  xanchor="left",
                                  x=0.2
                              ),
    width=1000, height=750
)
fig.show()

In [None]:
thresholds = [0.05,0.10,0.15,0.20,0.25,0.30,0.35,0.40,0.45,0.50,0.55,0.60,0.65,0.70,0.75,0.80,.85,0.90,0.95]

Log_ThresholdDF = []
Ran_ThresholdDF = []
Neural_ThresholdDF = []

for name, model in PredictionModels.items():

  if name == 'Logistic Regression':
    model.fit(X = train_x, y = train_y)
    for i in thresholds:

      preds = np.where(model.predict_proba(validation_x)[:,1] > i, 1, 0)
      Log_ThresholdDF.append(
        {
        'Accuracy': accuracy_score(validation_y, preds),
        'recall': recall_score(validation_y, preds),
        'precision': precision_score(validation_y, preds),
        'f1 Score': f1_score(validation_y, preds),
        'Threshold': i
        }
      )
    Log_ThresholdDF = pd.DataFrame(Log_ThresholdDF)
    print(name)
    print(Log_ThresholdDF)

  if name == 'Random Forest':
    model.fit(X = train_x, y = train_y)
    for i in thresholds:

      preds = np.where(model.predict_proba(validation_x)[:,1] > i, 1, 0)
      Ran_ThresholdDF.append(
        {
        'Accuracy': accuracy_score(validation_y, preds),
        'recall': recall_score(validation_y, preds),
        'precision': precision_score(validation_y, preds),
        'f1 Score': f1_score(validation_y, preds),
        'Threshold': i
        }
      )
    Ran_ThresholdDF = pd.DataFrame(Ran_ThresholdDF)
    print(name)
    print(Ran_ThresholdDF)

  if name == 'Neural Network':
    model.fit(X = train_x, y = train_y.values.reshape(1,-1))
    for i in thresholds:

      preds, proba = model.predict(validation_x, return_proba = True)
      preds = np.where(proba > i, 1, 0)
      Neural_ThresholdDF.append(
          {
          'Accuracy': accuracy_score(validation_y, preds),
          'recall': recall_score(validation_y, preds),
          'precision': precision_score(validation_y, preds),
          'f1 Score': f1_score(validation_y, preds),
          'Threshold': i
          }
        )
    Neural_ThresholdDF = pd.DataFrame(Neural_ThresholdDF)
    print(name)
    print(Neural_ThresholdDF)



In [None]:
fig1 = make_subplots(rows=1, cols=3, subplot_titles=('Logistic Regression',  'Random Forest', 'Neural Network'))

fig1.add_traces(go.Scatter(x=Log_ThresholdDF['Threshold'], y=Log_ThresholdDF['Accuracy'], name=f"Accuracy", legendgroup='group1'),rows=1, cols=1)
fig1.add_traces(go.Scatter(x=Log_ThresholdDF['Threshold'], y=Log_ThresholdDF['precision'], name=f"Precision",legendgroup='group2'), rows=1, cols=1)
fig1.add_traces(go.Scatter(x=Log_ThresholdDF['Threshold'], y=Log_ThresholdDF['recall'], name=f"Recall",legendgroup='group3'), rows=1, cols=1)
fig1.add_traces(go.Scatter(x=Log_ThresholdDF['Threshold'], y=Log_ThresholdDF['f1 Score'], name=f"Precision",legendgroup='group4'), rows=1, cols=1)

fig1.add_traces(go.Scatter(x=Ran_ThresholdDF['Threshold'], y=Ran_ThresholdDF['Accuracy'], name=f"Accuracy",legendgroup='group1',showlegend=False), rows=1, cols=2)
fig1.add_traces(go.Scatter(x=Ran_ThresholdDF['Threshold'], y=Ran_ThresholdDF['precision'], name=f"Precision",legendgroup='group2',showlegend=False), rows=1, cols=2)
fig1.add_traces(go.Scatter(x=Ran_ThresholdDF['Threshold'], y=Ran_ThresholdDF['recall'], name=f"Recall",legendgroup='group3',showlegend=False), rows=1, cols=2)
fig1.add_traces(go.Scatter(x=Ran_ThresholdDF['Threshold'], y=Ran_ThresholdDF['f1 Score'], name=f"Precision",legendgroup='group4',showlegend=False), rows=1, cols=2)

fig1.add_traces(go.Scatter(x=Neural_ThresholdDF['Threshold'], y=Neural_ThresholdDF['Accuracy'], name=f"Accuracy",legendgroup='group1',showlegend=False), rows=1, cols=3)
fig1.add_traces(go.Scatter(x=Neural_ThresholdDF['Threshold'], y=Neural_ThresholdDF['precision'], name=f"Precision",legendgroup='group2',showlegend=False), rows=1, cols=3)
fig1.add_traces(go.Scatter(x=Neural_ThresholdDF['Threshold'], y=Neural_ThresholdDF['recall'], name=f"Recall",legendgroup='group3',showlegend=False), rows=1, cols=3)
fig1.add_traces(go.Scatter(x=Neural_ThresholdDF['Threshold'], y=Neural_ThresholdDF['f1 Score'], name=f"Precision",legendgroup='group4',showlegend=False), rows=1, cols=3)
fig1.update_traces(texttemplate='%{y:.3f}')  # Drop Decimals for display
fig1.update_layout(yaxis_title = 'Accuracy') # Set y axis title, otherwise the y axis will have no title before using update buttons
fig1.update_layout(title_text = 'Metric Plots for Decision Thresholds from 0.05 to 0.95, at increments of 0.05', title_x = 0.5)

fig1.show()

In [None]:
# Precision-Recall Curve for each model

fig = go.Figure()

fig.add_shape(
    type='line', line=dict(dash='dash'),
    x0=0, x1=1, y0=0, y1=1
)

# Plot P-R Curves
for name, model in PredictionModels.items():

  if name != 'Neural Network':
    y_true = validation_y
    y_scores = model.predict_proba(validation_x)[:, 1]

    prec, recall, _ = precision_recall_curve(y_true, y_scores)

    name = f"({name})"
    fig.add_trace(go.Scatter(x=recall, y=prec, name=name, mode='lines'))

  if name == 'Neural Network':
    y_true = validation_y
    pred, y_scores = NNmodel.predict(validation_x, return_proba=True)

    prec, recall, _ = precision_recall_curve(validation_y, y_scores)

    name = f"({name})"
    fig.add_trace(go.Scatter(x=recall, y=prec, name=name, mode='lines'))

fig.update_layout(
    title_text = 'Precision-Recall Curves For Selected Models',
    title_x = 0.5,
    xaxis_title='Recall',
    yaxis_title='Precision',
    yaxis=dict(scaleanchor="x", scaleratio=1),
    xaxis=dict(constrain='domain'),
    legend=dict(
                                  yanchor="top",
                                  y=0.99,
                                  xanchor="left",
                                  x=0.5
                              ),
    width=1000, height=750
)
fig.show()
