import necessary modules 

In [None]:
import pandas  as pd
import matplotlib.pyplot as plt
%matplotlib auto
# Switch off plot output until show() is issued
plt.interactive(False)
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import roc_curve, auc

load the data set

In [None]:
df = pd.read_csv('data/BEED_Data.csv')
print(df.head())

# EDA Pass 1

print info about columns in the dataframe

In [None]:
print(df.info())

Now look at the distribution of each column.
Note that the features all have the same scaling/range so there is no need to scale them

In [None]:
print(df.describe())

check the target
as you can see there are 2000 of each class (so it is balanced)

In [None]:
print(df['y'].value_counts())

Derive the features X and target y, just for y=0 (no epilepsy) and y = 1 (epilepsy)

In [None]:
[nrows, ncols] = df.shape
X = df.iloc[:, 0:ncols-1].loc[df['y'].isin((0,1))]
y = df['y'].loc[df['y'].isin((0,1))]
print(X.shape)

# Undersample the cases with epilepsy

In [None]:
X0 = X.loc[df['y'] == 0]
X1 = X.loc[df['y'] == 1]
y0 = y.loc[df['y'] == 0]
y1 = y.loc[df['y'] == 1]
# Select 2% of the X1,y1 (persons with Epilepsy)
y1p = y1.sample(frac=0.02, random_state=42)
X1p = X.iloc[y1p.index]
# Concatenate the full no-Epilepsy cases with the sampled with-epilepsy cases
Xp = pd.concat((X0,X1p))
yp = pd.concat((y0,y1p))

# Split full data into 75:25 ratio

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42, stratify=y)
print(X_train.shape)

# Split partial data into 75:25 ratio

In [None]:
X_trainP, X_testP, y_trainP, y_testP = train_test_split(Xp, yp, test_size = 0.25, random_state = 42, stratify=yp)
print(X_trainP.shape)

# Create python dictionaries, to store results by classifier type

In [None]:
model = dict()
prob_pred = dict()
report = dict()

In [None]:
modelP = dict()
prob_predP = dict()
reportP = dict()

# Train a full logistic regression model

In [None]:
model['LogisticRegression'] = LogisticRegression(max_iter=500) 
model['LogisticRegression'].fit(X_train, y_train)

# Train a full KNN model

In [None]:
model['KNN'] = KNeighborsClassifier(n_neighbors=5) 
model['KNN'].fit(X_train, y_train)

# Train a partial logistic regression model

In [None]:
modelP['LogisticRegression'] = LogisticRegression(max_iter=500) 
modelP['LogisticRegression'].fit(X_trainP, y_trainP)

# Train a partial KNN model

In [None]:
modelP['KNN'] = KNeighborsClassifier(n_neighbors=5) 
modelP['KNN'].fit(X_trainP, y_trainP)

# Define the function to compute the predicted targets and probabilities for the test set, and return the classification report for that model

In [None]:
def getMetrics(X_test, y_test, model): 
  # Predict probabilities for the rows of the test set
  prob_pred = model.predict_proba(X_test)
  print(prob_pred[0:5,:])

  # Predict target for the rows of the test set
  y_pred = model.predict(X_test)

  # print classification report
  report = classification_report(y_test, y_pred)
  return report, prob_pred

# Create the function to derive the ROC curve and associated AUC, and to plot it.

In [None]:
def plotROCcurve(y_test, prob_pred, dataAmount, classifierType, ax):
  # Calculate ROC curve
  FPR, TPR, thresholds = roc_curve(y_test, prob_pred)
  roc_auc = auc(FPR, TPR)

  # Prepare to plot the ROC curve
  ax.plot(FPR, TPR, label=f'ROC (AUC = {roc_auc:.2f})')
  ax.plot([0, 1], [0, 1], 'k--', label='Random Guessing')
  ax.set_xlim([-0.05, 1.05])
  ax.set_ylim([-0.05, 1.05])
  ax.set_aspect('equal', 'box')
  ax.set_xlabel('False Positive Rate (FPR)',fontsize=8)
  ax.set_ylabel('True Positive Rate (TPR)',fontsize=8)
  ax.set_title(f'ROC for {dataAmount} Epilepsy, with {classifierType}',fontsize=8)
  ax.tick_params(axis='both', which='major', labelsize=7)
  ax.legend(loc='lower right',fontsize=7)
  return ax, thresholds

# Create the function to collect the clasdsificatiuon report and ROC curve plots

In [None]:
def getResults(X_test, y_test, model, dataAmount, classifierType, ax):
  report, prob_pred = getMetrics(X_test, y_test, model[classifierType])
  ax = plotROCcurve(y_test, prob_pred[:,1], dataAmount, classifierType, ax)
  return report, prob_pred, ax

# Split the plotting space into subplots

In [None]:
fig, ax = plt.subplots(2,2)
#plt.rcParams["figure.figsize"] = (15,15)

# Get the results for the LogisticRegression model

In [None]:
dataAmount = 'full'
classifierType = 'LogisticRegression'
report[classifierType], prob_pred[classifierType], ax[0,0] = getResults(X_test, y_test, model, dataAmount, classifierType, ax[0,0])
print(report[classifierType])

# Get the results for the KNN model

In [None]:
dataAmount = 'full'
classifierType = 'KNN'
report[classifierType], prob_pred[classifierType], ax[0,1] = getResults(X_test, y_test, model, dataAmount, classifierType, ax[0,1])
print(report[classifierType])

# Get the results for the partial LogisticRegression model

In [None]:
dataAmount = 'part'
classifierType = 'LogisticRegression'
reportP[classifierType], prob_predP[classifierType], ax[1,0] = getResults(X_testP, y_testP, modelP, dataAmount, classifierType, ax[1,0])
print(reportP[classifierType])

# Get the results for the partial KNN model

In [None]:
dataAmount = 'part'
classifierType = 'KNN'
reportP[classifierType], prob_predP[classifierType], ax[1,1] = getResults(X_testP, y_testP, modelP, dataAmount, classifierType, ax[1,1])
print(reportP[classifierType])

# Display the plots

In [None]:
# See https://stackoverflow.com/a/41717533/1988855
fig.tight_layout()
plt.show()
plt.rcParams["figure.figsize"] = plt.rcParamsDefault["figure.figsize"]

# Some exercises for you to try

1. How would you interpret the results (classification reports and ROC curves)?
2a. How might you use Oversampling on the partial data, to restore balance in the target?
   You might find the following code useful:

In [None]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state = 42)

   If using miniconda, you might need to install `conda install conda-forge::imbalanced-learn`
2b. How do the results compare with those where all the data was used?