In [None]:
# -*- coding: utf-8 -*-
"""HW2_part2.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1GSxKIWEzBpY3zzcA2C5rfsccdW7tERHo
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

plt.rcParams['axes.labelsize'] = 14  # fontsize of the x any y labels
plt.rcParams['xtick.labelsize'] = 12 # fontsize of the x tick labels
plt.rcParams['ytick.labelsize'] = 12 # fontsize of the y tick labels

# Sklearn imports
from sklearn import linear_model
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score,precision_score, recall_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
#Scipy imports
from scipy import stats

#import itertools
import itertools

# Suppress warnings
import warnings
warnings.filterwarnings("ignore")

"""# 1. Read data from this URL http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data"""

data = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data', header=None)
data

"""# 2. Defining and mapping features to data

#### Define the column names based on the data description
"""

column_names = ['ID','Diagnosis']

# Add mean real-valued features
mean_features = ['Mean_Radius', 'Mean_Texture', 'Mean_Perimeter', 'Mean_Area',
                 'Mean_Smoothness', 'Mean_Compactness', 'Mean_Concavity',
                 'Mean_Concave_Points', 'Mean_Symmetry', 'Mean_Fractal_Dimension']

# Add standard error feature names
se_features = ['SE_Radius', 'SE_Texture', 'SE_Perimeter', 'SE_Area',
               'SE_Smoothness', 'SE_Compactness', 'SE_Concavity',
               'SE_Concave_Points', 'SE_Symmetry', 'SE_Fractal_Dimension']

# Add worst feature names
worst_features = ['Worst_Radius', 'Worst_Texture', 'Worst_Perimeter', 'Worst_Area',
                  'Worst_Smoothness', 'Worst_Compactness', 'Worst_Concavity',
                  'Worst_Concave_Points', 'Worst_Symmetry', 'Worst_Fractal_Dimension']


# combine all columns
all_columns = column_names + mean_features + se_features + worst_features

# assign column names to dataframe
data.columns = all_columns

data.head()

"""# 3. Exploratory Data Analysis (EDA)"""

# Plot heatmap correlation matrix on a graph Reference: https://medium.com/@szabo.bibor/how-to-create-a-seaborn-correlation-heatmap-in-python-834c0686b88e
plt.figure(figsize=(20,15))

heatmap = sns.heatmap(data.corr(), vmin=-1, vmax=1, annot=True, cmap='BrBG')

heatmap.set_title('Correlation Matrix Heatmap', fontdict={'fontsize': 18}, pad=12)

"""### Check class imbalace"""

fig = plt.figure(figsize=(5,3))
ax = fig.add_subplot(111)
data['Diagnosis'].value_counts().plot(kind='bar',
                                     ax=ax,
                                     color=['green','salmon'])

# set title and labels
ax.set_title('Proportion of observations of the response variable',
             fontsize=10, loc='left')
ax.set_xlabel('Diagnosis',
              fontsize=7)
ax.set_ylabel('proportion of observations',
              fontsize=7)

"""### Summary statistics"""

# Using describe function to compute summary statistics of entire df. It automatically ignore cat features
# Slice the dataset to skip first column as it is an ID
data.iloc[:,1:].describe()[1:] # Skip first row as it returns instance counts and it is the same across all features (569)

"""# 4. Encoding target variable"""

data['Diagnosis']  = data['Diagnosis'].map({'M':1,'B':0})
class_names = {0: 'Benign', 1: 'Malignant'}

data.tail()

class_names = {0: 'Benign', 1: 'Malignant'}

"""# 5. Model building"""

# Extract X and y
X = data.drop(columns=['Diagnosis','ID'],axis=1)
y = data.Diagnosis
X.head()
X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,stratify=y, random_state=42)

X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,stratify=y, random_state=42)

"""#### Scaling data for KNN ( Below codes were adopted from class code example) link: https://colab.research.google.com/drive/1Tk3iWD1MgSIUrhbvrEobmcA5PNG7Njkw?usp=sharing#scrollTo=Fe_Mfq1QsP3g"""

# Instantiate StandardScaler
sc = StandardScaler()
# Fitting the StandardScaler
sc.fit(X_train)

# Transforming the datasets
X_train_std = sc.transform(X_train) # Perform standardization of train set X attributes by centering and scaling
                                    # This line uses the transform method of the sc object to standardize the features in the training set.
X_test_std = sc.transform(X_test)   # Perform standardization of test set X attributes by centering and scaling
                                    # Similarly, this line standardizes the features in the testing set.
                                    # Importantly, it uses the same mean and standard deviation values that were computed from the training set.

X_train_std

# Fit models ( KNN and Logistic)

# K-NN model
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train_std,y_train)

#Logistic regression modelm
log_reg = linear_model.LogisticRegression(solver='lbfgs',max_iter=500)
log_reg.fit(X_train, y_train)

"""### The below function was adopted from class code"""

# Function that prints and plots the confusion matrix.
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]         # devide absolute number of observations with sum across columns to get the relative percentage of observations
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)                 # shows the confusion matrix in the console
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))                        # add tick marks to the confusion matrix
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'                        # choose format depending on whether the confusion matrix is normalizaed or not
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):      # loop that adds the value to each cell of the confusion matrix
        plt.text(j, i, format(cm[i, j], fmt),                   # we reformat how the cell values are displayed accroding to the variable fmt we defined before
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

# k-NN and Logistic predictions
k_nn_pred = knn.predict(X_test_std)
log_reg_pred = log_reg.predict(X_test)

# k-NN Compute confusion matrix to evaluate the accuracy of a classification
knn_cnf_matrix = confusion_matrix(y_test, k_nn_pred)
knn_accuracy = accuracy_score(y_test, k_nn_pred)
knn_report = classification_report(y_test, k_nn_pred)


# Logistic Regression Compute confusion matrix to evaluate the accuracy of a classification
log_reg_cnf_matrix = confusion_matrix(y_test, log_reg_pred)
log_reg_accuracy = accuracy_score(y_test, log_reg_pred)
log_reg_report = classification_report(y_test, log_reg_pred)

#print KNN performance
print("K-NN Evaluation:")
print("Confusion Matrix:\n", knn_cnf_matrix)
print("Accuracy:", knn_accuracy)
print("Classification Report:\n", knn_report)

#print KNN performance
print("\nLogistic Regression Evaluation:")
print("Confusion Matrix:\n", log_reg_cnf_matrix)
print("Accuracy:", log_reg_accuracy)
print("Classification Report:\n", log_reg_report)

print("\nLogistic Regression Evaluation:")
print("Confusion Matrix:\n", log_reg_cnf_matrix)
print("Accuracy:", log_reg_accuracy)
print("Classification Report:\n", log_reg_report)

"""# 6. Model Evaluation"""

# def evaluate_model(model, model_type, X_test, X_test_std, y_test):
#     if model_type == 'knn':
#         y_pred = model.predict(X_test_std)
#     if model_type =='log_reg':
#         y_pred = model.predict(X_test)

#     # Compute confusion matrix to evaluate the accuracy of a classification
#     cnf_matrix = confusion_matrix(y_test, y_pred)
#     accuracy = accuracy_score(y_test, y_pred)
#     report = classification_report(y_test, y_pred)

#     return cnf_matrix, accuracy, report

# # Assuming knn and log_reg are your trained models, and X_test and X_test_std are your test datasets
# # Evaluate K-NN model
# knn_cm, knn_accuracy, knn_report = evaluate_model(model=knn, model_type='knn', X_test=X_test, X_test_std=X_test_std, y_test=y_test)

# # Evaluate Logistic model
# log_reg_cm, log_reg_accuracy, log_reg_report = evaluate_model(model=log_reg, model_type='log_reg', X_test=X_test, X_test_std=X_test_std, y_test=y_test)

# print("K-NN Evaluation:")
# print("Confusion Matrix:\n", knn_cm)
# print("Accuracy:", knn_accuracy)
# print("Classification Report:\n", knn_report)

# print("\nLogistic Regression Evaluation:")
# print("Confusion Matrix:\n", log_reg_cm)
# print("Accuracy:", log_reg_accuracy)
# print("Classification Report:\n", log_reg_report)

# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(knn_cnf_matrix,
                      classes=class_names,
                      title='k-NN Confusion matrix, without normalization')

# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(log_reg_cnf_matrix,
                      classes=class_names,
                      title='Logistic Confusion matrix, without normalization')

# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(log_reg_cnf_matrix,
                      classes=class_names,
                      title='Logistic Confusion matrix, without normalization')
