In [78]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **Week 4: Colab Experiment**

# I. Introduction
In this exercise, we load the Breast cancer wisconsin dataset for classification.

# II. Methods
We train 3 models:
1. logistic regression
2. support vector machine
3. decision tree.

...

In [79]:
from sklearn.datasets import load_breast_cancer
import pandas as pd
from collections import Counter
from datetime import datetime
import numpy as np
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import zero_one_loss


In [80]:
# Define the dependent and independent variables.
data = load_breast_cancer()
Y = data.target
X = data.data


In [81]:
# Create CV folds
num_folds = 5
kf = KFold(n_splits=num_folds, random_state=0, shuffle=True)
kfold_indices = {}

for i, (train_index, test_index) in enumerate(kf.split(X)):
  kfold_indices[f"fold_{i}"] = {'train': train_index, 'test': test_index}

In [82]:
# Train models and apply them to the test set
Error_rate = {'logreg': [], 'svm': [], 'decision_tree': []}

#create scaler
scaler = StandardScaler()

#create models
logistic = LogisticRegression()
svm = SVC()
decisionTree = DecisionTreeClassifier()

#hyperparameters:
#for logistic regression
#by default logistic uses l2 penalty. cannot use l1 penalty since lbfgs only supports l2 penalty
logistic_param = {
    'C': list(np.arange(0.1, 1, 0.05)) #regularization strength where lower is stronger
}

#for SVM
svm_param = {
    'kernel': ['poly', 'linear', 'rbf', 'sigmoid'], #testing various kernel
    'C': list(np.arange(0.1, 1, 0.05))#regularization strength where lower is stronger
}

#for decision tree
decisionTree_param = {
    'criterion': ['gini', 'entropy'], #some ways to determine which split is the best
    'max_depth': [None, 5, 10, 20, 30, 40, 50, 100], #max depth of tree, default is none (basically no limit)
    # 'min_samples_split': [2,3,4,5]

}

for fold_id in range(num_folds):
  print("fold id ", fold_id)
  X_train = X[kfold_indices[f"fold_{fold_id}"]['train']]
  Y_train = Y[kfold_indices[f"fold_{fold_id}"]['train']]
  X_test = X[kfold_indices[f"fold_{fold_id}"]['test']]
  Y_test = Y[kfold_indices[f"fold_{fold_id}"]['test']]

  # TODO : use standardScaler to normalize the data and run the models

  # just fit using the training data. we dont do fit for the entire data
  # because we dont want the testing data to influence the scaling process
  scaler.fit(X_train)
  X_train_scaled = scaler.transform(X_train)
  X_test_scaled = scaler.transform(X_test)

  # GridSearchCV(estimator, param_grid, scoring='accuracy')

  #grid search to find best hyperparameters, cross validation = 5 fold

  #Logistic regression
  grid_logistic = GridSearchCV(logistic, logistic_param, scoring='accuracy', cv = 5)
  grid_logistic.fit(X_train_scaled, Y_train)
  updatedLogistic = grid_logistic.best_estimator_

  print("logistic hyperparameter:")
  print(grid_logistic.best_params_)

  Error_rate['logreg'].append(zero_one_loss(
      Y_test, updatedLogistic.predict(X_test_scaled)))

  # SVM
  grid_svm = GridSearchCV(svm, svm_param, scoring='accuracy', cv = 5)
  grid_svm.fit(X_train_scaled, Y_train)
  updatedSvm = grid_svm.best_estimator_

  print("svm hyperparameter:")
  print(grid_svm.best_params_)

  Error_rate['svm'].append(zero_one_loss(
      Y_test, updatedSvm.predict(X_test_scaled)))

  # Decision tree
  grid_decisionTree = GridSearchCV(decisionTree, decisionTree_param, scoring='accuracy', cv = 5)
  grid_decisionTree.fit(X_train_scaled, Y_train)
  updatedDecisionTree = grid_decisionTree.best_estimator_

  print("deicision tree hyperparameter:")
  print(grid_decisionTree.best_params_)

  print()
  Error_rate['decision_tree'].append(zero_one_loss(
      Y_test, updatedDecisionTree.predict(X_test_scaled)))



fold id  0
logistic hyperparameter:
{'C': 0.30000000000000004}
svm hyperparameter:
{'C': 0.20000000000000004, 'kernel': 'linear'}
deicision tree hyperparameter:
{'criterion': 'gini', 'max_depth': 5}

fold id  1
logistic hyperparameter:
{'C': 0.20000000000000004}
svm hyperparameter:
{'C': 0.1, 'kernel': 'linear'}
deicision tree hyperparameter:
{'criterion': 'entropy', 'max_depth': 10}

fold id  2
logistic hyperparameter:
{'C': 0.25000000000000006}
svm hyperparameter:
{'C': 0.1, 'kernel': 'linear'}
deicision tree hyperparameter:
{'criterion': 'entropy', 'max_depth': None}

fold id  3
logistic hyperparameter:
{'C': 0.30000000000000004}
svm hyperparameter:
{'C': 0.5000000000000001, 'kernel': 'linear'}
deicision tree hyperparameter:
{'criterion': 'entropy', 'max_depth': 5}

fold id  4
logistic hyperparameter:
{'C': 0.3500000000000001}
svm hyperparameter:
{'C': 0.30000000000000004, 'kernel': 'linear'}
deicision tree hyperparameter:
{'criterion': 'entropy', 'max_depth': 5}



## III. Results

Show the results.

In [83]:
# TODO
print(f"The error rate over 5 folds in CV:")
print(f"Logistic Regression: mean = {round(np.mean(Error_rate['logreg']),4)}, std = {round(np.std(Error_rate['logreg']),4)}")
print(f"SVM: mean = {round(np.mean(Error_rate['svm']),4)}, std = {round(np.std(Error_rate['svm']),4)}")
print(f"Decision Tree: mean = {round(np.mean(Error_rate['decision_tree']),4)}, std = {round(np.std(Error_rate['decision_tree']),4)}")


The error rate over 5 folds in CV:
Logistic Regression: mean = 0.0246, std = 0.0151
SVM: mean = 0.0281, std = 0.017
Decision Tree: mean = 0.0686, std = 0.0205


# IV. Conclusion and Discussion

- Logistic Regression has the lowest mean and std. This mean that logistic is able to produce the most consistent and is considered the best out of the three models. Behind it is SVM with a similar performance, then the last one is decision tree.
- Seeing how the best kernel for SVM in this dataset is linear, and how the best model is logistic regression, it seems that the dataset has linear characteristic.


## Extra things I discover while doing this homework
- Adding more options or range in the grid parameters don't always mean less error.
<br> Example: I tried adding min_samples_split hyperparameter and min_samples_leaf hyperparameters but the mean error for the decision tree increases instead of decreasing. Maybe adding more hyperparameters led the tree to overfit instead.
- Not all the regularisation can be used by default. Example, l1 penalty does not work for default logistic regression. In order to use l1 penalty for logistic, need to change the solver to other type.
