# Lab 9

## TODO:

- [ ] Linear classifier
  - [ ] Use training data, note MSE
  - [ ] Can we say the data is linearly separable?
  - [ ] Note MSE on test data
  - [ ] Does the model generalise?
- [ ] Logistic regression
  - [ ] Use training data, note MSE
  - [ ] Note MSE on test data
  - [ ] Record and explain findings
- [ ] (optional) Experiment with hyperparameters (maybe use random search)
  - [ ] learning rate
  - [ ] epochs
  - [ ] batch size
- [ ] (optional) Table or graph results

## Imports 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import os
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import SGDClassifier, LogisticRegression
from scipy.stats import randint
from sklearn.preprocessing import MinMaxScaler

## Data setup

In [2]:
np.random.seed(0)
os.environ["OMP_NUM_THREADS"] = "5"

df3 = pd.read_csv("Data/data3.csv")
df3

Unnamed: 0,failures,higher,studytime,Medu,Fedu,Dalc,age,reason_reputation,school,address,internet,G3
0,0,1,2,4,4,1,18,0,0,0,0,6
1,0,1,2,1,1,1,17,0,0,0,1,6
2,3,1,2,1,1,2,15,0,0,0,1,10
3,0,1,3,4,2,1,15,0,0,0,1,15
4,0,1,2,3,3,1,16,0,0,0,0,10
...,...,...,...,...,...,...,...,...,...,...,...,...
1039,1,1,3,2,3,1,19,0,1,1,1,10
1040,0,1,2,3,1,1,18,0,1,0,1,16
1041,0,1,2,1,1,1,18,0,1,0,0,9
1042,0,1,1,3,1,3,17,0,1,0,1,10


In [7]:
X = df3.copy()
y = X.pop("G3") # y becomes the G3 column and deletes it from df3_copy
y_binary = [0 if y_i > np.mean(y) else 1 for y_i in y] #making the target class into binary (over/under average score)

# display(X)
# print(y_binary)
X

Unnamed: 0,failures,higher,studytime,Medu,Fedu,Dalc,age,reason_reputation,school,address,internet
0,0,1,2,4,4,1,18,0,0,0,0
1,0,1,2,1,1,1,17,0,0,0,1
2,3,1,2,1,1,2,15,0,0,0,1
3,0,1,3,4,2,1,15,0,0,0,1
4,0,1,2,3,3,1,16,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
1039,1,1,3,2,3,1,19,0,1,1,1
1040,0,1,2,3,1,1,18,0,1,0,1
1041,0,1,2,1,1,1,18,0,1,0,0
1042,0,1,1,3,1,3,17,0,1,0,1


In [8]:
#scale
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y_binary, test_size=0.2, stratify=y_binary, random_state=42,)
X_train.shape

(835, 11)

## Linear Classifier

In [10]:
#batch size is not supported in SGDClassifier, can be done using partial_fit
parameters = {
    "alpha": [1.0, 0.1, 0.01, 0.001],
    "max_iter": [100, 1000, 10000],
    "penalty" : ["l1", "l2", "none"]

}
linear_model = SGDClassifier(loss="squared_error")
linear_cv = GridSearchCV(linear_model, parameters, cv=10)
linear_cv.fit(X_train,y_train)

In [11]:
print(f"Train accuracy: {linear_cv.best_estimator_.score(X_train, y_train)}")
print(f"Test accuracy: {linear_cv.best_estimator_.score(X_test, y_test)}")

Train accuracy: 0.6634730538922156
Test accuracy: 0.645933014354067


In [12]:
table = pd.DataFrame(linear_cv.cv_results_["params"])
table["classifier"] = "SGDClassifier"
table["test_method"] = "10CV"
table["accuracy"] = linear_cv.cv_results_["mean_test_score"]
table = table.sort_values(by=["accuracy"], ascending=False)

table

Unnamed: 0,alpha,max_iter,penalty,classifier,test_method,accuracy
34,0.001,10000,l2,SGDClassifier,10CV,0.656311
29,0.001,100,none,SGDClassifier,10CV,0.656182
4,1.0,1000,l2,SGDClassifier,10CV,0.647992
5,1.0,1000,none,SGDClassifier,10CV,0.647892
2,1.0,100,none,SGDClassifier,10CV,0.64422
31,0.001,1000,l2,SGDClassifier,10CV,0.639659
7,1.0,10000,l2,SGDClassifier,10CV,0.639644
33,0.001,10000,l1,SGDClassifier,10CV,0.638224
1,1.0,100,l2,SGDClassifier,10CV,0.637235
32,0.001,1000,none,SGDClassifier,10CV,0.632444


## Logistic Regression

In [13]:
#batch size is not supported in SGDClassifier, can be done using partial_fit
parameters = {
    "max_iter": [100, 1000, 10000],
    "penalty" : ["l1", "l2", "none"],
    "solver": ["newton-cg", "lbfgs", "liblinear", "sag", "saga"]}

log_regression = LogisticRegression()
logistic_cv = GridSearchCV(log_regression, parameters, cv=10)
logistic_cv.fit(X_train,y_train)

In [14]:
print(f"Train accuracy: {logistic_cv.best_estimator_.score(X_train, y_train)}")
print(f"Test accuracy: {logistic_cv.best_estimator_.score(X_test, y_test)}")

Train accuracy: 0.6898203592814371
Test accuracy: 0.6985645933014354


In [15]:
table = pd.DataFrame(logistic_cv.cv_results_["params"])
table["classifier"] = "LogisticRegression"
table["test_method"] = "10CV"
table["accuracy"] = logistic_cv.cv_results_["mean_test_score"]
table = table.sort_values(by=["accuracy"], ascending=False)

table

Unnamed: 0,max_iter,penalty,solver,classifier,test_method,accuracy
23,1000,l2,sag,LogisticRegression,10CV,0.674312
38,10000,l2,sag,LogisticRegression,10CV,0.674312
35,10000,l2,newton-cg,LogisticRegression,10CV,0.674312
24,1000,l2,saga,LogisticRegression,10CV,0.674312
36,10000,l2,lbfgs,LogisticRegression,10CV,0.674312
21,1000,l2,lbfgs,LogisticRegression,10CV,0.674312
20,1000,l2,newton-cg,LogisticRegression,10CV,0.674312
39,10000,l2,saga,LogisticRegression,10CV,0.674312
9,100,l2,saga,LogisticRegression,10CV,0.674312
8,100,l2,sag,LogisticRegression,10CV,0.674312
