In [None]:
import warnings

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.dummy import DummyClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.compose import make_column_transformer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, cross_validate, train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline, make_pipeline

# read in training and test data downloaded from Kaggle
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

# split the train data into X and y
y = train["label"]
X = train.drop("label",axis = 1)
# relabeled test data
X_test = test

# scaler makes the models worse, do not use
# scaler = StandardScaler()

In [None]:
# plot with counts from target column
vc = y.value_counts()
vc.plot(kind='bar')

In [None]:
# function from homeworks
def mean_std_cross_val_scores(model, X_train, y_train, **kwargs):
    """
    Returns mean and std of cross validation

    Parameters
    ----------
    model :
        scikit-learn model
    X_train : numpy array or pandas DataFrame
        X in the training data
    y_train :
        y in the training data

    Returns
    ----------
        pandas Series with mean scores from cross_validation
    """

    scores = cross_validate(model, X_train, y_train, **kwargs)

    mean_scores = pd.DataFrame(scores).mean()
    std_scores = pd.DataFrame(scores).std()
    out_col = []

    for i in range(len(mean_scores)):
        out_col.append((f"%0.3f (+/- %0.3f)" %
                       (mean_scores[i], std_scores[i])))

    return pd.Series(data=out_col, index=mean_scores.index)

In [None]:
# first attempt at creating a knn model
# used simple 2 neighbors and found the cross validation scores
knn = KNeighborsClassifier(n_neighbors=2)
cv_score = mean_std_cross_val_scores(knn, X, y, cv=2, return_train_score=True)

In [None]:
print(cv_score)

In [None]:
# created grid search between 2 and 5 neighbors for knn model
# checked ot see which was the best parameter for knn model
# 731 seconds to run this
params = [2,3,4,5]
scores = {}
for param in params:
    knn_adjust = KNeighborsClassifier(n_neighbors=param)
    scores[param] = mean_std_cross_val_scores(knn_adjust, X, y, cv=3, return_train_score=True)

In [None]:
display(pd.DataFrame(scores))

In [None]:
# knn model with hyperparameterized n neighbors
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X,y)
test = knn.predict(X_test)

In [None]:
# transforming data frame into correct csv file for submission
df = pd.DataFrame(test, columns=["Label"])
df.index += 1
df.index.name = "ImageId"
out = df.to_csv('data/sumbmission_knn.csv')

In [None]:
# knn model without hyperparameterized n neighbors
knn = KNeighborsClassifier()
knn.fit(X,y)
test = knn.predict(X_test)

In [None]:
# transforming data frame into correct csv file for submission
df = pd.DataFrame(test, columns=["Label"])
df.index += 1
df.index.name = "ImageId"
out = df.to_csv('data/sumbmission_knn_nohy.csv')

In [None]:
# decision tree without hyperparameterized max depth
dt = DecisionTreeClassifier()
dt.fit(X,y)
test = dt.predict(X_test)

In [None]:
# transforming data frame into correct csv file for submission
df = pd.DataFrame(test, columns=["Label"])
df.index += 1
df.index.name = "ImageId"
out = df.to_csv('data/sumbmission_dt.csv')

In [None]:
# decision tree with hyperparameterized max depth = 14
dt = DecisionTreeClassifier(max_depth=14)
dt.fit(X,y)
test = dt.predict(X_test)

In [None]:
# transforming data frame into correct csv file for submission
df = pd.DataFrame(test, columns=["Label"])
df.index += 1
df.index.name = "ImageId"
out = df.to_csv('data/sumbmission_dt_14.csv')

In [None]:
# Logistic Regression model with solver = lbfgs and max_iter = 10000
LogReg = LogisticRegression(solver='lbfgs',max_iter = 10000)
LogReg.fit(X,y)

Result = LogReg.predict(X_test)
Result = pd.Series(Result,name="Label")

df = pd.DataFrame(Result, columns=["Label"])
df.index += 1
df.index.name = "ImageId"
out = df.to_csv('data/LRSubmission.csv')

In [None]:
# Logistic Regression model with solver = lbfgs and max_iter = 200
LogReg = LogisticRegression(solver='lbfgs',max_iter = 200)
lrpipe = make_pipeline(scaler, LogReg)
LogReg.fit(X,y)

Grid searches with decision tree classifier. They are separated since an error was given for too little memory. The grid search is used between 1 and 30

In [None]:
param_grid = {
    "max_depth": range(1,10,1)
}
best_ps = {"max_depth": 5}
pipe_grid = DecisionTreeClassifier(**best_ps)
grid_search = GridSearchCV(pipe_grid, param_grid, cv=5, n_jobs=-1, return_train_score=True)
grid_search.fit(X, y)

print(f"Best score: {grid_search.best_score_} Best Params: {grid_search.best_params_}")

In [None]:
param_grid = {
    "max_depth": range(11,15,1)
}
best_ps = {"max_depth": 5}
pipe_grid = DecisionTreeClassifier(**best_ps)
grid_search = GridSearchCV(pipe_grid, param_grid, cv=5, n_jobs=-1, return_train_score=True)
grid_search.fit(X, y)

print(f"Best score: {grid_search.best_score_} Best Params: {grid_search.best_params_}")

In [None]:
param_grid = {
    "max_depth": range(15,20,1)
}
best_ps = {"max_depth": 5}
pipe_grid = DecisionTreeClassifier(**best_ps)
grid_search = GridSearchCV(pipe_grid, param_grid, cv=5, n_jobs=-1, return_train_score=True)
grid_search.fit(X, y)

print(f"Best score: {grid_search.best_score_} Best Params: {grid_search.best_params_}")

In [None]:
param_grid = {
    "max_depth": range(21,25,1)
}
best_ps = {"max_depth": 5}
pipe_grid = DecisionTreeClassifier(**best_ps)
grid_search = GridSearchCV(pipe_grid, param_grid, cv=5, n_jobs=-1, return_train_score=True)
grid_search.fit(X, y)

print(f"Best score: {grid_search.best_score_} Best Params: {grid_search.best_params_}")

In [None]:
param_grid = {
    "max_depth": range(26,30,1)
}
best_ps = {"max_depth": 5}
pipe_grid = DecisionTreeClassifier(**best_ps)
grid_search = GridSearchCV(pipe_grid, param_grid, cv=5, n_jobs=-1, return_train_score=True)
grid_search.fit(X, y)

print(f"Best score: {grid_search.best_score_} Best Params: {grid_search.best_params_}")

Grid searches with knn classifier.
They are separated since an error was given for too little memory.
The grid search is used between 1 and 20.

In [None]:
param_grid = {
    "n_neighbors": range(1,5,1)
}
best_ps = {"n_neighbors": 5}
pipe_grid = mKNeighborsClassifier(**best_ps)
grid_search = GridSearchCV(pipe_grid, param_grid, cv=3, n_jobs=-1, return_train_score=True)
grid_search.fit(X, y)

print(f"Best score: {grid_search.best_score_} Best Params: {grid_search.best_params_}")

In [None]:
param_grid = {
    "n_neighbors": range(6,10,1)
}
best_ps = {"n_neighbors": 5}
pipe_grid = KNeighborsClassifier(**best_ps)
grid_search = GridSearchCV(pipe_grid, param_grid, cv=3, n_jobs=-1, return_train_score=True)
grid_search.fit(X, y)

print(f"Best score: {grid_search.best_score_} Best Params: {grid_search.best_params_}")

In [None]:
param_grid = {
    "n_neighbors": range(11,15,1)
}
best_ps = {"n_neighbors": 5}
pipe_grid = KNeighborsClassifier(**best_ps)
grid_search = GridSearchCV(pipe_grid, param_grid, cv=3, n_jobs=-1, return_train_score=True)
grid_search.fit(X, y)

print(f"Best score: {grid_search.best_score_} Best Params: {grid_search.best_params_}")

In [None]:
param_grid = {
    "n_neighbors": range(16,20,1)
}
best_ps = {"n_neighbors": 5}
pipe_grid = KNeighborsClassifier(**best_ps)
grid_search = GridSearchCV(pipe_grid, param_grid, cv=3, n_jobs=-1, return_train_score=True)
grid_search.fit(X, y)

print(f"Best score: {grid_search.best_score_} Best Params: {grid_search.best_params_}")

Neural Network Classifier with default hyperparameters

In [None]:
# Baseline of MLPClassifier
MLP = MLPClassifier()
scores = mean_std_cross_val_scores(MLP, X, y, cv = 5, return_train_score=True)
print(scores)

In [None]:
MLP.fit(X, y)
MLP_output = MLP.predict(X_test)
df = pd.DataFrame(MLP_output, columns=["Label"])
df.index += 1
df.index.name = "ImageId"
df.to_csv('data/sumbmission_MLP_Unoptimized.csv')

Grid Search Optimization of the MLPClassifier.
Split into two sections to save time.


In [None]:
mlp = MLPClassifier(max_iter=1000000)
mlp_pipe = make_pipeline(StandardScaler(), mlp)

param_dist = {
    'mlpclassifier__alpha': [0.0001, 0.001, 0.01, 0.05, 0.1, 0.5, 0.75, 0.9],
    'mlpclassifier__solver': ['sgd', 'adam', 'lbfgs'],
    'mlpclassifier__activation': ['identity', 'logistic', 'tanh', 'relu'],
    'mlpclassifier__learning_rate': ['constant', 'invscaling', 'adaptive']
}

grid = GridSearchCV(mlp_pipe, param_dist, n_jobs=-1, cv=2)
grid.fit(X, y)

print(grid.best_score_)
print(grid.best_params_)

In [None]:
mlp = MLPClassifier(max_iter=1000000, activation='relu', solver='adam', alpha=0.05)

one_layer = [(100), (300), (500)]
two_layer = [(100, 100), (300, 300), (500, 500)]
three_layer = [(100, 100, 100), (300, 300, 300), (500, 500, 500)]

param_dist = {
    'mlpclassifier__hidden_layer_sizes': one_layer+two_layer+three_layer,
}

new_grid = GridSearchCV(mlp, param_dist, n_iter=20, n_jobs=-1, cv=2)
new_grid.fit(X, y)

print(new_grid.best_score_)
print(new_grid.best_params_)

MLPClassifier with Best Hyperparameters

In [None]:
optimized_mlp = MLPClassifier(max_iter=1000000, activation='relu', solver='adam', learning_rate='adaptive',
                        hidden_layer_sizes=(500,500), alpha=0.5)

optimized_mlp.fit(X, y)
optimized_mlp_out = opt_mlp.predict(X_test)

df = pd.DataFrame(optimized_mlp_out, columns=["Label"])
df.index += 1
df.index.name = "ImageId"
df.to_csv('data/sumbmission_MLP_optimized.csv')

In [None]:
# SVC Classifier with kernel = rbf c = 1 gamma = scale and random state = 123
svc_classifier = SVC(kernel='rbf', C=1.0, gamma='scale', random_state=123)

svc_classifier.fit(X, y)
y_pred = svc_classifier.predict(X_test)

In [None]:
# transforming data frame for Kaggle submission
df = pd.DataFrame(y_pred, columns=["Label"])
df.index += 1
df.index.name = "ImageId"
out = df.to_csv('data/SVC.csv')

In [None]:
# Random forest model
RF = RandomForestClassifier(n_estimators=1000, random_state=123)
RF.fit(X,y)
Result = RF.predict(X_test)
Result = pd.Series(Result,name="Label")

In [None]:
# transforming data frame for Kaggle submission
df = pd.DataFrame(Result, columns=["Label"])
df.index += 1
df.index.name = "ImageId"
out = df.to_csv('data/RF.csv')