In [43]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np
import json

In [44]:
df =  pd.read_csv('train.csv', header = 0)
df = df._get_numeric_data()
numeric_headers = list(df.columns.values)
numeric_headers.pop()
X = df[numeric_headers]
X= X.drop('label', axis=1)
X = X.to_numpy()
y = df['label']
y=y.apply(lambda row: int(row)) 
y=y.to_numpy()

scaler = StandardScaler()
X=scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [45]:
names = [#"Nearest_Neighbors",
        #"SVM",
        "MLP",
         "Adaboost",
        "Random_Forest"]

In [46]:
classifiers = [
    #KNeighborsClassifier(),
    #SVC(),
    MLPClassifier(),
    AdaBoostClassifier(),
    RandomForestClassifier()]

In [51]:
paramsGrid={}
paramsGrid["SVM"]={
    'C':[0.0001,0.1,1,10,100],
    'gamma':[0.001,0.1,1]
}
paramsGrid["Nearest_Neighbors"]={
    'n_neighbors':[1,5,10,50,100,500,1000]
}
paramsGrid["Adaboost"]={
    'n_estimators': [50,100,150,200,250,300]
}
paramsGrid["MLP"]={
    'learning_rate_init':[0.001,0.1,0.01],
    'early_stopping':[True],
    'hidden_layer_sizes':[100,200,500]
}
paramsGrid["Random_Forest"]={
    'min_samples_leaf': [2,10,30,50],
    'min_samples_split': [2,10,30,50],
    'n_estimators': [50,100,150,200,250,300]
}

In [52]:
def evaluate(model, test_features, test_labels):
    ypred = model.predict(test_features)
    errors = abs(ypred - test_labels)
    accuracy=np.sum([pred == true for pred, true in zip(ypred, test_labels)])/len(test_labels)
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy
from google.colab import files

In [53]:
results={}
for name, clf in zip(names, classifiers):
  print(f"For classifier {name}")	
  grid = GridSearchCV(estimator = clf, param_grid = paramsGrid[name], 
                            cv = 3, n_jobs = -1, verbose = 2)
  grid.fit(X_train, y_train)
  print("\tBest parameters set found on development set:")
  print()
  print(f"\t{grid.best_params_}")
  print()
  best_grid = grid.best_estimator_
  grid_accuracy = evaluate(best_grid, X_test, y_test)
  print("\tGrid scores on development set:")
  print()
  print(f"\t\t{grid_accuracy}")
  print()
  results[name]={
      "best_params":grid.best_params_,
      "grid_accuracy":grid_accuracy
  }
with open(f"Results_tuning.json", "w+") as f:
    json.dump(results,f)
files.download(f"Results_tuning.json")


For classifier MLP
Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:  3.3min finished


	Best parameters set found on development set:

	{'early_stopping': True, 'hidden_layer_sizes': 200, 'learning_rate_init': 0.001}

Model Performance
Average Error: 0.0688 degrees.
Accuracy = 0.93%.
	Grid scores on development set:

		0.931212482189989

For classifier Adaboost
Fitting 3 folds for each of 6 candidates, totalling 18 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:  2.7min finished


	Best parameters set found on development set:

	{'n_estimators': 50}

Model Performance
Average Error: 0.0704 degrees.
Accuracy = 0.93%.
	Grid scores on development set:

		0.929647536963072

For classifier Random_Forest
Fitting 3 folds for each of 96 candidates, totalling 288 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  8.4min
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed: 33.9min
[Parallel(n_jobs=-1)]: Done 288 out of 288 | elapsed: 58.9min finished


	Best parameters set found on development set:

	{'min_samples_leaf': 50, 'min_samples_split': 2, 'n_estimators': 100}

Model Performance
Average Error: 0.0684 degrees.
Accuracy = 0.93%.
	Grid scores on development set:

		0.9316329152360264



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>