In [2]:
import scipy as sp

from sklearn.datasets import load_digits as load

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, train_test_split

from sklearn.svm import SVC
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis, LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

# This lines are used to remove warning from scikit-learn when QDA is used, to prevent noisy verbosity in the notebook: should not be used in practice!
import warnings
warnings.filterwarnings("ignore")

# Load data
# X, y = load(return_X_y=True) # load_iris(return_X_y=True)
from sklearn.datasets import fetch_covtype
X, y = fetch_covtype(return_X_y=True)

print(X.shape)
print(X.max())

for c in sp.unique(y):
    print("Number of samples for class {}: {}".format(c, len(sp.where(y==c)[0])))

(581012, 54)
7173.0
Number of samples for class 1: 211840
Number of samples for class 2: 283301
Number of samples for class 3: 35754
Number of samples for class 4: 2747
Number of samples for class 5: 9493
Number of samples for class 6: 17367
Number of samples for class 7: 20510


In [4]:
# Standardize data
sc = MinMaxScaler()
X = sc.fit_transform(X) # Scale data between 0 and 1

# The stratification ensures that the proportion of each class from the orginal data is preserved in the train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.01, random_state=0, stratify=y)
print(X_train.shape)
for c in sp.unique(y):
    print("Number of samples for class {}: {}".format(c, len(sp.where(y_train==c)[0])))

(5810, 54)
Number of samples for class 1: 2118
Number of samples for class 2: 2833
Number of samples for class 3: 358
Number of samples for class 4: 27
Number of samples for class 5: 95
Number of samples for class 6: 174
Number of samples for class 7: 205


In [6]:
# Define the classifier list
classifiers = [["SVM", SVC()], 
               ["KNN", KNeighborsClassifier()],
               ["QDA", QuadraticDiscriminantAnalysis()], 
               ["LDA", LinearDiscriminantAnalysis(solver="lsqr")],
               ["RF",  RandomForestClassifier()]]

# Hyperparameters range
gamma = 2.0**sp.arange(-4,2) # Scale of the RBF kernel
C = 10.0**sp.arange(-1,4) # Penality of the optimization problem

# Define the dictionnary of parameters to optimize
param_grids = [dict(kernel=['rbf'], gamma=2.0**sp.arange(-4, 2), C=10.0**sp.arange(0,3)), # Scale and regularization
               dict(n_neighbors = sp.arange(1, 40, 5)), # number of neighbors for KNN
               dict(reg_param = sp.linspace(0, 1, 50)), # Regularization parameter for QDA
               dict(shrinkage = sp.linspace(0, 0.5, 30)), # Regularization parameter for LDA
               dict(n_estimators=sp.arange(50, 150, 10)) # Number of trees
               ]

for classifier, param_grid in zip(classifiers, param_grids):
    grid = GridSearchCV(classifier[1], param_grid=param_grid, cv= 3, n_jobs=-1)
    grid.fit(X_train, y_train)
    
    clf = grid.best_estimator_ 
    clf.fit(X_train,y_train)
    
    y_pred = clf.predict(X_test)
    print("Classification accuracy for {}: {:.2f} (best parameters {})".format(classifier[0],accuracy_score(y_test,y_pred),grid.best_params_))

Classification accuracy for SVM: 0.76 (best parameters {'C': 100.0, 'gamma': 0.25, 'kernel': 'rbf'})
Classification accuracy for KNN: 0.75 (best parameters {'n_neighbors': 1})
Classification accuracy for QDA: 0.64 (best parameters {'reg_param': 0.16326530612244897})
Classification accuracy for LDA: 0.67 (best parameters {'shrinkage': 0.017241379310344827})
Classification accuracy for RF: 0.79 (best parameters {'n_estimators': 120})
