In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

### Functions for machine learning algorithm's useage

In [2]:
def cross_validation(model, training_features, training_labels,metric, kfolds):
    '''
    A function that applies cross_validation on given machine learning algorithm, data and number of data splits
    
    input:
            model (sklearn machine learning algorithm api): LogisticRegression, XGBoost, etc..
            training_features (DataFrame): (X_train)
            training_labels (DataFrame): (y_train)
            metric (String, sklearn scoring metrics api): 
            kfolds (int): number of splits to perform on the datasets
    
    output:
            scores (list): a list with scoring values for each K split
            average_score (float): the mean of scores
            
    '''
    scores = cross_val_score(estimator=model, X=training_features.values, y=training_labels.values.ravel(), scoring=metric, cv=kfolds, verbose=3, n_jobs=-1)
    average_score = np.mean(scores)
    return scores, average_score

In [3]:
def hyper_tuning(model, training_features, training_labels,metric, hyper_params, cv):
    '''
    A function that applies GridSearch (hyper parameter tuning) for a given machine learning algorithm.
    
    *In order to get the best of this function it is recommended that you'll use it on algorithms with many hyper parameters*
    
    input:
            model (sklearn machine learning algorithm api): LogisticRegression, XGBoost, etc..
            training_features (DataFrame): (X_train)
            training_labels (DataFrame): (y_train)
            metric (String, sklearn scoring metrics api): 
            hyper_params (list): A list which contains a dictionary with it's keys as names of a model's hyper parameters and 
                                 values to test on
            cv (int): cross validation splitting strategy (3-fold, 5-fold). *for faster performance choose 3 fold cv*
    
    output: 
            best_find (machine learning model): returns the given machine learning algorithm with the best hyper parameters
            best_score (float): returns the best score achieved by the model with the best hyper parameters
            
    '''
    
    grid_search = GridSearchCV(model, param_grid=hyper_params, scoring=metric, cv=cv)
    grid_search.fit(training_features.values, training_labels.values.ravel())
    best_find = grid_search.best_estimator_
    best_score = grid_search.best_score_
    return best_find, best_score

In [4]:
def model_performance(trained_model, testing_features, testing_labels, metrics = []):
    '''
    A functions that applies multiple scoring metrics given by the user on the testing set
    
    input:
            trained_model (sklearn machine learning algorithm api): An already trained machine learning algorithm
            testing_features (DataFrame): (X_test) 
            testing_label (DataFrame): (y_test)
            metrics (list of sklearn metric api): a list which contains the desired scoring metrics
    
    output:
            scores (dictionary): a dictionary which contains the scoring method as a key and the score as value
            
    '''
    scores = {}
    num_metrics = 1
    predictions = trained_model.predict(testing_features.values)
    for metric in metrics:
        
        if str(metric) == str(f1_score):
            score = f1_score(testing_labels.values.ravel(), predictions, average='micro')
            scores[num_metrics] = score
            num_metrics += 1
        else:
            score = metric(testing_labels.values.ravel(), predictions)
            scores[num_metrics] = score
            num_metrics += 1
        
    return scores

### Importing the datasets

In [5]:
T_X = pd.read_csv('New_train.csv')
T_Y = pd.read_csv('New_train_label.csv')
TEST = pd.read_csv('New_test.csv')

#### Train Validate split

In [6]:
X_train, X_val, y_train, y_val = train_test_split(T_X, T_Y, test_size=0.1, random_state=1)

## Support Vector Machine (SVM)

In [8]:
svm = SVC(C=3.5, kernel='linear', random_state=42)

In [9]:
cross_validation(svm, X_train, y_train, 'accuracy', 10)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:  2.1min remaining:  5.0min
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:  2.2min remaining:   57.5s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  3.4min finished


(array([0.93624339, 0.93518519, 0.92910053, 0.92883598, 0.92592593,
        0.92910053, 0.92936508, 0.92460317, 0.92804233, 0.92433862]),
 0.9290740740740742)

In [11]:
svm_params = [
    {
        'C': [1.0, 2.5, 3.5, 9.0],
        'kernel': ['linear', 'rbf'],
        'tol':[0.5, 0.1, 0.01, 0.001]
    }
]

In [12]:
best_svm, best_score = hyper_tuning(svm, X_train, y_train, 'accuracy', svm_params, 3)
print('Best logistic regression model is: \n', best_svm)
print('Best Score: ' + str(best_score) + '%')

Best logistic regression model is: 
 SVC(C=9.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=42, shrinking=True, tol=0.5,
    verbose=False)
Best Score: 0.9818253968253968%


In [13]:
validation_scores = model_performance(best_svm, X_val, y_val, [confusion_matrix, accuracy_score, f1_score])

In [16]:
print("CM:\n", validation_scores[1], "\nAccuracy: " + str(validation_scores[2]) +"\nF1_score: " + str(validation_scores[3]))

CM:
 [[419   0   1   0   0   0   0   0   0   0]
 [  0 488   0   0   0   0   0   0   0   0]
 [  0   1 411   2   2   0   0   3   2   0]
 [  0   0   3 411   0   4   0   2   2   1]
 [  0   1   2   0 413   0   0   3   0   7]
 [  1   1   0   6   0 381   1   0   0   1]
 [  1   0   0   0   2   1 399   0   0   0]
 [  0   0   4   1   1   0   0 395   0   2]
 [  0   2   0   1   3   1   0   0 403   1]
 [  1   0   0   0   5   3   0   1   0 404]] 
Accuracy: 0.981904761904762
F1_score: 0.981904761904762


### Creating CSV file for Kaggle submission

In [21]:
preds = best_svm.predict(TEST.values)

In [28]:
sub = pd.DataFrame(np.arange(1,28001,1)) 

In [29]:
sub['Label'] = preds
sub.rename({0:'ImageId'}, axis=1, inplace=True)
sub.head()

Unnamed: 0,ImageId,Label
0,1,2
1,2,0
2,3,9
3,4,9
4,5,3


In [30]:
sub.to_csv('sub.csv', index=False)

### Saving the model

In [32]:
filename = 'SVM_98%_Kaggle.sav'
pickle.dump(best_svm, open(filename, 'wb'))

## Random Forest Classifier 

In [33]:
rfc = RandomForestClassifier(n_estimators=300, max_depth=10, min_samples_split=20, min_samples_leaf=4,random_state=3)

In [35]:
cross_validation(rfc, X_train, y_train, 'accuracy', 10)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:  2.5min remaining:  5.8min
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:  2.5min remaining:  1.1min
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  4.1min finished


(array([0.91455026, 0.91031746, 0.90291005, 0.90238095, 0.90714286,
        0.91190476, 0.90714286, 0.90582011, 0.90608466, 0.90502646]),
 0.9073280423280424)

In [36]:
rfc_params_grid = [
    {
     'n_estimators': [300,350,450,600],
     'min_samples_split': [2, 14, 20, 56],
     'min_samples_leaf' : [4, 16, 25, 30],
     'max_depth': [None, 4, 10, 13, 20]
    }
]

In [None]:
best_rfc, best_score = hyper_tuning(rfc, X_train, y_train, 'accuracy', rfc_params_grid, 3)