In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

### Functions for machine learning algorithm's useage

In [2]:
def cross_validation(model, training_features, training_labels,metric, kfolds):
    '''
    A function that applies cross_validation on given machine learning algorithm, data and number of data splits
    
    input:
            model (sklearn machine learning algorithm api): LogisticRegression, XGBoost, etc..
            training_features (DataFrame): (X_train)
            training_labels (DataFrame): (y_train)
            metric (String, sklearn scoring metrics api): 
            kfolds (int): number of splits to perform on the datasets
    
    output:
            scores (list): a list with scoring values for each K split
            average_score (float): the mean of scores
            
    '''
    scores = cross_val_score(estimator=model, X=training_features.values, y=training_labels.values.ravel(), scoring=metric, cv=kfolds, verbose=3, n_jobs=-1)
    average_score = np.mean(scores)
    return scores, average_score

In [11]:
def hyper_tuning(model, training_features, training_labels,metric, hyper_params, cv):
    '''
    A function that applies GridSearch (hyper parameter tuning) for a given machine learning algorithm.
    
    *In order to get the best of this function it is recommended that you'll use it on algorithms with many hyper parameters*
    
    input:
            model (sklearn machine learning algorithm api): LogisticRegression, XGBoost, etc..
            training_features (DataFrame): (X_train)
            training_labels (DataFrame): (y_train)
            metric (String, sklearn scoring metrics api): 
            hyper_params (list): A list which contains a dictionary with it's keys as names of a model's hyper parameters and 
                                 values to test on
            cv (int): cross validation splitting strategy (3-fold, 5-fold). *for faster performance choose 3 fold cv*
    
    output: 
            best_find (machine learning model): returns the given machine learning algorithm with the best hyper parameters
            best_score (float): returns the best score achieved by the model with the best hyper parameters
            
    '''
    
    grid_search = GridSearchCV(model, param_grid=hyper_params, scoring=metric, verbose=3, cv=cv)
    grid_search.fit(training_features.values, training_labels.values.ravel())
    best_find = grid_search.best_estimator_
    best_score = grid_search.best_score_
    return best_find, best_score

In [4]:
def model_performance(trained_model, testing_features, testing_labels, metrics = []):
    '''
    A functions that applies multiple scoring metrics given by the user on the testing set
    
    input:
            trained_model (sklearn machine learning algorithm api): An already trained machine learning algorithm
            testing_features (DataFrame): (X_test) 
            testing_label (DataFrame): (y_test)
            metrics (list of sklearn metric api): a list which contains the desired scoring metrics
    
    output:
            scores (dictionary): a dictionary which contains the scoring method as a key and the score as value
            
    '''
    scores = {}
    num_metrics = 1
    predictions = trained_model.predict(testing_features.values)
    for metric in metrics:
        
        if str(metric) == str(f1_score):
            score = f1_score(testing_labels.values.ravel(), predictions, average='micro')
            scores[num_metrics] = score
            num_metrics += 1
        else:
            score = metric(testing_labels.values.ravel(), predictions)
            scores[num_metrics] = score
            num_metrics += 1
        
    return scores

### Importing the datasets

In [5]:
X_train = pd.read_csv('./Engineered_Datasets/Engineered_X_train.csv')
X_test = pd.read_csv('./Engineered_Datasets/Engineered_X_test.csv')
y_train = pd.read_csv('./Engineered_Datasets/X_train_label.csv')
y_test = pd.read_csv('./Engineered_Datasets/X_test_label.csv')

## Logistic Regression (LR)

In [21]:
lr = LogisticRegression(random_state=42, max_iter=500)

In [22]:
cross_validation(lr, X_train, y_train, 'accuracy', 10)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  ................................................................
[CV] .................................... , score=0.901, total=   5.2s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    5.1s remaining:    0.0s


[CV] .................................... , score=0.892, total=   4.8s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    9.9s remaining:    0.0s


[CV] .................................... , score=0.904, total=   4.9s
[CV]  ................................................................
[CV] .................................... , score=0.896, total=   5.4s
[CV]  ................................................................
[CV] .................................... , score=0.905, total=   5.1s
[CV]  ................................................................
[CV] .................................... , score=0.896, total=   5.2s
[CV]  ................................................................
[CV] .................................... , score=0.901, total=   4.6s
[CV]  ................................................................
[CV] .................................... , score=0.901, total=   4.7s
[CV]  ................................................................
[CV] .................................... , score=0.899, total=   4.8s
[CV]  ................................................................
[CV] .

[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   49.4s finished


(array([0.90059524, 0.89196429, 0.90446429, 0.89642857, 0.90505952,
        0.89613095, 0.90059524, 0.90119048, 0.89940476, 0.90327381]),
 0.8999107142857143)

In [23]:
lr_param_grid = [    
    {'penalty' : ['l1', 'l2', 'elasticnet', 'none'],
    'C' : np.logspace(-4, 4, 10),
    'solver' : ['lbfgs','newton-cg','liblinear','sag','saga'],
    'max_iter' : [500, 750, 1000, 1250]
    }
]

best_lr, best_score = hyper_tuning(lr, X_train, y_train, 'accuracy', lr_param_grid, 3)

Fitting 3 folds for each of 800 candidates, totalling 2400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  96 tasks      | elapsed:   23.5s
[Parallel(n_jobs=-1)]: Done 256 tasks      | elapsed:  6.8min
[Parallel(n_jobs=-1)]: Done 480 tasks      | elapsed: 16.4min
[Parallel(n_jobs=-1)]: Done 768 tasks      | elapsed: 28.3min
[Parallel(n_jobs=-1)]: Done 1120 tasks      | elapsed: 43.3min
[Parallel(n_jobs=-1)]: Done 1536 tasks      | elapsed: 68.5min
[Parallel(n_jobs=-1)]: Done 2016 tasks      | elapsed: 106.2min
[Parallel(n_jobs=-1)]: Done 2400 out of 2400 | elapsed: 145.1min finished


In [30]:
print('Best logistic regression model is: \n', best_lr)
print('Best Score: ' + str(best_score) + '%')

Best logistic regression model is: 
 LogisticRegression(C=1291.5496650148827, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=500, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='newton-cg', tol=0.0001, verbose=0,
                   warm_start=False)
Best Score: 0.9060714285714285%


In [27]:
model_performance(best_lr, X_test, y_test, [confusion_matrix, accuracy_score, f1_score])

{1: array([[787,   0,   3,   0,   3,  15,   5,   0,   3,   0],
        [  0, 896,   4,   0,   1,   3,   0,   2,   3,   0],
        [  5,   7, 742,  20,  17,   7,  15,   8,  23,   2],
        [  2,   4,  23, 791,   1,  52,   4,  12,  28,  20],
        [  1,   1,   7,   2, 766,   2,  14,   4,   6,  36],
        [  4,   1,   7,  31,  16, 588,  20,   4,  23,   8],
        [ 10,   2,   6,   1,  12,   6, 736,   2,   9,   1],
        [  0,   1,  15,   5,  14,   6,   0, 812,   4,  36],
        [  4,   7,   9,  25,   4,  24,   8,   4, 743,   7],
        [  1,   2,   4,  16,  27,  11,   0,  34,   4, 739]], dtype=int64),
 2: 0.9047619047619048,
 3: 0.9047619047619048}

## Random Forest Classifier 

In [28]:
rfc = RandomForestClassifier(random_state=23)

In [29]:
cross_validation(rfc, X_train, y_train, 'accuracy', 10)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  ................................................................
[CV] .................................... , score=0.944, total=  16.3s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   16.2s remaining:    0.0s


[CV] .................................... , score=0.939, total=  16.3s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   32.6s remaining:    0.0s


[CV] .................................... , score=0.950, total=  16.5s
[CV]  ................................................................
[CV] .................................... , score=0.942, total=  16.5s
[CV]  ................................................................
[CV] .................................... , score=0.945, total=  16.5s
[CV]  ................................................................
[CV] .................................... , score=0.946, total=  16.4s
[CV]  ................................................................
[CV] .................................... , score=0.948, total=  16.5s
[CV]  ................................................................
[CV] .................................... , score=0.947, total=  16.8s
[CV]  ................................................................
[CV] .................................... , score=0.948, total=  16.6s
[CV]  ................................................................
[CV] .

[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:  2.7min finished


(array([0.94434524, 0.93869048, 0.95029762, 0.94166667, 0.94494048,
        0.94613095, 0.94821429, 0.94672619, 0.94791667, 0.95297619]),
 0.9461904761904762)

In [31]:
rfc_params_grid = [
    {'n_estimators': [100,200,300],
     'min_samples_split': [2, 12, 41, 94],
     'min_samples_leaf' : [1, 15, 67, 82],
     'criterion': ['gini', 'entropy']
    }]

In [32]:
best_rfc, best_score = hyper_tuning(rfc, X_train, y_train, 'accuracy', rfc_params_grid, 3)

Fitting 3 folds for each of 96 candidates, totalling 288 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  96 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 256 tasks      | elapsed: 14.1min
[Parallel(n_jobs=-1)]: Done 288 out of 288 | elapsed: 16.4min finished


In [33]:
print('Best logistic regression model is: \n', best_rfc)
print('Best Score: ' + str(best_score) + '%')

Best logistic regression model is: 
 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=300,
                       n_jobs=None, oob_score=False, random_state=23, verbose=0,
                       warm_start=False)
Best Score: 0.9440476190476191%


In [34]:
model_performance(best_rfc, X_test, y_test, [confusion_matrix, accuracy_score, f1_score])

{1: array([[799,   0,   1,   2,   1,   2,   9,   0,   1,   1],
        [  0, 896,   5,   2,   1,   2,   1,   1,   1,   0],
        [  4,   4, 797,  11,   8,   2,   2,   6,  11,   1],
        [  5,   1,  10, 857,   1,  18,   2,  11,  19,  13],
        [  2,   0,   3,   0, 789,   0,  10,   4,   4,  27],
        [  2,   0,   4,   9,   5, 659,  12,   1,   8,   2],
        [  8,   0,   1,   0,   0,   6, 768,   0,   2,   0],
        [  1,   6,  14,   0,   8,   0,   0, 841,   3,  20],
        [  1,   2,   4,  21,   5,  18,   5,   3, 770,   6],
        [  1,   3,   3,  20,  13,   5,   0,  16,   5, 772]], dtype=int64),
 2: 0.9461904761904761,
 3: 0.9461904761904761}

## XGBoost Classifier

In [6]:
xgb = XGBClassifier(objective='multi:softprob')

In [7]:
cross_validation(xgb, X_train, y_train, 'accuracy', 10)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:  1.0min remaining:  2.4min
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:  1.1min remaining:   27.8s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  1.1min finished


(array([0.90684524, 0.89613095, 0.90119048, 0.90357143, 0.90595238,
        0.91130952, 0.91339286, 0.91309524, 0.90565476, 0.9125    ]),
 0.9069642857142857)

In [9]:
xgb_params = [
    {
        'max_depth': [3, 6, 9], 
        'learning_rate': [0.1, 0.05, 1.0], 
        'n_estimators': [100, 150, 200],
        'booster': ['gbtree', 'gblinear'], 
        'min_child_weight': [1, 4, 7]
    }
]

In [None]:
best_xgb, best_score = hyper_tuning(xgb, X_train, y_train, 'accuracy', xgb_params, 3)

Fitting 3 folds for each of 768 candidates, totalling 2304 fits
[CV] booster=gbtree, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=50 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  booster=gbtree, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=50, score=0.866, total=  19.3s
[CV] booster=gbtree, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=50 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   19.2s remaining:    0.0s


[CV]  booster=gbtree, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=50, score=0.865, total=  18.6s
[CV] booster=gbtree, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=50 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   37.8s remaining:    0.0s


[CV]  booster=gbtree, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=50, score=0.872, total=  18.6s
[CV] booster=gbtree, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=100 
[CV]  booster=gbtree, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=100, score=0.904, total=  36.8s
[CV] booster=gbtree, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=100 
[CV]  booster=gbtree, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=100, score=0.907, total=  37.1s
[CV] booster=gbtree, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=100 
[CV]  booster=gbtree, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=100, score=0.910, total=  36.6s
[CV] booster=gbtree, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=150 
[CV]  booster=gbtree, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=150, score=0.920, total=  53.6s
[CV] booster=gbtree, learning_rate=0.1, ma