In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score

In [2]:
X_train = pd.read_csv('./Engineered_csv/Engineered_X_train.csv')
X_test = pd.read_csv('./Engineered_csv/Engineered_X_test.csv')
y_train = pd.read_csv('./Engineered_csv/X_train_label.csv')
y_test = pd.read_csv('./Engineered_csv/X_test_label.csv')

In [3]:
sets = [X_train, X_test, y_train, y_test]

for x in range(len(sets)):
    print(sets[x].shape)

(37800, 9)
(4200, 9)
(37800, 1)
(4200, 1)


In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from xgboost import XGBClassifier

### Functions for machine learning algorithm's useage

In [5]:
def cross_validation(model, training_features, training_labels,metric, kfolds):
    '''
    A function that applies cross_validation on given machine learning algorithm, data and number of data splits
    
    input:
            model (sklearn machine learning algorithm api): LogisticRegression, XGBoost, etc..
            training_features (DataFrame): (X_train)
            training_labels (DataFrame): (y_train)
            metric (String, sklearn scoring metrics api): 
            kfolds (int): number of splits to perform on the datasets
    
    output:
            scores (list): a list with scoring values for each K split
            average_score (float): the mean of scores
            
    '''
    scores = cross_val_score(estimator=model, X=training_features.values, y=training_labels.values.ravel(), scoring=metric, cv=kfolds)
    average_score = np.mean(scores)
    return scores, average_score

In [6]:
def hyper_tuning(model, training_features, training_labels,metric, hyper_params, cv):
    '''
    A function that applies GridSearch (hyper parameter tuning) for a given machine learning algorithm.
    
    *In order to get the best of this function it is recommended that you'll use it on algorithms with many hyper parameters*
    
    input:
            model (sklearn machine learning algorithm api): LogisticRegression, XGBoost, etc..
            training_features (DataFrame): (X_train)
            training_labels (DataFrame): (y_train)
            metric (String, sklearn scoring metrics api): 
            hyper_params (list): A list which contains a dictionary with it's keys as names of a model's hyper parameters and 
                                 values to test on
            cv (int): cross validation splitting strategy (3-fold, 5-fold). *for faster performance choose 3 fold cv*
    
    output: 
            best_find (machine learning model): returns the given machine learning algorithm with the best hyper parameters
            best_score (float): returns the best score achieved by the model with the best hyper parameters
            
    '''
    
    grid_search = GridSearchCV(model, param_grid=hyper_params, scoring=metric, cv=cv, n_jobs=-1)
    grid_search.fit(training_features.values, training_labels.values.ravel())
    best_find = grid_search.best_estimator_
    best_score = grid_search.best_score_
    return best_find, best_score

In [19]:
def model_performance(trained_model, testing_features, testing_labels, metrics = []):
    '''
    A functions that applies multiple scoring metrics given by the user on the testing set
    
    input:
            trained_model (sklearn machine learning algorithm api): An already trained machine learning algorithm
            testing_features (DataFrame): (X_test) 
            testing_label (DataFrame): (y_test)
            metrics (list of sklearn metric api): a list which contains the desired scoring metrics
    
    output:
            scores (dictionary): a dictionary which contains the scoring method as a key and the score as value
            
    '''
    scores = {}
    num_metrics = 1
    predictions = trained_model.predict(testing_features.values)
    for metric in metrics:
        
        if str(metric) == str(f1_score):
            score = f1_score(testing_labels.values.ravel(), predictions, average='micro')
            scores[num_metrics] = score
            num_metrics += 1
        else:
            score = metric(testing_labels.values.ravel(), predictions)
            scores[num_metrics] = score
            num_metrics += 1
        
    return scores

### Logistic Regression

In [8]:
lr = LogisticRegression(max_iter=500, random_state=1)

In [9]:
cross_validation(lr, X_train, y_train, 'accuracy', 10)

(array([0.8989418 , 0.89550265, 0.89338624, 0.9031746 , 0.90291005,
        0.89761905, 0.89867725, 0.9021164 , 0.8978836 , 0.9005291 ]),
 0.899074074074074)

In [10]:
lr_params = [{'solver':('newton-cg', 'lbfgs', 'sag'), 'tol': (0.1, 0.01, 0.0005, 0.0001)}]
lr_best_estimator, lr_best_score = hyper_tuning(lr, X_train, y_train, 'accuracy', lr_params, 3)

In [11]:
print('best estimator: ' , lr_best_estimator)
print('the score of the best estimator: ', lr_best_score)

best estimator:  LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=500,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=1, solver='newton-cg', tol=0.1, verbose=0,
                   warm_start=False)
the score of the best estimator:  0.8888359788359788


In [20]:
lr_scores = model_performance(lr_best_estimator, X_test, y_test, [confusion_matrix, accuracy_score, f1_score])

In [26]:
print('Confusion Matrix: \n', lr_scores[1])
print('Accuracy: ', lr_scores[2])
print('F1-Score: ', lr_scores[3])

Confusion Matrix: 
 [[382   0   6   0   0   0   2   0  18   0]
 [  0 460   0   0   7   0   1   3   0   0]
 [  7   0 374   0   0   0   9   0  20  10]
 [  1   3   5 430   0  33   7   3   4  20]
 [  0   2   0   1 377   0   5   9   0   3]
 [  0   5   5  26   1 265  22   0   0  15]
 [  2   2  16   3   0  18 360   1   0   0]
 [  0  12   0   1  17   8   1 395   0   4]
 [  2   0  22   0   0   1   1   0 371   6]
 [  1   1   4  15   3  11   2  20   0 359]]
Accuracy:  0.8983333333333333
F1-Score:  0.8983333333333333


### KNN Classifier

In [27]:
kn = KNeighborsClassifier(n_neighbors=10)

In [28]:
'''
applying cross validation with 10 fold split
'''
cross_validation(kn, X_train, y_train, 'accuracy', 10)

(array([0.87380952, 0.87671958, 0.86957672, 0.87354497, 0.87724868,
        0.87936508, 0.87962963, 0.87460317, 0.87513228, 0.87592593]),
 0.8755555555555556)

In [29]:
'''
applying a hyper parameter tuning
'''
lr_params = [{'weights':('uniform', 'distance'), 'leaf_size':(100,300,500)}]
kn_best_estimator, kn_best_score = hyper_tuning(kn, X_train, y_train, 'accuracy', lr_params, 3)

In [30]:
print('best estimator: ' , kn_best_estimator)
print('the score of the best estimator: ', kn_best_score)

best estimator:  KNeighborsClassifier(algorithm='auto', leaf_size=100, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=10, p=2,
                     weights='distance')
the score of the best estimator:  0.8719576719576719


In [31]:
kn_scores = model_performance(kn_best_estimator, X_test, y_test, [confusion_matrix, accuracy_score, f1_score])

In [32]:
print('Confusion Matrix: \n', kn_scores[1])
print('Accuracy: ', kn_scores[2])
print('F1-Score: ', kn_scores[3])

Confusion Matrix: 
 [[381   0   4   1   1   7   7   0   7   0]
 [  0 465   2   1   0   0   2   1   0   0]
 [  3   0 382   6   3   2  10   6   8   0]
 [  2   6   8 367   2  23   1   5  80  12]
 [  0   0   3   0 310   3   5   3   4  69]
 [  7   0   1  13   7 290   8   2   9   2]
 [ 10   1   3   0   1   4 380   0   1   2]
 [  0   6   3   1   4   3   0 396   6  19]
 [  2   1   8  20   3   7   3   3 354   2]
 [  3   1   2  10  44   4   4  14   6 328]]
Accuracy:  0.8697619047619047
F1-Score:  0.8697619047619047


### AdaBoost Classifier