# Import statements

In [1]:
# Import Dependencies
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
%matplotlib inline
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.metrics import roc_auc_score
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA 
from sklearn.svm import LinearSVC
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import mutual_info_classif as MIC
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
from sklearn.feature_selection import SelectKBest, SelectPercentile
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_validate
from sklearn.tree import DecisionTreeClassifier 
from sklearn.linear_model import Perceptron
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
import skopt
from hyperopt import hp
import pickle

import warnings
warnings.filterwarnings("ignore")

# Loading the Dataset

In [2]:
# Load Dataset
df = pd.read_csv('final_dataset.csv')
df.shape

(5856, 95)

# Train Test Split

In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(labels=['Label','Image_ID'], axis=1),
    df['Label'],
    test_size=0.2,
    random_state=23)

X_train.shape, X_test.shape

((4684, 93), (1172, 93))

# Standard Scaler

In [4]:
from sklearn.preprocessing import StandardScaler
def scale_data(dataset):
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(dataset)
    return scaled_data

In [5]:
scaled_train_set = scale_data(X_train)

scaled_test_set = scale_data(X_test)

# Cross Validation

In [6]:
def cross_validation(model, _X, _y, _cv=5):
    _scoring = ['f1']
    results = cross_validate(estimator=model,
                             X=_X,
                             y=_y,
                             cv=_cv,
                             scoring=_scoring,
                             return_train_score=True)
      
    return {
              "Mean Validation F1 Score": results['test_f1'].mean()
              }

# Logistic Regression

In [7]:
LogisticReg = LogisticRegression(max_iter = 3000000)
LogisticReg.fit(scaled_train_set,y_train)

# Base Model Score

In [8]:
LogisticReg.score(scaled_test_set,y_test)

0.8907849829351536

# Parameters for Logistic Regression Model

In [9]:
LogisticReg.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 3000000,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

# HyperParameter Tuning for Logistic Regression Model

In [10]:
LRgrid = {"C":[100, 10, 1.0, 0.1, 0.01],
          "class_weight":[None,'balanced'],
          "penalty":['l2','l1',None,'elasticnet'],
          "solver":['newton-cg', 'newton-cholesky','lbfgs','liblinear','sag','saga']
          }
RCV_LRReg = RandomizedSearchCV(estimator = LogisticReg,
                              param_distributions = LRgrid,
                              n_iter = 20,
                              scoring = 'f1',
                              n_jobs = -1,
                              cv = 5)
RCV_LRReg.fit(scaled_train_set,y_train)

In [12]:
RCV_LRReg.best_params_

{'solver': 'liblinear', 'penalty': 'l2', 'class_weight': None, 'C': 10}

In [13]:
RCV_LRReg.score(scaled_test_set,y_test)

0.9264018691588786

In [14]:
LRgrid_ = {'C':[1.0,10.0],
           'penalty':['l1','l2','elasticnet',None],
           'class_weight':[None,'balanced'],
           'solver':['newton-cg','lbfgs','liblinear']
           }
GCV_LRReg = GridSearchCV(estimator = LogisticReg,
                         param_grid = LRgrid_,
                         scoring = 'f1',
                         n_jobs = -1,
                         cv = 5)
GCV_LRReg.fit(scaled_train_set,y_train)

In [15]:
GCV_LRReg.best_params_

{'C': 10.0, 'class_weight': None, 'penalty': 'l2', 'solver': 'newton-cg'}

#  Model score after Hyperparameter tuning

In [16]:
GCV_LRReg.score(scaled_test_set,y_test)

0.9264018691588786

# Decision Trees

In [17]:
DTClr = DecisionTreeClassifier()
DTClr.fit(scaled_train_set,y_train)

In [18]:
DTClr.score(scaled_test_set,y_test)

0.8139931740614335

In [19]:
DTClr.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': None,
 'splitter': 'best'}

In [20]:
DTClrgrid = {"ccp_alpha":[0.0,1.0,3.5],
             "class_weight":[None,'balanced'],
             "criterion":['gini','entropy','log_loss'],
             "min_impurity_decrease":[0.0,2.1,1.1],
             "max_depth":[3,2,4,5,None],
             "max_features":[None,'auto','sqrt','log2'],
             "min_samples_split":[1,2,4,5],
             "min_samples_leaf":[1,2,4]
            }
RCV_DTClr = RandomizedSearchCV(estimator = DTClr,
                              param_distributions = DTClrgrid,
                              n_iter = 15,
                              scoring = 'f1',
                              n_jobs = -1,
                              cv = 5)
RCV_DTClr.fit(scaled_train_set,y_train)

In [21]:
RCV_DTClr.best_params_

{'min_samples_split': 4,
 'min_samples_leaf': 4,
 'min_impurity_decrease': 0.0,
 'max_features': 'sqrt',
 'max_depth': None,
 'criterion': 'entropy',
 'class_weight': 'balanced',
 'ccp_alpha': 0.0}

In [22]:
RCV_DTClr.score(scaled_test_set,y_test)

0.8854599406528189

In [23]:
DTClrgrid_ = {"ccp_alpha":[0.0,1.0],
             "class_weight":['balanced',None],
             "criterion":['log_loss','entropy'],
             "min_impurity_decrease":[0.0],
             "max_depth":[2,4,None],
             "max_features":['sqrt','log2'],
             "min_samples_split":[2,4],
             "min_samples_leaf":[1,3]
            }
GCV_DTClr = GridSearchCV(estimator = DTClr,
                         param_grid = DTClrgrid_,
                         scoring = 'f1',
                         cv = 5,
                         )
GCV_DTClr.fit(scaled_train_set,y_train)

In [24]:
GCV_DTClr.best_params_

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'entropy',
 'max_depth': 4,
 'max_features': 'sqrt',
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 4}

In [25]:
GCV_DTClr.score(scaled_test_set,y_test)

0.9123006833712984

# Random Forest

In [35]:
RFClr = RandomForestClassifier()
RFClr.fit(scaled_train_set,y_train)

In [36]:
RFClr.score(scaled_test_set,y_test)

0.8950511945392492

In [37]:
RFClr.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [38]:
RFClrgrid = {"n_estimators": [10,100,150,200],
             "min_weight_fraction_leaf":[0.0,0.5,2.1],
             "max_leaf_nodes":[None,2,5,10],
             "criterion":['gini','entropy','log_loss'],
             "max_depth":[None,5,10],
             "max_features":['sqrt','log2',None],
             "min_samples_split":[1,2,4,7],
             "min_samples_leaf":[1,2,3]}
RCV_RFClr = RandomizedSearchCV(estimator = RFClr,
                              param_distributions = RFClrgrid,
                              n_iter = 20,
                              n_jobs = -1,
                              scoring = 'f1',
                              cv = 5)
RCV_RFClr.fit(scaled_train_set,y_train)

In [39]:
RCV_RFClr.best_params_

{'n_estimators': 10,
 'min_weight_fraction_leaf': 0.0,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_leaf_nodes': 10,
 'max_features': None,
 'max_depth': None,
 'criterion': 'log_loss'}

In [40]:
RCV_RFClr.score(scaled_test_set,y_test)

0.9205069124423964

In [41]:
RFClrgrid_ = {"n_estimators": [10,200],
             "min_weight_fraction_leaf":[0.0],
             "max_leaf_nodes":[None],
             "criterion":['gini','entropy'],
             "max_depth":[None,10],
             "max_features":['log2',None],
             "min_samples_split":[1,4],
             "min_samples_leaf":[1,3]}

GCV_RFClr = GridSearchCV(estimator = RFClr,
                         param_grid = RFClrgrid_,
                         cv = 5,
                         n_jobs = -1,
                         scoring = 'f1')
GCV_RFClr.fit(scaled_train_set,y_train)

In [42]:
GCV_RFClr.best_params_

{'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_samples_leaf': 3,
 'min_samples_split': 4,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 200}

In [43]:
GCV_RFClr.score(scaled_test_set,y_test)

0.9256484149855908

# Support Vector Machines (SVM)

In [44]:
SVM = SVC().fit(scaled_train_set,y_train)

In [45]:
SVM.score(scaled_test_set,y_test)

0.8916382252559727

In [46]:
#get parameters
SVM.get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [47]:
#SVM
from skopt import Optimizer
from skopt import BayesSearchCV 
from skopt.space import Real, Categorical, Integer
rf_params = {
    'C': Real(0.01,50),
    "kernel":['linear','poly','rbf','sigmoid'],
}
clf = SVC(gamma='scale')
Bayes = BayesSearchCV(clf, rf_params,cv=3,n_iter=20, n_jobs=-1,scoring='accuracy')
Bayes.fit(scaled_train_set,y_train)

In [48]:
Bayes.best_params_

OrderedDict([('C', 4.713956795429039), ('kernel', 'rbf')])

In [49]:
SVM_opt = SVC(kernel = 'rbf', C = 12.140873261631244)
SVM_opt.fit(scaled_train_set,y_train)

In [50]:
SVM_opt.score(scaled_test_set,y_test)

0.8959044368600683

# Perceptron

In [26]:
PClr = Perceptron(tol=1e-3, random_state=0)
PClr.fit(scaled_test_set,y_test)

In [27]:
PClr.score(scaled_test_set,y_test)

0.8626279863481229

In [28]:
PClr.get_params()

{'alpha': 0.0001,
 'class_weight': None,
 'early_stopping': False,
 'eta0': 1.0,
 'fit_intercept': True,
 'l1_ratio': 0.15,
 'max_iter': 1000,
 'n_iter_no_change': 5,
 'n_jobs': None,
 'penalty': None,
 'random_state': 0,
 'shuffle': True,
 'tol': 0.001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [29]:
PClrgrid = {'max_iter':[1,10,100,1000,10000],
            'penalty':['l2','l1','elasticnet',None],
            'class_weight':['balanced',None],
            'alpha':[0.0001,0.01,0.1],
            'n_jobs':[-1]
           }
RCV_PClr = RandomizedSearchCV(estimator = PClr,
                              param_distributions = PClrgrid,
                              n_iter = 20,
                              scoring = 'f1',
                              n_jobs = -1,
                              cv = 5)
RCV_PClr.fit(scaled_train_set,y_train)

In [30]:
RCV_PClr.best_params_

{'penalty': 'l2',
 'n_jobs': -1,
 'max_iter': 100,
 'class_weight': None,
 'alpha': 0.0001}

In [31]:
RCV_PClr.score(scaled_test_set,y_test)

0.8993448481238832

In [32]:
PClrgrid_ = {'max_iter':[10,100,1000],
            'penalty':['l2','l1',None],
            'class_weight':['balanced',None],
            'alpha':[0.0001,0.01],
            'n_jobs':[-1]
            }
GCV_PClr = GridSearchCV(estimator = PClr,
                         param_grid = PClrgrid_,
                         cv = 5,
                         n_jobs = -1,
                         scoring = 'f1')
GCV_PClr.fit(scaled_train_set,y_train)

In [33]:
GCV_PClr.best_params_

{'alpha': 0.0001,
 'class_weight': None,
 'max_iter': 10,
 'n_jobs': -1,
 'penalty': 'l2'}

In [34]:
GCV_PClr.score(scaled_test_set,y_test)

0.8993448481238832

# Save the Model 

In [51]:
import pickle
# create an iterator object with write permission - model.pkl
with open('base_model', 'wb') as files:
    pickle.dump(GCV_RFClr, files)

In [52]:
model = pickle.load(open('base_model', 'rb'))