In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import sklearn as skl
import matplotlib
import matplotlib.pyplot as plt

from scipy.stats import uniform

from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, roc_curve, auc, f1_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, 
                              GradientBoostingClassifier, ExtraTreesClassifier)
from sklearn.model_selection import (RandomizedSearchCV, GridSearchCV)
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.naive_bayes import GaussianNB

In [4]:
mod_dat = pd.read_csv('Standardized_ALz_Data.csv')

In [5]:
mod_dat.describe()

Unnamed: 0,Group,M/F,Age,EDUC,SES,MMSE,eTIV,nWBV,ASF
count,373.0,373.0,373.0,373.0,373.0,373.0,373.0,373.0,373.0
mean,0.490617,-1.089388e-16,-4.161104e-16,-3.62534e-16,-2.047811e-16,-2.047811e-16,8.125761000000001e-17,-3.928939e-16,8.890714e-16
std,0.500583,1.001343,1.001343,1.001343,1.001343,1.001343,1.001343,1.001343,1.001343
min,0.0,-0.8667028,-2.229597,-2.993181,-1.29714,-1.29714,-2.172383,-2.307345,-2.316501
25%,0.0,-0.8667028,-0.7880533,-0.9043942,-0.3944662,-0.3944662,-0.7454601,-0.7973089,-0.6994664
50%,0.0,-0.8667028,-0.001756695,0.1399991,-0.3944662,-0.3944662,-0.1030607,-0.01532591,-0.01059503
75%,1.0,1.153798,0.6534905,0.4881302,0.508208,0.508208,0.6189281,0.7127272,0.7072815
max,1.0,1.153798,2.750282,2.925048,2.313556,2.313556,2.932703,2.896887,2.839157


In [8]:
train_01 = mod_dat.copy()
feature_df = train_01.drop(["Group"],axis=1)
feature_df2 = train_01["Group"].values
x = np.asarray(feature_df)
y = np.asarray(feature_df2)
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2,random_state = 19)

In [7]:
print("{0:0.2f}% Train".format((len(x_train)/len(train_01.index)) * 100))
print("{0:0.2f}% Test".format((len(x_test)/len(train_01.index)) * 100))

79.89% Train
20.11% Test


In [8]:
print("Original Demented : {0} ({1:0.2f}%)".format(len(train_01.loc[train_01['Group'] == 1]), 100 * (len(train_01.loc[train_01['Group'] == 1]) / len(train_01))))
print("Original Nondemented : {0} ({1:0.2f}%)".format(len(train_01.loc[train_01['Group'] == 0]), 100 * (len(train_01.loc[train_01['Group'] == 0]) / len(train_01))))
print("")
print("Training Demented : {0} ({1:0.2f}%)".format(len(y_train[y_train[:] == 1]), 100 * (len(y_train[y_train[:] == 1]) / len(y_train))))
print("Training Nondemented : {0} ({1:0.2f}%)".format(len(y_train[y_train[:] == 0]), 100 * (len(y_train[y_train[:] == 0]) / len(y_train))))
print("")
print("Test Demented : {0} ({1:0.2f}%)".format(len(y_test[y_test[:] == 1]), 100 * (len(y_test[y_test[:] == 1]) / len(y_test))))
print("Test Nondemented : {0} ({1:0.2f}%)".format(len(y_test[y_test[:] == 0]), 100 * (len(y_test[y_test[:] == 0]) / len(y_test))))

Original Demented : 183 (49.06%)
Original Nondemented : 190 (50.94%)

Training Demented : 142 (47.65%)
Training Nondemented : 156 (52.35%)

Test Demented : 41 (54.67%)
Test Nondemented : 34 (45.33%)


In [8]:
#Tunning All hyperparameters using the difffrent methods
#Logistic Regression
penalty = ['l1', 'l2'] # l1 is Lasso, l2 is Ridge
solver= ['liblinear']
samplec = np.linspace(0.00002,1,100)

params = {
    'penalty':penalty,        
    'solver':solver,
    'C': samplec
}

linear_reg = LogisticRegression()
linear_gs = GridSearchCV(linear_reg, params, cv=3, verbose=1)
linear_gs.fit(x_train, y_train)

print ("Best Params", linear_gs.best_params_)
print ("Best Score", linear_gs.best_score_)

Fitting 3 folds for each of 200 candidates, totalling 600 fits
Best Params {'C': 0.4747579797979798, 'penalty': 'l2', 'solver': 'liblinear'}
Best Score 0.6778523489932886


[Parallel(n_jobs=1)]: Done 600 out of 600 | elapsed:    5.1s finished


In [9]:
# Create regularization penalty space
penalty = ['l1', 'l2']

# Create regularization hyperparameter distribution using uniform distribution
C = uniform(loc=0, scale=4)

# Create hyperparameter options
hyperparameters = dict(C=C, penalty=penalty)

# Create randomized search 5-fold cross validation and 100 iteration
rscv = RandomizedSearchCV(linear_reg, hyperparameters, random_state=19, n_iter=1000, cv=5, verbose=2, n_jobs=-1)

# Fit randomized search
best_model = rscv.fit(x_train, y_train)

# View best hyperparameters
print('Best Penalty:', best_model.best_estimator_.get_params()['penalty'])
print('Best C:', best_model.best_estimator_.get_params()['C'])

Fitting 5 folds for each of 1000 candidates, totalling 5000 fits


[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:   25.2s
[Parallel(n_jobs=-1)]: Done 5000 out of 5000 | elapsed:   56.8s finished


Best Penalty: l2
Best C: 0.5525267499052782


In [10]:
best_model.best_score_

0.6644295302013423

In [11]:
# Number of trees in random forest
n_estimators = range(10,250)
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = range(1,40)
# Minimum number of samples required to split a node
min_samples_split = range(3,60)

# Create the random grid
parametro_rf = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split}

model_forest = RandomForestClassifier(n_jobs=-1)
forest_random = RandomizedSearchCV(estimator = model_forest, param_distributions = parametro_rf, n_iter = 1000, cv = 10, 
                               verbose=2, random_state=19, n_jobs = -1, scoring='roc_auc')
forest_random.fit(x_train, y_train)

print ("Best Params", forest_random.best_params_)
print ("Best Score", forest_random.best_score_)

Fitting 10 folds for each of 1000 candidates, totalling 10000 fits


[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 361 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 644 tasks      | elapsed:  9.5min
[Parallel(n_jobs=-1)]: Done 1009 tasks      | elapsed: 13.8min
[Parallel(n_jobs=-1)]: Done 1454 tasks      | elapsed: 18.5min
[Parallel(n_jobs=-1)]: Done 1981 tasks      | elapsed: 24.1min
[Parallel(n_jobs=-1)]: Done 2588 tasks      | elapsed: 30.9min
[Parallel(n_jobs=-1)]: Done 3277 tasks      | elapsed: 37.7min
[Parallel(n_jobs=-1)]: Done 4046 tasks      | elapsed: 46.1min
[Parallel(n_jobs=-1)]: Done 4897 tasks      | elapsed: 56.2min
[Parallel(n_jobs=-1)]: Done 5828 tasks      | elapsed: 66.2min
[Parallel(n_jobs=-1)]: Done 6841 tasks      | elapsed: 78.4min
[Parallel(n_jobs=-1)]: Done 7934 tasks      | elapsed: 91.1min
[Parallel(n_jobs=-1)]: Done 9109 tasks      | elapsed: 104.4min
[Parallel(n_jobs=-1)]: Done 10000 out of 10000 | elapsed: 

Best Params {'n_estimators': 128, 'min_samples_split': 3, 'max_features': 'sqrt', 'max_depth': 31}
Best Score 0.8850850910834134


In [12]:
# Number of trees in Extra Trees
n_estimators = range(50,280)
# Maximum number of levels in tree
max_depth =  range(1,40)
# Minimum number of samples required to split a node
min_samples_leaf = [3,4,5,6,7,8,9,10,15,20,30,40,50,60]
# Create the random grid
parametro_Et = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'min_samples_leaf': min_samples_leaf}
model_et = ExtraTreesClassifier(n_jobs=-1)
et_random = RandomizedSearchCV(estimator = model_et, param_distributions = parametro_rf, n_iter = 1000, cv = 10, 
                               verbose=2, random_state=19, n_jobs = -1, scoring='roc_auc')
et_random.fit(x_train, y_train)

Fitting 10 folds for each of 1000 candidates, totalling 10000 fits


[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 361 tasks      | elapsed:  6.4min
[Parallel(n_jobs=-1)]: Done 644 tasks      | elapsed:  9.6min
[Parallel(n_jobs=-1)]: Done 1009 tasks      | elapsed: 13.3min
[Parallel(n_jobs=-1)]: Done 1454 tasks      | elapsed: 17.8min
[Parallel(n_jobs=-1)]: Done 1981 tasks      | elapsed: 23.2min
[Parallel(n_jobs=-1)]: Done 2588 tasks      | elapsed: 32.6min
[Parallel(n_jobs=-1)]: Done 3277 tasks      | elapsed: 41.2min
[Parallel(n_jobs=-1)]: Done 4046 tasks      | elapsed: 50.9min
[Parallel(n_jobs=-1)]: Done 4897 tasks      | elapsed: 61.0min
[Parallel(n_jobs=-1)]: Done 5828 tasks      | elapsed: 70.7min
[Parallel(n_jobs=-1)]: Done 6841 tasks      | elapsed: 81.2min
[Parallel(n_jobs=-1)]: Done 7934 tasks      | elapsed: 92.0min
[Parallel(n_jobs=-1)]: Done 9109 tasks      | elapsed: 104.6min
[Parallel(n_jobs=-1)]: Done 10000 out of 10000 | elapsed: 

RandomizedSearchCV(cv=10, error_score='raise',
          estimator=ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
          fit_params=None, iid=True, n_iter=1000, n_jobs=-1,
          param_distributions={'n_estimators': range(10, 250), 'max_features': ['auto', 'sqrt'], 'max_depth': range(1, 40), 'min_samples_split': range(3, 60)},
          pre_dispatch='2*n_jobs', random_state=19, refit=True,
          return_train_score='warn', scoring='roc_auc', verbose=2)

In [13]:
print ("Best Params", et_random.best_params_)
print ("Best Score", et_random.best_score_)

Best Params {'n_estimators': 241, 'min_samples_split': 3, 'max_features': 'auto', 'max_depth': 23}
Best Score 0.9240961569191434


In [9]:
#Optimizing Adabosst Classifier

n_estimators = range(10,200)
learning_rate = [0.0001, 0.001, 0.01, 0.1,0.2,0.3,0.4
,0.5,0.6,0.7,0.8,0.9,0.95,1]

# Create the random grid
parametros_ada = {'n_estimators': n_estimators,'learning_rate': learning_rate}

model_ada = AdaBoostClassifier()
ada_random = RandomizedSearchCV(estimator = model_ada, param_distributions = parametros_ada, n_iter = 1000,cv = 10, 
                                verbose=2, random_state=19, n_jobs = -1, scoring='roc_auc')

ada_random.fit(x_train, y_train)
print ("Best Params", ada_random.best_params_)
print ("Best Score", ada_random.best_score_)

Fitting 10 folds for each of 1000 candidates, totalling 10000 fits


[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:   44.5s
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 361 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done 644 tasks      | elapsed:  8.5min
[Parallel(n_jobs=-1)]: Done 1009 tasks      | elapsed: 15.9min
[Parallel(n_jobs=-1)]: Done 1454 tasks      | elapsed: 21.1min
[Parallel(n_jobs=-1)]: Done 1981 tasks      | elapsed: 25.5min
[Parallel(n_jobs=-1)]: Done 2588 tasks      | elapsed: 30.3min
[Parallel(n_jobs=-1)]: Done 3277 tasks      | elapsed: 37.2min
[Parallel(n_jobs=-1)]: Done 4046 tasks      | elapsed: 43.3min
[Parallel(n_jobs=-1)]: Done 4897 tasks      | elapsed: 50.6min
[Parallel(n_jobs=-1)]: Done 5828 tasks      | elapsed: 58.7min
[Parallel(n_jobs=-1)]: Done 6841 tasks      | elapsed: 67.9min
[Parallel(n_jobs=-1)]: Done 7934 tasks      | elapsed: 77.4min
[Parallel(n_jobs=-1)]: Done 9109 tasks      | elapsed: 88.0min
[Parallel(n_jobs=-1)]: Done 10000 out of 10000 | elapsed: 9

Best Params {'n_estimators': 71, 'learning_rate': 1}
Best Score 0.7682146852029402


In [7]:
#Svm Classifer Optimization of parameters
C = [0.001, 0.10, 0.1, 10, 25, 50,65,70,80,90, 100, 1000]
kernel = ['linear', 'poly', 'rbf', 'sigmoid']
gamma =[1e-2, 1e-3, 1e-4, 1e-5,1e-6,1]

# Create the grid search
parametros_svm = {'C': C,
                  'gamma': gamma,
                  'kernel': kernel}

model_svm = SVC()

svm_random = GridSearchCV(model_svm, parametros_svm, cv = 10, verbose=2,
                          n_jobs = -1, scoring='roc_auc' )
svm_random.fit(x_train, y_train)
print ("Best Params", svm_random.best_params_)
print ("Best Score", svm_random.best_score_)

Fitting 10 folds for each of 288 candidates, totalling 2880 fits


[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:   55.5s
[Parallel(n_jobs=-1)]: Done 513 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 1212 tasks      | elapsed:  6.3min
[Parallel(n_jobs=-1)]: Done 1917 tasks      | elapsed: 26.8min
[Parallel(n_jobs=-1)]: Done 2880 out of 2880 | elapsed: 161.2min finished


Best Params {'C': 10, 'gamma': 1, 'kernel': 'rbf'}
Best Score 0.9164789069990411


In [10]:
#4° Gradient Boosting

parametros_gb = {"loss":["deviance"],"learning_rate": [0.01, 0.025, 0.005,0.5, 0.075,0.1, 0.15, 0.2,0.3,0.8,0.9],
                 "min_samples_split": [0.01, 0.025, 0.005,0.4,0.5,0.075, 0.1, 0.15, 0.2,0.3,0.8,0.9],
                 "min_samples_leaf": [1,2,3,5,8,10,15,20,40,50,55,60,65,70,80,85,90,100],
                 "max_depth":[3,5,8,10,15,20,25,30,40,50],"max_features":["log2","sqrt"],
                 "criterion": ["friedman_mse", "mae"],"subsample":[0.5, 0.618, 0.8, 0.85, 0.9, 0.95, 1.0],
                 "n_estimators":range(1,100)}

model_gb= GradientBoostingClassifier()
gb_random = RandomizedSearchCV(estimator = model_gb,param_distributions = parametros_gb, n_iter = 100, cv= 10,
                               verbose=2, random_state=19, n_jobs = -1, scoring='roc_auc')
gb_random.fit(x_train, y_train)

Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 997 out of 1000 | elapsed:  3.4min remaining:    0.5s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:  3.4min finished


RandomizedSearchCV(cv=10, error_score='raise',
          estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False),
          fit_params=None, iid=True, n_iter=100, n_jobs=-1,
          param_distributions={'loss': ['deviance'], 'learning_rate': [0.01, 0.025, 0.005, 0.5, 0.075, 0.1, 0.15, 0.2, 0.3, 0.8, 0.9], 'min_samples_split': [0.01, 0.025, 0.005, 0.4, 0.5, 0.075, 0.1, 0.15, 0.2, 0.3, 0.8, 0.9], 'min_samples_leaf': [1, 2, 3, 5, 8, 10, 15, 20, 40, 50, 55, 60, 65, 70, 80, 85, 90, ...n_mse', 'mae'], 'subsample': [0.5, 0.618, 0.8, 0.85, 0.9, 0.95, 1.0], 'n_estimators': ra

In [11]:
print ("Best Params", gb_random.best_params_)
print ("Best Score", gb_random.best_score_)

Best Params {'subsample': 0.8, 'n_estimators': 51, 'min_samples_split': 0.005, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 8, 'loss': 'deviance', 'learning_rate': 0.025, 'criterion': 'friedman_mse'}
Best Score 0.88592401725791


In [9]:
log_reg_param = {'C': 0.4747579797979798, 'penalty': 'l2', 'solver': 'liblinear'}
rand_for_pram ={'n_estimators': 128, 'min_samples_split': 3, 'max_features': 'sqrt', 'max_depth': 31}
extr_tre_pram ={'n_estimators': 241, 'min_samples_split': 3, 'max_features': 'auto', 'max_depth': 23}
ada_bost_pram = {'n_estimators': 71, 'learning_rate': 1}
svm_pram ={'C': 10, 'gamma': 1, 'kernel': 'rbf'}
gb_pram = {'subsample': 0.8, 'n_estimators': 51, 'min_samples_split': 0.005, 'min_samples_leaf': 1, 'max_features': 'sqrt', 
           'max_depth': 8, 'loss': 'deviance', 'learning_rate': 0.025, 'criterion': 'friedman_mse'}
knn_pram {'n_neighbors': 1, 'metric': 'cityblock'}


SyntaxError: invalid syntax (<ipython-input-9-67e958254fbe>, line 8)

In [6]:
rf_model = RandomForestClassifier(n_estimators= 128, min_samples_split= 3, max_features= 'sqrt', max_depth = 31)
et_model = ExtraTreesClassifier(n_estimators= 241, min_samples_split= 3, max_features= 'auto', max_depth = 23)
av_model = AdaBoostClassifier(n_estimators = 71, learning_rate = 0.8)
svm_model = SVC(C= 10, gamma = 1, kernel = 'rbf')
gb_model = GradientBoostingClassifier(subsample = 0.8, n_estimators = 51, min_samples_split = 0.005, min_samples_leaf = 1, 
                                      max_features = 'sqrt', max_depth = 8, loss = 'deviance', learning_rate = 0.025, 
                                      criterion = 'friedman_mse')
gnb_model = GaussianNB()

In [11]:
rf_model.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=31, max_features='sqrt', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=3,
                       min_weight_fraction_leaf=0.0, n_estimators=128,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [12]:
rf_y_pred = rf_model.predict(x_test)

In [13]:
rf_y_pred

array([1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1,
       0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0,
       1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 0, 1, 0, 1, 0], dtype=int64)

In [14]:
print(confusion_matrix(y_test,rf_y_pred))
w = rf_model.score(x_train, y_train)
v = rf_model.score(x_test, y_test)
print(f"Train accuracy: {w:0.2%}")
print(f"Test accuracy: {v:0.2%}")

[[29  5]
 [ 4 37]]
Train accuracy: 100.00%
Test accuracy: 88.00%


In [9]:
rf_model.fit(x_train, y_train)
et_model.fit(x_train, y_train)
av_model.fit(x_train, y_train)
svm_model.fit(x_train, y_train)
gb_model.fit(x_train, y_train)
gnb_model.fit(x_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [16]:
knn_params = {"n_neighbors": np.arange(1, 41, 2),"metric": ["euclidean", "cityblock"],
              "algorithm" : ['auto', 'ball_tree', 'kd_tree', 'brute'] }
model_knn= KNeighborsClassifier()
knn_random = RandomizedSearchCV(estimator = model_knn,param_distributions = knn_params, n_iter = 100, cv= 10,
                               verbose=2, random_state=19, n_jobs = -1, scoring='roc_auc')
knn_random.fit(x_train, y_train)

Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:   13.6s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:   23.5s finished


RandomizedSearchCV(cv=10, error_score='raise',
          estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
          fit_params=None, iid=True, n_iter=100, n_jobs=-1,
          param_distributions={'n_neighbors': array([ 1,  3,  5,  7,  9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33,
       35, 37, 39]), 'metric': ['euclidean', 'cityblock'], 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']},
          pre_dispatch='2*n_jobs', random_state=19, refit=True,
          return_train_score='warn', scoring='roc_auc', verbose=2)

In [17]:
print ("Best Params", knn_random.best_params_)
print ("Best Score", knn_random.best_score_)

Best Params {'n_neighbors': 1, 'metric': 'cityblock', 'algorithm': 'auto'}
Best Score 0.8901096596356665


In [20]:
knn_gs = GridSearchCV(model_knn, knn_params, cv=10, verbose=1)
knn_gs.fit(x_train, y_train)
print ("Best Params", knn_random.best_params_)
print ("Best Score", knn_random.best_score_)

Fitting 10 folds for each of 40 candidates, totalling 400 fits
Best Params {'n_neighbors': 1, 'metric': 'cityblock'}
Best Score 0.8901096596356665


[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:    7.2s finished


In [10]:
knn_model = KNeighborsClassifier(n_neighbors = 1, metric = 'cityblock', algorithm  = 'auto')
knn_model.fit(x_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='cityblock',
                     metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                     weights='uniform')

In [11]:
#Building Of ANN Model
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
 


ann_model = MLPClassifier(activation='tanh', alpha=0.0001, batch_size='auto', beta_1=0.9,
                       beta_2=0.999, early_stopping=False, epsilon=1e-08,
                       hidden_layer_sizes=(16,14,9,2), learning_rate='constant',
                       learning_rate_init=0.001, max_iter=100000, momentum=0.9, nesterovs_momentum=True, power_t=0.5,
                       random_state=19, shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1, verbose=False, warm_start=False)
ann_model.fit(x_train, y_train)


MLPClassifier(activation='tanh', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(16, 14, 9, 2), learning_rate='constant',
              learning_rate_init=0.001, max_iter=100000, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=19, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [12]:
ann_y_pred = ann_model.predict(x_test)
et_y_pred = et_model.predict(x_test)
av_y_pred = av_model.predict(x_test)
svm_y_pred = svm_model.predict(x_test)
gb_y_pred = gb_model.predict(x_test)
gnb_y_pred = gnb_model.predict(x_test)
rf_y_pred = rf_model.predict(x_test)
knn_y_pred = knn_model.predict(x_test)

In [14]:
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, roc_curve, auc, f1_score
def accuracy(x_test, y_test, x_train, y_train, model):
    print(model.__class__)
    y_pred = model.predict(x_test)
    print(confusion_matrix(y_test, y_pred))
    w = model.score(x_train, y_train)
    v = model.score(x_test, y_test)
    print(f"Train accuracy: {w:0.2%}")
    print(f"Test accuracy: {v:0.2%}")
  

In [15]:
accuracy(x_test, y_test, x_train, y_train, ann_model)
accuracy(x_test, y_test, x_train, y_train, et_model)
accuracy(x_test, y_test, x_train, y_train, av_model)
accuracy(x_test, y_test, x_train, y_train, svm_model)
accuracy(x_test, y_test, x_train, y_train, gb_model)
accuracy(x_test, y_test, x_train, y_train, gnb_model)
accuracy(x_test, y_test, x_train, y_train, rf_model)
accuracy(x_test, y_test, x_train, y_train, knn_model)

<class 'sklearn.neural_network.multilayer_perceptron.MLPClassifier'>
[[29  5]
 [ 3 38]]
Train accuracy: 96.64%
Test accuracy: 89.33%
<class 'sklearn.ensemble.forest.ExtraTreesClassifier'>
[[32  2]
 [ 2 39]]
Train accuracy: 100.00%
Test accuracy: 94.67%
<class 'sklearn.ensemble.weight_boosting.AdaBoostClassifier'>
[[25  9]
 [11 30]]
Train accuracy: 84.90%
Test accuracy: 73.33%
<class 'sklearn.svm.classes.SVC'>
[[32  2]
 [ 3 38]]
Train accuracy: 98.99%
Test accuracy: 93.33%
<class 'sklearn.ensemble.gradient_boosting.GradientBoostingClassifier'>
[[29  5]
 [ 6 35]]
Train accuracy: 100.00%
Test accuracy: 85.33%
<class 'sklearn.naive_bayes.GaussianNB'>
[[25  9]
 [13 28]]
Train accuracy: 66.78%
Test accuracy: 70.67%
<class 'sklearn.ensemble.forest.RandomForestClassifier'>
[[29  5]
 [ 6 35]]
Train accuracy: 100.00%
Test accuracy: 85.33%
<class 'sklearn.neighbors.classification.KNeighborsClassifier'>
[[32  2]
 [ 3 38]]
Train accuracy: 100.00%
Test accuracy: 93.33%


In [23]:
et_feature = et_model.feature_importances_
av_feature = av_model.feature_importances_

gb_feature = gb_model.feature_importances_
rf_feature = rf_model.feature_importances_


cols = feature_df.columns.tolist()
# Create a dataframe with features
feature_dataframe = pd.DataFrame( {'features': cols,
                                   'Random Forest feature importances': rf_feature,
                                   'AdaBoost feature importances': av_feature,
                                   'Gradient Boost feature importances': gb_feature,
                                   'Extra Trees  feature importances': et_feature,
                                  })

In [24]:
# Create the new column that contains the average of the values.
feature_dataframe['mean'] = feature_dataframe.mean(axis= 1) # axis = 1 computes the mean row-wise
feature_dataframe.head(10)

Unnamed: 0,features,Random Forest feature importances,AdaBoost feature importances,Gradient Boost feature importances,Extra Trees feature importances,mean
0,M/F,0.044188,0.070423,0.045143,0.06529,0.056261
1,Age,0.141147,0.098592,0.12584,0.134734,0.125078
2,EDUC,0.133309,0.140845,0.146221,0.173426,0.14845
3,SES,0.04774,0.042254,0.052355,0.061457,0.050951
4,MMSE,0.049743,0.028169,0.049001,0.060849,0.04694
5,eTIV,0.18035,0.211268,0.177238,0.165462,0.183579
6,nWBV,0.213182,0.225352,0.226993,0.179376,0.211226
7,ASF,0.190342,0.183099,0.177209,0.159406,0.177514


In [16]:
base_predictions_train = pd.DataFrame( {'RandomForest': rf_y_pred.ravel(),
                                        'AdaBoost': av_y_pred.ravel(),
                                        'GradientBoost': gb_y_pred.ravel(),
                                        'SVM': svm_y_pred.ravel(),
                                        'ANN': ann_y_pred.ravel(),
                                        'ExtraTrees': et_y_pred.ravel(),
                                        'KNN': knn_y_pred.ravel(),
                                        'GNB': gnb_y_pred.ravel(),
                                        'Real value': y_test                                
                                        })
base_predictions_train.head(15)

Unnamed: 0,RandomForest,AdaBoost,GradientBoost,SVM,XGB,ANN,ExtraTrees,KNN,GNB,Real value
0,1,1,1,1,1,1,1,1,1,1
1,0,0,0,0,0,0,0,0,0,0
2,1,1,1,1,1,1,1,1,1,1
3,0,0,0,0,0,1,0,0,0,0
4,1,1,1,0,1,1,1,1,1,0
5,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0
7,0,1,0,0,0,0,0,0,1,0
8,0,0,0,0,0,0,0,0,0,0
9,1,1,1,1,1,1,1,1,0,1


In [21]:
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, roc_curve, auc, f1_score
def accuracy_dict(x_test, y_test, x_train, y_train, model):
    test_accuracy = '{0:0.2%}'.format(model.score(x_train, y_train))
    train_accuracy = '{0:0.2%}'.format(model.score(x_test, y_test))
    acc_dict = {'0': test_accuracy , '1': train_accuracy}
    return acc_dict

acc_ann = accuracy_dict(x_test, y_test, x_train, y_train, ann_model)
acc_et = accuracy_dict(x_test, y_test, x_train, y_train, et_model)
acc_av =accuracy_dict(x_test, y_test, x_train, y_train, av_model)
acc_svm = accuracy_dict(x_test, y_test, x_train, y_train, svm_model)
acc_gb = accuracy_dict(x_test, y_test, x_train, y_train, gb_model)
acc_gnb = accuracy_dict(x_test, y_test, x_train, y_train, gnb_model)
acc_rf = accuracy_dict(x_test, y_test, x_train, y_train, rf_model)
acc_knn = accuracy_dict(x_test, y_test, x_train, y_train, knn_model)
accu = {'0': 'Train Accuracy', '1': 'Test Accuracy'}

accuracy_dataframe = pd.DataFrame({'accuracy': accu,
                                    'Random Forest ': acc_rf,
                                    'AdaBoost': acc_av,
                                    'Gradient Boost': acc_gb,
                                    'Extra Trees': acc_et,
                                    'Xgboost': acc_xgb,
                                    'ANN': acc_ann,
                                    'Support Vector Machine': acc_svm,
                                    'k Nearest Neigbour': acc_knn,
                                    'Navies Bayes': acc_xgb
                                  })
accuracy_dataframe.head(10)

  if diff:
  if diff:


Unnamed: 0,accuracy,Random Forest,AdaBoost,Gradient Boost,Extra Trees,Xgboost,ANN,Support Vector Machine,k Nearest Neigbour,Navies Bayes
0,Train Accuracy,100.00%,84.90%,99.66%,100.00%,99.66%,95.64%,98.99%,100.00%,99.66%
1,Test Accuracy,84.00%,73.33%,88.00%,93.33%,82.67%,88.00%,93.33%,93.33%,82.67%


In [17]:
%matplotlib inline

In [21]:
def plot_cm(y_true, x_test, model, figsize=(10,10),):
    y_pred = model.predict(x_test)
    cm = confusion_matrix(y_true, y_pred, labels=np.unique(y_true))
    cm_sum = np.sum(cm, axis=1, keepdims=True)
    cm_perc = cm / cm_sum.astype(float) * 100
    annot = np.empty_like(cm).astype(str)
    nrows, ncols = cm.shape
    for i in range(nrows):
        for j in range(ncols):
            c = cm[i, j]
            p = cm_perc[i, j]
            if i == j:
                s = cm_sum[i]
                annot[i, j] = '%.1f%%\n%d/%d' % (p, c, s)
            elif c == 0:
                annot[i, j] = ''
            else:
                annot[i, j] = '%.1f%%\n%d' % (p, c)
    cm = pd.DataFrame(cm, index=np.unique(y_true), columns=np.unique(y_true))
    cm.index.name = 'Actual'
    cm.columns.name = 'Predicted'
    fig, ax = plt.subplots(figsize=figsize)
    sns.heatmap(cm, cmap= "YlGnBu", annot=annot, fmt='', ax=ax)
    


In [22]:
plot_cm(y_train, x_test, ann_model)
plot_cm(y_train, x_test, et_model)
plot_cm(y_train, x_test, av_model)
plot_cm(y_train, x_test, svm_model)
plot_cm(y_train, x_test, gb_model)
plot_cm(y_train, x_test, gnb_model)
plot_cm(y_train, x_test,  rf_model)
plot_cm(y_train, x_test, knn_model)

ValueError: Found input variables with inconsistent numbers of samples: [298, 75]