In [30]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from xgboost import XGBClassifier

## Importing sklearn packages!!

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.datasets import load_iris,load_diabetes
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score,recall_score, confusion_matrix,classification_report
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from sklearn import decomposition
from sklearn.decomposition import PCA

##**Importing Dataset**

In [4]:
df = pd.read_csv('diabetes.csv')
X = df.iloc[:,:-1].values
y = df.iloc[:, -1].values

## Splitting the dataset into the Training set and Test set

In [5]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.10,random_state=0)

## Training model on RFC 

In [6]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=20)
classifier.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=20,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

## Making Confusion Matrix

In [7]:
from sklearn.metrics import confusion_matrix,accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test,y_pred)
print(cm)
accuracy_score(y_test,y_pred)

[[45  6]
 [10 16]]


0.7922077922077922

## Applying K-fold cross validation

In [8]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator=classifier,X=X_train,y=y_train,cv=10)
print("Accuracy: {:.2f}%".format(accuracies.mean()*100))

Accuracy: 74.52%


## Hyperparameter Tuning using GridSearchCV

In [12]:
from sklearn.model_selection import GridSearchCV
parameters = [{'n_estimators':[10,20,30,40,50,60,70],
               'max_depth': [3,4,5,7],
               'criterion':['entropy'],
               'min_samples_split':[5,4,6,7,8],
               'max_features':['auto', 'sqrt', 'log2']
               }]
grid_search = GridSearchCV(estimator= classifier,
                           param_grid= parameters,
                           scoring = 'accuracy',
                           n_jobs = -1,
                           verbose=2,
                           cv = 10)

grid_search.fit(X_train,y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print("Best Accuracy: {:.2f} %".format(best_accuracy*100))
print("Best Parameters:", best_parameters)

Fitting 10 folds for each of 420 candidates, totalling 4200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 504 tasks      | elapsed:    5.8s
[Parallel(n_jobs=-1)]: Done 1316 tasks      | elapsed:   15.7s
[Parallel(n_jobs=-1)]: Done 2448 tasks      | elapsed:   29.7s
[Parallel(n_jobs=-1)]: Done 3908 tasks      | elapsed:   49.2s


Best Accuracy: 77.43 %
Best Parameters: {'criterion': 'entropy', 'max_depth': 3, 'max_features': 'log2', 'min_samples_split': 7, 'n_estimators': 30}


[Parallel(n_jobs=-1)]: Done 4200 out of 4200 | elapsed:   53.7s finished


In [10]:
pip install hyperopt

Note: you may need to restart the kernel to use updated packages.


In [11]:
pip install hyperopt

Note: you may need to restart the kernel to use updated packages.


## Tuning Model Using Hyperopt

In [13]:
from hyperopt import hp,fmin,tpe,STATUS_OK,Trials

In [14]:
space = {'criterion' : hp.choice('criterion', ['entropy','gini']),
         'max_depth' : hp.quniform('max_depth',10,1200,10),
         'max_features' : hp.choice('max_features',['auto','sqrt','log2',None]),
         'min_samples_leaf' : hp.uniform('min_samples_leaf',0,0.5),
         'min_samples_split' : hp.uniform('min_samples_split',0,1),
         'n_estimators': hp.choice('n_estimators',[10,50,350,550,750,1200,1300,1500])
    
}

In [15]:
space

{'criterion': <hyperopt.pyll.base.Apply at 0x7fe83307e990>,
 'max_depth': <hyperopt.pyll.base.Apply at 0x7fe833019cd0>,
 'max_features': <hyperopt.pyll.base.Apply at 0x7fe833019f90>,
 'min_samples_leaf': <hyperopt.pyll.base.Apply at 0x7fe8334ba8d0>,
 'min_samples_split': <hyperopt.pyll.base.Apply at 0x7fe83a7475d0>,
 'n_estimators': <hyperopt.pyll.base.Apply at 0x7fe83316af90>}

In [17]:
space['max_depth']

<hyperopt.pyll.base.Apply at 0x7fe833019cd0>

In [26]:
def objective(space):
    model = RandomForestClassifier( criterion = space['criterion'],
                                    max_depth = space['max_depth'],
                                    max_features = space['max_features'],
                                    min_samples_leaf = space['min_samples_leaf'],
                                    min_samples_split = space['min_samples_split'],
                                    n_estimators = space['n_estimators']
    )
    
    accuracy = cross_val_score(model, X_train,y_train,cv=10).mean()
    return {'loss' : -accuracy , 'status' : STATUS_OK}

In [27]:
from sklearn.model_selection import cross_val_score
trials = Trials()
best = fmin(fn = objective,
            space=space,
            algo= tpe.suggest,
            max_evals=80,
            trials=trials
           )
best

100%|██████████| 80/80 [15:33<00:00, 11.66s/trial, best loss: -0.7569358178053831]


{'criterion': 0,
 'max_depth': 1140.0,
 'max_features': 3,
 'min_samples_leaf': 0.09415105426583024,
 'min_samples_split': 0.2254281615429092,
 'n_estimators': 4}

In [28]:
crit = {0: 'entropy', 1: 'gini'}
feat = {0: 'auto', 1: 'sqrt', 2: 'log2', 3: None}
est = {0: 10, 1: 50, 2: 300, 3: 750, 4: 1200,5:1300,6:1500}


print(crit[best['criterion']])
print(feat[best['max_features']])
print(est[best['n_estimators']])

entropy
None
1200


In [31]:
trainedforest = RandomForestClassifier(criterion = crit[best['criterion']], max_depth = best['max_depth'], 
                                       max_features = feat[best['max_features']], 
                                       min_samples_leaf = best['min_samples_leaf'], 
                                       min_samples_split = best['min_samples_split'], 
                                       n_estimators = est[best['n_estimators']]).fit(X_train,y_train)
predictionforest = trainedforest.predict(X_test)
print(confusion_matrix(y_test,predictionforest))
print(accuracy_score(y_test,predictionforest))
print(classification_report(y_test,predictionforest))
acc5 = accuracy_score(y_test,predictionforest)

[[46  5]
 [ 9 17]]
0.8181818181818182
              precision    recall  f1-score   support

           0       0.84      0.90      0.87        51
           1       0.77      0.65      0.71        26

    accuracy                           0.82        77
   macro avg       0.80      0.78      0.79        77
weighted avg       0.81      0.82      0.81        77

