In [4]:
import pandas as pd
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
df = pd.read_csv("https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv", names=names)
print(df.shape)
print(df.head(10))

(768, 9)
   preg  plas  pres  skin  test  mass   pedi  age  class
0     6   148    72    35     0  33.6  0.627   50      1
1     1    85    66    29     0  26.6  0.351   31      0
2     8   183    64     0     0  23.3  0.672   32      1
3     1    89    66    23    94  28.1  0.167   21      0
4     0   137    40    35   168  43.1  2.288   33      1
5     5   116    74     0     0  25.6  0.201   30      0
6     3    78    50    32    88  31.0  0.248   26      1
7    10   115     0     0     0  35.3  0.134   29      0
8     2   197    70    45   543  30.5  0.158   53      1
9     8   125    96     0     0   0.0  0.232   54      1


In [7]:
X = df.drop('class', axis=1)
y = df['class']

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=66)

In [14]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train,y_train)
rfc_predict = rfc.predict(X_test)

In [15]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix

In [17]:
rfc_cv_score = cross_val_score(rfc, X, y, cv=10, scoring='roc_auc')

In [18]:
print("=== Confusion Matrix ===")
print(confusion_matrix(y_test, rfc_predict))
print('\n')
print("=== Classification Report ===")
print(classification_report(y_test, rfc_predict))
print('\n')
print("=== All AUC Scores ===")
print(rfc_cv_score)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score - Random Forest: ", rfc_cv_score.mean())

=== Confusion Matrix ===
[[149  27]
 [ 31  47]]


=== Classification Report ===
              precision    recall  f1-score   support

           0       0.83      0.85      0.84       176
           1       0.64      0.60      0.62        78

    accuracy                           0.77       254
   macro avg       0.73      0.72      0.73       254
weighted avg       0.77      0.77      0.77       254



=== All AUC Scores ===
[0.7762963  0.83555556 0.8262963  0.7437037  0.81296296 0.8762963
 0.85888889 0.90148148 0.82       0.84384615]


=== Mean AUC Score ===
Mean AUC Score - Random Forest:  0.8295327635327636


In [37]:
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 10000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(100, 1000, num = 10)]
max_depth.append(None)

In [38]:
random_grid = {
 'n_estimators': n_estimators,
 'max_features': max_features,
 'max_depth': max_depth
 }

In [39]:
rfc_random = RandomizedSearchCV(estimator = rfc, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)

In [40]:
rfc_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 14.4min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 30.2min finished


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    max_samples=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
               

In [41]:
print(rfc_random.best_params_)

{'n_estimators': 100, 'max_features': 'auto', 'max_depth': 400}


=== Confusion Matrix ===
[[150  26]
 [ 34  44]]


=== Classification Report ===
              precision    recall  f1-score   support

           0       0.82      0.85      0.83       176
           1       0.63      0.56      0.59        78

    accuracy                           0.76       254
   macro avg       0.72      0.71      0.71       254
weighted avg       0.76      0.76      0.76       254



=== All AUC Scores ===
[0.78592593 0.82962963 0.82888889 0.74148148 0.80777778 0.86
 0.86444444 0.90074074 0.80076923 0.85269231]


=== Mean AUC Score ===
Mean AUC Score - Random Forest:  0.8272350427350428


[100, 1200, 2300, 3400, 4500, 5600, 6700, 7800, 8900, 10000]