In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from scipy.stats import randint as sp_randint
import warnings

In [None]:
# Suppress future warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [None]:
# Set seed for reproducibility
seed = 2017
np.random.seed(seed)

In [None]:
# Read the data in
df = pd.read_csv("Data/Diabetes.csv")
X = df.iloc[:, :8].values  # Independent variables
y = df['class'].values  # Dependent variables

In [None]:
# Normalize
X = StandardScaler().fit_transform(X)

In [None]:
# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=seed)
kfold = StratifiedKFold(n_splits=5, random_state=seed, shuffle=True)

In [None]:
# Initialize RandomForestClassifier
clf_rf = RandomForestClassifier(random_state=seed)

In [None]:
# Define parameter distributions for randomized search
param_dist = {
    'n_estimators': sp_randint(100, 1000),
    'criterion': ['gini', 'entropy'],
    'max_features': [None, 'auto', 'sqrt', 'log2'],
    'max_depth': [None, 1, 3, 5, 7, 9]
}

In [None]:
# Run randomized search
n_iter_search = 20
random_search = RandomizedSearchCV(
    clf_rf, 
    param_distributions=param_dist,
    cv=kfold, 
    n_iter=n_iter_search,
    verbose=10, 
    n_jobs=-1, 
    random_state=seed
)
random_search.fit(X_train, y_train)

In [None]:
# Output the best parameters
print('Best Parameters: ', random_search.best_params_)

In [None]:
# Evaluate the best estimator from the randomized search
results = cross_val_score(random_search.best_estimator_, X_train, y_train, cv=kfold)
print("Accuracy - Train CV: ", results.mean())
print("Accuracy - Train: ", metrics.accuracy_score(random_search.best_estimator_.predict(X_train), y_train))
print("Accuracy - Test: ", metrics.accuracy_score(random_search.best_estimator_.predict(X_test), y_test))