<a href="https://colab.research.google.com/github/DanielBojchovski/hyperparameter_tuning_random_forest_classifier/blob/main/hyperparameter_tuning_random_forest_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Importing the Dependencies

In [40]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

Data Collection & Processing

In [41]:
data_frame = pd.read_csv("/content/sample_data/diabetes.csv")

In [42]:
# print the first 5 rows of the dataframe
data_frame.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [43]:
# Get the count of Zeros in columns 
count = (data_frame['Glucose'] == 0).sum()
print('Count of zeros in Column  Glucose : ', count)
count = (data_frame['BloodPressure'] == 0).sum()
print('Count of zeros in Column  BloodPressure : ', count)
count = (data_frame['SkinThickness'] == 0).sum()
print('Count of zeros in Column  SkinThickness : ', count)
count = (data_frame['Insulin'] == 0).sum()
print('Count of zeros in Column  Insulin : ', count)
count = (data_frame['BMI'] == 0).sum()
print('Count of zeros in Column  BMI : ', count)
count = (data_frame['DiabetesPedigreeFunction'] == 0).sum()
print('Count of zeros in Column  DiabetesPedigreeFunction : ', count)
count = (data_frame['Age'] == 0).sum()
print('Count of zeros in Column  Age : ', count)

Count of zeros in Column  Glucose :  5
Count of zeros in Column  BloodPressure :  35
Count of zeros in Column  SkinThickness :  227
Count of zeros in Column  Insulin :  374
Count of zeros in Column  BMI :  11
Count of zeros in Column  DiabetesPedigreeFunction :  0
Count of zeros in Column  Age :  0


Replacing the zero values with the mean

In [44]:
avg = data_frame['Glucose'].mean()
data_frame['Glucose'] = data_frame['Glucose'].replace(0, avg)
avg = data_frame['BloodPressure'].mean()
data_frame['BloodPressure'] = data_frame['BloodPressure'].replace(0, avg)
avg = data_frame['SkinThickness'].mean()
data_frame['SkinThickness'] = data_frame['SkinThickness'].replace(0, avg)
avg = data_frame['Insulin'].mean()
data_frame['Insulin'] = data_frame['Insulin'].replace(0, avg)
avg = data_frame['BMI'].mean()
data_frame['BMI'] = data_frame['BMI'].replace(0, avg)

Separating the features and target

In [45]:
X = data_frame.drop(columns='Outcome', axis=1)
Y = data_frame['Outcome']

Splitting the data into training data & Testing data

In [46]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2)

In [47]:
print(X.shape, X_train.shape, X_test.shape)

(768, 8) (614, 8) (154, 8)


Standardize the data

In [48]:
scaler = StandardScaler()

X_train_std = scaler.fit_transform(X_train)

X_test_std = scaler.transform(X_test)

Model training

In [49]:
model = RandomForestClassifier()

In [50]:
max_features_range = np.arange(1, 6, 1)
n_estimators_range = np.arange(10, 210, 10)
param_grid = dict(max_features = max_features_range, n_estimators = n_estimators_range)

grid = GridSearchCV(estimator = model, param_grid = param_grid, cv = 5)

In [51]:
grid.fit(X_train, Y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'max_features': array([1, 2, 3, 4, 5]),
                         'n_estimators': array([ 10,  20,  30,  40,  50,  60,  70,  80,  90, 100, 110, 120, 130,
       140, 150, 160, 170, 180, 190, 200])})

In [52]:
print("The best parameters are %s with a score of %0.2f" % (grid.best_params_, grid.best_score_))

The best parameters are {'max_features': 1, 'n_estimators': 170} with a score of 0.77
