In [4]:
import pandas as pd
import numpy as np
import time

# We will use Random Forest for Model Building and Grid Search for find the Best Parameters for the Random Forest Model
# Importing the required libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Our Business Use case is a Classification Problem Statement
# Importing necessary evaluation metric modules
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_score, recall_score

In [5]:
# Loading the clean and scaled data into our dataframes
X_train = pd.read_csv("xtrain.csv")
X_test = pd.read_csv("xtest.csv")
y_train = pd.read_csv("ytrain.csv")
y_test = pd.read_csv("ytest.csv")

In [9]:
# Creating Random Forest Class Object
# We will be creating 100 decision trees
classifier = RandomForestClassifier(n_estimators= 100)

# Creating Parameter dictionary with experimenting values
# We will use Entropy Criterion for this execution
param_grid = {"max_depth" : [3,None],
             "max_features" : [4,5,6],
             "min_samples_split" : [2,5,10],
             "min_samples_leaf" : [1,5,10],
             "bootstrap" : [True, False],
             "criterion" : ["entropy"]}


# Creating Grid Search Object with required Parameters
grid_search = GridSearchCV(estimator= classifier, param_grid= param_grid, scoring= "accuracy", cv = 5, n_jobs= -1)

# Running Grid Search and Fitting our model to train data
start_time = time.time()
grid_search = grid_search.fit(X_train, y_train)
end_time = time.time()

# Evaluating the time take to run through all combinations
print(f"Time taken for Model Training: {end_time - start_time}")

  self.best_estimator_.fit(X, y, **fit_params)


Time taken for Model Training: 858.0229263305664


In [13]:
# Capturing the best features and best score
rf_best_score = grid_search.best_score_
rf_best_param = grid_search.best_params_

print(rf_best_score, rf_best_param)

0.6345804515600243 {'bootstrap': False, 'criterion': 'entropy', 'max_depth': None, 'max_features': 4, 'min_samples_leaf': 10, 'min_samples_split': 2}


In [15]:
## Training the instance with hyper parameters selected as Best Param using grid search
rand_entropy_best_model = RandomForestClassifier(n_estimators= 100, bootstrap = False, criterion = 'entropy', max_depth = None, max_features = 4, min_samples_leaf = 10, min_samples_split = 2)
rand_entropy_best_model.fit(X_train,y_train)

# Using trained model to predict our Training Data
y_rand_entropy_pred = rand_entropy_best_model.predict(X_test)

# Evaluating the performance of our prediction
rand_entropy_acc = accuracy_score(y_test, y_rand_entropy_pred)
rand_entropy_precision = precision_score(y_test, y_rand_entropy_pred)
rand_entropy_recall = recall_score(y_test, y_rand_entropy_pred)
rand_entropy_f1 = f1_score(y_test, y_rand_entropy_pred)

performance = pd.DataFrame(data = [['Random Forest (n_estimation = 100) Entropy', rand_entropy_acc, rand_entropy_precision, rand_entropy_recall, rand_entropy_f1]], columns= ['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
performance

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Random Forest (n_estimation = 100) Entropy,0.633724,0.645146,0.710062,0.676049


In [16]:
# Creating Parameter dictionary with experimenting values
# We will use Entropy Criterion for this execution
param_grid = {"max_depth" : [3,None],
             "max_features" : [4,5,6],
             "min_samples_split" : [2,5,10],
             "min_samples_leaf" : [1,5,10],
             "bootstrap" : [True, False],
             "criterion" : ["gini"]}


# Creating Grid Search Object with required Parameters
grid_search = GridSearchCV(estimator= classifier, param_grid= param_grid, scoring= "accuracy", cv = 5, n_jobs= -1)

# Running Grid Search and Fitting our model to train data
start_time = time.time()
grid_search = grid_search.fit(X_train, y_train)
end_time = time.time()

# Evaluating the time take to run through all combinations
print(f"Time taken for Model Training: {end_time - start_time}")

# Capturing the best features and best score
rf_best_score = grid_search.best_score_
rf_best_param = grid_search.best_params_

print(rf_best_score, rf_best_param)

  self.best_estimator_.fit(X, y, **fit_params)


Time taken for Model Training: 645.8046817779541
0.6352087481686354 {'bootstrap': False, 'criterion': 'gini', 'max_depth': None, 'max_features': 4, 'min_samples_leaf': 10, 'min_samples_split': 2}


In [17]:
## Training the instance with hyper parameters selected as Best Param using grid search
rand_gini_best_model = RandomForestClassifier(n_estimators= 100, bootstrap = False, criterion = 'gini', max_depth = None, max_features = 4, min_samples_leaf = 10, min_samples_split = 2)
rand_gini_best_model.fit(X_train,y_train)

# Using trained model to predict our Training Data
y_rand_gini_pred = rand_gini_best_model.predict(X_test)

# Evaluating the performance of our prediction
rand_gini_acc = accuracy_score(y_test, y_rand_gini_pred)
rand_gini_precision = precision_score(y_test, y_rand_gini_pred)
rand_gini_recall = recall_score(y_test, y_rand_gini_pred)
rand_gini_f1 = f1_score(y_test, y_rand_gini_pred)

performance = performance.append(pd.DataFrame(data = [['Random Forest (n_estimation = 100) Gini', rand_gini_acc, rand_gini_precision, rand_gini_recall, rand_gini_f1]], columns= ['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score']))
performance

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Random Forest (n_estimation = 100) Entropy,0.633724,0.645146,0.710062,0.676049
0,Random Forest (n_estimation = 100) Gini,0.635678,0.647841,0.707988,0.67658


## We can see that using Precision with evaluated parameters provides us the Best Model for prediction
## Hence we will use the same for our model creation