In [None]:
import gzip
import json
import pickle

import matplotlib.pyplot as plt
import pandas as pd

from imblearn.over_sampling import RandomOverSampler
from IPython.display import VimeoVideo
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.pipeline import make_pipeline

We have multiple trees in the model to predict the outcome. It is similar to getting opinions from multiple people to come to a conclusion. Similar to that we have a forest of decision tree built on random sample and predicting the class label of the outcome variable. In the Random Forest model, which is a bagging technique, we collect outcome prediction from all the different trees and then combine them to get which outcome has max votes or likelihoodness than the other and the model will give that as the outcome.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size = 0.2, random_state = 42)

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

In [None]:
over_sampler = RandomOverSampler(random_state =42)
X_train_over, y_train_over = over_sampler.fit_resample(X_train, y_train)
print("X_train_over shape:", X_train_over.shape)
X_train_over.head()

In [None]:
clf = make_pipeline(
    SimpleImputer(),
    RandomForestClassifier(random_state=42)
)
print(clf)

## Tuning hyperparameter using cross-validation

In [None]:
cv_acc_scores = cross_val_score(clf, X_train_over, y_train_over, cv = 5, n_jobs =-1) # n_jobs sets the resourses to run in max optimized way in paralle process
print(cv_acc_scores)

In [None]:
## parameters that we are going to tune
params = {
    "simpleimputer__strategy": ["mean", "median"],
    "randomforestclassifier__n_estimators" : range(25, 100, 25) ,
    "randomforestclassifier__max_depth": range(10,50,10),
}
params

In [None]:
## adding all the elments to prepare modelling process

model = GridSearchCV(
    clf,
    param_grid=params,
    cv=5,
    n_jobs=-1, 
    verbose = 1
)    
model

In [None]:
# Train model
model.fit(X_train_over, y_train_over)

In [None]:
## look at the results from the above training
cv_results = pd.DataFrame(model.cv_results_)
cv_results.head(10)

In [None]:
# Extract best hyperparameters
model.best_params_

In [None]:

# model.predict() by default uses best parameters from gridsearchcv
from sklearn.metrics import accuracy_score
acc_train = accuracy_score(y_train, model.predict(X_train))
acc_test = accuracy_score(y_test, model.predict(X_test))

print("Training Accuracy:", round(acc_train, 4))
print("Test Accuracy:", round(acc_test, 4))

In [None]:
# Plot confusion matrix
ConfusionMatrixDisplay.from_estimator(model, X_test, y_test)

In [None]:
# Get feature names from training data
features = X_train.columns
# Extract importances from model
importances = model.best_estimator_.named_steps["randomforestclassifier"].feature_importances_
# Create a series with feature names and importances
feat_imp = pd.Series(importances, index = features).sort_values()
# Plot 10 most important features

feat_imp.tail(10).plot(kind = 'barh')
plt.xlabel("Gini Importance")
plt.ylabel("Feature")
plt.title("Feature Importance");