In [1]:
#  Imports

import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, balanced_accuracy_score
import time

In [2]:
# Load data

df = pd.read_csv(os.path.join('..', 'Resources', 'Created_Files', 'Clustered_df.csv'))
df

Unnamed: 0,Age,Previously_Insured,Vehicle_Damage,Annual_Premium,Vintage,Class,Response
0,23,0,1,29648.0,22,0,1
1,74,0,1,2630.0,68,0,0
2,22,0,1,35674.0,116,0,0
3,26,0,0,27920.0,120,1,0
4,44,1,0,31630.0,129,1,0
...,...,...,...,...,...,...,...
500435,28,0,1,2630.0,135,0,1
500436,59,0,1,2630.0,237,0,1
500437,33,1,0,29282.0,273,1,1
500438,45,0,1,61984.0,184,0,1


In [3]:
# Define X

X = df.drop(['Response'], axis=1)
X.head(3)

Unnamed: 0,Age,Previously_Insured,Vehicle_Damage,Annual_Premium,Vintage,Class
0,23,0,1,29648.0,22,0
1,74,0,1,2630.0,68,0
2,22,0,1,35674.0,116,0


In [4]:
# Define y

y = df['Response']
y.value_counts()

1    250220
0    250220
Name: Response, dtype: int64

In [5]:
# Split data into train and test

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [6]:
# Scale the dataset

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [7]:
# Random Forest

start_time = time.time()

n_estimator = 100
rf_model = RandomForestClassifier(n_estimators=n_estimator, random_state=1) 
rf_model = rf_model.fit(X_train_scaled, y_train)
y_pred = rf_model.predict(X_test_scaled)

end_time = time.time()

In [8]:
# Evaluation

acc = accuracy_score(y_test, y_pred)
b_acc = balanced_accuracy_score(y_test, y_pred)
classification_report = classification_report(y_test, y_pred)
elapsed_time  = end_time - start_time

print(f'''Random Forest - Number of Estimators: {n_estimator}

Accuracy: {round(acc*100,2)}%
Balanced_Accuracy: {round(b_acc*100,2)}%

Classification Report:
{classification_report}

Training Time: {round(elapsed_time,1)} Seconds''')

Random Forest - Number of Estimators: 100

Accuracy: 92.64%
Balanced_Accuracy: 92.65%

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.86      0.92     62677
           1       0.88      0.99      0.93     62433

    accuracy                           0.93    125110
   macro avg       0.93      0.93      0.93    125110
weighted avg       0.93      0.93      0.93    125110


Training Time: 44.8 Seconds


In [9]:
# Feature Importance

importances = rf_model.feature_importances_
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.2689407214322844, 'Annual_Premium'),
 (0.23818260383540946, 'Vintage'),
 (0.1434738177592814, 'Class'),
 (0.1358815500819696, 'Vehicle_Damage'),
 (0.10941838293824208, 'Previously_Insured'),
 (0.10410292395281306, 'Age')]