In [1]:
#  Imports

import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, balanced_accuracy_score
import time

In [2]:
# Load data

df = pd.read_csv(os.path.join('..', 'Resources', 'Created_Files', 'Resampled_df.csv'))
df

Unnamed: 0,Gender,Age,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,1,23,35.0,0,0,1,29648.0,154.0,22,1
1,1,74,3.0,0,0,1,2630.0,156.0,68,0
2,0,22,2.0,0,1,1,35674.0,152.0,116,0
3,0,26,46.0,0,1,0,27920.0,152.0,120,0
4,0,44,28.0,1,0,0,31630.0,124.0,129,0
...,...,...,...,...,...,...,...,...,...,...
500435,1,28,47.0,0,1,1,2630.0,160.0,135,1
500436,1,59,48.0,0,0,1,2630.0,15.0,237,1
500437,0,33,28.0,1,0,0,29282.0,154.0,273,1
500438,1,45,28.0,0,0,1,61984.0,26.0,184,1


In [3]:
# Define X

X = df.drop('Response', axis=1)
X.head(3)

Unnamed: 0,Gender,Age,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage
0,1,23,35.0,0,0,1,29648.0,154.0,22
1,1,74,3.0,0,0,1,2630.0,156.0,68
2,0,22,2.0,0,1,1,35674.0,152.0,116


In [4]:
# Define y

y = df['Response']
y.value_counts()

1    250220
0    250220
Name: Response, dtype: int64

In [5]:
# Split data into train and test

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [6]:
# Scale the dataset

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [7]:
# Random Forest

start_time = time.time()

n_estimator = 100
rf_model = RandomForestClassifier(n_estimators=n_estimator, random_state=1) 
rf_model = rf_model.fit(X_train_scaled, y_train)
y_pred = rf_model.predict(X_test_scaled)

end_time = time.time()

In [8]:
# Evaluation

acc = accuracy_score(y_test, y_pred)
b_acc = balanced_accuracy_score(y_test, y_pred)
classification_report = classification_report(y_test, y_pred)
elapsed_time  = end_time - start_time

print(f'''Random Forest - Number of Estimators: {n_estimator}

Accuracy: {round(acc*100,2)}%
Balanced_Accuracy: {round(b_acc*100,2)}%

Classification Report:
{classification_report}

Training Time: {round(elapsed_time,1)} Seconds''')

Random Forest - Number of Estimators: 100

Accuracy: 94.5%
Balanced_Accuracy: 94.51%

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.89      0.94     62677
           1       0.90      1.00      0.95     62433

    accuracy                           0.94    125110
   macro avg       0.95      0.95      0.94    125110
weighted avg       0.95      0.94      0.94    125110


Training Time: 49.9 Seconds


In [9]:
# Feature Importance

importances = rf_model.feature_importances_
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.19226947152059387, 'Vintage'),
 (0.18525727525509367, 'Vehicle_Damage'),
 (0.16929537420162147, 'Annual_Premium'),
 (0.15923190819780186, 'Previously_Insured'),
 (0.12941963632595077, 'Age'),
 (0.08312671863555278, 'Region_Code'),
 (0.05882910424083333, 'Policy_Sales_Channel'),
 (0.012765846993809471, 'Vehicle_Age'),
 (0.009804664628742873, 'Gender')]