In [1]:
# Import modules

import pandas as pd
import os
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score
from imblearn.metrics import classification_report_imbalanced

In [2]:
df = pd.read_csv(os.path.join('..', 'Resources', 'Created_Files', 'Cleaned_df.csv'))
df

Unnamed: 0,Gender,Age,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,1,44,28.0,0,2,1,40454.0,26.0,217,1
1,1,76,3.0,0,0,0,33536.0,26.0,183,0
2,1,47,28.0,0,2,1,38294.0,26.0,27,1
3,1,21,11.0,1,1,0,28619.0,152.0,203,0
4,0,29,41.0,1,1,0,27496.0,152.0,39,0
...,...,...,...,...,...,...,...,...,...,...
380292,1,74,26.0,1,0,0,30170.0,26.0,88,0
380293,1,30,37.0,1,1,0,40016.0,152.0,131,0
380294,1,21,30.0,1,1,0,35118.0,160.0,161,0
380295,0,68,14.0,0,2,1,44617.0,124.0,74,0


In [3]:
# Define X

X = df.drop('Response', axis=1)
X.head(3)

Unnamed: 0,Gender,Age,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage
0,1,44,28.0,0,2,1,40454.0,26.0,217
1,1,76,3.0,0,0,0,33536.0,26.0,183
2,1,47,28.0,0,2,1,38294.0,26.0,27


In [4]:
# Define y

y = df['Response']
y.value_counts()

0    333628
1     46669
Name: Response, dtype: int64

In [5]:
# Split data into train and test

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

## Random Oversampling

In [6]:
ros = RandomOverSampler(random_state=1)
X_resampled_ros, y_resampled_ros = ros.fit_resample(X_train, y_train)
y_resampled_ros.value_counts()

1    250220
0    250220
Name: Response, dtype: int64

In [7]:
# Logistic Regression - With Random Oversampling

model = LogisticRegression(solver='lbfgs', random_state=1, max_iter=200)
model.fit(X_resampled_ros, y_resampled_ros)
y_pred = model.predict(X_test)

acc = balanced_accuracy_score(y_test, y_pred)

print(f'''Logistic Regression - With Random Oversampling

Balanced_Accuracy: {round(acc*100,2)}%

Classification Report:
{classification_report_imbalanced(y_test, y_pred)}''')

Logistic Regression - With Random Oversampling

Balanced_Accuracy: 77.63%

Classification Report:
                   pre       rec       spe        f1       geo       iba       sup

          0       0.99      0.61      0.94      0.75      0.76      0.56     83408
          1       0.25      0.94      0.61      0.40      0.76      0.59     11667

avg / total       0.90      0.65      0.90      0.71      0.76      0.56     95075



## SMOTE Oversampling

In [8]:
# SMOTE Oversampling

X_resampled_smote, y_resampled_smote = SMOTE(random_state=1, sampling_strategy='auto').fit_resample(X_train, y_train)
y_resampled_smote.value_counts()

1    250220
0    250220
Name: Response, dtype: int64

In [9]:
# Logistic Regression - With SMOTE Oversampling

model = LogisticRegression(solver='lbfgs', random_state=1, max_iter=200)
model.fit(X_resampled_smote, y_resampled_smote)
y_pred = model.predict(X_test)

acc = balanced_accuracy_score(y_test, y_pred)

print(f'''Logistic Regression - With SMOTE Oversampling

Balanced_Accuracy: {round(acc*100,2)}%

Classification Report:
{classification_report_imbalanced(y_test, y_pred)}''')

Logistic Regression - With SMOTE Oversampling

Balanced_Accuracy: 74.07%

Classification Report:
                   pre       rec       spe        f1       geo       iba       sup

          0       0.96      0.67      0.81      0.79      0.74      0.54     83408
          1       0.26      0.81      0.67      0.39      0.74      0.55     11667

avg / total       0.88      0.69      0.79      0.74      0.74      0.54     95075



## SOMTEEN

In [10]:
# Combination Sampling - SMOTEENN

smote_enn = SMOTEENN(random_state=0)
X_resampled_smoteenn, y_resampled_smoteen = smote_enn.fit_resample(X_train, y_train)
y_resampled_smoteen.value_counts()

1    209645
0    147573
Name: Response, dtype: int64

In [11]:
# Logistic Regression - With Combination Sampling (SMOTEENN)

model = LogisticRegression(solver='lbfgs', random_state=1, max_iter=200)
model.fit(X_resampled_smoteenn, y_resampled_smoteen)
y_pred = model.predict(X_test)

acc = balanced_accuracy_score(y_test, y_pred)

print(f'''Logistic Regression - With Combination Sampling (SMOTEENN)

Balanced_Accuracy: {round(acc*100,2)}%

Classification Report:
{classification_report_imbalanced(y_test, y_pred)}''')

Logistic Regression - With Combination Sampling (SMOTEENN)

Balanced_Accuracy: 75.5%

Classification Report:
                   pre       rec       spe        f1       geo       iba       sup

          0       0.97      0.64      0.87      0.77      0.75      0.55     83408
          1       0.25      0.87      0.64      0.39      0.75      0.57     11667

avg / total       0.88      0.67      0.84      0.73      0.75      0.55     95075



## Create Resampled DataFrame

In [12]:
resampled_df = pd.concat([X_resampled_ros,  y_resampled_ros], axis=1)
resampled_df

Unnamed: 0,Gender,Age,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,1,23,35.0,0,0,1,29648.0,154.0,22,1
1,1,74,3.0,0,0,1,2630.0,156.0,68,0
2,0,22,2.0,0,1,1,35674.0,152.0,116,0
3,0,26,46.0,0,1,0,27920.0,152.0,120,0
4,0,44,28.0,1,0,0,31630.0,124.0,129,0
...,...,...,...,...,...,...,...,...,...,...
500435,1,28,47.0,0,1,1,2630.0,160.0,135,1
500436,1,59,48.0,0,0,1,2630.0,15.0,237,1
500437,0,33,28.0,1,0,0,29282.0,154.0,273,1
500438,1,45,28.0,0,0,1,61984.0,26.0,184,1


In [13]:
# Export DataFrame to CSV file

resampled_df.to_csv(os.path.join('..', 'Resources', 'Created_Files', 'Resampled_df.csv'), index=False)