In [43]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
sns.set(style='whitegrid')
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeClassifier
#from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

In [23]:
train = pd.read_csv('E:/Workspace_DS/Insurance_prediction/Data_Sets/train.csv')
test = pd.read_csv('E:/Workspace_DS/Insurance_prediction/Data_Sets/test.csv')

In [24]:
ptrain= train
ptest = test

In [18]:
train.size,test.size,ptrain.size,ptest.size


(4573308, 1397407, 4573308, 1397407)

In [27]:
ptrain['Vehicle_Age']=train['Vehicle_Age'].replace({'< 1 Year':0,'1-2 Year':1,'> 2 Years':2})
ptrain['Gender']=train['Gender'].replace({'Male':1,'Female':0})
ptrain['Vehicle_Damage']=train['Vehicle_Damage'].replace({'Yes':1,'No':0})

In [65]:
ptest['Vehicle_Age']=test['Vehicle_Age'].replace({'< 1 Year':0,'1-2 Year':1,'> 2 Years':2})
ptest['Gender']=test['Gender'].replace({'Male':1,'Female':0})
ptest['Vehicle_Damage']=test['Vehicle_Damage'].replace({'Yes':1,'No':0})

In [67]:
ptest

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage
0,381110,1,25,1,11.0,1,0,0,35786.0,152.0,53
1,381111,1,40,1,28.0,0,1,1,33762.0,7.0,111
2,381112,1,47,1,28.0,0,1,1,40050.0,124.0,199
3,381113,1,24,1,27.0,1,0,1,37356.0,152.0,187
4,381114,1,27,1,28.0,1,0,0,59097.0,152.0,297
...,...,...,...,...,...,...,...,...,...,...,...
127032,508142,0,26,1,37.0,1,0,0,30867.0,152.0,56
127033,508143,0,38,1,28.0,0,1,1,28700.0,122.0,165
127034,508144,1,21,1,46.0,1,0,0,29802.0,152.0,74
127035,508145,1,71,1,28.0,1,1,0,62875.0,26.0,265


In [30]:
#First, I delete the "id" column as it will not contribute to model training.
#If inplace were set to False, the drop method would return a new DataFrame with the specified column removed, 
#and train would remain unchanged.
ptrain.drop(columns="id", inplace=True, errors="ignore")

In [32]:
#I define the "Response" column to y and the other columns to X.

X = ptrain[ptrain.columns[:-1]]
y = ptrain[ptrain.columns[-1]]

In [33]:
type(X),type(y), X.size, y.size

(pandas.core.frame.DataFrame, pandas.core.series.Series, 3811090, 381109)

In [35]:
X

Unnamed: 0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage
0,1,44,1,28.0,0,2,1,40454.0,26.0,217
1,1,76,1,3.0,0,1,0,33536.0,26.0,183
2,1,47,1,28.0,0,2,1,38294.0,26.0,27
3,1,21,1,11.0,1,0,0,28619.0,152.0,203
4,0,29,1,41.0,1,0,0,27496.0,152.0,39
...,...,...,...,...,...,...,...,...,...,...
381104,1,74,1,26.0,1,1,0,30170.0,26.0,88
381105,1,30,1,37.0,1,0,0,40016.0,152.0,131
381106,1,21,1,30.0,1,0,0,35118.0,160.0,161
381107,0,68,1,14.0,0,2,1,44617.0,124.0,74


In [40]:
# Define preprocessing for numeric features
numeric_features = ['Gender', 'Age','Driving_License','Region_Code','Previously_Insured','Vehicle_Age','Annual_Premium','Policy_Sales_Channel','Vintage']  # Update with your numeric features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))
])

In [44]:
# Create a preprocessor that applies transformations to numeric and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features)
    ])

# Create pipelines for each model

In [52]:
# Decision Tree pipeline
dt_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier())
])

In [46]:
# Random Forest Classifier Pipeline
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),  # Apply preprocessing
    ('classifier', RandomForestClassifier(random_state=1))  # Random Forest Classifier model
])

In [47]:
# K-Nearest Neighbors Classifier Pipeline
knn_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),  # Apply preprocessing
    ('classifier', KNeighborsClassifier(n_neighbors=11, metric='minkowski', p=2))  # KNN Classifier model
])

In [48]:
# Bagging Classifier Pipeline
bagging_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),  # Apply preprocessing
    ('classifier', BaggingClassifier())  # Bagging Classifier model
])


In [49]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [51]:
X_train.size, y_train.size,X_test.size, y_test.size,ptrain.size,ptest.size

(3048870, 304887, 762220, 76222, 4192199, 1397407)

In [54]:
# Fit the pipeline on your data
dt_pipeline.fit(X_train, y_train)

# Example usage: Predict with the fitted pipeline
y_pred_tree = dt_pipeline.predict(X_test)
y_pred_tree

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [55]:
# Fit the pipeline on your data
rf_pipeline.fit(X_train, y_train)

# Example usage: Predict with the fitted pipeline
y_pred_rf = rf_pipeline.predict(X_test)
y_pred_rf

array([0, 0, 1, ..., 0, 0, 0], dtype=int64)

In [56]:
# Fit the pipeline on your data
knn_pipeline.fit(X_train, y_train)

# Example usage: Predict with the fitted pipeline
y_pred_knn = knn_pipeline.predict(X_test)
y_pred_knn

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [57]:
# Fit the pipeline on your data
bagging_pipeline.fit(X_train, y_train)

# Example usage: Predict with the fitted pipeline
y_pred_bagging = bagging_pipeline.predict(X_test)
y_pred_bagging

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [62]:
print("DecisionTreeClassifier Accuracy = ",accuracy_score(y_test, y_pred_tree))
print("RandomForestClassifier Accuracy = ",accuracy_score(y_test,y_pred_rf))
print("KNeighborsClassifier Accuracy = ",accuracy_score(y_test,y_pred_knn))
print("BaggingClassifier Accuracy = ",accuracy_score(y_test,y_pred_bagging))

DecisionTreeClassifier Accuracy =  0.8188055941854058
RandomForestClassifier Accuracy =  0.8652619978483902
KNeighborsClassifier Accuracy =  0.8713363595812232
BaggingClassifier Accuracy =  0.8603290388601716


In [None]:
It seems that KNeighborsClassifier has the best accuracy score. So I am going to use this model on submission.csv

In [66]:
# Predicting response for unseen test data sets
test_KNN = ptest.drop(columns="id", inplace=False, errors="ignore")
predictions_test_KNN = knn_pipeline.predict(test_KNN)
predictions_test_KNN

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [70]:
submission_Pipeline = pd.DataFrame(data = {'id': ptest['id'], 'Response': predictions_test_KNN})
submission_Pipeline.to_csv('submission_Pipeline.csv', index = False)
submission_Pipeline.head()

Unnamed: 0,id,Response
0,381110,0
1,381111,0
2,381112,0
3,381113,0
4,381114,0
