Vehicle Insurance Claims Prediction

The "Vehicle Insurance Claims Prediction" project was part of the "Allstate Claim Prediction Challenge" on Kaggle, held on 13-07-2011, with a focus on predicting claims payments. The goal was to develop predictive models to estimate insurance claims payments based on historical data and relevant features, using various machine learning techniques, data preprocessing methods, and feature engineering approaches to improve prediction accuracy.

Import necessary libraries:

In [36]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')

Import data:

In [46]:
train_data = pd.read_csv(r'D:\ML_Projects\Vehicle_Insurance_Claims_Prediction\train_set.csv')
y_train = train_data['Claim_Amount']

test_data = pd.read_csv(r'D:\ML_Projects\Vehicle_Insurance_Claims_Prediction\test_set.csv')
example_entry = pd.read_csv(r'D:\ML_Projects\Vehicle_Insurance_Claims_Prediction\example_entry.csv')
test_data = test_data.merge(example_entry, on='Row_ID', how='left')
y_test = test_data['Claim_Amount']

In [60]:
print('Number of training samples: ', format(len(train_data),','))
print('Number of test samples: ', format(len(test_data), ','))
print('Train % : ', round(len(train_data) / (len(train_data) + len(test_data)), 3)*100,'%')
print('Test %  : ', round(len(test_data) / (len(train_data) + len(test_data)), 3)*100, '%')

Number of training samples:  13,184,290
Number of test samples:  4,314,865
Train % :  75.3 %
Test %  :  24.7 %


In [63]:
print('Features in training data: \n', train_data.columns)
print('Features in test data: \n', test_data.columns)

Features in training data: 
 Index(['Row_ID', 'Household_ID', 'Vehicle', 'Calendar_Year', 'Model_Year',
       'Blind_Make', 'Blind_Model', 'Blind_Submodel', 'Cat1', 'Cat2', 'Cat3',
       'Cat4', 'Cat5', 'Cat6', 'Cat7', 'Cat8', 'Cat9', 'Cat10', 'Cat11',
       'Cat12', 'OrdCat', 'Var1', 'Var2', 'Var3', 'Var4', 'Var5', 'Var6',
       'Var7', 'Var8', 'NVCat', 'NVVar1', 'NVVar2', 'NVVar3', 'NVVar4',
       'Claim_Amount'],
      dtype='object')
Features in test data: 
 Index(['Row_ID', 'Household_ID', 'Vehicle', 'Calendar_Year', 'Model_Year',
       'Blind_Make', 'Blind_Model', 'Blind_Submodel', 'Cat1', 'Cat2', 'Cat3',
       'Cat4', 'Cat5', 'Cat6', 'Cat7', 'Cat8', 'Cat9', 'Cat10', 'Cat11',
       'Cat12', 'OrdCat', 'Var1', 'Var2', 'Var3', 'Var4', 'Var5', 'Var6',
       'Var7', 'Var8', 'NVCat', 'NVVar1', 'NVVar2', 'NVVar3', 'NVVar4',
       'Claim_Amount'],
      dtype='object')


In [37]:
train_data.head()

Unnamed: 0,Household_ID,Vehicle,Calendar_Year,Model_Year,Blind_Make,Blind_Model,Blind_Submodel,Cat1,Cat2,Cat3,...,Var4,Var5,Var6,Var7,Var8,NVCat,NVVar1,NVVar2,NVVar3,NVVar4
0,1,3,2005,2005,K,K.78,K.78.2,D,C,F,...,0.908351,1.008912,0.26104,0.907793,-0.077998,M,-0.23153,-0.266117,-0.272337,-0.251419
1,2,2,2005,2003,Q,Q.22,Q.22.3,B,C,A,...,0.485509,1.240851,0.432987,-0.726459,0.204785,O,-0.23153,-0.266117,-0.272337,-0.251419
2,3,1,2005,1998,AR,AR.41,AR.41.1,B,?,A,...,-1.679445,-0.971487,-1.405797,-0.837048,-1.176858,F,-0.23153,-0.266117,-0.272337,-0.251419
3,3,1,2006,1998,AR,AR.41,AR.41.1,B,?,A,...,-1.679445,-0.971487,-1.405797,-0.837048,-1.176858,F,-0.23153,-0.266117,-0.272337,-0.251419
4,3,2,2005,2001,D,D.20,D.20.0,J,C,B,...,1.838605,0.812656,2.112691,1.534462,2.34726,F,-0.23153,-0.266117,-0.272337,-0.251419


In [66]:
train_data.describe()

Unnamed: 0,Row_ID,Household_ID,Vehicle,Calendar_Year,Model_Year,Var1,Var2,Var3,Var4,Var5,Var6,Var7,Var8,NVVar1,NVVar2,NVVar3,NVVar4,Claim_Amount
count,13184290.0,13184290.0,13184290.0,13184290.0,13184290.0,13184290.0,13184290.0,13184290.0,13184290.0,13184290.0,13184290.0,13184290.0,13184290.0,13184290.0,13184290.0,13184290.0,13184290.0,13184290.0
mean,6592146.0,4128242.0,1.894002,2006.052,1999.312,-0.01011925,-0.06508702,-0.02543391,-0.05456792,0.003838594,-0.04012271,-0.02421288,-0.05856059,0.0146841,0.01751169,0.01354226,0.01851376,1.360658
std,3805977.0,2248415.0,1.173861,0.812372,5.211866,0.9800609,0.9684165,1.018902,0.968017,0.991049,0.9792078,1.006433,1.003954,1.03104,1.038212,1.027748,1.034274,39.00103
min,1.0,1.0,1.0,2005.0,1981.0,-2.578222,-2.493393,-2.790335,-2.508216,-3.350344,-2.376657,-2.778491,-2.163042,-0.2315299,-0.2661168,-0.2723372,-0.2514189,0.0
25%,3296073.0,2184932.0,1.0,2005.0,1996.0,-0.6658971,-0.8161519,-0.8696874,-0.7830189,-0.6860239,-0.688765,-0.8984857,-0.651768,-0.2315299,-0.2661168,-0.2723372,-0.2514189,0.0
50%,6592146.0,4257083.0,2.0,2006.0,2000.0,-0.3123581,-0.1245062,-0.2217581,-0.1064709,-0.1150981,-0.2372568,-0.4684193,-0.2568567,-0.2315299,-0.2661168,-0.2723372,-0.2514189,0.0
75%,9888218.0,6281433.0,2.0,2007.0,2003.0,0.4429298,0.4806838,0.7269956,0.4855086,0.5331405,0.4973212,0.8217801,0.3409799,-0.2315299,-0.2661168,-0.2723372,-0.2514189,0.0
max,13184290.0,7542113.0,29.0,2007.0,2009.0,5.143392,7.82942,5.563325,7.589263,4.018167,4.584289,4.127148,47.35074,6.62711,8.883081,8.691144,6.388802,11440.75
