# Kaggle Playground Series: Regression with an Insurance Dataset

### Project Prerequisites

In [95]:
import numpy as np
import pandas as pd
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer

### Preparing the Data

In [88]:
df = pd.read_csv("Data/train.csv")
df.drop(columns = ["Health Score","Policy Start Date","Customer Feedback","Marital Status"],inplace = True)
df.set_index("id",inplace = True)
df.head()

Unnamed: 0_level_0,Age,Gender,Annual Income,Number of Dependents,Education Level,Occupation,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Smoking Status,Exercise Frequency,Property Type,Premium Amount
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,19.0,Female,10049.0,1.0,Bachelor's,Self-Employed,Urban,Premium,2.0,17.0,372.0,5.0,No,Weekly,House,2869.0
1,39.0,Female,31678.0,3.0,Master's,,Rural,Comprehensive,1.0,12.0,694.0,2.0,Yes,Monthly,House,1483.0
2,23.0,Male,25602.0,3.0,High School,Self-Employed,Suburban,Premium,1.0,14.0,,3.0,Yes,Weekly,House,567.0
3,21.0,Male,141855.0,2.0,Bachelor's,,Rural,Basic,1.0,0.0,367.0,1.0,Yes,Daily,Apartment,765.0
4,21.0,Male,39651.0,1.0,Bachelor's,Self-Employed,Rural,Premium,0.0,8.0,598.0,4.0,Yes,Weekly,House,2022.0


In [89]:
print(df["Education Level"].value_counts(),end = "\n\n")
print(df["Occupation"].value_counts(),end = "\n\n")
print(df["Location"].value_counts(),end = "\n\n")
print(df["Policy Type"].value_counts(),end = "\n\n")
print(df["Property Type"].value_counts(),end = "\n\n")
print(df["Exercise Frequency"].value_counts())

Education Level
Master's       303818
PhD            303507
Bachelor's     303234
High School    289441
Name: count, dtype: int64

Occupation
Employed         282750
Self-Employed    282645
Unemployed       276530
Name: count, dtype: int64

Location
Suburban    401542
Rural       400947
Urban       397511
Name: count, dtype: int64

Policy Type
Premium          401846
Comprehensive    399600
Basic            398554
Name: count, dtype: int64

Property Type
House        400349
Apartment    399978
Condo        399673
Name: count, dtype: int64

Exercise Frequency
Weekly     306179
Monthly    299830
Rarely     299420
Daily      294571
Name: count, dtype: int64


In [90]:
df = pd.concat(
    [df, pd.get_dummies(df["Education Level"], prefix="Education_Level").astype(int)], axis=1
)
df = pd.concat(
    [df, pd.get_dummies(df["Exercise Frequency"], prefix="Exercise_Frequency").astype(int)], axis=1
)
df = pd.concat(
    [df, pd.get_dummies(df["Location"], prefix="Location").astype(int)], axis=1
)
df = pd.concat(
    [df, pd.get_dummies(df["Occupation"], prefix="Occupation").astype(int)], axis=1
)
df = pd.concat(
    [df, pd.get_dummies(df["Policy Type"], prefix="Policy_Type").astype(int)], axis=1
)

df = pd.concat(
    [df, pd.get_dummies(df["Property Type"], prefix="Property_Type").astype(int)], axis=1
)

In [91]:
df["Gender"] = df["Gender"].apply(lambda x: 1 if x=="Male" else 0)
df["Smoking Status"] = df["Smoking Status"].apply(lambda x: 1 if x=="Yes" else 0)

df.drop(["Education Level", "Exercise Frequency", "Location", "Occupation","Policy Type","Property Type"], axis=1, inplace=True)
df.head()

Unnamed: 0_level_0,Age,Gender,Annual Income,Number of Dependents,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Smoking Status,Premium Amount,...,Location_Urban,Occupation_Employed,Occupation_Self-Employed,Occupation_Unemployed,Policy_Type_Basic,Policy_Type_Comprehensive,Policy_Type_Premium,Property_Type_Apartment,Property_Type_Condo,Property_Type_House
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,19.0,0,10049.0,1.0,2.0,17.0,372.0,5.0,0,2869.0,...,1,0,1,0,0,0,1,0,0,1
1,39.0,0,31678.0,3.0,1.0,12.0,694.0,2.0,1,1483.0,...,0,0,0,0,0,1,0,0,0,1
2,23.0,1,25602.0,3.0,1.0,14.0,,3.0,1,567.0,...,0,0,1,0,0,0,1,0,0,1
3,21.0,1,141855.0,2.0,1.0,0.0,367.0,1.0,1,765.0,...,0,0,0,0,1,0,0,1,0,0
4,21.0,1,39651.0,1.0,0.0,8.0,598.0,4.0,1,2022.0,...,0,0,1,0,0,0,1,0,0,1


In [92]:
print(df["Number of Dependents"].dtype)
print(df["Previous Claims"].dtype)
df["Number of Dependents"] = pd.to_numeric(df["Number of Dependents"], errors="coerce")
df["Previous Claims"] = pd.to_numeric(df["Previous Claims"], errors="coerce")
print(df["Number of Dependents"].dtype)
print(df["Previous Claims"].dtype)

float64
float64
float64
float64


In [93]:
df["Number of Dependents"].fillna(df["Number of Dependents"].median(),inplace = True)
df["Previous Claims"].fillna(df["Previous Claims"].median(),inplace = True)
df["Vehicle Age"].dropna(how = "any",inplace=True)
df["Credit Score"].fillna(df["Credit Score"].mean(),inplace = True)
df.dropna(how = "any",axis = 1,inplace=True)
df.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Number of Dependents"].fillna(df["Number of Dependents"].median(),inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Previous Claims"].fillna(df["Previous Claims"].median(),inplace = True)
The behavior will change in pandas 3.0. This inplace method will never

Unnamed: 0_level_0,Gender,Number of Dependents,Previous Claims,Credit Score,Smoking Status,Premium Amount,Education_Level_Bachelor's,Education_Level_High School,Education_Level_Master's,Education_Level_PhD,...,Location_Urban,Occupation_Employed,Occupation_Self-Employed,Occupation_Unemployed,Policy_Type_Basic,Policy_Type_Comprehensive,Policy_Type_Premium,Property_Type_Apartment,Property_Type_Condo,Property_Type_House
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,1.0,2.0,372.0,0,2869.0,1,0,0,0,...,1,0,1,0,0,0,1,0,0,1
1,0,3.0,1.0,694.0,1,1483.0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,1
2,1,3.0,1.0,592.92435,1,567.0,0,1,0,0,...,0,0,1,0,0,0,1,0,0,1
3,1,2.0,1.0,367.0,1,765.0,1,0,0,0,...,0,0,0,0,1,0,0,1,0,0
4,1,1.0,0.0,598.0,1,2022.0,1,0,0,0,...,0,0,1,0,0,0,1,0,0,1
