# Predicting the claim amount from and automobile insurance set.

# Load and take a look at the data

In [None]:
# Load relevant libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load data
ds=pd.read_csv('Auto_Insurance_Claims_amount.csv')

In [None]:
ds.head()

In [None]:
ds.shape

In [None]:
ds.dtypes

# Check and treat null values

In [None]:
sns.heatmap(ds.isnull())

In [None]:
ds.isnull().sum()

# Check correlation

In [None]:
dscor=ds.corr()
sns.heatmap(dscor, annot=True)

So far the columns that correlate best with the target column are:
    Monthly Premium Auto, Total claim amount.  Will check again after encoding the data.

# Summary statistics

In [None]:
ds.describe()

The standard deviation is very large in some cases.

# Data visualizations

In [None]:
ds.columns

In [None]:
# Check for outliers
ds['Claim Amount'].plot.box()

Some outliers present, but not extreme ones.

In [None]:
# Count the amount of entries per response type
ds.groupby('Response')['Response'].count()

In [None]:
sns.countplot(ds['Response'])

Far more claims were rejected than granted.

In [None]:
sns.catplot(x='Response', y= 'Claim Amount', hue='Coverage', data=ds, kind='bar')

The type of coverage affects the amounts that people claim for proportionally. It does not reflect in the amounts that are approved though. The Basic and Extended coverages have very similar claims amounts that are approved.

In [None]:
sns.catplot(x='Response', y= 'Total Claim Amount', hue='Coverage', data=ds, kind='bar')

The total claim amounts reflect the coverage levels proportionally. It does not seem to affect the approval of claims otherwise.

In [None]:
sns.catplot(x='Response', y= 'Monthly Premium Auto', hue='Coverage', data=ds, kind='bar')

Higher monthly payments generally lead to greater claim approval rates except for the Basic coverage.

In [None]:
sns.catplot(x='Response', y= 'Monthly Premium Auto', hue='Gender', data=ds, kind='bar')

Females who have their claims approved seem to pay a noticibly higher monthly premium.

# Encode categorical columns to numeric values

In [None]:
eds=ds

In [None]:
eds.columns

In [None]:
eds.dtypes

In [None]:
# Change categorical data to numerical data for 2 categoriacl columns

from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
cols=['State','Response', 'Coverage', 'Education','Effective To Date',
       'EmploymentStatus', 'Gender', 'Location Code',
       'Marital Status', 'Policy Type', 'Policy', 'Claim Reason',
       'Sales Channel', 'Vehicle Class', 'Vehicle Size']
for col in cols:
    eds[col] = le.fit_transform(eds[col])

In [None]:
eds.head()

# Check correlation after encoding

In [None]:
dscor=eds.corr()
sns.heatmap(dscor)

Strongest correlation to target column:
    Total Claim Amount, Monthly Premium Auto, Coverage
Most are somewhat correlating.
Claim Reason has the poorest correlation.

# Drop irrelevant columns

In [None]:
eds.columns

In [None]:
eds.dtypes

In [None]:
# Drop irrelevant (poorly correlated) columns - dataset renamed to nds (new dataset)
nds=ds.drop(columns=['Customer', 'Country', 'State Code'])

# Find and remove outliers

In [None]:
nds.shape

In [None]:
# Show boxplots for all columns, check outliers
collist=nds.columns.values
ncol=23
nrows=10

plt.figure(figsize=(ncol,5*ncol))
for i in range(1, len(collist)):
    plt.subplot(nrows,ncol,i+1)
    sns.boxplot(nds[collist[i]], color='red', orient='v')
    plt.tight_layout()

In [None]:
# Remove outliers - dataset renamed to cds (clean dataset)
from scipy import stats
from scipy.stats import zscore

z_score=abs(zscore(nds))
print(nds.shape)
cds=nds.loc[(z_score<3).all(axis=1)]
print(cds.shape)

# Check and adjust skewness

In [None]:
# Show skewness (less than 0.55 is ok)
cds.skew()

In [None]:
#Treat skewness using log
for col in cds.columns:
    if cds.skew().loc[col]>0.55:
        cds[col]=np.log1p(cds[col])

In [None]:
cds.skew()

# Split data into x and y

In [None]:
cds.columns

In [None]:
# Lineup target and input values
cds_x=cds.drop(columns=['Claim Amount'])
y=cds[['Claim Amount']]

# Apply scaler

In [None]:
#Scaling for linear regression
from sklearn.preprocessing import StandardScaler

sc=StandardScaler()
x=sc.fit_transform(cds_x)
x=pd.DataFrame(x,columns=cds_x.columns)

In [None]:
x.skew()

# Train-Test split

In [None]:
# Train test split
from sklearn.model_selection import train_test_split
x_train,x_test, y_train, y_test = train_test_split(x,y, random_state=55, test_size=0.20)

# Linear Model

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.externals import joblib
from sklearn import linear_model

In [None]:
# Error calculation
max_r_score=0
for r_state in range(42,100):
    x_train,x_test, y_train, y_test = train_test_split(x,y, random_state=r_state, test_size=0.20)
    regr=linear_model.LinearRegression()
    regr.fit(x_train,y_train)
    y_pred=regr.predict(x_test)
    r2_scr=r2_score(y_test,y_pred)
    if r2_scr>max_r_score:
        max_r_score=r2_scr
        final_r_state=r_state
print("Max r2 score for",final_r_state,"is", max_r_score)

In [None]:
# Crossvalidation
cross_val_score(linear_model.LinearRegression(),x,y,cv=10,scoring='r2').mean()

In [None]:
# R2 score gives us a rondom state of 71
x_train,x_test, y_train, y_test = train_test_split(x,y, random_state=71, test_size=0.20)
lreg=linear_model.LinearRegression()
lreg.fit(x_train,y_train)
y_pred=lreg.predict(x_test)

In [None]:
# R2 and Mean squared error statements
print('r score is: ',r2_score(y_test,y_pred))
print('RMSE is: ', np.sqrt(mean_squared_error(y_test, y_pred)))

Very poor performance

# Other regression models

In [None]:
# Loop for SVR types
from sklearn.svm import SVR

kernellist=['linear','poly','rbf']
for i in kernellist:
    sv=SVR(kernel=i)
    sv.fit(x_train,y_train)
    print(sv.score(x_train,y_train))

In [None]:
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.model_selection import GridSearchCV
from sklearn import datasets
# Prepare a range of alpha values to test
alphavalue={'alpha':[1,0.1,0.01,0.001,0.0001,0]}
# Create and fit a Ridge regression model to test each alpha
model=Ridge()
grid=GridSearchCV(estimator=model,param_grid=alphavalue)
grid.fit(x_train,y_train)

#Summarize the results of the grid search

print(grid.best_estimator_.alpha)

In [None]:
# Create and fit a Ridge regression model to test each alpha
model2=Lasso()
grid=GridSearchCV(estimator=model2,param_grid=alphavalue)
grid.fit(x_train,y_train)

#Summarize the results of the grid search

print(grid.best_estimator_.alpha)

In [None]:
# x_train,x_test,y_train, y_test
# Try to minimize the coefficient variance

rd=Ridge(alpha=1)
rd.fit(x_train,y_train)
rd.score(x_train,y_train)

In [None]:
# Cross validation score for Ridge Regressor
cross_val_score(linear_model.Ridge(alpha=1),x,y,cv=10,scoring='r2').mean()

In [None]:
# x_train,x_test,y_train, y_test
# Try to minimize the coefficient variance

las=Lasso(alpha=0.001)
las.fit(x_train,y_train)
las.score(x_train,y_train)

In [None]:
# Use Gradient Boosting technique with GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor

gbr=GradientBoostingRegressor()
parameters={'learning_rate':[0.001,0.01,0.1,1],'n_estimators':[10,100,500,1000]}
clf=GridSearchCV(gbr,parameters, cv=5)
clf.fit(x,y)
clf.best_params_

In [None]:
# Use CrossValScore with Gradient Boosting to check r2 mean and standard deviation
print('Mean r2 score for GradientBoosting Regression:', cross_val_score(gbr,x,y,cv=5,scoring='r2').mean())
print('Standard deviation in r2 score for GradientBoosting Regression:',cross_val_score(gbr,x,y,cv=5,scoring='r2').std())

I choose the  GradientBoosting Regression model because it has the best scores.
Mean r2 score: 91.4
Standard deviation in r2 score : 0.4


# Save the model

In [None]:
#save model as a pickle file
from sklearn.externals import joblib
joblib.dump(gbr,'Claims.pkl')

In [None]:
# Load and use model to make a prediction
model=joblib.load('Claims.plk')
model.predict(x_test)