# Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Importing the Data for ML Project

In [2]:
df=pd.read_csv('insurance_renewal.csv')

In [3]:
df.shape

(9134, 23)

In [4]:
df.head()

Unnamed: 0,Customer,State,Customer Lifetime Value,Response,Coverage,Education,Effective To Date,EmploymentStatus,Gender,Income,...,Months Since Last Claim,Months Since Policy Inception,Number of Open Complaints,Number of Policies,Policy Type,Renew Offer Type,Sales Channel,Total Claim Amount,Vehicle Class,Vehicle Size
0,BU79786,Washington,2763.519279,No,Basic,Bachelor,2/24/2011,Employed,F,56274,...,32,5,0,1,Corporate Auto,Offer1,Agent,384.811147,Two-Door Car,Medsize
1,QZ44356,Arizona,6979.535903,No,Extended,Bachelor,1/31/2011,Unemployed,F,0,...,13,42,0,8,Personal Auto,Offer3,Agent,1131.464935,Four-Door Car,Medsize
2,AI49188,Nevada,12887.43165,No,Premium,Bachelor,2/19/2011,Employed,F,48767,...,18,38,0,2,Personal Auto,Offer1,Agent,566.472247,Two-Door Car,Medsize
3,WW63253,California,7645.861827,No,Basic,Bachelor,1/20/2011,Unemployed,M,0,...,18,65,0,7,Corporate Auto,Offer1,Call Center,529.881344,SUV,Medsize
4,HB64268,Washington,2813.692575,No,Basic,Bachelor,2/3/2011,Employed,M,43836,...,12,44,0,1,Personal Auto,Offer1,Agent,138.130879,Four-Door Car,Medsize


In [5]:
df.dtypes

Customer                          object
State                             object
Customer Lifetime Value          float64
Response                          object
Coverage                          object
Education                         object
Effective To Date                 object
EmploymentStatus                  object
Gender                            object
Income                             int64
Location Code                     object
Marital Status                    object
Monthly Premium Auto               int64
Months Since Last Claim            int64
Months Since Policy Inception      int64
Number of Open Complaints          int64
Number of Policies                 int64
Policy Type                       object
Renew Offer Type                  object
Sales Channel                     object
Total Claim Amount               float64
Vehicle Class                     object
Vehicle Size                      object
dtype: object

# Setting Display options to ensure feature name visibility

In [6]:
pd.set_option('display.max_columns',None)

# Warning Suppression 

In [7]:
import warnings
warnings.filterwarnings('ignore')

# How many rows have missing ID ?

In [9]:
df['Customer'].isnull().sum()

0

# Drop ID Feature from the dataset

In [10]:
df=df.drop(['Customer'],axis=1)

In [11]:
# Drop the following feature since we have 'Months Since Policy Inception'
# Effective To Date

In [12]:
df=df.drop(['Effective To Date'],axis=1)

In [13]:
# Labeling the Target Variable (1/0)

In [14]:
df['Response'].value_counts()

No     7826
Yes    1308
Name: Response, dtype: int64

In [15]:
df['Target']=np.where(df['Response']=="Yes",1,0)

In [16]:
df=df.drop(['Response'],axis=1)

# Defining Target and Independent Features

In [17]:
Y=df[['Target']]
X=df.drop(['Target'],axis=1)

# Get the Average Loan Disbursal Rate

In [18]:
Y.mean()

Target    0.143201
dtype: float64

# Split features into Numerical and Categorical

In [19]:
num=X.select_dtypes(include="number")
char=X.select_dtypes(include="object")

In [20]:
num.head()

Unnamed: 0,Customer Lifetime Value,Income,Monthly Premium Auto,Months Since Last Claim,Months Since Policy Inception,Number of Open Complaints,Number of Policies,Total Claim Amount
0,2763.519279,56274,69,32,5,0,1,384.811147
1,6979.535903,0,94,13,42,0,8,1131.464935
2,12887.43165,48767,108,18,38,0,2,566.472247
3,7645.861827,0,106,18,65,0,7,529.881344
4,2813.692575,43836,73,12,44,0,1,138.130879


In [22]:
# checking the levels within the data. 
def unique_level(x):
    x= x.value_counts().count()
    return(x)  
df_value_counts = pd.DataFrame(num.apply(lambda x:  unique_level(x)))
df_value_counts

Unnamed: 0,0
Customer Lifetime Value,8041
Income,5694
Monthly Premium Auto,202
Months Since Last Claim,36
Months Since Policy Inception,100
Number of Open Complaints,6
Number of Policies,9
Total Claim Amount,5106


In [23]:
df_value_counts.columns=['feature_levels']
df_value_counts

Unnamed: 0,feature_levels
Customer Lifetime Value,8041
Income,5694
Monthly Premium Auto,202
Months Since Last Claim,36
Months Since Policy Inception,100
Number of Open Complaints,6
Number of Policies,9
Total Claim Amount,5106


In [21]:
# Dropping Number of Open Complaints and Number of Policies from num datafram. We will treat them as categorical features

In [24]:
ind=num[['Number of Open Complaints','Number of Policies']]
num=num.drop(['Number of Open Complaints','Number of Policies'],axis=1)

In [25]:
ind.head(2)

Unnamed: 0,Number of Open Complaints,Number of Policies
0,0,1
1,0,8


In [26]:
ind.shape

(9134, 2)

# Outlier Analysis of Numerical Features

In [27]:
num.describe(percentiles=[0.01,0.05,0.10,0.25,0.50,0.75,0.85,0.9,0.99])

Unnamed: 0,Customer Lifetime Value,Income,Monthly Premium Auto,Months Since Last Claim,Months Since Policy Inception,Total Claim Amount
count,9134.0,9134.0,9134.0,9134.0,9134.0,9134.0
mean,8004.940475,37657.380009,93.219291,15.097,48.064594,434.088794
std,6870.967608,30379.904734,34.407967,10.073257,27.905991,290.500092
min,1898.007675,0.0,61.0,0.0,0.0,0.099007
1%,2230.433731,0.0,61.0,0.0,1.0,10.402835
5%,2475.109047,0.0,62.0,1.0,4.0,52.261227
10%,2661.757835,0.0,64.0,2.0,10.0,104.084855
25%,3994.251794,0.0,68.0,6.0,24.0,272.258244
50%,5780.182197,33889.5,83.0,14.0,48.0,383.945434
75%,8962.167041,62320.0,109.0,23.0,71.0,547.514839


# Capping and Flooring of outliers

In [28]:
def outlier_cap(x):
    x=x.clip(lower=x.quantile(0.01))
    x=x.clip(upper=x.quantile(0.99))
    return(x)

In [29]:
num=num.apply(lambda x : outlier_cap(x))

In [30]:
num.describe(percentiles=[0.01,0.05,0.10,0.25,0.50,0.75,0.85,0.9,0.99])

Unnamed: 0,Customer Lifetime Value,Income,Monthly Premium Auto,Months Since Last Claim,Months Since Policy Inception,Total Claim Amount
count,9134.0,9134.0,9134.0,9134.0,9134.0,9134.0
mean,7913.817819,37645.405877,92.922448,15.097,48.065141,430.55138
std,6344.938386,30355.858544,33.02085,10.073257,27.875048,273.819399
min,2230.433731,0.0,61.0,0.0,1.0,10.402835
1%,2230.539986,0.0,61.0,0.0,1.0,10.409413
5%,2475.109047,0.0,62.0,1.0,4.0,52.261227
10%,2661.757835,0.0,64.0,2.0,10.0,104.084855
25%,3994.251794,0.0,68.0,6.0,24.0,272.258244
50%,5780.182197,33889.5,83.0,14.0,48.0,383.945434
75%,8962.167041,62320.0,109.0,23.0,71.0,547.514839


# Missing Value Analysis - Numerical

In [31]:
num.isnull().mean()

Customer Lifetime Value          0.0
Income                           0.0
Monthly Premium Auto             0.0
Months Since Last Claim          0.0
Months Since Policy Inception    0.0
Total Claim Amount               0.0
dtype: float64

# Missing Value Handling - Numerical Features (Imputation with Mean)

In [None]:
# We are not performing this step since Missing Values are not present

# Missing Value Analysis- Categorical

In [32]:
char.isnull().mean()

State               0.0
Coverage            0.0
Education           0.0
EmploymentStatus    0.0
Gender              0.0
Location Code       0.0
Marital Status      0.0
Policy Type         0.0
Renew Offer Type    0.0
Sales Channel       0.0
Vehicle Class       0.0
Vehicle Size        0.0
dtype: float64

# Missing Value Handling - Categorical Features (Imputation with Mode)

In [33]:
# We are not performing this step since Missing Values are not present

# Feature Selection - Numerical Features 

# Part 1 : Remove Features with 0 Variance

In [34]:
from sklearn.feature_selection import VarianceThreshold

varselector= VarianceThreshold(threshold=0)
varselector.fit_transform(num)
# Get columns to keep and create new dataframe with those only
cols = varselector.get_support(indices=True)
num_1 = num.iloc[:,cols]

In [35]:
num_1.iloc[0]

Customer Lifetime Value           2763.519279
Income                           56274.000000
Monthly Premium Auto                69.000000
Months Since Last Claim             32.000000
Months Since Policy Inception        5.000000
Total Claim Amount                 384.811147
Name: 0, dtype: float64

# Part 2 - Bi Variate Analysis (Feature Discretization)

In [None]:
from sklearn.preprocessing import KBinsDiscretizer
discrete=KBinsDiscretizer(n_bins=10,encode='ordinal', strategy='quantile')
num_binned=pd.DataFrame(discrete.fit_transform(num_1),index=num_1.index, columns=num_1.columns).add_suffix('_Rank')
num_binned.head()

In [None]:
#Check if the features show a slope at all
#If they do, then do you see some deciles below the population average and some higher than population average?
#If that is the case then the slope will be strong
#Conclusion: A strong slope is indicative of the features' ability to discriminate the event from non event
#            making it a good predictor

#percentage_income_goesinto_intallments=Insallment/annual_inc (Derived Variables/Feature Engineering)

X_bin_combined=pd.co



ncat([Y,num_binned],axis=1,join='inner')

from numpy import mean
for col in (num_binned.columns):
    plt.figure()
    sns.lineplot(x=col, y=X_bin_combined['Target'].mean(),data=X_bin_combined,color='red')
    sns.barplot(x=col, y="Target",data=X_bin_combined, estimator=mean )
plt.show()

In [None]:
num_1.dtypes

# Part 3 - Select K Best

In [None]:
# We are not performing this step since we are selecting all the features

In [None]:
select_features_df_num=num_1

# Feature Selection - Categorical Features 

# Part 1 - Bi Variate Analysis

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
X_char_merged=pd.concat([Y,char,ind],axis=1,join='inner')

from numpy import mean
for col in (char.columns):
    plt.figure()
    sns.lineplot(x=col, y=X_char_merged['Target'].mean(),data=X_char_merged,color='red')
    sns.barplot(x=col, y="Target",data=X_char_merged, estimator=mean )
plt.show()

In [None]:
# Create dummy features with n-1 levels
X_char_dum = pd.get_dummies(char, drop_first = True)
X_char_dum.shape

# Part 2 - Select K Best

In [None]:
# We are not perfroming this step as we are choosing all the features 

In [None]:
select_features_df_char=X_char_dum

# Creating the Master Feature Set for Model Development

In [None]:
X_all=pd.concat([select_features_df_char,select_features_df_num],axis=1,join="inner")

In [None]:
Y['Target'].value_counts()

# Train Test Split

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X_all, Y, test_size=0.3, random_state=42)

In [None]:
print("Shape of Training Data",X_train.shape)
print("Shape of Testing Data",X_test.shape)
print("Response Rate in Training Data",y_train.mean())
print("Response Rate in Testing Data",y_test.mean())

In [None]:
# Non Linearity in feature relationships are observed which makes tree methods a good choice
# There are few options to consider among tree methods
# White Box (Completely Explainable Set of Rules) - Decision Tree
# Ensemble Methods - Random Forest (With Bagging)
# Ensemble Methods - GBM/XGBoost (Boosting)

In [None]:
from sklearn.linear_model import LogisticRegression
logreg=LogisticRegression(random_state=0)
logreg.fit(X_train,y_train)

In [None]:
coeff_df=pd.DataFrame(X_all.columns)
coeff_df.columns=['features']
coeff_df["Coefficient Estimate"] = pd.Series(logreg.coef_[0])
coeff_df

In [None]:
# Building a Decision Tree Model
from sklearn.tree import DecisionTreeClassifier
dtree=DecisionTreeClassifier(criterion='gini',random_state=0)

In [None]:
np.random.seed(44)
from sklearn.model_selection import GridSearchCV
param_dist = {'max_depth': [3, 5, 6, 7], 'min_samples_split': [60, 80, 100, 120, 140] }
tree_grid = GridSearchCV(dtree, cv = 10, param_grid=param_dist,n_jobs = 3)
tree_grid.fit(X_train,y_train) 
print('Best Parameters using grid search: \n', tree_grid.best_params_)

In [None]:
dtree=DecisionTreeClassifier(criterion='gini',random_state=0,max_depth=7,min_samples_split=100)
dtree.fit(X_train,y_train)

In [None]:
from sklearn import tree
import pydotplus
import matplotlib.pyplot as plt
plt.figure(figsize=[50,10])
tree.plot_tree(dtree,filled=True,fontsize=15,rounded=True,feature_names=X_all.columns)
plt.show()

In [None]:
# Building a Random Forest Model
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier(criterion='gini',random_state=0,max_depth=7,min_samples_split=100)
rf.fit(X_train,y_train)

In [None]:
import pandas as pd
feature_importances=pd.DataFrame(rf.feature_importances_,
                                 index=X_train.columns,
                                 columns=['importance']).sort_values('importance',ascending=False)
feature_importances

In [None]:
# Building a Gradient Boosting Model
from sklearn.ensemble import GradientBoostingClassifier
gbm=GradientBoostingClassifier(criterion='mse',random_state=0,max_depth=7,min_samples_split=100)
gbm.fit(X_train,y_train)

In [None]:
import pandas as pd
feature_importances=pd.DataFrame(gbm.feature_importances_,
                                 index=X_train.columns,
                                 columns=['importance']).sort_values('importance',ascending=False)
feature_importances

In [None]:
# Model Evaluation on training data
y_pred_logreg_train=logreg.predict(X_train)
y_pred_tree_train=dtree.predict(X_train)
y_pred_rf_train=rf.predict(X_train)
y_pred_gbm_train=gbm.predict(X_train)

In [None]:
# Model Evaluation on test data
y_pred_logreg=logreg.predict(X_test)
y_pred_tree=dtree.predict(X_test)
y_pred_rf=rf.predict(X_test)
y_pred_gbm=gbm.predict(X_test)

In [None]:
from sklearn import metrics
from sklearn.metrics import confusion_matrix

In [None]:
from sklearn import metrics
print("Accuracy:",metrics.accuracy_score(y_test, y_pred_logreg))
print("Precision",metrics.precision_score(y_test,y_pred_logreg))
print("Recall",metrics.recall_score(y_test,y_pred_logreg))
print("f1_score",metrics.f1_score(y_test,y_pred_logreg))

In [None]:
metrics.plot_confusion_matrix(logreg,X_all,Y)

In [None]:
from sklearn import metrics
print("Accuracy:",metrics.accuracy_score(y_test, y_pred_tree))
print("Precision",metrics.precision_score(y_test,y_pred_tree))
print("Recall",metrics.recall_score(y_test,y_pred_tree))
print("f1_score",metrics.f1_score(y_test,y_pred_tree))

In [None]:
metrics.plot_confusion_matrix(dtree,X_all,Y)

In [None]:
from sklearn import metrics
print("Accuracy:",metrics.accuracy_score(y_test, y_pred_rf))
print("Precision",metrics.precision_score(y_test,y_pred_rf))
print("Recall",metrics.recall_score(y_test,y_pred_rf))
print("f1_score",metrics.f1_score(y_test,y_pred_rf))

In [None]:
metrics.plot_confusion_matrix(rf,X_all,Y)

In [None]:
from sklearn import metrics
print("Accuracy:",metrics.accuracy_score(y_test, y_pred_gbm))
print("Precision",metrics.precision_score(y_test,y_pred_gbm))
print("Recall",metrics.recall_score(y_test,y_pred_gbm))
print("f1_score",metrics.f1_score(y_test,y_pred_gbm))

# Check for Model Performance Consistency

In [None]:
from sklearn import metrics
print("Accuracy:",metrics.accuracy_score(y_train, y_pred_gbm_train))
print("Precision",metrics.precision_score(y_train,y_pred_gbm_train))
print("Recall",metrics.recall_score(y_train,y_pred_gbm_train))
print("f1_score",metrics.f1_score(y_train,y_pred_gbm_train))

In [None]:
metrics.plot_confusion_matrix(gbm,X_all,Y)

In [None]:
# Lorenz Curve

In [None]:
# Decsion Tree Lorenz Curve

In [None]:
# Random Forest Lorenz Curve

In [None]:
y_pred_prob = gbm.predict_proba(X_all)[:, 1]
df['pred_prob']=pd.DataFrame(y_pred_prob)
df['P_Rank_GBM']=pd.qcut(df['pred_prob'].rank(method='first').values,10,duplicates='drop').codes+1
rank_df_actuals=df.groupby('P_Rank_GBM')['Target'].agg(['count','mean'])
rank_df_predicted=df.groupby('P_Rank_GBM')['pred_prob'].agg(['mean'])
rank_df_actuals=pd.DataFrame(rank_df_actuals)

rank_df_actuals.rename(columns={'mean':'Actutal_event_rate'},inplace=True)
rank_df_predicted=pd.DataFrame(rank_df_predicted)
rank_df_predicted.rename(columns={'mean':'Predicted_event_rate'},inplace=True)
rank_df=pd.concat([rank_df_actuals,rank_df_predicted],axis=1,join="inner")

sorted_rank_df=rank_df.sort_values(by='P_Rank_GBM',ascending=False)
sorted_rank_df['N_events']=rank_df['count']*rank_df['Actutal_event_rate']
sorted_rank_df['cum_events']=sorted_rank_df['N_events'].cumsum()
sorted_rank_df['event_cap']=sorted_rank_df['N_events']/max(sorted_rank_df['N_events'].cumsum())
sorted_rank_df['cum_event_cap']=sorted_rank_df['event_cap'].cumsum()

sorted_rank_df['N_non_events']=sorted_rank_df['count']-sorted_rank_df['N_events']
sorted_rank_df['cum_non_events']=sorted_rank_df['N_non_events'].cumsum()
sorted_rank_df['non_event_cap']=sorted_rank_df['N_non_events']/max(sorted_rank_df['N_non_events'].cumsum())
sorted_rank_df['cum_non_event_cap']=sorted_rank_df['non_event_cap'].cumsum()

sorted_rank_df['KS']=round((sorted_rank_df['cum_event_cap']-sorted_rank_df['cum_non_event_cap']),4)

sorted_rank_df['random_cap']=sorted_rank_df['count']/max(sorted_rank_df['count'].cumsum())
sorted_rank_df['cum_random_cap']=sorted_rank_df['random_cap'].cumsum()
sorted_reindexed=sorted_rank_df.reset_index()
sorted_reindexed['Decile']=sorted_reindexed.index+1
sorted_reindexed['Lift_over_Avg']=sorted_reindexed['Actutal_event_rate']/(max(sorted_reindexed['N_events'].cumsum())/max(sorted_reindexed['count'].cumsum()))
sorted_reindexed

In [None]:
ax = sns.lineplot( x="Decile", y="cum_event_cap", data=sorted_reindexed)
ax = sns.lineplot( x="Decile", y="cum_random_cap", data=sorted_reindexed)

In [None]:
ax = sns.lineplot( x="Decile", y="Actutal_event_rate", data=sorted_reindexed,color='red')
ax = sns.lineplot( x="Decile", y="Predicted_event_rate", data=sorted_reindexed,color='grey')

In [None]:
fig, axes = plt.subplots(1, 2, sharex=True, figsize=(10,5))
fig.suptitle('Effectiveness of Deciles based on Model Probabilities')
axes[0].set_title('Rank Ordering of Actual Event Rate')
axes[1].set_title('Lift over Mean Event Rate')
sns.lineplot(ax=axes[0],  x="Decile", y="Actutal_event_rate", data=sorted_reindexed,color='red')
sns.barplot(ax=axes[1],  x="Decile", y="Lift_over_Avg", data=sorted_reindexed,color='grey')

In [None]:
# Project Conclusion :- 
# The GBM Model has performed the best and will be used for Customer targeting with Loan offers
# Since Monthly Income and Existing EMI are the most important features for the GBM model
# We will build a Business Value Metric based on Existing EMI/Monthly Income
# Low Values of this ratio will indicate valueable customers
# Within the High Value group, we can leverage the model to identify the best targets

In [None]:
df['Claim_RANK']=pd.qcut(df['Total Claim Amount'].rank(method='first').values,10,duplicates='drop').codes+1

In [None]:
df.groupby('Claim_RANK')['Total Claim Amount'].max()

In [None]:
df['Total Claim Amount'].mean()

In [None]:
df['Claim_RANK_SEGMENT']=np.where(df['Claim_RANK']<=6,"Low",(np.where(df['Claim_RANK']>=9,"High","Mid")))
df.Claim_RANK_SEGMENT.value_counts()

In [None]:
df['Predicted_Renewal_Rank']=np.where(df['P_Rank_GBM']<=8,"Bottom8","Top2")
df['Predicted_Renewal_Rank'].value_counts()

In [None]:
pd.crosstab(index=df['Claim_RANK_SEGMENT'], columns=df['Predicted_Renewal_Rank'],values=df['Target'],aggfunc='mean')

In [None]:
x_chk1=df.loc[df['Predicted_Renewal_Rank']=='Top2']

In [None]:
x_chk2=df.loc[df['Predicted_Renewal_Rank']=='Bottom8']

In [None]:
pd.crosstab(index=x_chk1['Renew Offer Type'], columns=x_chk1['Policy Type'],values=x_chk1['Target'],aggfunc='mean')

In [None]:
pd.crosstab(index=x_chk1['Renew Offer Type'], columns=x_chk1['Policy Type'],values=x_chk1['Target'],aggfunc='count')