In [25]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [43]:
data = pd.read_csv('Data_Marketing_Customer_Analysis_Round3.csv')

data.head()

Unnamed: 0,region,customer_lifetime_value,response,coverage,education,effective_to_date,month,employment_status,gender,income,...,months_since_policy_inception,number_of_open_complaints,number_of_policies,policy_type,policy,renew_offer_type,sales_channel,total_claim_amount,vehicle_class,vehicle_size
0,central,4809,no,basic,college,2/18/11,feb,employed,m,48029,...,52,0,9,corporate auto,corporate l3,offer3,agent,292,four-door car,medsize
1,west region,2228,no,basic,college,1/18/11,jan,unemployed,f,92260,...,26,0,1,personal auto,personal l3,offer4,call center,744,four-door car,medsize
2,east,14947,no,basic,bachelor,2/10/11,feb,employed,m,22139,...,31,0,2,personal auto,personal l3,offer3,call center,480,suv,medsize
3,north west,22332,yes,extended,college,1/11/11,jan,employed,m,49078,...,3,0,2,corporate auto,corporate l3,offer2,branch,484,four-door car,medsize
4,north west,9025,no,premium,bachelor,1/17/11,jan,medical leave,f,23675,...,31,0,7,personal auto,personal l2,offer1,branch,707,four-door car,medsize


#### Defining X, y

In [44]:
X = data.drop(columns=["total_claim_amount"], axis = 1)
y = data["total_claim_amount"]

#### Data splitting

In [45]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

X_train = pd.DataFrame(X_train, columns=X.columns)
X_test  = pd.DataFrame(X_test, columns=X.columns)

In [46]:
X_train.describe()

Unnamed: 0,customer_lifetime_value,income,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,number_of_open_complaints,number_of_policies
count,8551.0,8551.0,8551.0,8551.0,8551.0,8551.0,8551.0
mean,7994.902701,51817.509063,93.295287,15.13554,48.19296,0.375395,2.983511
std,6848.846659,24717.379264,34.575537,10.13316,27.849503,0.899706,2.398456
min,1898.0,10074.0,61.0,0.0,0.0,0.0,1.0
25%,4020.5,29435.0,68.0,6.0,25.0,0.0,1.0
50%,5764.0,50446.0,83.0,14.0,48.0,0.0,2.0
75%,8964.0,72194.5,109.0,23.0,71.0,0.0,4.0
max,74228.0,99981.0,298.0,35.0,99.0,5.0,9.0


In [47]:
from sklearn.feature_selection import VarianceThreshold # It only works with numerical features


X_train = X_train.select_dtypes(include=np.number)
X_test  = X_test.select_dtypes(include=np.number)

#display(X_train)
print("Initial number of numerical columns: ",X_train.shape)
print()


selector = VarianceThreshold(threshold=100) # Default threshold value is 0
# Features with a training-set variance lower than this threshold will be removed.
selector.fit(X_train)

kept_features_indexes = selector.get_support(indices = True) #returns an array of integers corresponding to nonremoved features
kept_features = list(X_train.iloc[:,kept_features_indexes].columns)

X_train = selector.transform(X_train)
X_test  = selector.transform(X_test)

X_train = pd.DataFrame(X_train, columns=kept_features)
X_test  = pd.DataFrame(X_test, columns=kept_features)

print("Final number of numerical columns: ",X_train.shape)
print()
X_train

Initial number of numerical columns:  (8551, 7)

Final number of numerical columns:  (8551, 5)



Unnamed: 0,customer_lifetime_value,income,monthly_premium_auto,months_since_last_claim,months_since_policy_inception
0,21423,22379,65,9,31
1,8391,40211,106,5,98
2,3969,49544,101,3,29
3,14914,45963,63,3,73
4,18060,57882,115,1,61
...,...,...,...,...,...
8546,7610,98701,94,22,66
8547,35186,86134,98,17,78
8548,4241,19834,64,26,8
8549,12941,77060,106,23,90


#### Correlation matrix

import seaborn as sns
import matplotlib.pyplot as plt

c = abs(data.corr())
#c

#fig, ax = plt.subplots(figsize=(14,14))
#sns.heatmap(c, annot=True);

#c['total_claim_amount']
c_last = c['total_claim_amount'].sort_values(ascending=False)
#c_last
c_thr = .3
cols_to_keep = list(c_last[c_last > c_thr].index)[1:] + [list(c_last[c_last > c_thr].index)[0]]
print(cols_to_keep)

data[cols_to_keep]

#### Recursive feature elimination

In [48]:
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE  ## recursive feature elemination technique

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

X_train = X_train.select_dtypes(include=np.number)
X_test  = X_test.select_dtypes(include=np.number)

X_train = pd.DataFrame(X_train, columns=X.columns)
X_test  = pd.DataFrame(X_test, columns=X.columns)

#X_train.isna().sum()
nulls = pd.DataFrame(X_train.isna().sum()).reset_index()
#nulls.head()
nulls.columns = ['Column','nas']
#nulls.head()
#nulls[nulls['nas'] > 0].head()
cols_to_drop = nulls[nulls['nas'] > 0]['Column'] # Too drastic, but made on pourpose for quick filtering (don't do this in production!!)

X_train.drop(columns=cols_to_drop, axis=1, inplace = True)
X_test.drop(columns=cols_to_drop, axis=1, inplace = True)

#display(X_train)

lm = LinearRegression()

selector = RFE(lm, n_features_to_select= 8, step = 1, verbose = 1) # Step is how many features to add or drop everytime
selector.fit(X_train, y_train)

kept_features = selector.get_support(indices = True) #returns an array of integers corresponding to nonremoved features
kept_features = list(X_train.iloc[:,kept_features].columns)

X_train = selector.transform(X_train)
X_test  = selector.transform(X_test)

X_train = pd.DataFrame(X_train, columns=kept_features)
X_test  = pd.DataFrame(X_test, columns=kept_features)

print("Final selected features: ")
display(X_train)

Final selected features: 


Unnamed: 0,customer_lifetime_value,income,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,number_of_open_complaints,number_of_policies
0,21423,22379,65,9,31,0,2
1,8391,40211,106,5,98,2,6
2,3969,49544,101,3,29,0,1
3,14914,45963,63,3,73,2,2
4,18060,57882,115,1,61,0,2
...,...,...,...,...,...,...,...
8546,7610,98701,94,22,66,0,3
8547,35186,86134,98,17,78,0,2
8548,4241,19834,64,26,8,4,8
8549,12941,77060,106,23,90,0,2


### Embedded Methods

In [50]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

X_train = X_train.select_dtypes(include=np.number)
X_test  = X_test.select_dtypes(include=np.number)

In [51]:
import numpy as np
from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
X_train=imp_mean.fit_transform(X_train)

In [52]:
X_train

array([[8.6630e+03, 4.2169e+04, 8.3000e+01, ..., 9.0000e+01, 1.0000e+00,
        2.0000e+00],
       [4.2130e+03, 1.2160e+04, 1.0900e+02, ..., 3.4000e+01, 0.0000e+00,
        1.0000e+00],
       [2.3590e+03, 1.9864e+04, 6.3000e+01, ..., 9.6000e+01, 0.0000e+00,
        1.0000e+00],
       ...,
       [4.2410e+03, 1.9834e+04, 6.4000e+01, ..., 8.0000e+00, 4.0000e+00,
        8.0000e+00],
       [1.2941e+04, 7.7060e+04, 1.0600e+02, ..., 9.0000e+01, 0.0000e+00,
        2.0000e+00],
       [6.9470e+03, 6.3406e+04, 1.0000e+02, ..., 5.4000e+01, 0.0000e+00,
        7.0000e+00]])

### OLS

In [53]:
model=LinearRegression()
model.fit(X_train, y_train)
print(f"{model.__class__.__name__}: Train -> {model.score(X_train, y_train)}, Test -> {model.score(X_test, y_test)}")

LinearRegression: Train -> 0.4086926440650056, Test -> 0.41140062170553826


##### lasso model could drop features and make it a feature selection technique

In [54]:
from sklearn.linear_model import Lasso,Ridge,ElasticNet, LinearRegression
model=Lasso(alpha=0.05)

model.fit(X_train, y_train)
print(f"{model.__class__.__name__}: Train -> {model.score(X_train, y_train)}, Test -> {model.score(X_test, y_test)}")

Lasso: Train -> 0.4086926032425867, Test -> 0.41141620918570554


###### Ridge

In [55]:
model=Ridge(alpha=10000)
model.fit(X_train, y_train)
print(f"{model.__class__.__name__}: Train -> {model.score(X_train, y_train)}, Test -> {model.score(X_test, y_test)}")

Ridge: Train -> 0.40867692596088767, Test -> 0.4116852305026222


##### ElasticNet

In [57]:
model=ElasticNet(alpha=0.1)
model.fit(X_train, y_train)
print(f"{model.__class__.__name__}: Train -> {model.score(X_train, y_train)}, Test -> {model.score(X_test, y_test)}")

ElasticNet: Train -> 0.40869238511331907, Test -> 0.4114386909519817


#### OLS Fitting

In [59]:
import statsmodels.api as sm
X = data.drop(columns=["total_claim_amount"], axis = 1)
y = data["total_claim_amount"]
X_added_constant = sm.add_constant(X)
X_added_constant

Unnamed: 0,const,region,customer_lifetime_value,response,coverage,education,effective_to_date,month,employment_status,gender,...,months_since_last_claim,months_since_policy_inception,number_of_open_complaints,number_of_policies,policy_type,policy,renew_offer_type,sales_channel,vehicle_class,vehicle_size
0,1.0,central,4809,no,basic,college,2/18/11,feb,employed,m,...,7,52,0,9,corporate auto,corporate l3,offer3,agent,four-door car,medsize
1,1.0,west region,2228,no,basic,college,1/18/11,jan,unemployed,f,...,3,26,0,1,personal auto,personal l3,offer4,call center,four-door car,medsize
2,1.0,east,14947,no,basic,bachelor,2/10/11,feb,employed,m,...,34,31,0,2,personal auto,personal l3,offer3,call center,suv,medsize
3,1.0,north west,22332,yes,extended,college,1/11/11,jan,employed,m,...,10,3,0,2,corporate auto,corporate l3,offer2,branch,four-door car,medsize
4,1.0,north west,9025,no,premium,bachelor,1/17/11,jan,medical leave,f,...,33,31,0,7,personal auto,personal l2,offer1,branch,four-door car,medsize
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10684,1.0,central,15563,no,premium,bachelor,1/19/11,jan,unemployed,f,...,12,40,0,7,personal auto,personal l1,offer3,web,luxury car,medsize
10685,1.0,north west,5259,no,basic,college,1/6/11,jan,employed,f,...,7,68,0,6,personal auto,personal l3,offer2,branch,four-door car,medsize
10686,1.0,central,23893,no,extended,bachelor,2/6/11,feb,employed,f,...,11,63,0,2,corporate auto,corporate l3,offer1,web,luxury suv,medsize
10687,1.0,west region,11971,no,premium,college,2/13/11,feb,employed,f,...,0,27,4,6,personal auto,personal l1,offer1,branch,suv,medsize
