In [163]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso, Ridge ,ElasticNet, LinearRegression
from sklearn.feature_selection import RFE
import statsmodels.api as sm

df = pd.read_csv('Data_Marketing_Customer_Analysis_Round3.csv')
numerical = df.select_dtypes(include=np.number)
numerical.dropna(inplace=True)

# Select and encode categorical data (from lab4.3.2)
categorical = df.select_dtypes(include=object)
categorical.dropna(inplace=True)

ord_encoded = categorical[['education','month','vehicle_size']]
ord_encoded = ord_encoded.replace({
    'education':
        {'high school or below': 1, 'college': 2, 'bachelor': 3, 'master': 4, 'doctor': 5},
    'month': {'jan': 1, 'feb': 2},
    'vehicle_size': {'small': 1, 'medsize': 2, 'large': 3}
})

hot_encoded = categorical.drop(['education','month','vehicle_size','effective_to_date'], axis=1)
hot_encoded = pd.get_dummies(hot_encoded, drop_first=False, dtype=int)

cat_encoded = pd.concat([ord_encoded,hot_encoded], axis=1)
all_data = pd.concat([numerical,cat_encoded], axis=1)
all_data

Unnamed: 0,customer_lifetime_value,income,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,number_of_open_complaints,number_of_policies,total_claim_amount,education,month,...,sales_channel_agent,sales_channel_branch,sales_channel_call center,sales_channel_web,vehicle_class_four-door car,vehicle_class_luxury car,vehicle_class_luxury suv,vehicle_class_sports car,vehicle_class_suv,vehicle_class_two-door car
0,4809,48029,61,7,52,0,9,292,2,2,...,1,0,0,0,1,0,0,0,0,0
1,2228,92260,64,3,26,0,1,744,2,1,...,0,0,1,0,1,0,0,0,0,0
2,14947,22139,100,34,31,0,2,480,3,2,...,0,0,1,0,0,0,0,0,1,0
3,22332,49078,97,10,3,0,2,484,2,1,...,0,1,0,0,1,0,0,0,0,0
4,9025,23675,117,33,31,0,7,707,3,1,...,0,1,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10684,15563,61541,253,12,40,0,7,1214,3,1,...,0,0,0,1,0,1,0,0,0,0
10685,5259,61146,65,7,68,0,6,273,2,1,...,0,1,0,0,1,0,0,0,0,0
10686,23893,39837,201,11,63,0,2,381,3,2,...,0,0,0,1,0,0,1,0,0,0
10687,11971,64195,158,0,27,4,6,618,2,2,...,0,1,0,0,0,0,0,0,1,0


In [164]:
# 1. fit the models LinearRegression, Lasso and Ridge and compare the model performances

X = all_data.drop('total_claim_amount', axis=1)
y = all_data['total_claim_amount']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
scaler.fit(np.array(X_train))
X_train_scaled = scaler.transform(np.array(X_train))
X_test_scaled = scaler.transform(np.array(X_test))

# Linear Regression
model = LinearRegression()
model.fit(X_train_scaled,y_train)
print(f"{model.__class__.__name__}: Train -> {model.score(X_train_scaled, y_train)}, Test -> {model.score(X_test_scaled, y_test)}")

# Lasso
model = Lasso(alpha=1)
model.fit(X_train_scaled, y_train)
print(f"{model.__class__.__name__}\t\t: Train -> {model.score(X_train_scaled, y_train)}, Test -> {model.score(X_test_scaled, y_test)}")

# Ridge
model = Ridge(alpha=1)
model.fit(X_train_scaled, y_train)
print(f"{model.__class__.__name__}\t\t: Train -> {model.score(X_train_scaled, y_train)}, Test -> {model.score(X_test_scaled, y_test)}")

# ElasticNet
model = ElasticNet(alpha=1)
model.fit(X_train_scaled, y_train)
print(f"{model.__class__.__name__}\t: Train -> {model.score(X_train_scaled, y_train)}, Test -> {model.score(X_test_scaled, y_test)}")

LinearRegression: Train -> 0.7690443655683407, Test -> 0.7741037711606412
Lasso		: Train -> 0.7692432525282759, Test -> 0.7739283843433649
Ridge		: Train -> 0.7696625682810757, Test -> 0.7738950476275487
ElasticNet	: Train -> 0.7301743744897586, Test -> 0.7348950672642125


In [165]:
# 2. Define a function that takes a list of models and trains (and tests) them so we can try a lot of them without repeating code

def compare_models(df, models):
    
    X = df.drop('total_claim_amount', axis=1)
    y = df['total_claim_amount']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    scaler = StandardScaler()
    scaler.fit(np.array(X_train))
    X_train_scaled = scaler.transform(np.array(X_train))
    X_test_scaled = scaler.transform(np.array(X_test))
    results = pd.DataFrame(columns=['Model', 'Train', 'Test'])
    
    for i in range(len(models)):
        if models[i] == 'linear':
            model = LinearRegression()
        elif models[i] == 'lasso':
            model = Lasso(alpha=1)
        elif models[i] == 'ridge':
            model = Ridge(alpha=1)
        elif models[i] == 'elastic':
            model = ElasticNet(alpha=1)
        else:
            return print(f"Model name {models[i]} incorrect or not supported")
            
        model.fit(X_train_scaled, y_train)
        results.loc[i] = [model.__class__.__name__, round(model.score(X_train_scaled, y_train),5), round(model.score(X_test_scaled, y_test),5)]
        
    return results

compare_models(all_data, ['linear','lasso','ridge','elastic'])

Unnamed: 0,Model,Train,Test
0,LinearRegression,0.76904,0.7741
1,Lasso,0.76924,0.77393
2,Ridge,0.76966,0.7739
3,ElasticNet,0.73017,0.7349


In [166]:
#
#
#
# 
#
#
#
#
#
#

In [176]:
# 3.1 Use P-Value feature selection to select subset of features to train the model with.

# see P-Value feature selection in Data Processing lesson
X_train_named = pd.DataFrame(X_train_scaled, columns = X.columns)
#X_added_constant = sm.add_constant(X_train_named) # constant needed for the intercept
X_added_constant = X_train_named
model = sm.OLS(np.array(y_train),X_added_constant).fit()
#model.summary()


In [177]:
sig_features = model.params[list(np.where(model.pvalues < 0.05)[0])].iloc[0:].index.tolist()
significant_df = X_added_constant[sig_features]
if 'const' in sig_features: sig_features.remove('const')
sig_features.append('total_claim_amount')
sig_features

['monthly_premium_auto',
 'employment_status_employed',
 'employment_status_unemployed',
 'location_code_rural',
 'location_code_suburban',
 'marital_status_divorced',
 'marital_status_married',
 'marital_status_single',
 'total_claim_amount']

In [175]:
compare_models(all_data[sig_features], ['linear','lasso','ridge','elastic'])

Unnamed: 0,Model,Train,Test
0,LinearRegression,0.42922,0.4207
1,Lasso,0.42918,0.42082
2,Ridge,0.42922,0.42069
3,ElasticNet,0.38215,0.37763


In [170]:
#
#
#
# 
#
#
#
#
#
#

In [171]:
lm = LinearRegression()

selector = RFE(lm, n_features_to_select= 8, step = 1, verbose = 0)
selector.fit(X_train_named, y_train)

kept_features = selector.get_support(indices = True)
kept_features = list(X_train_named.iloc[:,kept_features].columns)
kept_features.append('total_claim_amount')

kept_features

['coverage_basic',
 'policy_type_corporate auto',
 'policy_type_personal auto',
 'policy_type_special auto',
 'sales_channel_agent',
 'sales_channel_branch',
 'sales_channel_call center',
 'sales_channel_web',
 'total_claim_amount']

In [172]:
compare_models(all_data[kept_features], ['linear','lasso','ridge','elastic'])

Unnamed: 0,Model,Train,Test
0,LinearRegression,0.05512,0.0549
1,Lasso,0.05514,0.05506
2,Ridge,0.05519,0.05474
3,ElasticNet,0.04886,0.04932
