In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

df = pd.read_csv("../lab-customer-analysis-round-5/files_for_lab/csv_files/marketing_customer_analysis.csv")
df.columns = [df.columns[col_name].lower().replace(' ','_') for col_name in range(len(df.columns))] # Standarize columns
df = df.set_index('customer')
df.sample(10)

Unnamed: 0_level_0,state,customer_lifetime_value,response,coverage,education,effective_to_date,employmentstatus,gender,income,location_code,...,months_since_policy_inception,number_of_open_complaints,number_of_policies,policy_type,policy,renew_offer_type,sales_channel,total_claim_amount,vehicle_class,vehicle_size
customer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
BF56053,Washington,9320.240511,No,Premium,Bachelor,2/24/11,Medical Leave,F,18643,Suburban,...,41,0,5,Corporate Auto,Corporate L2,Offer1,Agent,571.2,Four-Door Car,Medsize
LZ81665,Washington,9655.442955,No,Basic,High School or Below,1/8/11,Unemployed,M,0,Suburban,...,4,2,2,Personal Auto,Personal L2,Offer2,Call Center,720.0,SUV,Medsize
SH48087,California,10144.3094,No,Extended,College,1/29/11,Medical Leave,F,27231,Suburban,...,7,0,2,Personal Auto,Personal L2,Offer1,Branch,422.4,Four-Door Car,Medsize
RS24501,Oregon,13350.1209,No,Premium,College,1/11/11,Employed,F,28919,Suburban,...,88,0,3,Personal Auto,Personal L2,Offer3,Agent,830.4,SUV,Medsize
ME11422,California,5085.023369,No,Basic,College,2/12/11,Unemployed,M,0,Suburban,...,18,1,3,Personal Auto,Personal L3,Offer3,Agent,571.531582,Four-Door Car,Small
XC67861,Arizona,5011.751577,No,Basic,Bachelor,1/9/11,Employed,F,28859,Suburban,...,51,3,4,Corporate Auto,Corporate L2,Offer1,Call Center,321.6,Two-Door Car,Medsize
KG84303,California,3942.917284,No,Extended,Master,1/29/11,Employed,M,38820,Suburban,...,59,0,1,Personal Auto,Personal L3,Offer2,Call Center,547.619785,Four-Door Car,Small
NU93648,Arizona,20774.02463,No,Basic,Bachelor,2/5/11,Unemployed,M,0,Urban,...,40,0,2,Corporate Auto,Corporate L1,Offer1,Call Center,243.208647,Two-Door Car,Medsize
NI18835,California,2927.734329,No,Basic,Bachelor,2/15/11,Employed,M,51991,Rural,...,95,0,1,Personal Auto,Personal L3,Offer2,Web,124.376921,Four-Door Car,Medsize
RN36393,California,15114.40241,Yes,Basic,High School or Below,1/19/11,Medical Leave,F,28513,Suburban,...,22,1,2,Personal Auto,Personal L2,Offer2,Call Center,480.0,SUV,Medsize


In [2]:
# X-y split

y = df['total_claim_amount']
X = df.drop(['total_claim_amount'], axis=1)
X.columns

Index(['state', 'customer_lifetime_value', 'response', 'coverage', 'education',
       'effective_to_date', 'employmentstatus', 'gender', 'income',
       'location_code', 'marital_status', 'monthly_premium_auto',
       'months_since_last_claim', 'months_since_policy_inception',
       'number_of_open_complaints', 'number_of_policies', 'policy_type',
       'policy', 'renew_offer_type', 'sales_channel', 'vehicle_class',
       'vehicle_size'],
      dtype='object')

In [3]:
# Separate numerical and categorical values

X_num = X.select_dtypes(include='number')
X_cat = X.select_dtypes(exclude='number')

In [4]:
# Normalize (numerical)

def boxcox_transform(df):
    numeric_cols = df.select_dtypes(np.number).columns
    _ci = {column: None for column in numeric_cols}
    for column in numeric_cols:
        # since i know any columns should take negative numbers, to avoid -inf in df
        df[column] = np.where(df[column]<=0, np.NAN, df[column])
        df[column] = df[column].fillna(df[column].mean())
        transformed_data, ci = stats.boxcox(df[column]) 
        df[column] = transformed_data
        _ci[column] = [ci] 
    return df, _ci

X_num_trans, _ci = boxcox_transform(X_num.copy()) #IMPORTANT: use copy method
X_num_trans

Unnamed: 0_level_0,customer_lifetime_value,income,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,number_of_open_complaints,number_of_policies
customer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
BU79786,2.703839,1247.937066,0.685347,10.876059,3.017887,0.648045,0.000000
QZ44356,2.754926,1167.961720,0.685871,5.808248,18.780377,0.648045,1.424359
AI49188,2.780772,1144.063081,0.686039,7.347815,17.389171,0.648045,0.607328
WW63253,2.759125,1167.961720,0.686018,7.347815,26.160800,0.648045,1.363462
HB64268,2.704995,1072.375744,0.685461,5.471294,19.461641,0.648045,0.000000
...,...,...,...,...,...,...,...
LA72316,2.801170,1448.531450,0.685461,7.347815,33.090025,0.648045,0.607328
PK87824,2.711030,697.790558,0.685606,6.134207,13.709541,0.648045,0.000000
TD14365,2.762062,1167.961720,0.685725,4.379465,17.034934,1.176115,0.607328
UP19263,2.758397,704.383672,0.685898,11.319628,1.667871,0.648045,0.893486


In [6]:
# EXERCISE: Label Encoding (categorical)

#X_cat_encoded = pd.get_dummies(X_cat, drop_first=True) #get_dummies is not suitable for production pipelines
#X_cat_encoded

Unnamed: 0_level_0,state_California,state_Nevada,state_Oregon,state_Washington,response_Yes,coverage_Extended,coverage_Premium,education_College,education_Doctor,education_High School or Below,...,sales_channel_Branch,sales_channel_Call Center,sales_channel_Web,vehicle_class_Luxury Car,vehicle_class_Luxury SUV,vehicle_class_SUV,vehicle_class_Sports Car,vehicle_class_Two-Door Car,vehicle_size_Medsize,vehicle_size_Small
customer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
BU79786,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
QZ44356,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
AI49188,0,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,1,0
WW63253,1,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,1,0,0,1,0
HB64268,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
LA72316,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
PK87824,1,0,0,0,1,1,0,1,0,0,...,1,0,0,0,0,0,0,0,1,0
TD14365,1,0,0,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
UP19263,1,0,0,0,0,1,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0


In [5]:
# EXERCISE: One Hot Encoding (categorical)

encoder = OneHotEncoder(handle_unknown='error', drop='first') #drop one column for efficiency. It can be deduced
X_cat_encoded = encoder.fit_transform(X_cat).toarray()
X_cat_encoded

array([[0., 0., 0., ..., 1., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 1., 0., ..., 1., 1., 0.],
       ...,
       [1., 0., 0., ..., 0., 1., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 1., 1., 0.]])

In [6]:
# Concat DataFrames

column_names = list(X_num_trans.columns) # get list of numerical column names
column_names.extend(list(encoder.get_feature_names())) # add list of dummified categorical column names

X_numcat = np.concatenate([X_num_trans, X_cat_encoded], axis=1)
X_ready = pd.DataFrame(data=X_numcat, index=X.index, columns=column_names)
X_ready

Unnamed: 0_level_0,customer_lifetime_value,income,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,number_of_open_complaints,number_of_policies,x0_California,x0_Nevada,x0_Oregon,...,x12_Branch,x12_Call Center,x12_Web,x13_Luxury Car,x13_Luxury SUV,x13_SUV,x13_Sports Car,x13_Two-Door Car,x14_Medsize,x14_Small
customer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
BU79786,2.703839,1247.937066,0.685347,10.876059,3.017887,0.648045,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
QZ44356,2.754926,1167.961720,0.685871,5.808248,18.780377,0.648045,1.424359,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
AI49188,2.780772,1144.063081,0.686039,7.347815,17.389171,0.648045,0.607328,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
WW63253,2.759125,1167.961720,0.686018,7.347815,26.160800,0.648045,1.363462,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
HB64268,2.704995,1072.375744,0.685461,5.471294,19.461641,0.648045,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
LA72316,2.801170,1448.531450,0.685461,7.347815,33.090025,0.648045,0.607328,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
PK87824,2.711030,697.790558,0.685606,6.134207,13.709541,0.648045,0.000000,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
TD14365,2.762062,1167.961720,0.685725,4.379465,17.034934,1.176115,0.607328,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
UP19263,2.758397,704.383672,0.685898,11.319628,1.667871,0.648045,0.893486,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
# EXERCISE: Train-test split

X_train, X_test, y_train, y_test = train_test_split(X_ready, y, test_size=0.3, random_state=42)

In [8]:
# EXERCISE: Apply linear regression

model = LinearRegression()
model.fit(X_train,y_train)

LinearRegression()

In [9]:
# EXERCISE: Model validation

prediction = model.predict(X_test)

R2 = r2_score(y_test, prediction)
MSE = mean_squared_error(y_test, prediction, squared=True)
RMSE = mean_squared_error(y_test, prediction, squared=False)
MAE = mean_absolute_error(y_test, prediction)

print("R2:", round(R2,2))
print("MSE:", round(MSE,2))
print("RMSE:", round(RMSE,2))
print("MAE:", round(MAE,2))

R2: 0.74
MSE: 21149.99
RMSE: 145.43
MAE: 98.47
