# Customer analysis - ML modeling

In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import QuantileTransformer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PowerTransformer
from sklearn.metrics import r2_score
from scipy.stats import boxcox
pd.options.display.max_rows = 50
pd.options.display.max_columns = 999

## 1. Importing and initial data cleaning

In [2]:
def load_original_data():
    return pd.read_csv("Data_Marketing_Customer_Analysis_Round2.csv" )

Cleaning function:
- Standardizing column names
- Dropping some columns
- Dropping the ~630 rows that have NAN values in several columns: state and response
- Grouping the education column
- Grouping luxury cars and luxury suv in luxury
- Converting the number of open complaints in a string
- Eliminating the outliers in the customer_lifetime_value: rows where this column in bigger than 30000



In [37]:
def education_clean(value):
        
        dict_ed = {"Master":"Master_doctor", "Bachelor": "Bachelor", 
                   "College": "HS_College", "High School or Below": "HS_College", "Doctor": "Master_doctor"}
        return dict_ed[value]


def clean_data(df):
    df.columns=[e.lower().replace(' ', '_') for e in df.columns]
    df=df.drop(columns=['unnamed:_0', 'vehicle_type',"customer",
                        "effective_to_date", "months_since_last_claim", "months_since_policy_inception"])
    df.dropna(inplace=True)
    df["number_of_open_complaints"] = df["number_of_open_complaints"].astype(int).astype(str)
    df["total_claim_amount"]=round(df["total_claim_amount"],0)
    df["education"] = df["education"].apply(education_clean)
    df.loc[df["vehicle_class"].apply(lambda x: x.startswith("Luxury")), "vehicle_class"] = "Luxury"
    df["customer_lifetime_value"]=round(df["customer_lifetime_value"],0)
    
    return df

In [46]:
#Import and clean data
customers = load_original_data()
customers = clean_data(customers)

I am dropping the rows that have income 0, because checking them we can see that they are all unemployed. So they can be useful for a separate analysis: we store their data in a dataframe for the unemployed, where income will not be a variable because it is zero for all of them.

We also store in another dataframe the rows that have a customer_lifetime_value over 30000, because they are outliers. We will call this DF over30k

In [39]:
#We store the data in two data frames and leave them for later analysis
unemployed = customers[customers["employmentstatus"] == "Unemployed"]
over30k =  customers[customers["customer_lifetime_value"] > 30000]

In [47]:
#Eliminate the relevant rows from the final customers DF
customers = customers[customers["employmentstatus"] != "Unemployed"]
customers = customers[customers["customer_lifetime_value"] < 30000]
customers.reset_index(inplace=True)

print(f"The unemployed DF has {unemployed.shape[0]} rows.")
print(f"The over30k DF has {over30k.shape[0]} rows.")
print(f"The final customers DF has {customers.shape[0]} rows and {customers.shape[1]} columns.")


The unemployed DF has 2317 rows.
The over30k DF has 189 rows.
The final customers DF has 6664 rows and 22 columns.


In [41]:
customers._get_numeric_data().describe()

Unnamed: 0,index,customer_lifetime_value,income,monthly_premium_auto,number_of_policies,total_claim_amount
count,6664.0,6664.0,6664.0,6664.0,6664.0,6664.0
mean,5520.362395,7435.235444,50450.181122,92.0006,2.972839,373.121098
std,3161.941925,5111.416843,24333.310673,33.668347,2.413507,246.179264
min,0.0,2120.0,10037.0,61.0,1.0,0.0
25%,2757.75,4015.75,28418.0,68.0,1.0,209.0
50%,5548.5,5734.0,48089.5,81.0,2.0,341.0
75%,8296.25,8827.0,70211.75,108.0,4.0,494.0
max,10908.0,29973.0,99981.0,297.0,9.0,2893.0


In [42]:
customers.select_dtypes(object)

Unnamed: 0,state,response,coverage,education,employmentstatus,gender,location_code,marital_status,number_of_open_complaints,policy_type,policy,renew_offer_type,sales_channel,vehicle_class,vehicle_size
0,Arizona,No,Basic,HS_College,Employed,M,Suburban,Married,0,Corporate Auto,Corporate L3,Offer3,Agent,Four-Door Car,Medsize
2,Washington,No,Basic,Bachelor,Employed,M,Suburban,Single,0,Personal Auto,Personal L3,Offer3,Call Center,SUV,Medsize
3,Oregon,Yes,Extended,HS_College,Employed,M,Suburban,Single,0,Corporate Auto,Corporate L3,Offer2,Branch,Four-Door Car,Medsize
4,California,No,Basic,Master_doctor,Employed,F,Urban,Married,3,Corporate Auto,Corporate L2,Offer2,Branch,Four-Door Car,Medsize
5,California,No,Basic,HS_College,Employed,M,Urban,Single,0,Personal Auto,Personal L2,Offer2,Branch,Two-Door Car,Medsize
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9128,California,No,Extended,HS_College,Employed,F,Suburban,Married,2,Personal Auto,Personal L2,Offer2,Agent,Four-Door Car,Medsize
9129,Arizona,No,Basic,HS_College,Employed,M,Suburban,Married,4,Personal Auto,Personal L2,Offer2,Branch,Two-Door Car,Medsize
9131,Oregon,No,Basic,HS_College,Employed,F,Urban,Married,0,Personal Auto,Personal L3,Offer2,Branch,Four-Door Car,Medsize
9132,Arizona,No,Extended,Bachelor,Employed,F,Rural,Married,0,Corporate Auto,Corporate L3,Offer1,Web,Luxury,Medsize
