In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
lm = LinearRegression()
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn import linear_model 

In [None]:
%matplotlib inline

## Lab | Customer Analysis Round 2

### Dealing with the data

In [None]:
#Show the dataframe shape.

In [None]:
file = pd.read_csv('marketing_customer_analysis.csv')

In [None]:
file.shape

In [None]:
file.info()

In [None]:
file.describe()

In [None]:
#2. Standardize header names.

In [None]:
file.columns

In [None]:
cols = []
for col in file.columns: 
    column = col.lower()
    column = column.replace(" ", "_")
    cols.append(column)
cols

In [None]:
file.columns = cols
file.head()

In [None]:
#3. Which columns are numerical?
#4. Which columns are categorical?

In [None]:
file.dtypes

In [None]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
file.select_dtypes(include=numerics).columns


In [None]:
file.select_dtypes(include='object').columns

In [None]:
file['state'].unique()

In [None]:
file['response'].unique()

In [None]:
file['coverage'].unique()

In [None]:
file['education'].unique()

In [None]:
file['effective_to_date'].unique()

In [None]:
file['employmentstatus'].unique()

In [None]:
file['gender'].unique()

In [None]:
file['location_code'].unique()

In [None]:
file['marital_status'].unique()

In [None]:
file['policy_type'].unique()

In [None]:
file['policy'].unique()

In [None]:
file['renew_offer_type'].unique()

In [None]:
file['sales_channel'].unique()

In [None]:
file['vehicle_class'].unique()

In [None]:
file['vehicle_size'].unique()

In [None]:
file['vehicle_type'].unique()

###  Check and deal with `NaN` values

In [None]:
file.isna().sum()

In [None]:
#Dropping columns with missing values -categorical-. Vehicle type has too many missing values. Unnamed is useless. 
filedr = file.drop(columns=['vehicle_type', 'unnamed:_0'])
filedr.head(1)

In [None]:
filedr.dropna(axis=0,inplace=False)

In [None]:
filedr.isna().sum()

In [None]:
#We drop the column response.

In [None]:
filedr['response'].value_counts(dropna=False)

In [None]:
filedr = file.drop(columns=['response'])
filedr.head()

### Dealing with the date column

In [None]:
def get_month(text):
    return text.split('/')[0]

In [None]:
filedr['month'] = filedr.effective_to_date.apply(get_month)
filedr.head()

In [None]:
#6. Datetime format - 
#Extract the months from the dataset and store in a separate column. 
#Then filter the data to show only the information for the first quarter , ie. January, February and March. _
# #Hint_: If data from March does not exist, consider only January and February.
# selecting rows based on condition 
quarter = ['1', '2', '3'] 
quarter_1 = filedr[filedr['month'].isin(quarter)] 

In [None]:
quarter_1.head()

In [None]:
#7. BONUS: Put all the previously mentioned data transformations into a function

## Lab | Customer Analysis Round 3

In [None]:
#For this lab, we still keep using the marketing_customer_analysis.csv file. 
#You can find the file in the files_for_lab folder.

In [None]:
filel3 = pd.read_csv('marketing_customer_analysislab3.csv')

### EDA (Exploratory Data Analysis) 


In [None]:
filel3.info()

In [None]:
filel3.describe().T

In [None]:
filel3.describe(include=[np.object]).T 

### Plotting

In [None]:
#Show a plot of the total number of responses.

In [None]:
sns.displot(filel3['Response'])
plt.show()

In [None]:
#Show a plot of the response rate by the sales channel.

In [None]:
sns.displot(data=filel3, x="Sales Channel", hue="Response", multiple="stack")
plt.show()

In [None]:
#Show a plot of the response rate by the total claim amount.

In [None]:
sns.displot(data=filel3, x="Total Claim Amount", hue="Response")
sns.set(rc={"figure.figsize":(2, 2)}) 
plt.show()

In [None]:
#Show a plot of the response rate by income.

In [None]:
sns.displot(data=filel3, x="Income", hue="Response", multiple="stack")
sns.set(rc={"figure.figsize":(8, 4)}) 
plt.show()

## Lab | Customer Analysis Round 4

In [None]:
#In today's lesson we talked about continuous distributions (mainly normal distribution), 
#linear regression and how multicollinearity can impact the model.
#In this lab, we will test your knowledge on those things using the marketing_customer_analysis.csv file. 
#You have been using the same data in the previous labs (round 2 and 3). You can continue using the same jupyter file. 
#The file can be found in the files_for_lab folder.
#Please note that we will use the column total_claim_amount later as the target variable.

In [None]:
#Check the data types of the columns. 
#Get the numeric data into dataframe called numerical and categorical columns in a dataframe called categoricals. 
#(You can use np.number and np.object to select the numerical data types and categorical data types respectively)

In [None]:
categorical = filel3.select_dtypes(exclude=[np.number])
categorical.head(1)

In [None]:
numerical = filel3.select_dtypes(include=[np.number])
numerical.head(1)

### Plotting numerical variables

In [None]:
#Now we will try to check the normality of the numerical variables visually
#Use seaborn library to construct distribution plots for the numerical variables
#Use Matplotlib to construct histograms

In [None]:
sns.distplot(numerical['Total Claim Amount'], bins=40);

In [None]:
sns.distplot(numerical['Number of Policies'], bins=40);

In [None]:
sns.distplot(numerical['Number of Open Complaints'], bins=40);

In [None]:
sns.distplot(numerical['Months Since Policy Inception'], bins=30);

In [None]:
sns.distplot(numerical['Months Since Last Claim'], bins=30);

In [None]:
sns.distplot(numerical['Monthly Premium Auto'], bins=30)

In [None]:
sns.distplot(numerical['Income'], bins=6);

In [None]:
sns.distplot(numerical['Customer Lifetime Value'], bins=30);

### Checking for correlations

In [None]:
#Write code for both the correlation matrix and for seaborn heatmap. 
#If there is no pair of features that have a high correlation, then do not drop any features
#For the numerical variables, check the multicollinearity between the features.
#Please note that we will use the column total_claim_amount later as the target variable.
#Drop one of the two features that show a high correlation between them (greater than 0.9). 
#Write code for both the correlation matrix and for seaborn heatmap. 
#If there is no pair of features that have a high correlation, then do not drop any features

In [None]:
sns.pairplot(numerical)
plt.show()

In [None]:
numcorr = numerical.corr()
numcorr

In [None]:
fig, ax = plt.subplots(figsize=(10, 8))
ax = sns.heatmap(numcorr, annot=True)
plt.show()

There are no features that show a correlation between them higher than 0.9 and so we will not drop any feature. The two parameters that seem to have the highest explanationatory capacity for total claim amount are income, monthly premium auto and customer lifetime value.


##  lab| Customer analysis round 5

In [None]:
df = pd.concat([numerical, categorical], axis = 1)


In [None]:
df.drop(['Policy', 'Customer'], axis=1, inplace=True)

In [None]:
df.drop(df.columns[[12]], axis = 1, inplace = True)


In [None]:
df.dropna()

In [None]:
df.isna().sum()

In [None]:
def remove_perc(string):
    
    try:
        string = float(string[:-1])
        
    except:
        string = float(string)
        
    return string


df['customer_lifetime_value'].apply(lambda row: remove_perc(row))

### X-y split and separating numerical and categorical

In [None]:
y = df['Total Claim Amount']
X = df.drop(['Total Claim Amount'], axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size = 0.2, random_state = 1)

In [None]:
X_train_num = X_train.select_dtypes(include=[np.number])


In [None]:
X_train_cat = X_train.select_dtypes(exclude=[np.number])


In [None]:
X_test_num = X_test.select_dtypes(include=[np.number])


In [None]:
X_test_cat = X_test.select_dtypes(exclude=[np.number])


### Normalizing numerical variables

In [None]:
#Do the distributions for different numerical variables look like a normal distribution
#normalize values in every column
#view normalized DataFrame

In [None]:
powertransformer = PowerTransformer(method='yeo-johnson', standardize=True, copy=True)
powertransformer.fit(X_train_num)
X_train_PT = powertransformer.transform(X_train_num)
X_test_PT = powertransformer.transform(X_test_num)

In [None]:
X_train_df = pd.DataFrame(X_train_PT, columns=X_train_num.columns)
X_test_df= pd.DataFrame(X_test_PT,  columns=X_train_num.columns)

##  lab | Customer analysis round 6

### One Hot/Label Encoding (categorical).




In [None]:
encoder = OneHotEncoder(handle_unknown='ignore', drop='first')
type(encoder)

In [None]:
encoder.fit(X_train_cat)
enc_X_train_cat = encoder.transform(X_train_cat).toarray()
enc_X_train_cat

In [None]:
enc_col = encoder.get_feature_names_out()

In [None]:
enc_X_train_cat = pd.DataFrame(enc_X_train_cat, index=X_train_cat.index, columns=enc_col, dtype=int)

In [None]:
enc_X_test_cat = encoder.transform(X_test_cat).toarray()
enc_col = encoder.get_feature_names_out()
enc_X_test_cat = pd.DataFrame(enc_X_test_cat, index=X_test_cat.index, columns=enc_col, dtype=int)

### Concatenating and Scaling numerical and categorical train/test sets

In [None]:
X_train = pd.concat([X_train_df, enc_X_train_cat], axis = 1)

In [None]:
X_test = pd.concat([X_test_df, enc_X_test_cat], axis = 1)

In [None]:
MinMaxtransformer = MinMaxScaler()

MinMaxtransformer.fit(X_train)

X_train_nor = MinMaxtransformer.transform(X_train)
X_test_nor  = MinMaxtransformer.transform(X_test)



In [None]:
X_train_nor = pd.DataFrame(X_train_nor, columns=X_train.columns)
X_test_nor = pd.DataFrame(X_test_nor, columns = X_test.columns)

 ##  lab | Customer analysis round 7

In [None]:
lm = linear_model.LinearRegression()
lm.fit(X_train_nor,y_train)

In [None]:
y_train_pred = lm.predict(X_train_nor)
r2_score(y_train, y_train_pred)


In [None]:
y_test_pred = lm.predict(X_test_nor)
r2_score(y_test, y_test_pred)


In [None]:
lm.intercept_

In [None]:
lm.coef_

In [None]:
np.sqrt(mean_squared_error(y_train,y_train_pred)) # MSE -> RMSE

In [None]:
np.sqrt(mean_squared_error(y_test,y_test_pred)) # MSE -> RMSE