In [1]:
#libraries
import matplotlib.pyplot as plt
from matplotlib.collections import LineCollection
%matplotlib inline
from numpy import random
import seaborn as sns
import pandas as pd
import numpy as np
import scipy as sc
from ipywidgets import interactive
import ipywidgets as widgets
from sklearn.metrics import mean_squared_error, r2_score
from scipy.stats import boxcox
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
#Function for reading the data:
def read_customer_data():
    return pd.read_csv("customer_analysis_clean.csv")

In [3]:
df = read_customer_data()

In [4]:
list(df)

['Unnamed: 0',
 'state',
 'customer_lifetime_value',
 'response',
 'coverage',
 'education',
 'effective_to_date',
 'employmentstatus',
 'gender',
 'income',
 'location_code',
 'marital_status',
 'monthly_premium_auto',
 'months_since_last_claim',
 'months_since_policy_inception',
 'number_of_open_complaints',
 'number_of_policies',
 'policy_type',
 'policy',
 'renew_offer_type',
 'sales_channel',
 'total_claim_amount',
 'vehicle_class',
 'vehicle_size',
 'vehicle_type',
 'month_effective_to_date']

In [None]:
#Function for standardizing column names:
def standardize(df):
    new_columns = df.columns.str.lower().str.replace(" ", "_")
    df.columns = new_columns
    return df

In [None]:
#Function for dropping customer and unnamed:_0 columns
def drop_2cols(df):
    df.drop(columns=["unnamed:_0", "customer"], inplace=True)
    return df

In [None]:
#Drop duplicates and reset index
def dd(df):
    df = df.drop_duplicates().reset_index()
    df.drop(columns=["index"], inplace=True)
    return df

In [None]:
#Replacing null values
def fill_null(df):
    df["number_of_open_complaints"] = df["number_of_open_complaints"].fillna(value=0)
    df["vehicle_type"] = df["vehicle_type"].fillna(value="M")
    mean_months = round(df["months_since_last_claim"].mean())
    df["months_since_last_claim"] = df["months_since_last_claim"].fillna(value=mean_months)
    mean_income = round(df["income"].mean())
    df["income"] = df["income"].fillna(value=mean_income)
    return df

In [None]:
#Creating a months column for the effective_to_date column only
def month_creation(df):
    df['effective_to_date']=pd.to_datetime(df['effective_to_date'], errors='coerce')
    df["month_effective_to_date"]=list(map(lambda date:date.strftime(format="%B"),df["effective_to_date"]))
    return(df)

### Running the rest of the Pipeline

In [None]:
df = standardize(df)
df.info()

In [None]:
df = month_creation(df)

In [None]:
df= drop_2cols(df)

In [None]:
df = dd(df)

In [None]:
df = fill_null(df)

In [None]:
df

In [None]:
df.info()

In [None]:
## Export the DataFrame into a CSV and import it to Tableau for visualization
## like this you will save it in the same folder as the notebook is saved

df.to_csv("customer_analysis_clean.csv", index = False)


In [None]:
df2 = pd.read_csv("customer_analysis_clean.csv")
df2

In [None]:
df["vehicle_type"].value_counts()

In [None]:
df["vehicle_type"].hist()

## Activity 5

### Check the data types of the columns. Get the numeric data into dataframe called numerical and categorical columns in a dataframe called categoricals. (You can use np.number and np.object to select the numerical data types and categorical data types respectively)

In [None]:
df.info()

In [None]:
# Identify all the cathegorical columns 
[col for col in df.columns if not col in df._get_numeric_data().columns]

In [None]:
# save a new subset -> numerical columns as the new DF
numerical = df._get_numeric_data()

In [None]:
# save a new subset -> cathegorical columns as the new DF
categorical=[i for i in df.columns if df.dtypes[i]=='object']
categorical

### Check for Outliers: Use the boxplot for looking at the values for income


In [None]:
plt.figure(figsize=(12,6))
sns.boxplot(y='income' , x='response', data=df)
plt.ylabel('Response by Income')
plt.show()

Findings:
We can see that the response rate by income is quite similar but for the ones that have low incomes. There you clearly see that a lot of low income people dont response whereas only very little do respond up to the level of 20k.

### Now we will try to check the normality of the numerical variables visually

- Use seaborn library to construct distribution plots for the numerical variables
- Use Matplotlib to construct histograms
- Do the distributions for different numerical variables look like a normal distribution


In [None]:
# Identify all the numerical columns 
[col for col in df.columns if col in df._get_numeric_data().columns]

In [None]:
numerical["customer_lifetime_value"].value_counts()

In [None]:
numerical.describe()

In [None]:
sns.pairplot(df)
plt.show()

Finding:
We see that that total_claim_amount and monthly_premium_auto could have a correlation somehow.

### Do some normalization on the features

In [None]:
# Use seaborn library to construct distribution plots for the numerical variables
# -> probablity density function

sns.distplot(numerical["monthly_premium_auto"], hist=True)

# FINDING: Looks like a LogNromal distribution and should be changed to a more normal distribution

In [None]:
# Using Box-Cox Normalization method without lambda, seamingly calculating the optimal lambda

from scipy.stats import boxcox

monthly,l=boxcox(numerical['monthly_premium_auto'])
# the l after monthly is the optimal lambda calculated


In [None]:
monthly,l

In [None]:
sns.histplot(monthly)

In [None]:
# Using Box-Cox Normalization method with lamda=0.02

from scipy.stats import boxcox

monthly11=boxcox(numerical['monthly_premium_auto'],lmbda=0.02)



In [None]:
sns.histplot(monthly11)


In [None]:
numerical["customer_lifetime_value"].describe()

In [None]:
# Power Transformer -> normalize the data between -2 and 2

from sklearn.preprocessing import PowerTransformer

pt = PowerTransformer()
customer_lifetime_value_normalized = pt.fit_transform(numerical["monthly_premium_auto"].to_numpy().reshape(-1,1))



In [None]:
sns.histplot(customer_lifetime_value_normalized)

In [None]:
# Quantile Transformer

from sklearn.preprocessing import QuantileTransformer

qt = QuantileTransformer()

monthly2=qt.fit_transform(numerical['monthly_premium_auto'].to_numpy().reshape(-1,1))


In [None]:
sns.histplot(monthly2)

In [None]:
sns.distplot(numerical["customer_lifetime_value"], hist=True)


In [None]:
# Using Box-Cox Normalization method without lambda, seamingly calculating the optimal lambda

from scipy.stats import boxcox

monthly12,l=boxcox(numerical['customer_lifetime_value'])
# the l after monthly is the optimal lambda calculated

In [None]:
sns.histplot(monthly12)

In [None]:
# PLOT FITTED LINE Between two (best) correlating variables/features 

sns.regplot(x='customer_lifetime_value',y='total_claim_amount', data=numerical, scatter_kws={"color": "olive"}, line_kws={"color": "black"})

In [None]:
sns.distplot(numerical["income"], hist=True)

In [None]:
sns.distplot(numerical["total_claim_amount"], hist=True)

In [None]:
from matplotlib import pyplot
pyplot.plot(numerical["total_claim_amount"])
pyplot.show()

# histogram
pyplot.hist(numerical["total_claim_amount"])
pyplot.show()

In [None]:
# looks like a uniform distirbution

sns.distplot(numerical["number_of_policies"], hist=True)

In [None]:
# plot the histogram
plt.hist(numerical["total_claim_amount"], bins=10, density=True)


###  Normalize (numericals)

### MAX ABSOLUTE SCALING


In [None]:
# copy the data
numerical_max_scaled = numerical.copy()

In [None]:
# apply the normalization technique
for column in numerical_max_scaled.columns:
    numerical_max_scaled[column] = numerical_max_scaled[column]  / numerical_max_scaled[column].abs().max()
    
# view normalized data
display(numerical_max_scaled)

### Using The MIN-MAX FEATURE SCALING


In [None]:
# copy the data
numerical_min_max_scaled = numerical.copy()
  
# apply normalization technique
for column in numerical_min_max_scaled.columns:
    numerical_min_max_scaled[column] = (numerical_min_max_scaled[column] - numerical_min_max_scaled[column].min()) / (numerical_min_max_scaled[column].max() - numerical_min_max_scaled[column].min())    
  
# view normalized data
numerical_min_max_scaled

In [None]:
sns.distplot(numerical_min_max_scaled["income"], hist=True)

### Using The Z-SCORE METHOD -> Huge Benefit

In [None]:
# -> HUGE BENEFIT TO TAKE MULTIPLE NUMERICAL independent VARIABLES INTO A MODEL. They are same scaled!


# copy the data
numerical_z_scaled = numerical.copy()
  
# apply normalization techniques
for column in numerical_z_scaled.columns:
    numerical_z_scaled[column] = (numerical_z_scaled[column] -
                           numerical_z_scaled[column].mean()) / numerical_z_scaled[column].std()    
  
# view normalized data   
display(numerical_z_scaled)

### For the numerical variables, check the multicollinearity between the features. Please note that we will use the column total_claim_amount later as the target variable.

In [None]:
fig,ax = plt.subplots(figsize = (12,10))
sns.heatmap(numerical.corr(), annot = True, ax=ax)

FINDING: Largest positive correlation is between total_claim_amount and monthly_premium_auto
this can be explained by more expensive cars require higher premium payments
and if there is a damage, these more expensive cars create higher claim amounts.

### For the numerical variables, check the multicollinearity between the features. Please note that we will use the column total_claim_amount later as the target variable

RESULT: As we can see in the correlation coefficient matrix above, there is no independent 
variables/features with a higher correlation than plus/minus 0.9. Hence, we dont drop any of them.

## Transform Categorical Values

### Get cathegorical Columns

In [None]:
categoricals = df.select_dtypes(object)
categoricals.head()

### Conduct One Hot Encoder data fitting on policy type

In [None]:
categoricals["policy_type"].value_counts()

In [None]:
from sklearn import preprocessing

In [None]:
enc = preprocessing.OneHotEncoder(handle_unknown="ignore")
X = [["Personal Auto", 1], ["Corporate Auto", 2], ["Special Auto", 3]]
enc.fit(X)

In [None]:
input_data = pd.DataFrame ([["Personal Auto", 1], ["Corporate Auto", 2], ["Special Auto", 3]])

In [None]:
enc = preprocessing.OneHotEncoder(handle_unknown="ignore")
enc.fit_transform(input_data).toarray()

In [None]:
### work with two categorical variables.
enc = preprocessing.OneHotEncoder(handle_unknown="ignore")
categorical_hot_encoder = enc.fit_transform(np.array(categoricals.loc[:,["policy_type","marital_status"]])).toarray()

In [None]:
categorical_hot_encoder 

### Put the cathegorical clumns into numerical binary values

In [None]:
categorical_numerical =pd.get_dummies(categoricals)
categorical_numerical


COMMENT FOR STUFF BELOW: We tried to get the above numerical results into the respective cathegorical column. Unfortunately did not work

In [None]:
def numericalize_policy_type(n_p_t):
    categorical.replace(replace("Personal Auto", [0., 1., 0., 1., 0., 0.], "Corporate Auto", [1., 0., 0., 0., 1., 0.], "Special Auto", [0., 0., 1., 0., 0., 1.]) for i in ["policy_type"])
    return(n_p_t)

In [None]:
categoricals1 = numericalize_policy_type(categoricals)
categoricals1.info()

## Activity 6

# Processing Data
(Further processing...)


- Normalize (numerical). (done)
- One Hot/Label Encoding (categorical).



- Concat DataFrames

We decided to concatinate the numerical_z_scaled df and the categorical df. We do this below and call it z_scaled_full_df

In [None]:
z_scaled_full_df = numerical_z_scaled.join(categoricals, lsuffix="_left")
z_scaled_full_df

- X-y split

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split


In [None]:
#x = independent variable
#y = independent variable
x =z_scaled_full_df[["monthly_premium_auto"]] 
y = z_scaled_full_df[["total_claim_amount"]] 
x.head(3)

# Linear Regression
- Train-test split.
- Apply linear regression.

In [None]:
# Train test set split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state = 100)
x_train

In [None]:
## Apply linear regression

lm = LinearRegression() 
model = lm.fit(x_train,y_train)
predictions = model.predict(x_test)
predictions

In [None]:
## Calculating the robustness of the model (maybe R-squared)
model.score(x_test,y_test)

In [None]:
## Isolating alpha and beta of the linear regression

print("The slope of the regression is: {:.5f}".format(float(model.coef_)))
print("The intercept of the regression is: {:.5f}".format(float(model.intercept_)))



In [None]:
## Isolating alpha and beta of the linear regression

print(model.coef_)

print(model.intercept_)


In [None]:
## R-Squared -> SEE BELOW
from sklearn.metrics import mean_squared_error, r2_score

print("The R2 is {:.2f}".format(r2_score(y,y_pred)))

### Create the model based on non-normalized data

In [None]:
#x = independent variable
#y = independent variable
xx = numerical[["monthly_premium_auto"]] 
yy = numerical[["total_claim_amount"]] 
xx.head(3)

In [None]:
# Train-test split.

xx_train, xx_test, yy_train, yy_test = train_test_split(xx, yy, test_size=0.3, random_state = 100)
xx_train

In [None]:
yy_train

In [None]:
yy_train.describe()

In [None]:
## Apply linear regression

lm1 = LinearRegression()
lm1.fit(xx_train,yy_train)
predicted_yy = lm1.predict(xx_test)
predicted_yy

In [None]:
## Calculating the robustness of the model (maybe R-squared)
lm1.score(xx_test,yy_test)

In [None]:
print(lm1.coef_)
print(lm1.intercept_)



In [None]:
## Plot the model:



# Model Validation
Description:
- MSE.
- RMSE.
- MAE.
- R2.

1. MSE.
    (Mean Squared Error) represents the difference between the original and predicted values extracted by squared the average difference over the data set.
2. RMSE.
    (Root Mean Squared Error) is the error rate by the square root of MSE.
3. MAE.
    (Mean absolute error) represents the difference between the original and predicted values extracted by averaged the absolute difference over the data set.
4. R2.
    (Coefficient of determination) represents the coefficient of how well the values fit compared to the original values. The value from 0 to 1 interpreted as percentages. The higher the value is, the better the model is.

In [None]:
#MSE
mse = mean_squared_error(yy_test, predicted_yy)

mse

In [None]:
#RMSE

rmse = np.sqrt(mse)
rmse

In [None]:
# MAE -> NOT SURE IF That WORK

MAE = np.mean( np.abs(yy_test - predicted_yy) )
MAE


In [None]:
#R2 = model.score()
from sklearn.metrics import r2_score
r2_score(yy_test, predicted_yy)

### Long version (did not work)
FROM the CODE ALONG Notebook LINEAR REGRESSION, we ARE USING THE SAME CODE:
http://localhost:8888/notebooks/Desktop/BI%20or%20DnA%20BootCamps/Ironhack/Used%20for%20Exercises/Machine%20Learining%20Linear%20Regression/Notebook_Code_Along_Linear_Regression.ipynb

In [None]:
# Preparing the data

x = x_test
y = y_test

x

In [None]:
# turn the above x-test-set data frame into a numpy array

x = pd.DataFrame(x)
x.to_numpy()

In [None]:
# turn the above y-test-set data frame into a numpy array

x = pd.DataFrame(y)
y.to_numpy()

In [None]:
# slope
b = (np.mean(x*y) - (np.mean(x)*np.mean(y)) ) / ( np.mean(x*x) - np.mean(x)**2)
# Intercept
a = np.mean(y) - np.mean(x)*b


print("The slope of the regression is: {:.2f}".format(b))
print("The intercept of the regression is: {:.2f}".format(a))


## Creating a Linear Model based on the template from Rafa

### Copied from section X-Y Split in the follwing Notebook:
https://github.com/raafat-hantoush/IH_RH_DA_FT_AUG_2021_Labs_Activities_Solutions/blob/main/Labs_Solutions/Pandas/Lab_Customer_Analysis_Case_Study.ipynb


### Reducing amount of relevant columns/features for Model


1) Plotting all the cathegorical columns to see what can be dropped/bucketed (be aware, the less (categorical) columns you have, the less dummy variable columns and hence features you ll have in your model -> did not work but normally would also add significantly to the stenght of the model. Just needs experiance to have a good judgement on this

In [None]:
for c in df.select_dtypes(object):
    plt.figure(figsize=(10,5))
    plt.bar(df[c].unique(), df[c].value_counts())
    plt.title(c)
    plt.show();
    

2) Result of the above visualization shows that you can do the following below transformation (according to Rafa)

In [None]:
def transform(df):
    df=df[df['income']>0]
    df["total_claim_amount"]=round(df["total_claim_amount"],0)
    df["customer_lifetime_value"]=round(df["customer_lifetime_value"],0)
    df.vehicle_class[customers_df.vehicle_class.isin(["Luxury Car","Luxury SUV"])] = "Luxury Car"
    df.vehicle_class[customers_df.vehicle_class.isin(["Four-Door Car","SUV"])] = "Four-Door Car"
    df.policy[df.policy.isin(["Personal L3","Personal L2","Personal L1"]) ] = "Personal"
    df.policy[df.policy.isin(["Corporate L3","Corporate L2","Corporate L1"]) ] = "Corporate"
    df.policy[df.policy.isin(["Special L3","Special L2","Special L1"]) ] = "Special"
    df=df.drop(columns=["month","education"])
    df.number_of_policies[df.number_of_policies>2]=3
    return df

In [None]:
# Execution would be done via this code in line with pipelining at the beginning ->
# df = transform(df)

### X-Y Splitting 
splitting the target variable "total_claim_amount" from the cleaned df called df.

In [None]:
df.info()

In [None]:
X=df.drop('total_claim_amount', axis=1)
y=df.total_claim_amount

# we drop some useless columns
X=X.drop(columns=['month_effective_to_date'])

### Get the numeric data


In [None]:
numericals_features =X._get_numeric_data()
# numericals =data.select_dtypes(np.number)
numericals_features.head()

In [None]:
# rounding up the customer_lifetime_value column
numericals_features["customer_lifetime_value"]=numericals_features["customer_lifetime_value"].round()

In [None]:
numericals_features.head()

In [None]:
numericals_features.describe()

### Normalize via StandardScaler  -> not used going foward

In [None]:
##def normalize(X):
##    X_mean=X.mean(axis=0)
##    X_std=X.std(axis=0)
##    X_std[X_std==0]=1.0
##    X=(X-X_mean)/X_std
##    return X

##X_num=normalize(X_num)

In [None]:
from sklearn.preprocessing import StandardScaler

X_scaled=StandardScaler().fit_transform(numericals_features)

In [None]:
X_scaled

### Get Categorical data

In [None]:
categoricals_features = X.select_dtypes(object)
categoricals_features.head()

In [None]:
# get dummies pandas

categoricals_features=pd.get_dummies(categoricals_features, drop_first=True)

##pd.DataFrame(OneHotEncoder(drop='first').fit_transform(categoricals_features).toarray(),
## columns=OneHotEncoder(drop='first').fit(categoricals_features).get_feature_names(input_features=categoricals_features.columns)).head()

categoricals_features.head()

### Concatinating the Numerical and Categorical data

In [None]:
all_features=pd.concat([numericals_features, categoricals_features], axis=1) # concat numerical and categorical transformations
all_features.head()

### Train Test Split (80% train & 20% test)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test=train_test_split(all_features, y, test_size=.20)

In [None]:
X_test

### Apply Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

model=LinearRegression()    # model
model.fit(X_train.to_numpy(), y_train)   # model train
y_pred=model.predict(X_test.to_numpy())   # test model y-predictor
y_pred_train=model.predict(X_train.to_numpy()) # train y-predictor

In [None]:
# fit the model
print("Model-Intercept: ", model.intercept_)


print("Model-Coefficient: ", model.coef_)


In [None]:
# make predictions
# 1) turn the x-values into an array

np.array([12, 33, 54, 60, 55, 32, 74, 39, 2, 39, 12, 34, 40, 21])

In [None]:
# make predictions
# 2) Yield the y-predictions

pred = model.predict(np.array([12, 33, 54, 60, 55, 32, 74, 39, 2, 39, 12, 34, 40, 21]).reshape(-1,1))
pred

In [None]:
model.score(X_test,y_test) # Intermezzo -> checking for the robustness of the model

In [None]:
# first plot on the top left shows the y-predictor compared to the real y data in the test set
# we see a clear diagonal relationship which is a clear sign of a robust model

fig, ax = plt.subplots(2,2)
ax[0,0].plot(y_pred, y_test, 'o')
ax[0,0].set_xlabel("y_test")
ax[0,0].set_ylabel("y_pred")
ax[0,0].set_title("Test Set -Predicted vs real")

# second plot on the top right show the error values/residuals from the above comparision distributed
# remember that the residuals have to be normally distributed as assumption Nr. 4 for OLS
# Homoscedasticity -> i.c. TRUE

ax[0,1].hist(y_test - y_pred)
ax[0,1].set_xlabel("Test y-y_pred")
ax[0,1].set_title("Test Set Residual histogram")

# Third plot on the bottom left shows the trained y-predictor compared to the real y data in the train set

ax[1,0].plot(y_pred_train, y_train, 'o')
ax[1,0].set_xlabel("y_train")
ax[1,0].set_ylabel("y_pred_train")
ax[1,0].set_title("Train set Predicted vs real")



# Last plot on the bottom right show the error values/residuals from the above comparision distributed
# remember that the residuals have to be normally distributed as assumption Nr. 4 for OLS
# Homoscedasticity -> i.c. TRUE

ax[1,1].hist(y_train - y_pred_train)
ax[1,1].set_xlabel("Train y-y_pred")
ax[1,1].set_title("Train Residual histogram")


### Model Validation


MSE

In [None]:
from sklearn.metrics import mean_squared_error as mse

train_mse=mse(y_train,model.predict(X_train.to_numpy()))

test_mse=mse(y_test,model.predict(X_test.to_numpy()))

print ('train MSE: {} -- test MSE: {}'.format(train_mse, test_mse))

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

mse = mean_squared_error(y_test, y_pred)
mse

RMSE -> squaring MSE

In [None]:
print ('train RMSE: {} -- test RMSE: {}'.format(train_mse**.5, test_mse**.5))

MAE

In [None]:
from sklearn.metrics import mean_absolute_error as mae

train_mae=mae(y_train,model.predict(X_train.to_numpy()))
test_mae=mae(y_test,model.predict(X_test.to_numpy()))

print ('train MAE: {} -- test MAE: {}'.format(train_mse, test_mse))

# Same result as for MSE

R2

In [None]:
r2= model.score(X_test.to_numpy(), y_test)

In [None]:
print ('train R2: {} -- test R2: {}'.format(model.score(X_train.to_numpy(), y_train),
                                            model.score(X_test.to_numpy(), y_test)))

# Same result as for model.score above

adjusted R2

In [None]:
N=X_train.shape[0]
p=1
x = (1-r2)
y = (N-1) / (N-p-1)
adj_rsquared = (1 - (x * y))
print("Adjusted-R2 : " , adj_rsquared)

### Feature Importance¶

Here we see which of the features contribute most to the value to predicting the target value. The higher its coeffient (in absolute terms) the more it does.

In [None]:
features_importances = pd.DataFrame(data={
    'Attribute': X_train.columns,
    'Importance': model.coef_
})
features_importances = features_importances.sort_values(by='Importance', ascending=False)

In [None]:
features_importances

In [None]:
plt.bar(x=features_importances['Attribute'], height=features_importances['Importance'], color='#087E8B')
plt.title('Feature importances obtained from coefficients', size=20)
plt.xticks(rotation='vertical')
plt.show()

In [None]:
giving the features.

how can we use this model now in practive?

How can we visualize the model with the fitted line?

We actually have a section in the jupyter notbook. There is no one dimensional fitted line, but multi-dimensional

 include real data to estimate values. and how do we treat the standardization?
-> you need to put the data into the standardscaler again (same functions) then you can apply it again 





get models in statsmodel.