# Project name- Predicting Customer Lifetime value for an auto insurance company using Supervised Machine Learning
# Dataset can be downloaded from the below link
https://www.kaggle.com/datasets/somjee/auto-insurance-customerlifetimevalue
# The main aim of this project is to predict the Cutsomer Lifetime value based on various features

# In this project we do the following steps
 1. Finding missing Values
 2. Listing out the Numerical Variables
 3. Distribution of the Numerical Variables
 4. Categorical Variables
 5. Cardinality of Categorical Variables
 6. Encoding
 7. Splitting the data.
 8. Scaling.
 9. Modelling the data
 10. Testing the data.

In [None]:
## First we import the necessary libraries 
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
## to display all the columns of the dataset 
pd.set_option("display.max_columns",None)
## we import our dataset
dataset=pd.read_csv("F:\DATA SETS\CustomerlifetimeValue-copy.csv")

In [None]:
## we check the shape of the data,i.e, the number of rows and columns
dataset.shape

In [None]:
## to check the dimension of the data
dataset.ndim

In [None]:
## to print the top 5 records of the data
dataset.head()

In [None]:
## to check whether the data has any null values
dataset.isnull().sum()

In [None]:
## we find the number of numerical features present in the dataset
numerical_features=[feature for feature in dataset.columns if dataset[feature].dtypes!="O"]
print("count of numerical features:",len(numerical_features))

In [None]:
## we see the content of the numerical features
dataset[numerical_features].head()

In [None]:
## we create a separate variable for the temporal variable
temp_var=dataset["Effective To Date"]
temp_var

In [None]:
## We try to find if there is any relationship between the temporal variable and the target variable
## We observe that our target variable(CLV) have gone through cyclical fluctuations 
dataset.groupby("Effective To Date")["Customer Lifetime Value"].median().plot()
plt.xlabel("Effective To Date")
plt.ylabel("median customer lifetime value")
plt.title("date vs clv")
plt.show()

In [None]:
## We find the number of discrete numerical features in the dataset
discrete_features=[feature for feature in numerical_features if len(dataset[feature].unique())<20]
print("count of discrete variables :",len(discrete_features))

In [None]:
## We print the name of the discrete features
print(discrete_features)

In [None]:
## We print the data in the discrete features 
dataset[discrete_features].head()

In [None]:
## We plot a barplot to analyze the relationship between the list of discrete features and median_CLV
for feature in discrete_features:
    dataset.groupby(feature)["Customer Lifetime Value"].median().plot.bar()
    plt.xlabel(feature)
    plt.ylabel("median CLV")
    plt.show()

In [None]:
## We find the number of continous numerical features in the dataset
continous_features=[feature for feature in numerical_features if feature not in discrete_features]
print("count of continous features:",len(continous_features))

In [None]:
## We print the name of the continous features
print(continous_features)

In [None]:
## We print the data in continous features
dataset[continous_features].head()

In [None]:
## We plot histograms to find the distribution of the continous features
for feature in continous_features:
    plt.hist(dataset[feature],bins=20)
    plt.xlabel(feature)
    plt.ylabel("count")
    plt.title(feature)
    plt.show()

In [None]:
## We observe that the continous features are skewed so we apply logarithmic function to transform them
## only those features are transformed who does not have 0 as a value 
for feature in continous_features:
    if 0 in dataset[feature].unique():
        pass
    else:
        dataset[feature]=np.log(dataset[feature])
        plt.hist(dataset[feature],bins=20)
        plt.xlabel(feature)
        plt.ylabel("count")
        plt.title(feature)
        plt.show()

In [None]:
## We make boxplots to find out the presence of outliers in the continous features
for feature in continous_features:
    sns.boxplot(y=dataset[feature])
    plt.xlabel(feature)
    plt.show()

In [None]:
## We find the list of categorical features present in the dataset
categorical_features=[feature for feature in dataset.columns if dataset[feature].dtypes=="O"]
categorical_features

In [None]:
## We remove the "Customer" column as it is irrelevant
categorical_features.remove("Customer")

In [None]:
## We remove "Effective To Date" column as it is irrelevant to treat it as a categorical feature
categorical_features.remove("Effective To Date")

In [None]:
## Now we print the updated list of categorical features
categorical_features

In [None]:
## We find the number of categorical features present in the dataset
len(categorical_features)

In [None]:
## We print the data in the categorical features
dataset[categorical_features].head()

In [None]:
## We find the cardinality of the categorical features,i.e, the number of sub-categories present in each categorical feature
for feature in categorical_features:
    print("feature is {} and number of sub-categories are {}".format(feature,len(dataset[feature].unique())))

In [None]:
## We plot barplot to observe the relationship between the categorical features and the median of the target variable(CLV)
for feature in categorical_features:
    dataset.groupby(feature)["Customer Lifetime Value"].median().plot.bar()
    plt.xlabel(feature)
    plt.ylabel("median Customer Lifetime Value")
    plt.show()

In [None]:
## We try to find the percentage of missing values present in the categorical features
categorical_nan=[feature for feature in dataset.columns if dataset[feature].isnull().sum()>1 and dataset[feature].dtypes=="O"]
for feature in categorical_nan:
    print("{} has {} % missing values".format(feature,np.round(dataset[feature].isnull().mean(),4)))

In [None]:
## We fill the nan values in the categorical features with a label named "missing"
dataset[categorical_nan]=dataset[categorical_nan].fillna("missing")
## after replacing the nan values we check the whether there is any nan value present
dataset[categorical_nan].isnull().sum()

In [None]:
## We print the data of the categorical features which had nan values earlier
dataset[categorical_nan].head()

In [None]:
## We try to find the percentage of missing values present in the numerical features
numerical_nan=[feature for feature in dataset.columns if dataset[feature].isnull().sum()>1 and dataset[feature].dtypes!="O"]
for feature in numerical_nan:
    print("{} has {} % missing values".format(feature,np.round(dataset[feature].isnull().mean(),4)))

In [None]:
## as we have observed outliers in the data points of numercial features so we replace the nan values with it's median value
for feature in numerical_nan:
    median_value=dataset[feature].median()
    dataset[feature].fillna(median_value,inplace=True)
dataset[numerical_nan].isnull().sum()

In [None]:
## We print the dataset to find out check whether the nan values have been by median or not
dataset.head()

In [None]:
## We replace the sub categories which are present in less than 10% of the dataset with "rare_var"label 
for feature in categorical_features:
    temp=dataset.groupby(feature)["Customer Lifetime Value"].count()/len(dataset)
    temp_df=temp[temp>0.01].index
    dataset[feature]=np.where(dataset[feature].isin(temp_df),dataset[feature],"rare_var")

In [None]:
## We print the dataset to observe the change
dataset.head(20)

In [None]:
## We drop the unnecessary features from the dataset
dataset=dataset.drop(["Customer","Effective To Date"],axis=1)

In [None]:
## We perform one hot encoding on categorical variables
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(sparse=False)
columns_to_one_hot = ['State','Response','Coverage','Education','EmploymentStatus','Gender','Location Code','Marital Status','Policy Type','Policy','Renew Offer Type','Sales Channel','Vehicle Class','Vehicle Size']
encoded_array = enc.fit_transform(dataset.loc[:,columns_to_one_hot])
dataset_encoded = pd.DataFrame(encoded_array,columns=enc.get_feature_names_out() )
dataset_sklearn_encoded = pd.concat([dataset,dataset_encoded],axis=1)
dataset_sklearn_encoded.drop(labels= columns_to_one_hot,axis=1,inplace=True)

In [None]:
## We print the encoded dataset
dataset_sklearn_encoded

In [None]:
## We separate the independent and dependent variables from the dataset
x=dataset_sklearn_encoded.drop("Customer Lifetime Value",axis=1)
y=dataset_sklearn_encoded["Customer Lifetime Value"]

In [None]:
## We print the independent varibale
x

In [None]:
## We print the dependent variable
y=pd.DataFrame(y)
y

In [None]:
## We split our data into train and test data 
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=18)

In [None]:
## We use Standardization method to scale down all the features in the dataset 
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
x_train=scaler.fit_transform(x_train)
x_test=scaler.transform(x_test)
y_train=scaler.fit_transform(y_train)
y_test=scaler.transform(y_test)

In [None]:
## We print the transformed x_train
x_train

In [None]:
## We print the transformed y_train
y_train

In [None]:
## We import certain modules from the sklearn library
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

In [None]:
## We fit our dataset into lasso regression to select relevant features of the dataset to be considered for modelling
feature_sel_model=SelectFromModel(Lasso(alpha=0.005,random_state=0))
feature_sel_model.fit(x,y)

In [None]:
## We print an array indicating which features are selected
feature_sel_model.get_support()

In [None]:
## We count the number of selected features  
selected_feat=x.columns[(feature_sel_model.get_support())]
print("number of features selected: {}".format(len(selected_feat)))

In [None]:
## We create a list of the selected features and print them
selected_feat=list(selected_feat)
print(selected_feat)

In [None]:
## We create a list of independent variables and build the x_train dataframe with it
feature_scale=[feature for feature in dataset_sklearn_encoded.columns if feature not in ["Customer Lifetime Value"]]
x_train=pd.DataFrame(x_train,columns=feature_scale)
x_train

In [None]:
x_test=pd.DataFrame(x_test,columns=feature_scale)
x_test

In [None]:
## Now we reduce the dimension of x_train data by considering only the selected features
x_train=x_train[['Monthly Premium Auto', 'Months Since Last Claim', 'Months Since Policy Inception', 'Number of Open Complaints', 'Number of Policies', 'Total Claim Amount', 'EmploymentStatus_Employed', 'Marital Status_Single', 'Renew Offer Type_Offer1', 'Renew Offer Type_Offer2', 'Vehicle Class_Four-Door Car', 'Vehicle Class_SUV']]

In [None]:
## Now we reduce the dimension of x_test data by considering only the selected features
x_test=x_test[['Monthly Premium Auto', 'Months Since Last Claim', 'Months Since Policy Inception', 'Number of Open Complaints', 'Number of Policies', 'Total Claim Amount', 'EmploymentStatus_Employed', 'Marital Status_Single', 'Renew Offer Type_Offer1', 'Renew Offer Type_Offer2', 'Vehicle Class_Four-Door Car', 'Vehicle Class_SUV']]

In [None]:
## We convert the x_train dataframe into an array
x_train=np.array(x_train)
x_train

In [None]:
## We convert the x_test dataframe into an array
x_test=np.array(x_test)
x_test

In [None]:
y_train

In [None]:
y_test

In [None]:
## We check the shape of the x_train data
x_train.shape

In [None]:
## We check the shape of the x_test data
x_test.shape

In [None]:
## We check the shape of the y_train data
y_train.shape

In [None]:
## We check the shape of the y_test data
y_test.shape

In [None]:
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor
regressor=RandomForestRegressor(n_estimators = 500,random_state=6)
regressor.fit(x_train,y_train)

In [None]:
## We predict using the x_test data
y_pred=regressor.predict(x_test)

In [None]:
## We print the predicted values
print(y_pred)

In [None]:
y_test[1]

In [None]:
x_test[1]

In [None]:
print(regressor.predict([x_test[1]]))

In [None]:
print(regressor.predict([[100,3,12,0,1,1200,1,1,1,1,0,0]]))

In [None]:
## We construct a dataframe to display the actual and predicted values together
df=pd.DataFrame({"actual":y_test.ravel(),"predicted":y_pred.ravel()})

In [None]:
## We print the dataframe
df

In [None]:
## ## We use score to see how well the data is performing for train data
regressor.score(x_train,y_train)

In [None]:
## We use score to see how well the data is performing for test data
regressor.score(x_test,y_test)

In [None]:
## We check the r2_score 
metrics.r2_score(y_test,y_pred)

In [None]:
metrics.mean_squared_error(y_test,y_pred)

In [None]:
from sklearn.model_selection import learning_curve

In [None]:
x=x[['Monthly Premium Auto', 'Months Since Last Claim', 'Months Since Policy Inception', 'Number of Open Complaints', 'Number of Policies', 'Total Claim Amount', 'EmploymentStatus_Employed', 'Marital Status_Single', 'Renew Offer Type_Offer1', 'Renew Offer Type_Offer2', 'Vehicle Class_Four-Door Car', 'Vehicle Class_SUV']]

In [None]:
## sizes=[1,5,10,15,20,25,30,50,100,150,300]
## sizes,training_scores,testing_scores=learning_curve(regressor,x,y,cv=5,scoring='neg_mean_squared_error',train_sizes=sizes)

In [None]:
# mean_training=np.mean(training_scores,axis=1)
# mean_testing=np.mean(testing_scores,axis=1)

In [None]:
# plt.plot(sizes,mean_training,color='r',linestyle='--',label='Training score')
# plt.plot(sizes,mean_testing,color='y',label='Testing score')
# plt.title('Learning curve for the regressor model',fontsize=20)
# plt.xlabel('Training set size',fontsize=15)
# plt.ylabel('mean squared error',fontsize=15)
# plt.legend(loc='best',fontsize=15)
# plt.show()

In [None]:
import pickle 

In [None]:
pickle.dump(regressor,open('model.pkl','wb'))

In [None]:
model=pickle.load(open('model.pkl','rb'))