In [None]:
# importing libraries
from datetime import datetime,timedelta,date
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.cluster import KMeans
from __future__ import division

from sklearn.metrics import classification_report,confusion_matrix
from sklearn.model_selection import KFold,cross_val_score,train_test_split,GridSearchCV,KFold

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb

In [None]:
# load data

cus_df = pd.read_csv('C:/Users/user/Data_Driven/online_retail_II.csv')
cus_df.rename(columns={'Customer ID':'CustomerID'},inplace=True)

cus_df.head()

In [None]:
cus_df.info()

In [None]:
cus_df.isnull().sum()

In [None]:
# converting invoicedate into datetime object

cus_df['InvoiceDate'] = pd.to_datetime(cus_df['InvoiceDate'])

cus_df.info()

In [None]:
# dropping missing values

cus_df.dropna(inplace= True)

cus_df.isnull().sum()

In [None]:
pd.DataFrame(cus_df['InvoiceDate'].describe())

In [None]:
# creating 1year 2month and 6month dataframe for historical data and future data respectively

cus_1y_2m = cus_df[(cus_df['InvoiceDate'] >= pd.Timestamp(2010,3,1)) & (cus_df['InvoiceDate'] < pd.Timestamp(2011,6,1))].reset_index(drop=True)
cus_6m = cus_df[(cus_df['InvoiceDate'] >= pd.Timestamp(2011,6,1)) & (cus_df['InvoiceDate']<pd.Timestamp(2011,12,1))].reset_index(drop=True)

In [None]:
# ned df for assigning clusterring

cus_ctr = pd.DataFrame(cus_df['CustomerID'].unique())
cus_ctr.columns = ['CustomerID']
cus_ctr.head()

In [None]:
#order cluster method
def order_cluster(cluster_field_name, target_field_name,df,ascending):
    new_cluster_field_name = 'new_' + cluster_field_name
    df_new = df.groupby(cluster_field_name)[target_field_name].mean().reset_index()
    df_new = df_new.sort_values(by=target_field_name,ascending=ascending).reset_index(drop=True)
    df_new['index'] = df_new.index
    df_final = pd.merge(df,df_new[[cluster_field_name,'index']], on=cluster_field_name)
    df_final = df_final.drop([cluster_field_name],axis=1)
    df_final = df_final.rename(columns={"index":cluster_field_name})
    return df_final

In [None]:
# recency score

recency_df = pd.DataFrame(cus_1y_2m.groupby('CustomerID')['InvoiceDate'].max().reset_index())
recency_df.columns = ['CustomerID','LastPurchaseDate']
recency_df['Recency'] = (recency_df['LastPurchaseDate'].max()-recency_df['LastPurchaseDate']).dt.days

In [None]:
recency_df

In [None]:
cus_ctr = pd.merge(cus_ctr,recency_df[['CustomerID','Recency']],on='CustomerID')

In [None]:
cus_ctr.head()

In [None]:
# form clusters
kmeans = KMeans(n_clusters=4)
kmeans.fit(cus_ctr[['Recency']])
cus_ctr['RecencyCluster'] = kmeans.predict(cus_ctr[['Recency']])

cus_ctr.head()

In [None]:
# oder recency cluster

cus_ctr = order_cluster('RecencyCluster','Recency',cus_ctr,False)

cus_ctr.head()

In [None]:
cus_ctr.groupby('RecencyCluster')['Recency'].describe()

In [None]:
# Freqeuncy

frequency_df = cus_1y_2m.groupby('CustomerID')['InvoiceDate'].count().reset_index()
frequency_df.columns = ['CustomerID','Frequency']
cus_ctr = pd.merge(cus_ctr,frequency_df,on='CustomerID')
cus_ctr.head()

In [None]:
# frequency cluster
kmeans.fit(cus_ctr[['Frequency']])
cus_ctr['FrequencyCluster'] = kmeans.predict(cus_ctr[['Frequency']])
cus_ctr.head()

In [None]:
# order frequency cluster
cus_ctr = order_cluster('FrequencyCluster','Frequency',cus_ctr,True)


cus_ctr.head()

In [None]:
cus_ctr.groupby('FrequencyCluster')['Frequency'].describe()

In [None]:
# Revenue
cus_1y_2m['Revenue'] = cus_1y_2m['Quantity'] * cus_1y_2m['Price']

# total revenue
revenue_df = cus_1y_2m.groupby('CustomerID')['Revenue'].sum().reset_index()
revenue_df.columns = ['CustomerID','TotalRevenue']
cus_ctr = pd.merge(cus_ctr,revenue_df,on='CustomerID')

#average revenue
revenue_mean = cus_1y_2m.groupby('CustomerID')['Revenue'].mean().reset_index()
revenue_mean.columns = ['CustomerID','MeanRevenue']
cus_ctr = pd.merge(cus_ctr,revenue_mean,on='CustomerID')
cus_ctr.head()

In [None]:
# cluster
kmeans.fit(cus_ctr[['TotalRevenue']])
cus_ctr['TotalRevenueCluster'] = kmeans.predict(cus_ctr[['TotalRevenue']])
cus_ctr.head()

In [None]:
# order revenue ccluster

cus_ctr = order_cluster('TotalRevenueCluster','TotalRevenue',cus_ctr,True)

cus_ctr.head()

In [None]:
cus_ctr.groupby('TotalRevenueCluster')['TotalRevenue'].describe()

In [None]:
# overall score and segmenting customers

cus_ctr['Overall_Score'] = (cus_ctr['RecencyCluster']+cus_ctr['FrequencyCluster']+cus_ctr['TotalRevenueCluster'])
cus_ctr['Segment'] = 'Low-Value'
cus_ctr.loc[cus_ctr['Overall_Score']>2,'Segment'] = 'Mid-Value' 
cus_ctr.loc[cus_ctr['Overall_Score']>4,'Segment'] = 'High-Value'

In [None]:
cus_ctr

## Creating Target ltv variable
There is no cost specified in the dataset. That’s why Revenue becomes our LTV directly.

In [None]:
cus_6m['Revenue'] = cus_6m['Quantity'] * cus_6m['Price']
cus_user_6m = cus_6m.groupby('CustomerID')['Revenue'].sum().reset_index()
cus_user_6m.columns = ['CustomerID','m6_Revenue']

In [None]:
cus_user_6m.head()

In [None]:
plt.figure(figsize=(20,6))
sns.histplot(data=cus_user_6m,x=cus_user_6m['m6_Revenue'])
plt.title('6 Months Revenue');

Histogram clearly shows we have customers with negative LTV. We have some outliers too. 

In [None]:
# merging the two dataframe together

cus_ctr = pd.merge(cus_ctr,cus_user_6m,on='CustomerID',how='left')
cus_ctr.fillna(0)

cus_ctr.head()

In [None]:
# plotting a graphy of m6_revenue vs overall_score
plt.figure(figsize=(20,6))
sns.scatterplot(data=cus_ctr,x='Overall_Score',y='m6_Revenue',hue='Segment')
plt.title('LTV');

In [None]:
# removing outliers
cus_ctr = cus_ctr[cus_ctr['m6_Revenue']<cus_ctr['m6_Revenue'].quantile(0.9)]

# creating three clusters
kmeans = KMeans(n_clusters=3)
kmeans.fit(cus_ctr[['m6_Revenue']])
cus_ctr['LTVClusters'] = kmeans.predict(cus_ctr[['m6_Revenue']])

#order cluster number based on LTV
cus_ctr = order_cluster('LTVClusters','m6_Revenue',cus_ctr,True)

# making copy
cus_cluster = cus_ctr.copy()

In [None]:
cus_cluster.head()

In [None]:
# decribing ltv cluster

cus_cluster.groupby('LTVClusters')['m6_Revenue'].describe()

2 is the best with average 1.3k LTV whereas 0 is the worst with 238.

In [None]:
# converting categorical data to numerical 

cus_cluster = pd.get_dummies(cus_cluster)
cus_cluster.head()

In [None]:
# cheching the correlation between features and our ltv label

corr = cus_cluster.corr()
plt.figure(figsize=(15,10))
sns.heatmap(corr,annot=True,fmt='.2F');

## model Building

In [None]:
models = {
    'LogisticRegression' : LogisticRegression(),
    'RandomForestClalssifier' : RandomForestClassifier(),
    'XGb.xgb' : xgb.XGBClassifier(),
    'kneighbors' : KNeighborsClassifier(),
    'gradientboost' : GradientBoostingClassifier(),
    'Decisiontree' : DecisionTreeClassifier()
}

class_scorer = ['accuracy']

In [None]:
# function to perform cross validation on all metrics and display them for each model
def fit_and_score(model,x,y,scorer):
        metric =[]
        index = []
        kfold = KFold(n_splits=2, random_state=24, shuffle=True)
        for key,value in model.items() :
            index.append(key)
            for i in range(len(scorer)) :
                score = cross_val_score(value,x,y,scoring = scorer[i],cv = kfold)
                score = np.mean(score)
                metric.append(score)
        df = pd.DataFrame(np.array(metric).reshape(len(model),len(scorer)))
        df.columns = scorer
        df.index = index
        return df 

In [None]:
# splitting into x & y
x = cus_cluster.drop(['LTVClusters','m6_Revenue'],axis=1)
y = cus_cluster['LTVClusters']

In [None]:
plt.figure(figsize=(15,6))
sns.countplot(data=cus_cluster,x=cus_cluster['LTVClusters'])

In [None]:
#balancing imbalance data
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(sampling_strategy='not majority')
x_over,y_over = ros.fit_resample(x,y)

In [None]:
y_over.value_counts()

In [None]:
fit_and_score(models,x_over,y_over,class_scorer)

`xgboost` performs better than other models but it just over 70% acccuracy, so alot of improvement can be made to try to increase them

In [None]:
# splitting into training and test data

x_train,x_test,y_train,y_test = train_test_split(x_over,y_over,test_size=0.05,random_state=56)

### Hyperparameter tuning with `GridSearchCV`

In [None]:
kfold = KFold(n_splits=5, random_state=24, shuffle=True)
xgb_grid = {
    'max_depth' : [1,2,3,4,5,6,7,8,9,10],
    'learning_rate' : [0.01,0.1,0.4],
    'objective' : ['multi:softprob']
}

xgb_gs = GridSearchCV(xgb.XGBClassifier(),param_grid=xgb_grid,
                    scoring='accuracy',cv=kfold,verbose=1)

xgb_gs.fit(x_train,y_train)

In [None]:
xgb_gs.best_score_

In [None]:
xgb_gs.best_params_

### cross validation

In [None]:
print(f'cross_val_score {np.mean(cross_val_score(xgb_gs,x_over,y_over,cv=kfold,verbose=2))}')

#### Classification report

In [None]:
y_preds = xgb_gs.predict(x_test)

print(classification_report(y_preds,y_test))

In [None]:
## Savving model
from joblib import dump
dump(xgb_gs, 'C:/Users/user/Data_Driven/models/Customer_lifetime_value_model.joblib_1')