In [1]:
#Customer Value = Purchase Frequency * Average Order Value
#BG/NBD (Beta Geometric / Negative Binomial Distribution) with Expected Number of Transaction
#CLTV = Expected Number of Transaction * Expected Average Profit
#CLTV = BG/NBD Model (Buy Till You Die) * Gamma Gamma Submodel 
#BG/NBD Modeli, Expected Number of Transaction terimi için iki süreci olasılıksal olarak modeller.
#Transaction Process(Buy) + Dropout Process(Till You Die)

#Transaction Process(Buy)
#Alive olduğu sürece, belirli bir zaman periyodunda, bir müşteri tarafından gerçekleştirilecek işlem sayısı transaction rate parametresi ile Poisson dağılır.
#Yani bir müşteri yaşadığı sürece kendi transaction rate'i etrafında rastgele satın alma yapmaya devam edecektir.
#Transaction rate'ler her bir müşteriye göre değişir ve tüm kitle için Gamma dağılır.(r, a --> Gamma Dağılımının Parametresi)

#Dropout Process(Till You Die)
#Her bir müşterinin p olasılığı ile dropout rate(dropout probability)
#Bir müşteri alışveriş yaptıktan sonra belirli bir olasılıkla drop olur.
#Dropout rateler her bir müşteriye göre değişir ve tüm kitle için beta dağılır.(a, b)

In [2]:
#Gamma Gamma Submodel --> Bir müşterinin işlem başına ortalama ne kadar kar getirebileceğini tahmin etmek için kullanılır.
#Bir müşteri işlemlerinin parasal değeri(monetary) transaction (value) değerlerinin ortalaması etrafında rastgele dağılır.
#Ortalama transaction value, zaman içinde kullanıcılar arasında değişebilir fakat tek bir kullanıcı için değişmez.
#Gamma Gamma modelinde yine bireylerin kendi içindeki bu özelliklerinden ziyade bütün kitleyle ilgili bu yorum önemlidir.
#Ortalama transaction value tüm müşteriler arasında Gamma dağılır. 

In [3]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import datetime as dt
from lifetimes import BetaGeoFitter
from lifetimes import GammaGammaFitter
from lifetimes.plotting import plot_period_transactions
from sklearn.preprocessing import MinMaxScaler

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 500)
pd.set_option('display.float_format', lambda x: '%.4f' % x)

In [4]:
df_ = pd.read_csv("flo_data_20K.csv")
df = df_.copy()

In [None]:
def outlier_thresholds(dataframe, variable):
    # Calculate lower and upper bounds for outlier detection using IQR
    Q1 = dataframe[variable].quantile(0.25)
    Q3 = dataframe[variable].quantile(0.75)
    interquantile_range = Q3 - Q1
    lower_limit = Q1 - 1.5 * interquantile_range
    upper_limit = Q3 + 1.5 * interquantile_range
    return round(lower_limit), round(upper_limit)

def replace_with_thresholds(dataframe, variable):
    # Replace outliers with calculated lower and upper bounds
    lower_limit, upper_limit = outlier_thresholds(dataframe, variable)
    dataframe[variable] = np.where(dataframe[variable] < lower_limit, lower_limit, dataframe[variable])
    dataframe[variable] = np.where(dataframe[variable] > upper_limit, upper_limit, dataframe[variable])

def create_cltv_df(dataframe):
    # List of columns to check for outliers
    columns = [
        "order_num_total_ever_online",
        "order_num_total_ever_offline",
        "customer_value_total_ever_online",
        "customer_value_total_ever_offline",
    ]
    # Apply outlier replacement for each column
    for col in columns:
        replace_with_thresholds(dataframe, col)

    # Calculate total order count and total customer value
    dataframe["order_num_total"] = dataframe["order_num_total_ever_online"] + dataframe["order_num_total_ever_offline"]
    dataframe["customer_value_total"] = dataframe["customer_value_total_ever_online"] + dataframe["customer_value_total_ever_offline"]
    # Remove customers with zero orders or zero value
    dataframe = dataframe[~((dataframe["customer_value_total"] == 0) | (dataframe["order_num_total"] == 0))]

    # Convert date columns to datetime type
    date_columns = dataframe.columns[dataframe.columns.str.contains("date")]
    dataframe[date_columns] = dataframe[date_columns].apply(pd.to_datetime)

    # Set analysis date
    analysis_date = dt.datetime(2021, 6, 1)

    cltv_df = pd.DataFrame()
    cltv_df["customer_id"] = dataframe["master_id"]
    # Calculate recency in weeks (time between first and last purchase)
    cltv_df["recency_cltv_weekly"] = (
        (dataframe["last_order_date"] - dataframe["first_order_date"]).dt.days / 7
    )
    # Calculate T in weeks (time between first purchase and analysis date)
    cltv_df["T_weekly"] = (
        (analysis_date - dataframe["first_order_date"]).dt.days / 7
    )
    # Total number of purchases (frequency)
    cltv_df["frequency"] = dataframe["order_num_total"]
    # Average value per purchase (monetary)
    cltv_df["monetary_cltv_avg"] = dataframe["customer_value_total"] / dataframe["order_num_total"]
    # Keep only customers with more than one purchase
    cltv_df = cltv_df[cltv_df["frequency"] > 1]

    # Fit the BG/NBD model
    bgf = BetaGeoFitter(penalizer_coef=0.001)
    bgf.fit(cltv_df["frequency"], cltv_df["recency_cltv_weekly"], cltv_df["T_weekly"])
    # Predict expected sales for 3 months
    cltv_df["exp_sales_3_month"] = bgf.predict(
        4 * 3,
        cltv_df["frequency"],
        cltv_df["recency_cltv_weekly"],
        cltv_df["T_weekly"],
    )

    # Predict expected sales for 6 months
    cltv_df["exp_sales_6_month"] = bgf.predict(
        4 * 6,
        cltv_df["frequency"],
        cltv_df["recency_cltv_weekly"],
        cltv_df["T_weekly"],
    )

    # Fit the Gamma-Gamma model
    ggf = GammaGammaFitter(penalizer_coef=0.01)
    ggf.fit(cltv_df["frequency"], cltv_df["monetary_cltv_avg"])
    # Predict expected average profit per transaction
    cltv_df["exp_average_value"] = ggf.conditional_expected_average_profit(
        cltv_df["frequency"], cltv_df["monetary_cltv_avg"]
    )

    # Calculate 6-month CLTV
    cltv = ggf.customer_lifetime_value(
        bgf,
        cltv_df["frequency"],
        cltv_df["recency_cltv_weekly"],
        cltv_df["T_weekly"],
        cltv_df["monetary_cltv_avg"],
        time=6,
        freq="W",
        discount_rate=0.01,
    )
    cltv_df["cltv"] = cltv
    # Segment customers based on CLTV quartiles
    cltv_df["cltv_segment"] = pd.qcut(cltv_df["cltv"], 4, labels=["D", "C", "B", "A"])

    return cltv_df


cltv_df = create_cltv_df(df)
cltv_df.head()

Unnamed: 0,customer_id,recency_cltv_weekly,T_weekly,frequency,monetary_cltv_avg,exp_sales_3_month,exp_sales_6_month,exp_average_value,cltv,cltv_segment
0,cc294636-19f0-11eb-8d74-000d3a38a36f,17.0,30.5714,5.0,187.874,0.8412,1.6824,193.602,341.7447,A
1,f431bd5a-ab7b-11e9-a2fc-000d3a38a36f,209.8571,224.8571,10.0,138.097,0.5301,1.0601,140.2849,156.0383,B
2,69b69676-1a40-11ea-941b-000d3a38a36f,52.2857,78.8571,5.0,117.064,0.6233,1.2467,120.9493,158.2034,B
3,1854e56c-491f-11eb-806e-000d3a38a36f,1.5714,20.8571,2.0,60.985,0.6245,1.249,67.2962,88.1924,D
4,d6ea1074-f1f5-11e9-9346-000d3a38a36f,83.1429,95.4286,2.0,104.99,0.3951,0.7902,114.2804,94.7512,D
