In [24]:
# Generate fictitious data for one client : over 5000 days : frequency of purchases per year,
# average amount spent per purchase
# date when the client stops purchasing


import numpy as np
import pandas as pd
import scipy
print("Test : les versions suivantes sont utilisées :",pd.__version__, np.__version__, scipy.__version__)

Test : les versions suivantes sont utilisées : 2.3.3 1.25.2 1.11.4


In [25]:
n=15000 # number of clients
T=25  # number of years
y_start=2020
y_end=y_start+T
y_current=pd.Timestamp.now().year
np.random.seed(42)
mu_unit=np.log(12)-0.32  # average amount spent per purchase
sigma_unit=0.8  # standard deviation of amount spent per purchase

# For each client, for each year
nb_purchases = np.random.poisson(lam=5, size=n)  # average 5 purchases per year
# total amount spent per year
tot_purchases = np.array([np.sum(np.random.lognormal(mu_unit, sigma_unit, nb_purchases[i])) for i in range(n)])
# yearly satisfaction score
yearly_satisfaction=np.minimum(nb_purchases * np.random.uniform(0, 5, size=n), 5)
# churn
churn=np.array([ np.random.binomial(n=1, p=1/(1+8*yearly_satisfaction[i]), size=1)[0] for i in range(n) ])  # the more satisfied, the less likely to churn
# Client IDs
client_ids = np.array([f"C-{i:05d}" for i in range(1, n+1)])

# Checks
print(nb_purchases[10:15])
print(tot_purchases[10:15])
print("Average total purchases :", np.mean(tot_purchases))
print("Average number of purchases :", np.mean(nb_purchases))

records = []

for year in range(y_start, y_end):
    nb_purchases = np.random.poisson(lam=5, size=n)
    tot_purchases = np.array([
        np.sum(np.random.lognormal(mu_unit, sigma_unit, nb_purchases[i]))
        for i in range(n)
    ])
    
    satisfaction_raw = nb_purchases * np.random.uniform(0, 5, size=n)
    yearly_satisfaction = np.minimum(satisfaction_raw, 5)

    churn = np.random.binomial(1, 1 / (1 + 8* yearly_satisfaction), size=n)

    year_df = pd.DataFrame({
        "Client_ID": client_ids,
        "Year": year,
        "Nb_Purchases": nb_purchases,
        "Tot_Purchases": tot_purchases,
        "Yearly_Satisfaction": yearly_satisfaction,
        "Churn": churn
    })

    records.append(year_df)

panel_data = pd.concat(records, ignore_index=True)

panel_data = panel_data.sort_values(["Client_ID", "Year"])

panel_data["Has_Churned"] = (
    panel_data
    .groupby("Client_ID")["Churn"]
    .cumsum()
)

panel_data["Period_since_churn"]=panel_data.groupby("Client_ID")["Has_Churned"].cumsum()


final_data = panel_data[panel_data["Period_since_churn"] <= 1].drop(columns=["Has_Churned", "Period_since_churn"])

 # Example: show data for some clients
# display all rows
pd.set_option('display.max_rows', None)
print(final_data.loc[final_data['Client_ID'].isin(range(10))].sort_values(by=['Client_ID', 'Year']))
# reset the display option
pd.reset_option('display.max_rows')
print(final_data.head())
print(final_data.describe())

print(final_data[final_data['Client_ID']=='C-00014'])



# Export to csv in the src/data directory
final_data.to_csv(
    r"C:\Users\fquet\Documents\TRAVAIL\Travail_personnel\Projet - Vuciliette\src\data\data.csv",
    index=False
)



[2 5 5 6 4]
[ 34.80127398 155.95525541  54.86407472  49.8628846   34.21441015]
Average total purchases : 60.12852155172513
Average number of purchases : 5.0065333333333335
Empty DataFrame
Columns: [Client_ID, Year, Nb_Purchases, Tot_Purchases, Yearly_Satisfaction, Churn]
Index: []
      Client_ID  Year  Nb_Purchases  Tot_Purchases  Yearly_Satisfaction  Churn
0       C-00001  2020             2      12.812465             0.109683      0
15000   C-00001  2021             1      47.340007             0.595477      0
30000   C-00001  2022             6      76.713955             5.000000      0
45000   C-00001  2023             8      98.784379             5.000000      0
60000   C-00001  2024             7      76.901815             5.000000      0
                Year   Nb_Purchases  Tot_Purchases  Yearly_Satisfaction  \
count  219880.000000  219880.000000  219880.000000        219880.000000   
mean     2029.476387       5.003111      59.990124             4.327223   
std         6.95016

In [28]:
# Dataset de target : CLV per client
CLV_data = final_data.groupby("Client_ID").agg({
    "Tot_Purchases": "sum"}).rename(columns={"Tot_Purchases": "CLV"})
print(CLV_data.head())
CLV_data.to_csv(
    r"C:\Users\fquet\Documents\TRAVAIL\Travail_personnel\Projet - Vuciliette\src\data\CLV_data.csv",
    index=True)
print(final_data.head(10))

# Training data set : final_data for a single year 
sales_2025= final_data[final_data["Year"]==2025]
sales_2025.head(10)
sales_2025.describe()
sales_2025.to_csv(
    r"C:\Users\fquet\Documents\TRAVAIL\Travail_personnel\Projet - Vuciliette\src\data\sales_2025.csv",
    index=False
)   

                   CLV
Client_ID             
C-00001    1900.264883
C-00002     186.866652
C-00003    1287.019534
C-00004     682.498552
C-00005     482.287294
       Client_ID  Year  Nb_Purchases  Tot_Purchases  Yearly_Satisfaction  \
0        C-00001  2020             2      12.812465             0.109683   
15000    C-00001  2021             1      47.340007             0.595477   
30000    C-00001  2022             6      76.713955             5.000000   
45000    C-00001  2023             8      98.784379             5.000000   
60000    C-00001  2024             7      76.901815             5.000000   
75000    C-00001  2025             4      78.548358             5.000000   
90000    C-00001  2026             5      61.751787             5.000000   
105000   C-00001  2027             8      88.747531             2.104069   
120000   C-00001  2028             3     162.815844             5.000000   
135000   C-00001  2029             6      78.333196             5.000000   

  