In [107]:
##################
# IMPORT LIBRARIES
##################

import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from pandas.tseries.offsets import Week
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

- **'id'** = interaction id
- **'Customer ID'** = UNIQUE CUSTOMER ID *unique = 19976 / value_counts.sum = 19976 / 17, 12, 8, 7, 7....*
- 'Customer Name' = PROBLEM multiple customer name of a single customer ID
*unique = 17738 / value_counts.sum = 20443 / 33, 17, 11, 11, 9.....*
- 'Customer Description' = Migration or special Gifted Status
- 'Plan' = plan_1 = 20442 / plan_2 = 1
- 'Product' = Supper Club = 20442 / renew test = 1
- 'Interval' = year = 20442 / day = 1
- 'Amount' = 69 = 20442 / 1 = 1
- **'Status'** = 'trialing', 'active', 'canceled', 'past_due', 'incomplete_expired'
- **'Created'** = interaction date *'Created (UTC)' is the first date in all rows: True*
- **'kind (metadata)'** = Migrated OG Member 552
- **''Cancel At Period End'** = False : 16878 / True : 3565





In [108]:
##################
# VISUAL SETTINGS
##################

plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (15, 8)
plt.rcParams['font.size'] = 12
sns.set_palette("viridis")

In [109]:
##################
# LOADING CSV
##################

# Toggle this flag to True in production
RENAME_FILES = False

data_dir = 'data'

# List and sort files by creation time
files = [
    os.path.join(data_dir, f)
    for f in os.listdir(data_dir)
    if os.path.isfile(os.path.join(data_dir, f)) and f.endswith('.csv')
]

sorted_files = sorted(files, key=os.path.getctime, reverse=True)

# Loop over files
for file_path in sorted_files:
    created_at = datetime.fromtimestamp(os.path.getctime(file_path))
    timestamp_str = created_at.strftime('%Y-%m-%d_%H-%M-%S')
    original_name = os.path.basename(file_path)
    new_name = f"{timestamp_str}_{original_name}"
    new_path = os.path.join(data_dir, new_name)

    if RENAME_FILES:
        if not original_name.startswith(timestamp_str):
            os.rename(file_path, new_path)
            print(f"Renamed: {original_name} → {new_name}")
            file_path = new_path
        else:
            print(f"Already renamed: {original_name}")
    else:
        print(f"[DEV] Would rename: {original_name} → {new_name}")



df_raw = pd.read_csv(file_path)

[DEV] Would rename: DishpatchSubscriptionData_NIklas_Sanitised - subscriptions (2).csv → 2025-05-26_17-52-36_DishpatchSubscriptionData_NIklas_Sanitised - subscriptions (2).csv


In [110]:
##################
# DATA PREPROCESSING
# 1
###################

df = df_raw.copy()

# Date conversion
date_cols = [col for col in df.columns if '(UTC)' in col]
for col in date_cols:
    df[col] = pd.to_datetime(df[col], errors='coerce')

# Order by Created (UTC)
df = df.sort_values(by='Created (UTC)')

# Column selection
df = df[['id', 
        'Customer ID',
        'Customer Name',
        'Status',
        'Cancellation Reason',
        'Created (UTC)',
        'Start (UTC)',
        'Start Date (UTC)',
        'Current Period Start (UTC)',
        'Current Period End (UTC)',
        'Trial Start (UTC)',
        'Trial End (UTC)',
        'Canceled At (UTC)',
        'Ended At (UTC)',
       'senderShopifyCustomerId (metadata)']]


# Renaming and selection
df.rename(columns={'Customer ID' : 'customer_id',
                   'Customer Name' : 'customer_name',
                   'Status' : 'status',
                   'Created (UTC)': 'created_date_utc',
                   'Start (UTC)': 'start_date_utc',
                   'Start Date (UTC)': 'start_date_date_utc',
                   'Current Period Start (UTC)': 'current_period_start_date_utc',
                   'Current Period End (UTC)': 'current_period_end_date_utc',
                   'Trial Start (UTC)': 'trial_start_date_utc',
                   'Trial End (UTC)': 'trial_end_date_utc',
                   'Canceled At (UTC)': 'canceled_at_date_utc',
                   'Ended At (UTC)': 'ended_at_date_utc',
                   'senderShopifyCustomerId (metadata)': 'is_gift_member'
                }, inplace=True)


# Reference date for analysis
reference_date = datetime.now()


df['period_duration'] = np.where(df['ended_at_date_utc'].notnull(),
                            (df['ended_at_date_utc'] - df['current_period_start_date_utc']),
                            (df['current_period_end_date_utc'] - df['current_period_start_date_utc']))

########################################################################
# Remove Gifted User from analysis_df

# # Identification of gifted members vs. regular signups
# df['is_gift_member'] = df['gift_sender_id'].notna()
# df['is_regular_signup'] = ~df['is_gift_member']

# # Filter to keep only regular signups
# analysis_df = df[df['is_regular_signup']].copy()
# print(f"Number of regular signups for analysis: {len(analysis_df):,}")

### OR ###
# Keed Gifted User from analysis_df

# TRUE OR FALSEIdentification of gifted members vs. regular signups
#df['is_gifted_member'] = df['is_gift_member'].notna()

analysis_df = df.copy()

########################################################################

In [111]:
##################
# DEFINITION OF KEY PERFORMANCE INDICATORS
# 2
###################

# A. Initial full menber conversion
########################################################################
# # Include past_due status (Active or Past_due) -  - Include no Trial
# analysis_df['is_full_member'] = (
#     (analysis_df['is_gift_member'].isna()) &
#     (analysis_df['status'].isin(['active', 'past_due'])) &
#     (analysis_df['canceled_at_date_utc'].isna()) &
#     (analysis_df['trial_end_date_utc'] < reference_date) | analysis_df['trial_end_date_utc'].isna()
# )

# ## OR ##
# # Include only Active status (Active) - Include no Trial
# analysis_df['is_full_member'] = (
#     (analysis_df['is_gift_member'].isna()) &
#     (analysis_df['status'] == 'active') &
#     (analysis_df['canceled_at_date_utc'].isna()) &
#     (analysis_df['trial_end_date_utc'] < reference_date) | analysis_df['trial_end_date_utc'].isna()
# )

## OR ##
# Include only Active status (Active) - Exclude no Trial member
analysis_df['is_full_member'] = (
    (analysis_df['is_gift_member'].isna()) &
    (analysis_df['status'] == 'active') &
    (analysis_df['canceled_at_date_utc'].isna()) &
    (analysis_df['trial_end_date_utc'] < reference_date)
)
########################################################################


# B. Client canceled during trial
analysis_df['canceled_during_trial'] = (
    (analysis_df['is_gift_member'].isna()) &
    (analysis_df['trial_end_date_utc'].notna()) &
    (analysis_df['canceled_at_date_utc'].notna()) &
    (analysis_df['canceled_at_date_utc'] <= analysis_df['trial_end_date_utc'])
)


# C. Client canceled during Churn periode (14 days after trial end)
########################################################################
# Client had a trial
analysis_df['canceled_during_churn'] = (
    (analysis_df['is_gift_member'].isna()) &
    (analysis_df['trial_end_date_utc'].notna()) &
    (analysis_df['canceled_at_date_utc'] < analysis_df['trial_end_date_utc'] + pd.Timedelta(days=14))
)

## OR ##
# # Client had no trial
# analysis_df['canceled_during_churn'] = (
#     (analysis_df['is_gift_member'].isna()) &
#     (analysis_df['trial_end_date_utc'].isna()) &
#     (analysis_df['canceled_at_date_utc'].notna()) &
#     (analysis_df['canceled_at_date_utc'] < analysis_df['created_at_date_utc'] + pd.Timedelta(days=14))
# )
########################################################################


# D. Client still in Trial period
analysis_df['is_currently_trialing'] = (
    (analysis_df['is_gift_member'].isna()) &
    (analysis_df['trial_end_date_utc'] > reference_date) &
    (analysis_df['canceled_at_date_utc'].isna()))

# E. Client still in Churn period
analysis_df['is_currently_in_churn_period'] = (
    (analysis_df['is_gift_member'].isna()) &
    ((analysis_df['trial_end_date_utc'] + pd.Timedelta(days=14)) > reference_date) &
    (analysis_df['canceled_at_date_utc'].isna()))

# F. Start date of the paid subscription
analysis_df['subscription_start_date'] = analysis_df['trial_end_date_utc']

# D. Refund
analysis_df['is_refund'] =(
    (analysis_df['is_gift_member'].isna()) &
    (analysis_df['status'] == 'canceled') & 
    (analysis_df['canceled_at_date_utc'].notna()) &
    (analysis_df['trial_end_date_utc'].notna()) &
    ((analysis_df['canceled_at_date_utc'] - analysis_df['trial_end_date_utc']).dt.days <= 14) &
    ((analysis_df['canceled_at_date_utc'] - analysis_df['trial_end_date_utc']).dt.days >= 0)  # Éviter les valeurs négatives
)


extra_col = ['is_full_member', 
'canceled_during_churn', 
'canceled_during_trial', 
'is_currently_trialing', 
'is_currently_in_churn_period', 
'is_refund']


for i in extra_col:
    print(analysis_df[i].sum(), i)



4988 is_full_member
10879 canceled_during_churn
10020 canceled_during_trial
88 is_currently_trialing
183 is_currently_in_churn_period
5638 is_refund


In [112]:
##################
# AGGREGATION AT THE UNIQUE CLIENT LEVEL
# 2
###################

########################################################################
# GROUP BY customer_id
customer_df = analysis_df.groupby('customer_id').agg({
    'created_date_utc': 'first',
    'status': 'last',
    'current_period_start_date_utc': 'last',
    'current_period_end_date_utc': 'last',
    'trial_start_date_utc': 'first',
    'trial_end_date_utc': 'last',
    'canceled_at_date_utc': 'last',
    'ended_at_date_utc': 'last',
    'is_gift_member': list,
    'period_duration': sum,
    'is_full_member': list,
    'canceled_during_churn': list,
    'canceled_during_trial': list,
    'is_currently_trialing': list,
    'is_currently_in_churn_period': list,
    'is_refund': list
})


# ## OR ##
# # GROUP BY customer_name
# customer_df = df.groupby('customer_name').agg({
#     'created_date_utc': 'first',
#     'status': 'last',
#     'current_period_start_date_utc': 'last',
#     'current_period_end_date_utc': 'last',
#     'trial_start_date_utc': 'first',
#     'trial_end_date_utc': 'last',
#     'canceled_at_date_utc': 'last',
#     'ended_at_date_utc': 'last',
#     'is_gift_member': list,
#     'period_duration': sum
# })
# ########################################################################


#print(f"Number of unique Customer IDs : {len(customer_df):,}")
#print(f"Number of unique Customer Names : {analysis_df['customer_name'].nunique():,}")
customer_df[customer_df['status'] == 'active']

Unnamed: 0_level_0,created_date_utc,status,current_period_start_date_utc,current_period_end_date_utc,trial_start_date_utc,trial_end_date_utc,canceled_at_date_utc,ended_at_date_utc,is_gift_member,period_duration,is_full_member,canceled_during_churn,canceled_during_trial,is_currently_trialing,is_currently_in_churn_period,is_refund
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
cus_OhagJTEOs7LZLk,2023-09-25 09:54:00,active,2024-10-25 09:54:00,2025-10-25 09:54:00,2023-09-25 09:54:00,2023-10-25 09:54:00,NaT,NaT,[nan],365 days,[True],[False],[False],[False],[False],[False]
cus_Ohc4lazRKnDFZ3,2023-09-25 11:20:00,active,2024-10-25 11:20:00,2025-10-25 11:20:00,2023-09-25 11:20:00,2023-10-25 11:20:00,NaT,NaT,[nan],365 days,[True],[False],[False],[False],[False],[False]
cus_OhdZKUNDUARcWt,2023-09-25 12:53:00,active,2024-10-25 12:53:00,2025-10-25 12:53:00,2023-09-25 12:53:00,2023-10-25 12:53:00,NaT,NaT,[nan],365 days,[True],[False],[False],[False],[False],[False]
cus_Ohedmx3IFvbqdx,2023-09-25 14:00:00,active,2024-10-25 14:00:00,2025-10-25 14:00:00,2023-09-25 14:00:00,2023-10-25 14:00:00,NaT,NaT,[nan],365 days,[True],[False],[False],[False],[False],[False]
cus_OhfLGWZKdHkZrF,2023-09-25 14:44:00,active,2024-10-25 14:44:00,2025-10-25 14:44:00,2023-09-25 14:44:00,2023-10-25 14:44:00,NaT,NaT,[nan],365 days,[True],[False],[False],[False],[False],[False]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
cus_SK4pCXYZZuwaeH,2025-05-16 15:59:00,active,2025-05-16 15:59:00,2026-05-16 15:59:00,NaT,NaT,2025-05-16 15:59:00,NaT,[6880407978026.0],365 days,[False],[False],[False],[False],[False],[False]
cus_SK5m1zI3dksJhK,2025-05-16 16:59:00,active,2025-05-16 16:59:00,2026-05-16 16:59:00,NaT,NaT,2025-05-16 16:59:00,NaT,[3184448208938.0],365 days,[False],[False],[False],[False],[False],[False]
cus_SKok7wMYgqrIT7,2025-05-18 15:26:00,active,2025-05-18 15:26:00,2026-05-18 15:26:00,NaT,NaT,2025-05-18 15:26:00,NaT,[23249636262272.0],365 days,[False],[False],[False],[False],[False],[False]
cus_SL5k89KFbjYZMh,2025-05-19 09:00:00,active,2025-05-19 09:00:00,2026-05-19 09:00:00,NaT,NaT,2025-05-19 09:00:00,NaT,[22969593495936.0],365 days,[False],[False],[False],[False],[False],[False]


In [113]:
# analysis_df[analysis_df['period_duration'] == '365 days 00:00:00']
# #analysis_df['period_duration'].value_counts()
# #analysis_df[analysis_df['customer_id'] == 'cus_PFQYQkdFuy9n4A]']
# #analysis_df[analysis_df['customer_id'] == 'cus_OhagJTEOs7LZLk']


analysis_df[analysis_df['customer_id'] == 'cus_Pqte3EKNLHrKDd']
#customer_df[customer_df.index == 'cus_RSV2R19DHU3jgw']

Unnamed: 0,id,customer_id,customer_name,status,Cancellation Reason,created_date_utc,start_date_utc,start_date_date_utc,current_period_start_date_utc,current_period_end_date_utc,...,ended_at_date_utc,is_gift_member,period_duration,is_full_member,canceled_during_trial,canceled_during_churn,is_currently_trialing,is_currently_in_churn_period,subscription_start_date,is_refund
11677,sub_1P1BrKCZ9aYYH5wi5ZqKGHpp,cus_Pqte3EKNLHrKDd,Customer8877,canceled,cancellation_requested,2024-04-02 18:14:00,2024-04-08 08:47:00,2024-04-02 18:14:00,2024-04-02 18:14:00,2024-04-12 18:14:00,...,2024-04-12 18:14:00,,10 days 00:00:00,False,True,True,False,False,2024-04-12 18:14:00,False
10874,sub_1PEXT4CZ9aYYH5wiZrvRe0Dc,cus_Pqte3EKNLHrKDd,Customer8877,canceled,cancellation_requested,2024-05-09 13:57:00,2024-05-09 13:57:00,2024-05-09 13:57:00,2024-05-09 13:57:00,2024-05-19 13:57:00,...,2024-05-09 13:57:00,,0 days 00:00:00,False,True,True,False,False,2024-05-19 13:57:00,False
10873,sub_1PEXTTCZ9aYYH5wi3nv7PPVk,cus_Pqte3EKNLHrKDd,Customer8877,canceled,cancellation_requested,2024-05-09 13:57:00,2024-05-09 13:57:00,2024-05-09 13:57:00,2024-05-09 13:57:00,2024-05-19 13:57:00,...,2024-05-09 13:58:00,,0 days 00:01:00,False,True,True,False,False,2024-05-19 13:57:00,False
10872,sub_1PEXVoCZ9aYYH5wiJzSRsNmP,cus_Pqte3EKNLHrKDd,Customer8877,canceled,cancellation_requested,2024-05-09 13:59:00,2024-05-09 13:59:00,2024-05-09 13:59:00,2024-05-09 13:59:00,2024-05-19 13:59:00,...,2024-05-09 14:01:00,,0 days 00:02:00,False,True,True,False,False,2024-05-19 13:59:00,False
10871,sub_1PEXXfCZ9aYYH5wimqIxWg2R,cus_Pqte3EKNLHrKDd,Customer8877,canceled,cancellation_requested,2024-05-09 14:01:00,2024-05-09 14:01:00,2024-05-09 14:01:00,2024-05-09 14:01:00,2024-05-19 14:01:00,...,2024-05-09 14:16:00,,0 days 00:15:00,False,True,True,False,False,2024-05-19 14:01:00,False
10867,sub_1PEYTRCZ9aYYH5wihC6SeI8i,cus_Pqte3EKNLHrKDd,Customer8877,canceled,cancellation_requested,2024-05-09 15:01:00,2024-05-09 15:01:00,2024-05-09 15:01:00,2024-05-09 15:01:00,2024-05-19 15:01:00,...,2024-05-09 15:01:00,,0 days 00:00:00,False,True,True,False,False,2024-05-19 15:01:00,False
10866,sub_1PEYTTCZ9aYYH5wibRaQLKFD,cus_Pqte3EKNLHrKDd,Customer8877,canceled,cancellation_requested,2024-05-09 15:01:00,2024-05-09 15:01:00,2024-05-09 15:01:00,2024-05-09 15:01:00,2024-05-19 15:01:00,...,2024-05-09 15:14:00,,0 days 00:13:00,False,True,True,False,False,2024-05-19 15:01:00,False
10861,sub_1PEa9zCZ9aYYH5wivVxXxkfY,cus_Pqte3EKNLHrKDd,Customer8877,canceled,cancellation_requested,2024-05-09 16:49:00,2024-05-09 16:59:00,2024-05-09 16:49:00,2024-05-09 16:59:00,2025-05-09 16:59:00,...,2024-06-05 16:22:00,,26 days 23:23:00,False,False,False,False,False,2024-05-09 16:59:00,False
10387,sub_1POMBICZ9aYYH5wiZTEx8t6o,cus_Pqte3EKNLHrKDd,Customer8877,canceled,cancellation_requested,2024-06-05 15:55:00,2024-06-05 15:55:00,2024-06-05 15:55:00,2024-06-05 15:55:00,2024-06-15 15:55:00,...,2024-06-05 16:02:00,,0 days 00:07:00,False,True,True,False,False,2024-06-15 15:55:00,False
10386,sub_1POME6CZ9aYYH5wioPGJLc6M,cus_Pqte3EKNLHrKDd,Customer8877,canceled,cancellation_requested,2024-06-05 15:58:00,2024-06-05 15:58:00,2024-06-05 15:58:00,2024-06-05 15:58:00,2024-06-15 15:58:00,...,2024-06-05 16:02:00,,0 days 00:04:00,False,True,True,False,False,2024-06-15 15:58:00,False


In [114]:
multi_sub_customers = analysis_df.groupby('customer_name').size()
multi_sub_customers = multi_sub_customers[multi_sub_customers > 1].sort_values(ascending=False)
if len(multi_sub_customers) > 0:
    print(f"\n📊 Clients avec multiples abonnements détectés : {len(multi_sub_customers)}")
    print(f"   Exemple - {multi_sub_customers.index[0]} : {multi_sub_customers.iloc[0]} abonnements")
    print(f"   Total d'abonnements multiples : {(multi_sub_customers - 1).sum()}")
else:
    print("\n✅ Aucun client avec multiples abonnements détecté")


# Indicateurs au niveau client
customer_df['paid_initial_subscription'] = customer_df['is_full_member']
customer_df['trial_canceled'] = customer_df['is_trial_cancellation']
customer_df['refunded_initial'] = customer_df['is_refund']
customer_df['currently_trialing'] = customer_df['is_currently_trialing']
customer_df['first_subscription_start_date'] = customer_df['subscription_start_date']

customer_df['eligible_for_1st_renewal'] = (
    customer_df['paid_initial_subscription'] & 
    customer_df['first_subscription_start_date'].notna() & 
    ((reference_date - customer_df['first_subscription_start_date']).dt.days >= 365)  # >= au lieu de >
)



📊 Clients avec multiples abonnements détectés : 2189
   Exemple - Customer5019 : 33 abonnements
   Total d'abonnements multiples : 2705


KeyError: 'is_trial_cancellation'

In [None]:
customer_df