In [1]:
##################
# IMPORT LIBRARIES
##################

import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from pandas.tseries.offsets import Week
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

- **'id'** = interaction id
- **'Customer ID'** = UNIQUE CUSTOMER ID *unique = 19976 / value_counts.sum = 19976 / 17, 12, 8, 7, 7....*
- 'Customer Name' = PROBLEM multiple customer name of a single customer ID
*unique = 17738 / value_counts.sum = 20443 / 33, 17, 11, 11, 9.....*
- 'Customer Description' = Migration or special Gifted Status
- 'Plan' = plan_1 = 20442 / plan_2 = 1
- 'Product' = Supper Club = 20442 / renew test = 1
- 'Interval' = year = 20442 / day = 1
- 'Amount' = 69 = 20442 / 1 = 1
- **'Status'** = 'trialing', 'active', 'canceled', 'past_due', 'incomplete_expired'
- **'Created'** = interaction date *'Created (UTC)' is the first date in all rows: True*
- **'kind (metadata)'** = Migrated OG Member 552
- **''Cancel At Period End'** = False : 16878 / True : 3565





In [2]:
##################
# VISUAL SETTINGS
##################

plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (15, 8)
plt.rcParams['font.size'] = 12
sns.set_palette("viridis")

In [3]:
##################
# LOADING CSV
##################

# Toggle this flag to True in production
RENAME_FILES = False

data_dir = 'data'

# List and sort files by creation time
files = [
    os.path.join(data_dir, f)
    for f in os.listdir(data_dir)
    if os.path.isfile(os.path.join(data_dir, f)) and f.endswith('.csv')
]

sorted_files = sorted(files, key=os.path.getctime, reverse=True)

# Loop over files
for file_path in sorted_files:
    created_at = datetime.fromtimestamp(os.path.getctime(file_path))
    timestamp_str = created_at.strftime('%Y-%m-%d_%H-%M-%S')
    original_name = os.path.basename(file_path)
    new_name = f"{timestamp_str}_{original_name}"
    new_path = os.path.join(data_dir, new_name)

    if RENAME_FILES:
        if not original_name.startswith(timestamp_str):
            os.rename(file_path, new_path)
            print(f"Renamed: {original_name} → {new_name}")
            file_path = new_path
        else:
            print(f"Already renamed: {original_name}")
    else:
        print(f"[DEV] Would rename: {original_name} → {new_name}")



df_raw = pd.read_csv(file_path)

[DEV] Would rename: DishpatchSubscriptionData_NIklas_Sanitised - subscriptions (2).csv → 2025-05-27_20-40-12_DishpatchSubscriptionData_NIklas_Sanitised - subscriptions (2).csv


In [4]:
##################
# DATA PREPROCESSING
# 1
###################

df = df_raw.copy()

# Date conversion
date_cols = [col for col in df.columns if '(UTC)' in col]
for col in date_cols:
    df[col] = pd.to_datetime(df[col], errors='coerce')

# Order by Created (UTC)
df = df.sort_values(by='Created (UTC)')

# Column selection
df = df[['id', 
        'Customer ID',
        'Customer Name',
        'Status',
        'Cancellation Reason',
        'Created (UTC)',
        'Start (UTC)',
        'Start Date (UTC)',
        'Current Period Start (UTC)',
        'Current Period End (UTC)',
        'Trial Start (UTC)',
        'Trial End (UTC)',
        'Canceled At (UTC)',
        'Ended At (UTC)',
       'senderShopifyCustomerId (metadata)']]


# Renaming and selection
df.rename(columns={'Customer ID' : 'customer_id',
                   'Customer Name' : 'customer_name',
                   'Status' : 'status',
                   'Created (UTC)': 'created_utc',
                   'Start (UTC)': 'start_utc',
                   'Start Date (UTC)': 'start_date_utc',
                   'Current Period Start (UTC)': 'current_period_start_utc',
                   'Current Period End (UTC)': 'current_period_end_date_utc',
                   'Trial Start (UTC)': 'trial_start_utc',
                   'Trial End (UTC)': 'trial_end_utc',
                   'Canceled At (UTC)': 'canceled_at_utc',
                   'Ended At (UTC)': 'ended_at_utc',
                   'senderShopifyCustomerId (metadata)': 'is_gift_member'
                }, inplace=True)


# Reference date for analysis
reference_date = datetime.now()

# Hardcleaning

# df['period_duration'] = np.where(df['ended_at_utc'].notnull(),
#                             (df['ended_at_utc'] - df['current_period_start_utc']),
#                             (df['current_period_end_date_utc'] - df['current_period_start_utc']))

########################################################################
# Remove Gifted User from analysis_df

# # Identification of gifted members vs. regular signups
# df['is_gift_member'] = df['gift_sender_id'].notna()
# df['is_regular_signup'] = ~df['is_gift_member']

# # Filter to keep only regular signups
# analysis_df = df[df['is_regular_signup']].copy()
# print(f"Number of regular signups for analysis: {len(analysis_df):,}")

### OR ###
# Keed Gifted User from analysis_df

# TRUE OR FALSEIdentification of gifted members vs. regular signups
#df['is_gifted_member'] = df['is_gift_member'].notna()

analysis_df = df.copy()

########################################################################

#print(len(analysis_df))

# REMOVING High transaction volume from (Teams Account)
team_account = df[df.groupby('customer_id')['id'].transform('count') > 4]
# remove team_account from analysis_df
analysis_df = analysis_df[~analysis_df['customer_id'].isin(team_account['customer_id'])]

team_account = df[df.groupby('customer_name')['id'].transform('count') > 4]
# remove team_account from analysis_df
analysis_df = analysis_df[~analysis_df['customer_name'].isin(team_account['customer_name'])]

#print(len(analysis_df))

In [5]:
##################
# DEFINITION OF KEY PERFORMANCE INDICATORS
# 2
###################

# A. Initial full menber conversion
########################################################################
# # Include past_due status (Active or Past_due) -  - Include no Trial
# analysis_df['is_full_member'] = (
#     (analysis_df['is_gift_member'].isna()) &
#     (analysis_df['status'].isin(['active', 'past_due'])) &
#     (analysis_df['canceled_at_utc'].isna()) &
#     (analysis_df['trial_end_utc'] < reference_date) | analysis_df['trial_end_utc'].isna()
# )

# ## OR ##
# Include only Active status (Active) - Include no Trial
analysis_df['is_full_member'] = (
    (analysis_df['is_gift_member'].isna()) &
    (analysis_df['status'] == 'active') &
    (analysis_df['canceled_at_utc'].isna()) &
    (analysis_df['trial_end_utc'] < reference_date) | analysis_df['trial_end_utc'].isna()
)

## OR ##
# Include only Active status (Active) - Exclude no Trial member
# analysis_df['is_full_member'] = (
#     (analysis_df['is_gift_member'].isna()) &
#     (analysis_df['status'] == 'active') &
#     (analysis_df['canceled_at_utc'].isna()) &
#     (analysis_df['trial_end_utc'] < reference_date)
# )
########################################################################


# B. Client canceled during trial
analysis_df['canceled_during_trial'] = (
    (analysis_df['is_gift_member'].isna()) &
    (analysis_df['trial_end_utc'].notna()) &
    (analysis_df['canceled_at_utc'].notna()) &
    (analysis_df['canceled_at_utc'] <= analysis_df['trial_end_utc'])
)


# C. Client canceled during Churn periode (14 days after trial end)
########################################################################
# Client had a trial
analysis_df['canceled_during_churn'] = (
    (analysis_df['is_gift_member'].isna()) &
    (analysis_df['trial_end_utc'].notna()) &
    (analysis_df['canceled_at_utc'] < analysis_df['trial_end_utc'] + pd.Timedelta(days=14)) &
    (analysis_df['canceled_at_utc'] > analysis_df['trial_end_utc'])
)

## OR ##
# # Client had no trial
# analysis_df['canceled_during_churn'] = (
#     (analysis_df['is_gift_member'].isna()) &
#     (analysis_df['trial_end_utc'].isna()) &
#     (analysis_df['canceled_at_utc'].notna()) &
#     (analysis_df['canceled_at_utc'] < analysis_df['created_at_date_utc'] + pd.Timedelta(days=14))
# )
########################################################################


# D. Client still in Trial period
analysis_df['is_currently_trialing'] = (
    (analysis_df['is_gift_member'].isna()) &
    (analysis_df['trial_end_utc'] > reference_date) &
    (analysis_df['canceled_at_utc'].isna()))

# E. Client still in Churn period
analysis_df['is_currently_in_churn_period'] = (
    (analysis_df['is_gift_member'].isna()) &
    ((analysis_df['trial_end_utc'] + pd.Timedelta(days=14)) > reference_date) &
    (analysis_df['canceled_at_utc'].isna()))

# F. Start date of the paid subscription
analysis_df['subscription_start_date'] = analysis_df['trial_end_utc']

# D. Refund
analysis_df['is_refund'] =(
    (analysis_df['is_gift_member'].isna()) &
    (analysis_df['status'] == 'canceled') & 
    (analysis_df['canceled_at_utc'].notna()) &
    (analysis_df['trial_end_utc'].notna()) &
    ((analysis_df['canceled_at_utc'] - analysis_df['trial_end_utc']).dt.days <= 14) &
    ((analysis_df['canceled_at_utc'] - analysis_df['trial_end_utc']).dt.days >= 0)  # Éviter les valeurs négatives
)

# E. Initial Conversion
analysis_df['paid_after_trial'] = (
    analysis_df['is_gift_member'] == False &
    analysis_df['trial_end_utc'].notna() &  # Doit avoir une date de fin d'essai
    (
        # Cas 1: Client toujours actif (n'a jamais été annulé)
        (analysis_df['status'] == 'active') & analysis_df['canceled_at_utc'].isna()) |
        # Cas 2: Client annulé APRÈS la fin de l'essai (donc a payé au moins une fois)
        (
            (analysis_df['status'] == 'canceled') &
            (analysis_df['canceled_at_utc'].notna()) &
            (analysis_df['canceled_at_utc'] > analysis_df['trial_end_utc'])
        )
    )



extra_col = ['is_full_member', 
'canceled_during_churn', 
'canceled_during_trial', 
'is_currently_trialing', 
'is_currently_in_churn_period', 
'is_refund']


for i in extra_col:
    print(analysis_df[i].sum(), i)

7216 is_full_member
859 canceled_during_churn
9891 canceled_during_trial
80 is_currently_trialing
183 is_currently_in_churn_period
5560 is_refund


In [6]:
##################
# AGGREGATION AT THE UNIQUE CLIENT LEVEL
# 2
###################

########################################################################
# GROUP BY customer_name
customer_df = analysis_df.groupby('customer_name').agg({
    'customer_id': 'first',
    'created_utc': 'first',
    'status': 'last',
    'current_period_start_utc': 'last',
    'current_period_end_date_utc': 'last',
    'trial_start_utc': 'first',
    'trial_end_utc': 'last',
    'canceled_at_utc': 'last',
    'ended_at_utc': 'last',
    'is_gift_member': list,
    #'period_duration': sum,
    'is_full_member': list,
    'canceled_during_churn': list,
    'canceled_during_trial': list,
    'is_currently_trialing': list,
    'is_currently_in_churn_period': list,
    'is_refund': list,
    'subscription_start_date': 'first', 
    'paid_after_trial': list
})


# ## OR ##
# # GROUP BY customer_id
# customer_df = analysis_df.groupby('customer_id').agg({
#     'customer_id': 'first',
#     'created_utc': 'first',
#     'status': 'last',
#     'current_period_start_utc': 'last',
#     'current_period_end_date_utc': 'last',
#     'trial_start_utc': 'first',
#     'trial_end_utc': 'last',
#     'canceled_at_utc': 'last',
#     'ended_at_utc': 'last',
#     'is_gift_member': list,
#     #'period_duration': sum,
#     'is_full_member': list,
#     'canceled_during_churn': list,
#     'canceled_during_trial': list,
#     'is_currently_trialing': list,
#     'is_currently_in_churn_period': list,
#     'is_refund': list
# })


# ########################################################################


#print(f"Number of unique Customer IDs : {len(customer_df):,}")
#print(f"Number of unique Customer Names : {analysis_df['customer_name'].nunique():,}")

In [7]:
multi_sub_customers = analysis_df.groupby('customer_name').size()
multi_sub_customers = multi_sub_customers[multi_sub_customers > 1].sort_values(ascending=False)
if len(multi_sub_customers) > 0:
    print(f"\n📊 Clients avec multiples abonnements détectés : {len(multi_sub_customers)}")
    print(f"   Exemple - {multi_sub_customers.index[0]} : {multi_sub_customers.iloc[0]} abonnements")
    print(f"   Total d'abonnements multiples : {(multi_sub_customers - 1).sum()}")
else:
    print("\n✅ Aucun client avec multiples abonnements détecté")



📊 Clients avec multiples abonnements détectés : 2161
   Exemple - Customer1394 : 4 abonnements
   Total d'abonnements multiples : 2532


In [8]:
# Indicateurs au niveau client
customer_df['paid_initial_subscription'] = customer_df['is_full_member']
customer_df['trial_canceled'] = customer_df['canceled_during_trial']
customer_df['refunded_initial'] = customer_df['is_refund']
customer_df['currently_trialing'] = customer_df['is_currently_trialing']
customer_df['first_subscription_start_date'] = customer_df['subscription_start_date']

customer_df['eligible_for_1st_renewal'] = (
    customer_df['paid_initial_subscription'] & 
    customer_df['first_subscription_start_date'].notna() & 
    ((reference_date - customer_df['first_subscription_start_date']).dt.days >= 365)  # >= au lieu de >
)


TypeError: unsupported operand type(s) for &: 'list' and 'bool'

In [None]:
customer_df[customer_df['status'] == 'active']
analysis_df[analysis_df['customer_name'] == 'Customer9990']

Unnamed: 0,id,customer_id,customer_name,status,Cancellation Reason,created_utc,start_utc,start_date_utc,current_period_start_utc,current_period_end_date_utc,...,ended_at_utc,is_gift_member,is_full_member,canceled_during_trial,canceled_during_churn,is_currently_trialing,is_currently_in_churn_period,subscription_start_date,is_refund,paid_after_trial
11158,sub_1P9SqNCZ9aYYH5winYAPV7Uz,cus_PzRjNgID3iA78J,Customer9990,canceled,cancellation_requested,2024-04-25 14:00:00,2024-04-25 14:00:00,2024-04-25 14:00:00,2024-04-25 14:00:00,2024-05-05 14:00:00,...,2024-05-05 14:00:00,,False,True,False,False,False,2024-05-05 14:00:00,True,False
11140,sub_1P9pmwCZ9aYYH5wi16AWqIqd,cus_PzpRRFgmD9KZbb,Customer9990,canceled,cancellation_requested,2024-04-26 14:30:00,2024-04-26 14:30:00,2024-04-26 14:30:00,2024-04-26 14:30:00,2024-05-06 14:30:00,...,2024-05-06 14:30:00,,False,True,False,False,False,2024-05-06 14:30:00,True,False
11134,sub_1P9wSiCZ9aYYH5wiOjm4USYJ,cus_PzwLbud851rs9f,Customer9990,canceled,cancellation_requested,2024-04-26 21:37:00,2024-04-26 21:37:00,2024-04-26 21:37:00,2024-04-26 21:37:00,2024-05-06 21:37:00,...,2024-05-06 21:37:00,,False,True,False,False,False,2024-05-06 21:37:00,True,False
11133,sub_1P9xvRCZ9aYYH5wikDULIUJq,cus_PzxrUEUBtyLtF1,Customer9990,active,,2024-04-26 23:11:00,2024-04-26 23:11:00,2024-04-26 23:11:00,2025-05-06 23:11:00,2026-05-06 23:11:00,...,NaT,,True,False,False,False,False,2024-05-06 23:11:00,False,False


In [None]:
customer_df[customer_df.index == 'Customer9990']

Unnamed: 0_level_0,customer_id,created_utc,status,current_period_start_utc,current_period_end_date_utc,trial_start_utc,trial_end_utc,canceled_at_utc,ended_at_utc,is_gift_member,...,is_currently_trialing,is_currently_in_churn_period,is_refund,subscription_start_date,paid_after_trial,paid_initial_subscription,trial_canceled,refunded_initial,currently_trialing,first_subscription_start_date
customer_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Customer9990,cus_PzRjNgID3iA78J,2024-04-25 14:00:00,active,2025-05-06 23:11:00,2026-05-06 23:11:00,2024-04-25 14:00:00,2024-05-06 23:11:00,2024-05-06 21:37:00,2024-05-06 21:37:00,"[nan, nan, nan, nan]",...,"[False, False, False, False]","[False, False, False, False]","[True, True, True, False]","[2024-05-05 14:00:00, 2024-05-06 14:30:00, 202...","[False, False, False, False]","[False, False, False, True]","[True, True, True, False]","[True, True, True, False]","[False, False, False, False]","[2024-05-05 14:00:00, 2024-05-06 14:30:00, 202..."
