In [63]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.max_columns', None)
import seaborn as sns
import os
from datetime import datetime
import shutil
import glob

In [64]:
plt.rcParams.update({'font.size': 11, 'axes.labelsize': 10, 'axes.titlesize': 16})
plt.rcParams['figure.facecolor'] = 'white'
plt.rcParams['axes.facecolor'] = 'white'
plt.rcParams['axes.edgecolor'] = 'black'
#plt.rcParams['xtick.color'] = 'black'
plt.rcParams['xtick.color'] = 'white'
#plt.rcParams['ytick.color'] = 'black'
plt.rcParams['ytick.color'] = 'white'
plt.rcParams['figure.figsize'] = (22, 11)

# Grid with opacity and in background
plt.rcParams['axes.grid'] = True
plt.rcParams['grid.color'] = 'lightgray'
plt.rcParams['grid.alpha'] = 0.5
plt.rcParams['axes.axisbelow'] = True

plt.rcParams['axes.titleweight'] = 'bold'
#plt.rcParams['axes.titlecolor'] = 'black'
plt.rcParams['axes.titlecolor'] = 'white'
#plt.rcParams['axes.labelcolor'] = 'black'
plt.rcParams['legend.labelcolor'] = 'black'
plt.rcParams['legend.facecolor'] = 'white'
plt.rcParams['legend.edgecolor'] = 'gray'
plt.rcParams['text.color'] = 'black'
sns.set_palette("viridis")


In [65]:
# Set TODAY DATE
# today_date = pd.Timestamp.now(tz='UTC')
today_date = pd.Timestamp('2025-05-30', tz='UTC')  # For testing purposes


# Set REFUND PERDIOD DURATION
REFUND_PERIOD_DAYS = 14  # Duration of the refund period in days

# Set thresholds for cleaning
HIGH_VOLUME_THRESHOLD = 5
DUPLICATE_THRESHOLD_MINUTES = 15


# Set DIRECTORIES
data_dir = 'both_csv_go_here'
archive_csv_dir = 'archive/csv'
archive_png_dir = 'archive/analysis/png'
archive_pdf_dir = 'archive/analysis/pdf'
analysis_dir = 'analysis'

In [66]:
def get_file_creation_date(file_path):
    """
    Get the creation date of a file and return it as a formatted string
    Returns format: YYYY-MM-DD
    """
    try:
        # Get file creation time (or modification time if creation not available)
        if os.name == 'nt':  # Windows
            creation_time = os.path.getctime(file_path)
        else:  # Unix/Linux/Mac
            creation_time = os.path.getmtime(file_path)

        # Convert to datetime and format
        creation_date = datetime.fromtimestamp(creation_time)
        return creation_date.strftime('%Y-%m-%d')

    except Exception as e:
        print(f"❌ Error getting creation date for {file_path}: {e}")
        # Fallback to today's date
        return datetime.now().strftime('%Y-%m-%d')


def transfer_files_to_archive():
    """
    Enhanced version with date-based organization
    Transfer PNG files from analysis_dir to archive_png_dir/YYYY-MM-DD/
    Transfer PDF files from analysis_dir to archive_pdf_dir/YYYY-MM-DD/
    """
    timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

    # === TRANSFER PNG FILES ===
    png_files = glob.glob(os.path.join(analysis_dir, "*.png"))
    png_transferred = 0

    for png_file in png_files:
        filename = os.path.basename(png_file)

        # Get creation date for organization
        creation_date = get_file_creation_date(png_file)

        # Create date-based directory in archive
        date_archive_dir = os.path.join(archive_png_dir, creation_date)
        os.makedirs(date_archive_dir, exist_ok=True)

        # Set destination with date organization
        destination = os.path.join(date_archive_dir, filename)

        try:
            # Copy file to archive (keep original in analysis_dir)
            shutil.copy2(png_file, destination)
            print(f"📊 PNG archived: {creation_date}/{filename}")
            png_transferred += 1
        except Exception as e:
            print(f"❌ Error archiving PNG {filename}: {e}")

    # === TRANSFER PDF FILES ===
    pdf_files = glob.glob(os.path.join(analysis_dir, "*.pdf"))
    pdf_transferred = 0

    for pdf_file in pdf_files:
        filename = os.path.basename(pdf_file)

        # Get creation date for organization
        creation_date = get_file_creation_date(pdf_file)

        # Create date-based directory in archive
        date_archive_dir = os.path.join(archive_pdf_dir, creation_date)
        os.makedirs(date_archive_dir, exist_ok=True)

        # Set destination with date organization
        destination = os.path.join(date_archive_dir, filename)

        try:
            # Copy file to archive (keep original in analysis_dir)
            shutil.copy2(pdf_file, destination)
            print(f"📄 PDF archived: {creation_date}/{filename}")
            pdf_transferred += 1
        except Exception as e:
            print(f"❌ Error archiving PDF {filename}: {e}")

    # === SUMMARY ===
    print(f"\n📦 ARCHIVING SUMMARY ({timestamp}):")
    print(f"   PNG files transferred: {png_transferred}")
    print(f"   PDF files transferred: {pdf_transferred}")
    print(f"   Total files archived: {png_transferred + pdf_transferred}")

    return png_transferred, pdf_transferred


def clean_analysis_dir_after_archive():
    """
    OPTIONAL: Remove files from analysis_dir after successful archiving
    USE WITH CAUTION - This will delete the original files!
    Enhanced with better logging and date information
    """
    # Get all PNG and PDF files in analysis_dir
    png_files = glob.glob(os.path.join(analysis_dir, "*.png"))
    pdf_files = glob.glob(os.path.join(analysis_dir, "*.pdf"))
    all_files = png_files + pdf_files

    if not all_files:
        print("🗑️  No files to clean in analysis directory")
        return 0

    print(f"🗑️  Cleaning {len(all_files)} files from {analysis_dir}...")

    cleaned_files = 0

    for file_path in all_files:
        try:
            filename = os.path.basename(file_path)
            creation_date = get_file_creation_date(file_path)

            os.remove(file_path)
            print(f"🗑️  Cleaned: {filename} (was from {creation_date})")
            cleaned_files += 1

        except Exception as e:
            print(f"❌ Error cleaning {file_path}: {e}")

    print(f"🧹 Cleanup complete: {cleaned_files} files removed from {analysis_dir}")
    return cleaned_files


transfer_files_to_archive()
clean_analysis_dir_after_archive()



📦 ARCHIVING SUMMARY (2025-06-27 09:52:45):
   PNG files transferred: 0
   PDF files transferred: 0
   Total files archived: 0
🗑️  No files to clean in analysis directory


0

In [67]:
# LOADING CSV

# Toggle this flag to True in production
RENAME_FILES = False
MOVE_FILES = False

# Ensure archive directory exists
os.makedirs(archive_csv_dir, exist_ok=True)


# List and sort files by creation time
files = [
    os.path.join(data_dir, f)
    for f in os.listdir(data_dir)
    if os.path.isfile(os.path.join(data_dir, f)) and f.endswith('.csv')]
sorted_files = sorted(files, key=os.path.getctime, reverse=True)

# Check if we have exactly 2 CSV files
if len(sorted_files) != 2:
    print(f"Error: Expected 2 CSV files, found {len(sorted_files)}")
    print("Files found:", [os.path.basename(f) for f in sorted_files])
    exit(1)

for i, file_path in enumerate(sorted_files, 1):
    print(f"  File {i}:\n {os.path.basename(file_path)}")

# Loop over files
processed_files = []
for file_path in sorted_files:
    created_at = datetime.fromtimestamp(os.path.getctime(file_path))
    timestamp_str = created_at.strftime('%Y-%m-%d_%H-%M')
    original_name = os.path.basename(file_path)
    new_name = f"{timestamp_str}_{original_name}"

    if RENAME_FILES:
        if not original_name.startswith(timestamp_str):
            new_path = os.path.join(data_dir, new_name)
            os.rename(file_path, new_path)
            print(f"Renamed:\n {original_name} →\n {new_name}\n")
            processed_files.append(new_path)
        else:
            processed_files.append(file_path)
    else:
        processed_files.append(file_path)

# Load both CSV files into pandas DataFrames
file1_path, file2_path = processed_files[0], processed_files[1]
print(f"\nLoading CSV files:")
print(f"  File 1: {os.path.basename(file1_path)}")
print(f"  File 2: {os.path.basename(file2_path)}")

try:
    sub_raw = pd.read_csv(file1_path, low_memory=False)
    inv_raw = pd.read_csv(file2_path, low_memory=False)
    print(f"\nSuccessfully loaded:")
    print(f"  sub_raw: {sub_raw.shape[0]} rows, {sub_raw.shape[1]} columns")
    print(f"  inv_raw: {inv_raw.shape[0]} rows, {inv_raw.shape[1]} columns")
except Exception as e:
    print(f"Error loading CSV files: {e}")
    exit(1)

# Move files to archive
if MOVE_FILES:
    for file_path in processed_files:
        file_name = os.path.basename(file_path)
        archive_path = os.path.join(archive_csv_dir, file_name)

        if not os.path.exists(archive_path):
            os.rename(file_path, archive_path)
            print(f"Moved: {file_name} to archive")
        else:
            print(f"Already archived: {file_name}")
else:
    for file_path in processed_files:
        file_name = os.path.basename(file_path)

print("\nDataFrames available as: sub_raw, inv_raw")
print("\nProcessing complete!")
print('***************************************************')


  File 1:
 DishpatchSubscriptionData_NIklas_Sanitised - subscriptions.csv
  File 2:
 DishpatchInvoiceData_NIklas_Sanitised - invoices.csv

Loading CSV files:
  File 1: DishpatchSubscriptionData_NIklas_Sanitised - subscriptions.csv
  File 2: DishpatchInvoiceData_NIklas_Sanitised - invoices.csv

Successfully loaded:
  sub_raw: 20443 rows, 34 columns
  inv_raw: 33239 rows, 53 columns

DataFrames available as: sub_raw, inv_raw

Processing complete!
***************************************************


In [68]:
# DATA PREPROCESSING (customer_df)
def preprocess_data(input_df):
    """Clean and preprocess the subscription data"""
    df = input_df.copy()

    # Date conversion
    date_cols = [col for col in df.columns if '(UTC)' in col]
    for col in date_cols:
        df[col] = pd.to_datetime(df[col], errors='coerce', utc=True)

    df = df.sort_values(by='Created (UTC)')

    # Column selection and renaming
    columns_to_keep = [
        'id', 'Customer Name', 'Customer ID', 'Status', 'Cancellation Reason',
        'Created (UTC)', 'Start (UTC)', 'Current Period Start (UTC)',
        'Current Period End (UTC)', 'Trial Start (UTC)', 'Trial End (UTC)',
        'Canceled At (UTC)', 'Ended At (UTC)', 'senderShopifyCustomerId (metadata)'
    ]

    df = df[columns_to_keep]

    df.rename(columns={
        'id': 'subscription_id',
        'Customer ID': 'customer_id',
        'Customer Name': 'customer_name',
        'Status': 'status',
        'Cancellation Reason': 'cancellation_reason',
        'Created (UTC)': 'created_utc',
        'Start (UTC)': 'start_utc',
        'Current Period Start (UTC)': 'current_period_start_utc',
        'Current Period End (UTC)': 'current_period_end_utc',
        'Trial Start (UTC)': 'trial_start_utc',
        'Trial End (UTC)': 'trial_end_utc',
        'Canceled At (UTC)': 'canceled_at_utc',
        'Ended At (UTC)': 'ended_at_utc',
        'senderShopifyCustomerId (metadata)': 'is_gifted_member'
    }, inplace=True)

    # Convert is_gifted_member to boolean
    df['is_gifted_member'] = df['is_gifted_member'].notna()


    # Reference date for analysis
    print(f"📅 Reference date (TODAY) : {today_date.strftime('%d-%m-%Y')}")
    print(f"{len(df)} entries loaded from {file_path}")
    print('***************************************************')

    return df

sub_df = preprocess_data(sub_raw)

📅 Reference date (TODAY) : 30-05-2025
20443 entries loaded from both_csv_go_here/DishpatchInvoiceData_NIklas_Sanitised - invoices.csv
***************************************************


In [69]:
# REMOVE ALL MULTI-SUB customer_id from sub_df, put them im a new df multisub_df
def remove_multi_subscriptions(df):
    """Remove customers with multiple subscriptions and return a new DataFrame"""
    df = df.copy()

    # Count subscriptions per customer
    subscription_counts = df['customer_id'].value_counts()

    # Get customers with more than one subscription
    multi_sub_customers = subscription_counts[subscription_counts > 1].index.tolist()

    # Filter out these customers from the main DataFrame
    single_sub_df = df[~df['customer_id'].isin(multi_sub_customers)]

    # Create a new DataFrame for multi-subscription customers
    multi_sub_df = df[df['customer_id'].isin(multi_sub_customers)]

    print(f"Removed {len(multi_sub_customers)} customers with multiple subscriptions.")
    print(f"Total single_sub_df: {len(single_sub_df)}, with {len(single_sub_df['customer_id'].unique())} unique customers")
    print(f"Total multi_sub_df: {len(multi_sub_df)}, with {len(multi_sub_df['customer_id'].unique())} unique customers")


    return single_sub_df, multi_sub_df


sub_df, multisub_df = remove_multi_subscriptions(sub_df)

Removed 377 customers with multiple subscriptions.
Total single_sub_df: 19599, with 19599 unique customers
Total multi_sub_df: 844, with 377 unique customers


In [70]:
# DATA PREPROCESSING (invoices df)
def preprocess_data_invoice(input_df):
    """Clean and preprocess the subscription data"""
    df = input_df.copy()

    # Date conversion
    date_cols = [col for col in df.columns if '(UTC)' in col]
    for col in date_cols:
        df[col] = pd.to_datetime(df[col], errors='coerce', utc=True)


    # Column selection and renaming
    columns_to_keep = [
        'id', 'Customer Name', 'Customer', 'Amount Due', 'Amount Paid', 'Paid', 'Billing', 'Charge', 'Closed',
        'Date (UTC)', 'Description', 'Number', 'Finalized At (UTC)',
        'Paid At (UTC)', 'Minimum Line Item Period Start (UTC)', 'Maximum Line Item Period End (UTC)',
        'Period End (UTC)', 'Subscription', 'Total Discount Amount', 'Applied Coupons', 'Status'
        ]

    df = df[columns_to_keep]

    df.rename(columns={
        'id': 'invoice_id',
        'Status': 'inv_status',
        'Customer': 'customer_id',
        'Customer Name': 'customer_name',
        'Date (UTC)' : 'date_utc',
        'Description': 'description',
        'Paid At (UTC)': 'paid_at_utc',
        'Amount Paid': 'amount_paid',
        'Subscription': 'subscription_id',
    }, inplace=True)

    return df

inv_df = preprocess_data_invoice(inv_raw)

In [71]:
# MERGING DATAFRAMES
def merge_dataframes(sub_df, inv_df):
    """Merge two DataFrames on 'subscription_id' and 'customer_id'"""
    # Ensure both DataFrames have the same columns for merging

    merged_df = pd.merge(sub_df, inv_df, on=['subscription_id'], how='outer')


    return merged_df

#df = merge_dataframes(sub_df, inv_df)

In [72]:
# Removing customers with more than 5 subscriptions (Probably testing accounts)
def remove_high_volume_customers(df, threshold=HIGH_VOLUME_THRESHOLD):
    """Remove customers with more than a specified number of subscriptions"""
    df = df.copy()

    original_count = len(df)

    customer_counts = df['customer_id'].value_counts()
    high_volume_customers = customer_counts[customer_counts > threshold].index

    df = df[~df['customer_id'].isin(high_volume_customers)]

    print(f'{original_count - len(df)} subscriptions removed from \
{len(high_volume_customers)} customers with more than {threshold} subscriptions')
    print('***************************************************')

    return df


sub_df = remove_high_volume_customers(sub_df)

0 subscriptions removed from 0 customers with more than 5 subscriptions
***************************************************


In [73]:
# CANCEL DURING TRIAL PERIOD
def cancel_during_trial(df):
    """Check if a member canceled during their trial period"""
    df =df.copy()

    df['canceled_during_trial'] = (
        (df['canceled_at_utc'].notna()) &
        (df['trial_end_utc'] > df['canceled_at_utc'])
    )
    return df

sub_df = cancel_during_trial(sub_df)

In [74]:
# OLD  SETTING REFUND PERIOD END UTC
def refund_period_end_utc(df, REFUND_PERIOD_DAYS):
    df = df.copy()
    df['refund_period_end_utc'] = np.where(
        (df['trial_end_utc'].notna()) & (df['trial_end_utc'] > df['current_period_start_utc']),
        df['trial_end_utc'] + pd.Timedelta(days=REFUND_PERIOD_DAYS),
        df['current_period_start_utc'] + pd.Timedelta(days=REFUND_PERIOD_DAYS)
    )

    return df

sub_df = refund_period_end_utc(sub_df, REFUND_PERIOD_DAYS)


In [75]:
# REFUND PERIOD END TIME
# This function sets the 'refund_period_end_utc' column for each subscription.
# The refund period end date is calculated differently depending on whether the subscription started with a trial:
# - If 'trial_start_utc' is not null (i.e., the subscription had a trial), the refund period ends REFUND_PERIOD_DAYS after the trial ends ('trial_end_utc').
# - If there was no trial ('trial_start_utc' is null), the refund period ends REFUND_PERIOD_DAYS after the current paid period starts ('current_period_start_utc').
def refund_period_end_utc(df, REFUND_PERIOD_DAYS):
    df = df.copy()
    df['refund_period_end_utc'] = np.where(
        df['trial_start_utc'].notna() &
        (df['trial_end_utc'] > df['current_period_start_utc']),
        df['trial_end_utc'] + pd.Timedelta(days=REFUND_PERIOD_DAYS),
        df['current_period_start_utc'] + pd.Timedelta(days=REFUND_PERIOD_DAYS)
    )
    return df

# Apply the function to the subscriptions DataFrame
sub_df = refund_period_end_utc(sub_df, REFUND_PERIOD_DAYS)

In [76]:
# CANCEL DURRING REFUND PERIOD
def canceled_during_refund_period(df):
    """Check if a member canceled during their refund period"""
    df = df.copy()

    df['canceled_during_refund_period'] = (
        (df['canceled_at_utc'].notna()) &
        (df['canceled_during_trial'] == False) &
        (df['refund_period_end_utc'] > df['canceled_at_utc'])
    )
    return df

sub_df = canceled_during_refund_period(sub_df)


In [77]:
# FULL MEMBER STATUS
def full_member_status(df):
    """Determine if a customer is a full member based on business logic"""
    df = df.copy()

    # Full member if:
    # 1. Not canceled during trial
    # 2. Not canceled during refund period
    # 3. Not gifted
    # 4. Trial ended more than 14 days ago (if no trial, current_period_start_utc > 14 days ago)

    no_early_cancellation = (
        (~df['canceled_during_trial']) &
        (~df['canceled_during_refund_period'])
    )

    not_gifted = (~df['is_gifted_member'])

    refund_period_passed = (
            (today_date > df['refund_period_end_utc'])
            )

    df['is_full_member'] = (
        no_early_cancellation &
        not_gifted &
        refund_period_passed
    )

    return df

sub_df = full_member_status(sub_df)


In [78]:
# PAYING MEMBERS
def paying_members(df):
    """Determine if a customer is a paying member"""
    df = df.copy()

    # Paying member if:
    # 1. Not canceled
    # 2. Not gifted

    no_early_cancellation = (
        (~df['canceled_during_trial']) &
        (~df['canceled_during_refund_period'])
    )

    not_gifted = (~df['is_gifted_member'])


    df['is_paying_member'] = (
        no_early_cancellation &
        not_gifted
    )

    return df

sub_df = paying_members(sub_df)

In [79]:
# add ended_at_utc when needed
def add_ended_at_utc(df, today_date):
    """add ended_at_utc when needed"""
    df = df.copy()

    # if canceled during trial, set ended_at_utc to trial_end_utc
    df['ended_at_utc'] = np.where(
        (df['ended_at_utc'].isna()) & (df['canceled_during_trial']),
        df['trial_end_utc'],
        df['ended_at_utc']
    )

    # if canceled during refund period, set ended_at_utc to canceled_at_utc
    df['ended_at_utc'] = np.where(
        (df['ended_at_utc'].isna()) &
        (df['canceled_during_refund_period']) &
        (~df['canceled_during_trial']),
        df['canceled_at_utc'],
        df['ended_at_utc']
    )

    # if canceled after refund period, set ended_at_utc to current_period_end_utc
    df['ended_at_utc'] = np.where(
        (df['ended_at_utc'].isna()) &
        (df['canceled_at_utc'].notna()) &
        (~df['canceled_during_refund_period']) &
        (~df['canceled_during_trial']),
        np.minimum(df['current_period_end_utc'], today_date),
        df['ended_at_utc']
    )


    return df


sub_df = add_ended_at_utc(sub_df, today_date)


In [80]:
# CALCULATING DURATIONS
def calculate_duration(df, today_date):  # ← AJOUTER today_date en paramètre
    """Calculate various durations in days with proper business logic"""

    # Trial duration (if trial exists)
    df['trial_duration_planned'] = (df['trial_end_utc'] - df['trial_start_utc']).dt.days.fillna(0)

    # Pour les annulations pendant trial, limiter trial_duration à la vraie utilisation
    df['trial_duration'] = np.where(
        df['ended_at_utc'] < df['trial_end_utc'],  # Annulé pendant trial
        np.maximum(0, (df['ended_at_utc'] - df['trial_start_utc']).dt.days),  # Durée réelle
        df['trial_duration_planned']  # Sinon durée prévue
    )

    # Current period duration
    df['current_period_duration'] = (df['current_period_end_utc'] - df['current_period_start_utc']).dt.days

    # Trial-only subscription
    df['trial_only_subscription'] = (
        df['trial_start_utc'].notna() &
        df['trial_end_utc'].notna() &
        (df['trial_duration'] == df['current_period_duration'])
    )

    # Gift duration (only for gifted members)
    df['gift_duration'] = df['current_period_duration'].where(df['is_gifted_member'], 0)

    # Days until end for active subscriptions
    df['end_in'] = ((df['current_period_end_utc'] - today_date).dt.days).where(df['status'] == 'active', np.nan)

    # ← MODIFICATION ICI : Limiter real_duration à la durée max possible
    df['real_duration'] = np.where(
        df['ended_at_utc'].notna() & (df['status'] != 'trialing'),
        (df['ended_at_utc'] - df['created_utc']).dt.days,
        (today_date - df['created_utc']).dt.days
    )

    # ← AJOUTER CETTE VALIDATION
    max_possible = (today_date - df['created_utc'].min()).days
    df['real_duration'] = np.minimum(df['real_duration'], max_possible)

    df['paid_duration'] = df['real_duration'] - df['trial_duration']

    return df

# ← MODIFICATION DE L'APPEL
sub_df = calculate_duration(sub_df, today_date)


In [81]:
# WEEKS ARE FROM MONDAY TO SUNDAY
def get_specific_past_week(weeks_back=1, reference_date=None):
    """
    Get specific date for a specific week.
    weeks_back=1 : last week (from Monday to Sunday)
    weeks_back=2 : previous week (from Monday to Sunday)
    weeks_back=3 : three weeks ago (from Monday to Sunday)
    """

    if reference_date is None:
        today = pd.Timestamp.now(tz='UTC')
    else:
        if hasattr(reference_date, 'tz') and reference_date.tz is not None:
            today = pd.to_datetime(reference_date).tz_convert('UTC')
        else:
            today = pd.to_datetime(reference_date).tz_localize('UTC')


    # Finding the Monday of the target week
    days_since_monday = today.weekday()
    this_monday = today - pd.Timedelta(days=days_since_monday)
    target_monday = this_monday - pd.Timedelta(days=7 * weeks_back)
    target_sunday = target_monday + pd.Timedelta(days=6)

    week_start = target_monday.normalize()  # 00:00:00
    week_end = target_sunday.normalize() + pd.Timedelta(hours=23, minutes=59, seconds=59)  # 23:59:59

    monday = target_monday.strftime('%d-%m-%y')
    sunday = target_sunday.strftime('%d-%m-%y')

    # Las week info
    week_info = {
        'weeks_ago': weeks_back,
        'week_start': week_start,
        'week_end': week_end,
        'year': target_monday.year,
        'week_number': target_monday.isocalendar().week,
        'year_week': f"{target_monday.year}-W{target_monday.isocalendar().week:02d}",
        'monday': monday,
        'sunday': sunday,
    }

    return week_info


In [82]:
def get_full_members_count(df):
    """Count the number of full members"""
    df = df.copy()

    df = df[df['is_full_member'] == True]
    df_active = df[df['status'] == 'active']
    df_not_active = df[df['status'] != 'active']

    active = len(df_active)
    print(f"Total Active full member: {active}")
    print(f"Total not active full member: {len(df_not_active)}")

    dict_full_members = {'active': active,
                         'not_active': len(df_not_active)
                         }

    return dict_full_members


dict_full_member = get_full_members_count(sub_df)


Total Active full member: 5416
Total not active full member: 1632


In [83]:
# how many trial this week
def get_new_trial_last_week(df, weeks_back=1):
    """Count new trials started this week"""
    week_info = get_specific_past_week(weeks_back=weeks_back, reference_date=today_date)

    # Filter for the current week
    df_week = df[(df['trial_start_utc'] >= week_info['week_start']) &
                 (df['trial_start_utc'] < week_info['week_end'])]

    # Count new trials
    new_trials = df_week.shape[0]

    print(f"New trials last week ({week_info['year_week']}): {new_trials}")

    return new_trials


new_trial_last_week = get_new_trial_last_week(sub_df, weeks_back=1)
new_trial_prev_week = get_new_trial_last_week(sub_df, weeks_back=2)


New trials last week (2025-W21): 70
New trials last week (2025-W20): 110


In [84]:
# how many trial this week
def get_new_trial_last_week(df, weeks_back=1):
    """Count new trials started this week using pandas dt.isocalendar().week and year"""
    df = df.copy()
    # Ensure trial_start_utc is datetime
    df['trial_start_utc'] = pd.to_datetime(df['trial_start_utc'])

    # Get the current week and year
    today = pd.to_datetime(today_date)
    current_year, current_week = today.isocalendar().year, today.isocalendar().week

    # Calculate the target week and year
    # Subtract weeks_back from current week, handle year change
    target_week = current_week - weeks_back
    target_year = current_year
    while target_week <= 0:
        target_year -= 1
        # Get the number of weeks in the previous year
        last_year_weeks = pd.Timestamp(f"{target_year}-12-28").isocalendar().week
        target_week += last_year_weeks

    # Extract week and year from trial_start_utc
    df['trial_year'] = df['trial_start_utc'].dt.isocalendar().year
    df['trial_week'] = df['trial_start_utc'].dt.isocalendar().week

    # Filter for the target week and year
    df_week = df[(df['trial_year'] == target_year) & (df['trial_week'] == target_week)]

    new_trials = df_week.shape[0]

    print(f"New trials last week ({target_year}-W{target_week}): {new_trials}")

    return new_trials

new_trial_last_week = get_new_trial_last_week(sub_df, weeks_back=1)
new_trial_prev_week = get_new_trial_last_week(sub_df, weeks_back=2)


New trials last week (2025-W21): 70
New trials last week (2025-W20): 110


In [85]:
# Count trials that converted to full members
def get_conversion_rate(df):
    """Calculate conversion rate from trial to full member"""
    df = df.copy()

    new_customers = df[df['trial_start_utc'].notna()].copy()

    mature_trials = new_customers[new_customers['refund_period_end_utc'] < today_date]

    total_trials = len(mature_trials)
    conversions = len(mature_trials[mature_trials['is_full_member'] == True])

    conversion_rate = (conversions / total_trials * 100)


    conversion_rate_dict = {
        'total_trials': total_trials,
        'mature_trials': mature_trials,
        'conversion_rate': round(conversion_rate, 2)
    }

    print(f"Total trials: {conversion_rate_dict['total_trials']}, Full Members converted = {conversions}, Conversion rate: {conversion_rate_dict['conversion_rate']}")

    return conversion_rate_dict


conversion_rate_dict = get_conversion_rate(sub_df)


Total trials: 17524, Full Members converted = 6585, Conversion rate: 37.58


In [86]:
# Count trials that converted to full members
def get_conversion_rate(df):
    """Calculate conversion rate from trial to full member"""
    df = df.copy()

    # Use .dt.year and .dt.week for demonstration (though not strictly needed here)
    if 'trial_start_utc' in df.columns and pd.api.types.is_datetime64_any_dtype(df['trial_start_utc']):
        df['trial_year'] = df['trial_start_utc'].dt.year
        df['trial_week'] = df['trial_start_utc'].dt.week

    new_customers = df[df['trial_start_utc'].notna()].copy()

    mature_trials = new_customers[new_customers['refund_period_end_utc'] < today_date]

    total_trials = len(mature_trials)
    conversions = len(mature_trials[mature_trials['is_full_member'] == True])

    conversion_rate = (conversions / total_trials * 100) if total_trials > 0 else 0.0

    conversion_rate_dict = {
        'total_trials': total_trials,
        'mature_trials': mature_trials,
        'conversion_rate': round(conversion_rate, 2)
    }

    return conversion_rate_dict

conversion_rate_dict = get_conversion_rate(sub_df)

  df['trial_week'] = df['trial_start_utc'].dt.week


In [87]:
# Count trials that converted to full members
def get_conversion_rate_last_week(df, weeks_back=1):
    """Calculate conversion rate from trial to full member"""
    df = df.copy()

    week_info = get_specific_past_week(weeks_back=weeks_back, reference_date=today_date)
    # Filter for the current week
    mature_customers = df[(df['refund_period_end_utc'] >= week_info['week_start']) &
                          (df['refund_period_end_utc'] < week_info['week_end']) &
                          (df['trial_start_utc'].notna())].copy()


    total_trials = len(mature_customers)
    conversions = len(mature_customers[mature_customers['is_full_member'] == True])

    conversion_rate = (conversions / total_trials * 100)


    return conversion_rate


last_week_conversion_rate = get_conversion_rate_last_week(sub_df, weeks_back=1)
prev_week_conversion_rate = get_conversion_rate_last_week(sub_df, weeks_back=2)


In [88]:
def get_new_full_members_last_week(df, today_date, weeks_back=1):
    """
    Get new full members from last week
    """
    week_info = get_specific_past_week(weeks_back=weeks_back, reference_date=today_date)

    # New full members = those whose refund period ended last week
    last_week_full_members = df[
        (df['refund_period_end_utc'] >= week_info['week_start']) &
        (df['refund_period_end_utc'] <= week_info['week_end']) &
        (df['is_full_member'] == True)
    ]

    last_week_full_members = len(last_week_full_members)

    return last_week_full_members

last_week_full_members = get_new_full_members_last_week(sub_df, today_date, weeks_back=1)
prev_week_full_members = get_new_full_members_last_week(sub_df, today_date, weeks_back=2)


In [89]:
def get_churn_members_last_week(df, today_date, weeks_back=1):
    """
    Get churned members from last week (using the function from previous artifact)
    """
    week_info = get_specific_past_week(weeks_back=weeks_back, reference_date=today_date)

    # Churned members = those who canceled last week
    churned_members = df[
        (df['canceled_at_utc'] >= week_info['week_start']) &
        (df['canceled_at_utc'] <= week_info['week_end']) &
        (df['is_full_member'] == True)
    ]

    last_week_churned_members = len(churned_members)

    return last_week_churned_members

last_week_churned_members = get_churn_members_last_week(sub_df, today_date, weeks_back=1)
prev_week_churned_members = get_churn_members_last_week(sub_df, today_date, weeks_back=2)


In [90]:
def cus_renewal(df):
    df = df.copy()

    df = df[~df['is_gifted_member']]

    # number of customers who had trial
    all_customers = df[df['created_utc'].notna()]

    all_active_full_member = all_customers[~all_customers['is_gifted_member']]

    # number of customers who became full members (from trial)
    trial_to_full_member = all_customers[
        (~all_customers['canceled_during_trial']) &
        (~all_customers['canceled_during_refund_period']) &
        (all_customers['paid_duration'] > 14) &
        (~all_customers['is_gifted_member'])
    ]

    print(f"trial customers who became full members: {len(trial_to_full_member)}")

    # trial > full member conversion rate
    conversion_rate = (len(trial_to_full_member) / len(all_customers) * 100)
    print(f"conversion rate: {conversion_rate:.2f}%")


    # 1st year customers
    customers_in_y1 = trial_to_full_member[trial_to_full_member['paid_duration'] <= 366]

    # active in 1st year
    active_in_y1 = customers_in_y1[customers_in_y1['status'] == 'active']

    # chrun during 1st year
    canceled_during_y1 = customers_in_y1[customers_in_y1['canceled_at_utc'].notna()]

    # cancelation rate during y1
    y1_cancelation_rate = (len(canceled_during_y1) / len(customers_in_y1) * 100)





    # customers eligible to year 2
    eligible_for_y2 = trial_to_full_member[trial_to_full_member['paid_duration'] >= 364]

    # customer currently in year 2
    customers_in_y2 = eligible_for_y2[eligible_for_y2['paid_duration'] <= 730]

    # currently active in y2
    active_in_y2 = customers_in_y2[customers_in_y2['status'] == 'active']

    # customer who renewed for a second year
    renewed_to_y2 = eligible_for_y2[eligible_for_y2['paid_duration'] >= (364 + REFUND_PERIOD_DAYS)]
    #renewed_to_y2 = eligible_for_y2[eligible_for_y2['is_full_member']]

    # customer who canceled in year 2
    canceled_during_y2 = customers_in_y2[customers_in_y2['canceled_at_utc'].notna()]

    # renewal rate from y1 to y2
    renewal_rate_y1_to_y2 = (len(renewed_to_y2) / len(eligible_for_y2) * 100)

    # cancelation rate during y2
    y2_cancelation_rate = (len(canceled_during_y2) / len(customers_in_y2) * 100)





    # customers eligible to year 3
    eligible_for_y3 = trial_to_full_member[trial_to_full_member['paid_duration'] >= 729]

    # customer currently in year 3
    customers_in_y3 = trial_to_full_member[trial_to_full_member['paid_duration'] >= 729]

    # currently active in y3
    active_in_y3 = customers_in_y3[customers_in_y3['status'] == 'active']

    # customer who renewed for a second year
    renewed_to_y3 = eligible_for_y3[eligible_for_y3['paid_duration'] >= (729 + REFUND_PERIOD_DAYS)]

    # customer who canceled in year 3
    canceled_during_y3 = customers_in_y3[customers_in_y3['canceled_at_utc'].notna()]

    # renewal rate from y2 to y3
    renewal_rate_y2_to_y3 = (len(renewed_to_y3) / len(eligible_for_y3) * 100) if len(eligible_for_y3) > 0 else 0

    # cancelation rate during y3
    y3_cancelation_rate = (len(canceled_during_y3) / len(eligible_for_y3) * 100) if len(eligible_for_y3) > 0 else 0



    renewal_dict = {
        'all_customers_df' : all_customers,
        'all_customer' : len(all_customers),

        'all_active_full_member_df' : all_active_full_member,
        'all_active_full_member' : len(all_active_full_member),

        'trial_to_full_member_df' : trial_to_full_member,
        'trial_to_full_member' : len(trial_to_full_member),

        'conversion_rate' : conversion_rate,

        'customers_in_y1_df' : customers_in_y1,
        'customers_in_y1' : len(customers_in_y1),

        'active_in_y1_df' : active_in_y1,
        'active_in_y1' : len(active_in_y1),

        'canceled_during_y1_df' : canceled_during_y1,
        'canceled_during_y1' : len(canceled_during_y1),

        'y1_cancelation_rate' : y1_cancelation_rate,

        'eligible_for_y2_df' : eligible_for_y2,
        'eligible_for_y2' : len(eligible_for_y2),

        'customer_in_y2_df' : customers_in_y2,
        'customer_in_y2' : len(customers_in_y2),

        'active_in_y2_df' : active_in_y2,
        'active_in_y2' : len(active_in_y2),

        'renewed_to_y2_df' : renewed_to_y2,
        'renewed_to_y2' : len(renewed_to_y2),

        'canceled_during_y2_df' : canceled_during_y2,
        'canceled_during_y2' : len(canceled_during_y2),

        'y2_cancelation_rate' : y2_cancelation_rate,
        'renewal_rate_y1_to_y2' : renewal_rate_y1_to_y2,

        'eligible_for_y3_df' : eligible_for_y3,
        'eligible_for_y3' : len(eligible_for_y3),

        'customer_in_y3_df' : customers_in_y3,
        'customer_in_y3' : len(customers_in_y3),

        'active_in_y3_df' : active_in_y3,
        'active_in_y3' : len(active_in_y3),

        'renewed_to_y3_df' : renewed_to_y3,
        'renewed_to_y3' : len(renewed_to_y3),

        'canceled_during_y3_df' : canceled_during_y3,
        'canceled_during_y3' : len(canceled_during_y3),

        'y3_cancelation_rate' : y3_cancelation_rate,
        'renewal_rate_y2_to_y3' : renewal_rate_y2_to_y3}

    return renewal_dict


renewal_dict = cus_renewal(sub_df)

trial customers who became full members: 7055
conversion rate: 38.42%


In [91]:
print("Key, Value\n")
for key, value in renewal_dict.items():
    if not key.endswith('_df'):
        print(f"{key}: {value}")

Key, Value

all_customer: 18361
all_active_full_member: 18361
trial_to_full_member: 7055
conversion_rate: 38.42383312455749
customers_in_y1: 5078
active_in_y1: 3677
canceled_during_y1: 1420
y1_cancelation_rate: 27.96376526191414
eligible_for_y2: 2955
customer_in_y2: 2955
active_in_y2: 1769
renewed_to_y2: 1934
canceled_during_y2: 1181
y2_cancelation_rate: 39.96615905245347
renewal_rate_y1_to_y2: 65.44839255499154
eligible_for_y3: 0
customer_in_y3: 0
active_in_y3: 0
renewed_to_y3: 0
canceled_during_y3: 0
y3_cancelation_rate: 0
renewal_rate_y2_to_y3: 0


In [93]:
def weekly_flow_all_time(cus_df, today_date):
    """
    Create a dual-axis chart with weekly metrics for ALL TIME
    North: Conversions + Renewals (stacked)
    South: Churn full members
    + Cumulative line plot
    """

    renewal_dict = cus_renewal(cus_df)

    all_customers_df = renewal_dict['all_customers_df']
    trial_to_full_member_df = renewal_dict['trial_to_full_member_df']
    customers_in_y1 = renewal_dict['customers_in_y1_df']
    customer_in_y2 = renewal_dict['customer_in_y2_df']
    customer_in_y3  = renewal_dict['customer_in_y3_df']
    renewed_to_y2_df = renewal_dict['renewed_to_y2_df']
    renewed_to_y3_df = renewal_dict['renewed_to_y3_df']


    # Use all data since first date
    first_date = cus_df['created_utc'].min()
    num_weeks = int((today_date - first_date).days / 7) + 1
    print(f"Analysis since first date: {first_date.strftime('%d-%m-%Y')} ({num_weeks} weeks)")

    conversion_customers = trial_to_full_member_df

    weekly_conversions = conversion_customers.groupby(
        pd.Grouper(key='created_utc', freq='W-MON')).size()

    y1_to_y2_customers = renewed_to_y2_df.copy()
    y1_to_y2_customers['renewal_date'] = y1_to_y2_customers['current_period_start_utc']

    weekly_renewals_y1 = y1_to_y2_customers.groupby(
        pd.Grouper(key='renewal_date', freq='W-MON')).size()

    y2_to_y3_customers = renewed_to_y3_df.copy()
    y2_to_y3_customers['renewal_date'] = y2_to_y3_customers['current_period_start_utc']

    weekly_renewals_y2 = y2_to_y3_customers.groupby(
        pd.Grouper(key='renewal_date', freq='W-MON')).size()

    churn_customers = trial_to_full_member_df[trial_to_full_member_df['canceled_at_utc'].notna()]

    weekly_churn = churn_customers.groupby(
        pd.Grouper(key='canceled_at_utc', freq='W-MON')).size()



    all_dates = []
    for series in [weekly_conversions, weekly_renewals_y1, weekly_renewals_y2, weekly_churn]:
        if len(series) > 0:
            all_dates.extend(series.index.tolist())

    if not all_dates:
        print("❌ No data found")
        return {}

    start_date = min(all_dates)
    end_date = max(all_dates)

    full_date_range = pd.date_range(start=start_date, end=end_date, freq='W-MON')

    x_pos = range(len(full_date_range))
    week_labels = [f"{date.strftime('%d-%m-%y')} > {(date + pd.Timedelta(days=6)).strftime('%d-%m-%y')}"
               for date in full_date_range]

    # Reindex all series to same range (fill missing weeks with 0)
    weekly_conversions = weekly_conversions.reindex(full_date_range, fill_value=0)
    weekly_renewals_y1 = weekly_renewals_y1.reindex(full_date_range, fill_value=0)
    weekly_renewals_y2 = weekly_renewals_y2.reindex(full_date_range, fill_value=0)
    weekly_churn = weekly_churn.reindex(full_date_range, fill_value=0)

    # Calculate net weekly change and cumulative
    net_weekly = weekly_conversions + weekly_renewals_y1 + weekly_renewals_y2 - weekly_churn
    net_cumulative = net_weekly.cumsum()

       # === CREATE CHART - ALL TIME ===
    fig, ax = plt.subplots(1, 1, figsize=(22, 8))

    Format dates for X axis
    weeks_labels = [week_info['monday'] + ' > ' + week_info['sunday'] for week_info in week_data]
    x_pos = range(len(week_range))

    # === POSITIVE BARPLOT (NORTH) ===
    ax.bar(x_pos, weekly_conversions, label='Conversions (Trial→Full)', color='green')
    ax.bar(x_pos, weekly_renewals_y1, bottom=weekly_conversions,label='Renewals Y1', color='lightgreen')
    ax.bar(x_pos, weekly_renewals_y2, \
           bottom=weekly_conversions + weekly_renewals_y1, \
           label='Renewals Y2+', color='orange')

    # === NEGATIVE BARPLOT (SOUTH) ===
    ax.bar(x_pos, -weekly_churn, label='Churn Full Members', color='red')

    # === CUMULATIVE LINE PLOT ===
    ax_twin = ax.twinx()
    ax_twin.plot(x_pos, net_cumulative, color='darkblue', linewidth=1, \
                 label='Net Cumulative (Gains - Losses)')

    # === AXIS CONFIGURATION ===
    ax.set_ylabel('Full Members per week\n(Positive: Gains | Negative: Losses)',
                  fontsize=12, fontweight='bold')
    ax.set_xlabel('Weeks (Monday - Sunday)', fontsize=12, fontweight='bold')

    for i, (conv, ren1, ren2, churn) in enumerate(zip(weekly_conversions, weekly_renewals_y1, weekly_renewals_y2, weekly_churn)):
        total_gains = conv + ren1 + ren2
        if total_gains > 0:
            ax.text(i, total_gains + 0.5, str(int(total_gains)),
                   ha='center', va='bottom', fontsize=7, color='darkgreen', fontweight='bold')

        if churn > 0:
            ax.text(i, -churn - 0.5, str(int(churn)),
                   ha='center', va='top', fontsize=7, color='darkred')

    ax_twin.set_ylabel('Net Cumulative Total', fontsize=12, fontweight='bold')
    ax_twin.tick_params(axis='y', labelcolor='darkblue')

    # === VISUAL CONFIGURATION ===
    ax.axhline(y=0, color='black', linestyle='-', linewidth=1, alpha=0.5)
    ax.grid(True, alpha=0.3)

    ax.set_xlim(-0.3, len(x_pos) - 0.5)

    # Adjust Y limits
    y_max = max(weekly_conversions + weekly_renewals_y1 + weekly_renewals_y2) * 1.2
    y_min = -max(weekly_churn) * 1.2
    ax.set_ylim(y_min, y_max)

    # X axis configuration - reduce labels for long periods
    step = max(1, len(x_pos) // 15)
    ax.set_xticks(x_pos[::step])
    ax.set_xticklabels([week_labels[i] for i in x_pos[::step]],
                       rotation=45, ha='right', fontsize=9)


    # === GREY ZONE FOR IMMATURE PERIODS ===
    immature_cutoff = today_date - pd.Timedelta(days=24)  # Extended for customers
    immature_weeks = [i for i, date in enumerate(full_date_range) if date >= immature_cutoff]

    if immature_weeks:
        start_idx = min(immature_weeks) - 0.5
        end_idx = max(immature_weeks) + 0.5
        ax.axvspan(start_idx, end_idx, alpha=0.15, color='gray',
                   label='Immature Period (< 50 days)', zorder=0)
        print(f"🔍 Immature period: {len(immature_weeks)} recent weeks")



    # === TITLES AND LEGENDS ===
    period_text = f'(from {start_date} to {end_date})'
    ax.set_title(f'WEEKLY FULL MEMBERS FLOW - ALL TIME\n{period_text}', fontsize=18, fontweight='bold', pad=30)

    # Combine legends
    lines1, labels1 = ax.get_legend_handles_labels()
    lines2, labels2 = ax_twin.get_legend_handles_labels()
    ax.legend(lines1 + lines2, labels1 + labels2, loc='best', fontsize=10)

    # === SUMMARY METRICS ===
    print("=== CALCULATING METRICS ===")



    plt.tight_layout()

    # === SAVE ===
    filename = f"weekly_flow_all_time_{today_date.strftime('%Y-%m-%d')}.png"
    plt.savefig(os.path.join(analysis_dir, filename), dpi=300, bbox_inches='tight')
    print(f"All time chart saved: {filename}")
    plt.show()


    print(weekly_churn, weekly_renewals_y2)

weekly_flow_all_time(sub_df, today_date)


SyntaxError: invalid syntax (4041314398.py, line 80)