In [18]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.max_columns', None)
import seaborn as sns
import os
from datetime import datetime
import shutil
import glob

In [19]:
plt.rcParams.update({'font.size': 11, 'axes.labelsize': 10, 'axes.titlesize': 16})
plt.rcParams['figure.facecolor'] = 'white'
plt.rcParams['axes.facecolor'] = 'white'
plt.rcParams['axes.edgecolor'] = 'black'
#plt.rcParams['xtick.color'] = 'black'
plt.rcParams['xtick.color'] = 'white'
#plt.rcParams['ytick.color'] = 'black'
plt.rcParams['ytick.color'] = 'white'
plt.rcParams['figure.figsize'] = (22, 11)

# Grid with opacity and in background
plt.rcParams['axes.grid'] = True
plt.rcParams['grid.color'] = 'lightgray'
plt.rcParams['grid.alpha'] = 0.5
plt.rcParams['axes.axisbelow'] = True

plt.rcParams['axes.titleweight'] = 'bold'
#plt.rcParams['axes.titlecolor'] = 'black'
plt.rcParams['axes.titlecolor'] = 'white'
#plt.rcParams['axes.labelcolor'] = 'black'
plt.rcParams['legend.labelcolor'] = 'black'
plt.rcParams['legend.facecolor'] = 'white'
plt.rcParams['legend.edgecolor'] = 'gray'
plt.rcParams['text.color'] = 'black'
sns.set_palette("viridis")


In [20]:
# Set TODAY DATE
# today_date = pd.Timestamp.now(tz='UTC')
today_date = pd.Timestamp('2025-05-30', tz='UTC')  # For testing purposes


# Set REFUND PERDIOD DURATION
REFUND_PERIOD_DAYS = 14  # Duration of the refund period in days

# Set thresholds for cleaning
HIGH_VOLUME_THRESHOLD = 6
DUPLICATE_THRESHOLD_MINUTES = 15


# Set DIRECTORIES
data_dir = 'both_csv_go_here'
archive_csv_dir = 'archive/csv'
archive_png_dir = 'archive/analysis/png'
archive_pdf_dir = 'archive/analysis/pdf'
analysis_dir = 'analysis'

In [21]:
def get_file_creation_date(file_path):
    """
    Get the creation date of a file and return it as a formatted string
    Returns format: YYYY-MM-DD
    """
    try:
        # Get file creation time (or modification time if creation not available)
        if os.name == 'nt':  # Windows
            creation_time = os.path.getctime(file_path)
        else:  # Unix/Linux/Mac
            creation_time = os.path.getmtime(file_path)

        # Convert to datetime and format
        creation_date = datetime.fromtimestamp(creation_time)
        return creation_date.strftime('%Y-%m-%d')

    except Exception as e:
        print(f"❌ Error getting creation date for {file_path}: {e}")
        # Fallback to today's date
        return datetime.now().strftime('%Y-%m-%d')


def transfer_files_to_archive():
    """
    Enhanced version with date-based organization
    Transfer PNG files from analysis_dir to archive_png_dir/YYYY-MM-DD/
    Transfer PDF files from analysis_dir to archive_pdf_dir/YYYY-MM-DD/
    """
    timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

    # === TRANSFER PNG FILES ===
    png_files = glob.glob(os.path.join(analysis_dir, "*.png"))
    png_transferred = 0

    for png_file in png_files:
        filename = os.path.basename(png_file)

        # Get creation date for organization
        creation_date = get_file_creation_date(png_file)

        # Create date-based directory in archive
        date_archive_dir = os.path.join(archive_png_dir, creation_date)
        os.makedirs(date_archive_dir, exist_ok=True)

        # Set destination with date organization
        destination = os.path.join(date_archive_dir, filename)

        try:
            # Copy file to archive (keep original in analysis_dir)
            shutil.copy2(png_file, destination)
            print(f"📊 PNG archived: {creation_date}/{filename}")
            png_transferred += 1
        except Exception as e:
            print(f"❌ Error archiving PNG {filename}: {e}")

    # === TRANSFER PDF FILES ===
    pdf_files = glob.glob(os.path.join(analysis_dir, "*.pdf"))
    pdf_transferred = 0

    for pdf_file in pdf_files:
        filename = os.path.basename(pdf_file)

        # Get creation date for organization
        creation_date = get_file_creation_date(pdf_file)

        # Create date-based directory in archive
        date_archive_dir = os.path.join(archive_pdf_dir, creation_date)
        os.makedirs(date_archive_dir, exist_ok=True)

        # Set destination with date organization
        destination = os.path.join(date_archive_dir, filename)

        try:
            # Copy file to archive (keep original in analysis_dir)
            shutil.copy2(pdf_file, destination)
            print(f"📄 PDF archived: {creation_date}/{filename}")
            pdf_transferred += 1
        except Exception as e:
            print(f"❌ Error archiving PDF {filename}: {e}")

    # === SUMMARY ===
    print(f"\n📦 ARCHIVING SUMMARY ({timestamp}):")
    print(f"   PNG files transferred: {png_transferred}")
    print(f"   PDF files transferred: {pdf_transferred}")
    print(f"   Total files archived: {png_transferred + pdf_transferred}")

    return png_transferred, pdf_transferred


def clean_analysis_dir_after_archive():
    """
    OPTIONAL: Remove files from analysis_dir after successful archiving
    USE WITH CAUTION - This will delete the original files!
    Enhanced with better logging and date information
    """
    # Get all PNG and PDF files in analysis_dir
    png_files = glob.glob(os.path.join(analysis_dir, "*.png"))
    pdf_files = glob.glob(os.path.join(analysis_dir, "*.pdf"))
    all_files = png_files + pdf_files

    if not all_files:
        print("🗑️  No files to clean in analysis directory")
        return 0

    print(f"🗑️  Cleaning {len(all_files)} files from {analysis_dir}...")

    cleaned_files = 0

    for file_path in all_files:
        try:
            filename = os.path.basename(file_path)
            creation_date = get_file_creation_date(file_path)

            os.remove(file_path)
            print(f"🗑️  Cleaned: {filename} (was from {creation_date})")
            cleaned_files += 1

        except Exception as e:
            print(f"❌ Error cleaning {file_path}: {e}")

    print(f"🧹 Cleanup complete: {cleaned_files} files removed from {analysis_dir}")
    return cleaned_files


transfer_files_to_archive()
clean_analysis_dir_after_archive()



📦 ARCHIVING SUMMARY (2025-06-27 15:49:20):
   PNG files transferred: 0
   PDF files transferred: 0
   Total files archived: 0
🗑️  No files to clean in analysis directory


0

In [22]:
# LOADING CSV

# Toggle this flag to True in production
RENAME_FILES = False
MOVE_FILES = False

# Ensure archive directory exists
os.makedirs(archive_csv_dir, exist_ok=True)


# List and sort files by creation time
files = [
    os.path.join(data_dir, f)
    for f in os.listdir(data_dir)
    if os.path.isfile(os.path.join(data_dir, f)) and f.endswith('.csv')]
sorted_files = sorted(files, key=os.path.getctime, reverse=True)

# Check if we have exactly 2 CSV files
if len(sorted_files) != 2:
    print(f"Error: Expected 2 CSV files, found {len(sorted_files)}")
    print("Files found:", [os.path.basename(f) for f in sorted_files])
    exit(1)

for i, file_path in enumerate(sorted_files, 1):
    print(f"  File {i}:\n {os.path.basename(file_path)}")

# Loop over files
processed_files = []
for file_path in sorted_files:
    created_at = datetime.fromtimestamp(os.path.getctime(file_path))
    timestamp_str = created_at.strftime('%Y-%m-%d_%H-%M')
    original_name = os.path.basename(file_path)
    new_name = f"{timestamp_str}_{original_name}"

    if RENAME_FILES:
        if not original_name.startswith(timestamp_str):
            new_path = os.path.join(data_dir, new_name)
            os.rename(file_path, new_path)
            print(f"Renamed:\n {original_name} →\n {new_name}\n")
            processed_files.append(new_path)
        else:
            processed_files.append(file_path)
    else:
        processed_files.append(file_path)

# Load both CSV files into pandas DataFrames
file1_path, file2_path = processed_files[0], processed_files[1]
print(f"\nLoading CSV files:")
print(f"  File 1: {os.path.basename(file1_path)}")
print(f"  File 2: {os.path.basename(file2_path)}")

try:
    sub_raw = pd.read_csv(file1_path, low_memory=False)
    inv_raw = pd.read_csv(file2_path, low_memory=False)
    print(f"\nSuccessfully loaded:")
    print(f"  sub_raw: {sub_raw.shape[0]} rows, {sub_raw.shape[1]} columns")
    print(f"  inv_raw: {inv_raw.shape[0]} rows, {inv_raw.shape[1]} columns")
except Exception as e:
    print(f"Error loading CSV files: {e}")
    exit(1)

# Move files to archive
if MOVE_FILES:
    for file_path in processed_files:
        file_name = os.path.basename(file_path)
        archive_path = os.path.join(archive_csv_dir, file_name)

        if not os.path.exists(archive_path):
            os.rename(file_path, archive_path)
            print(f"Moved: {file_name} to archive")
        else:
            print(f"Already archived: {file_name}")
else:
    for file_path in processed_files:
        file_name = os.path.basename(file_path)

print("\nDataFrames available as: sub_raw, inv_raw")
print("\nProcessing complete!")
print('***************************************************')


  File 1:
 DishpatchSubscriptionData_NIklas_Sanitised - subscriptions.csv
  File 2:
 DishpatchInvoiceData_NIklas_Sanitised - invoices.csv

Loading CSV files:
  File 1: DishpatchSubscriptionData_NIklas_Sanitised - subscriptions.csv
  File 2: DishpatchInvoiceData_NIklas_Sanitised - invoices.csv

Successfully loaded:
  sub_raw: 20443 rows, 34 columns
  inv_raw: 33239 rows, 53 columns

DataFrames available as: sub_raw, inv_raw

Processing complete!
***************************************************


In [23]:
# DATA PREPROCESSING (customer_df)
def preprocess_data(input_df):
    """Clean and preprocess the subscription data"""
    df = input_df.copy()

    # Date conversion
    date_cols = [col for col in df.columns if '(UTC)' in col]
    for col in date_cols:
        df[col] = pd.to_datetime(df[col], errors='coerce', utc=True)

    df = df.sort_values(by='Created (UTC)')

    # Column selection and renaming
    columns_to_keep = [
        'id', 'Customer Name', 'Customer ID', 'Status', 'Cancellation Reason',
        'Created (UTC)', 'Start (UTC)', 'Current Period Start (UTC)',
        'Current Period End (UTC)', 'Trial Start (UTC)', 'Trial End (UTC)',
        'Canceled At (UTC)', 'Ended At (UTC)', 'senderShopifyCustomerId (metadata)'
    ]

    df = df[columns_to_keep]

    df.rename(columns={
        'id': 'subscription_id',
        'Customer ID': 'customer_id',
        'Customer Name': 'customer_name',
        'Status': 'status',
        'Cancellation Reason': 'cancellation_reason',
        'Created (UTC)': 'created_utc',
        'Start (UTC)': 'start_utc',
        'Current Period Start (UTC)': 'current_period_start_utc',
        'Current Period End (UTC)': 'current_period_end_utc',
        'Trial Start (UTC)': 'trial_start_utc',
        'Trial End (UTC)': 'trial_end_utc',
        'Canceled At (UTC)': 'canceled_at_utc',
        'Ended At (UTC)': 'ended_at_utc',
        'senderShopifyCustomerId (metadata)': 'is_gifted_member'
    }, inplace=True)

    # Convert is_gifted_member to boolean
    df['is_gifted_member'] = df['is_gifted_member'].notna()


    # Reference date for analysis
    print(f"📅 Reference date (TODAY) : {today_date.strftime('%d-%m-%Y')}")
    print(f"{len(df)} entries loaded from {file_path}")
    print('***************************************************')

    return df

sub_df = preprocess_data(sub_raw)

📅 Reference date (TODAY) : 30-05-2025
20443 entries loaded from both_csv_go_here/DishpatchInvoiceData_NIklas_Sanitised - invoices.csv
***************************************************


In [24]:
# REMOVE ALL MULTI-SUB customer_id from sub_df, put them im a new df multisub_df
def remove_multi_subscriptions(df):
    """Remove customers with multiple subscriptions and return a new DataFrame"""
    df = df.copy()

    # Count subscriptions per customer
    subscription_counts = df['customer_id'].value_counts()

    # Get customers with more than one subscription
    multi_sub_customers = subscription_counts[subscription_counts > 1].index.tolist()

    # Filter out these customers from the main DataFrame
    single_sub_df = df[~df['customer_id'].isin(multi_sub_customers)]

    # Create a new DataFrame for multi-subscription customers
    multi_sub_df = df[df['customer_id'].isin(multi_sub_customers)]

    print(f"Removed {len(multi_sub_customers)} customers with multiple subscriptions.")
    print(f"Total single_sub_df: {len(single_sub_df)}, with {len(single_sub_df['customer_id'].unique())} unique customers")
    print(f"Total multi_sub_df: {len(multi_sub_df)}, with {len(multi_sub_df['customer_id'].unique())} unique customers")


    return single_sub_df, multi_sub_df


#sub_df, multisub_df = remove_multi_subscriptions(sub_df)

In [25]:
# DATA PREPROCESSING (invoices df)
def preprocess_data_invoice(input_df):
    """Clean and preprocess the subscription data"""
    df = input_df.copy()

    # Date conversion
    date_cols = [col for col in df.columns if '(UTC)' in col]
    for col in date_cols:
        df[col] = pd.to_datetime(df[col], errors='coerce', utc=True)

    # Trier par date et par customer ID
    df = df.sort_values(['Date (UTC)', 'Customer'])




    # Column selection and renaming
    columns_to_keep = [
        'id', 'Customer Name', 'Customer', 'Amount Due', 'Amount Paid', 'Paid', 'Billing', 'Charge', 'Closed',
        'Date (UTC)', 'Description', 'Number', 'Finalized At (UTC)',
        'Paid At (UTC)', 'Minimum Line Item Period Start (UTC)', 'Maximum Line Item Period End (UTC)',
        'Period End (UTC)', 'Subscription', 'Total Discount Amount', 'Applied Coupons', 'Status'
        ]

    df = df[columns_to_keep]

    df.rename(columns={
        'id': 'invoice_id',
        'Status': 'inv_status',
        'Customer': 'customer_id',
        'Customer Name': 'customer_name',
        'Date (UTC)' : 'date_utc',
        'Description': 'description',
        'Paid': 'paid',
        'Paid At (UTC)': 'paid_at_utc',
        'Amount Paid': 'amount_paid',
        'Subscription': 'subscription_id',
    }, inplace=True)

    return df

inv_df = preprocess_data_invoice(inv_raw)

In [26]:
# Removing customers with more than 5 subscriptions (Probably testing accounts)
def remove_high_volume_customers(df, threshold=HIGH_VOLUME_THRESHOLD):
    """Remove customers with more than a specified number of subscriptions"""
    df = df.copy()

    original_count = len(df)

    customer_counts = df['customer_id'].value_counts()
    high_volume_customers = customer_counts[customer_counts > threshold].index

    df = df[~df['customer_id'].isin(high_volume_customers)]

    print(f'{original_count - len(df)} subscriptions removed from \
{len(high_volume_customers)} customers with more than {threshold} subscriptions')
    print('***************************************************')

    return df


inv_df = remove_high_volume_customers(inv_df)

223 subscriptions removed from 23 customers with more than 6 subscriptions
***************************************************


In [27]:
# Drop all rows where 'invoice' is False (or equivalent column if named differently)
print(len(inv_df))
#inv_df = inv_df[inv_df['paid'] == True]
print(len(inv_df))


33016
33016


In [28]:
inv_df['customer_id'].value_counts()
inv_df[inv_df['customer_id'] == 'cus_Q6F5nUeXK8doS8']

Unnamed: 0,invoice_id,customer_name,customer_id,Amount Due,amount_paid,paid,Billing,Charge,Closed,date_utc,description,Number,Finalized At (UTC),paid_at_utc,Minimum Line Item Period Start (UTC),Maximum Line Item Period End (UTC),Period End (UTC),subscription_id,Total Discount Amount,Applied Coupons,inv_status
18684,in_1PG2bVCZ9aYYH5wi149OP50H,Customer917,cus_Q6F5nUeXK8doS8,0.0,0.0,True,charge_automatically,,True,2024-05-13 17:24:00+00:00,,F0E48C3E-14472,2024-05-13 17:24:00+00:00,2024-05-13 17:24:00+00:00,2024-05-13 17:24:00+00:00,2024-05-23 17:24:00+00:00,2024-05-13 17:24:00+00:00,sub_1PG2bVCZ9aYYH5wiDYhOIisE,0,,paid
18372,in_1PJfNQCZ9aYYH5wikrILiVWa,Customer917,cus_Q6F5nUeXK8doS8,69.0,69.0,True,charge_automatically,ch_3PJgK5CZ9aYYH5wi1BHnzZwq,True,2024-05-23 17:24:00+00:00,,F0E48C3E-14784,2024-05-23 18:25:00+00:00,2024-05-23 18:25:00+00:00,2024-05-23 17:24:00+00:00,2025-05-23 17:24:00+00:00,2024-05-23 17:24:00+00:00,sub_1PG2bVCZ9aYYH5wiDYhOIisE,0,,paid
12281,in_1QHoo0CZ9aYYH5wipsGTjQfU,Customer917,cus_Q6F5nUeXK8doS8,69.0,69.0,True,charge_automatically,ch_3QHoo2CZ9aYYH5wi1S3ibDFH,True,2024-11-05 15:36:00+00:00,Customer Early Renew,F0E48C3E-20875,2024-11-05 15:36:00+00:00,2024-11-05 15:36:00+00:00,2024-11-05 15:36:00+00:00,2024-11-05 15:36:00+00:00,2025-05-23 17:24:00+00:00,,0,,paid
12280,in_1QHooTCZ9aYYH5wiIhWCKOAv,Customer917,cus_Q6F5nUeXK8doS8,0.0,0.0,True,charge_automatically,,True,2024-11-05 15:37:00+00:00,,F0E48C3E-20876,2024-11-05 15:37:00+00:00,2024-11-05 15:37:00+00:00,2024-11-05 15:37:00+00:00,2025-11-05 15:37:00+00:00,2024-11-05 15:37:00+00:00,sub_1QHooTCZ9aYYH5wiIkZHzWV1,69,FVO3JKM7,paid
1710,in_1RBI9ECZ9aYYH5wiPV6RcosO,Customer917,cus_Q6F5nUeXK8doS8,69.0,69.0,True,charge_automatically,ch_3RBI9HCZ9aYYH5wi16Ke2ngK,True,2025-04-07 16:03:00+00:00,Customer Early Renew,F0E48C3E-31446,2025-04-07 16:03:00+00:00,2025-04-07 16:04:00+00:00,2025-04-07 16:03:00+00:00,2025-04-07 16:03:00+00:00,2025-11-05 15:37:00+00:00,,0,,paid
1709,in_1RBI9rCZ9aYYH5wiVehEQrMn,Customer917,cus_Q6F5nUeXK8doS8,0.0,0.0,True,charge_automatically,,True,2025-04-07 16:04:00+00:00,,F0E48C3E-31447,2025-04-07 16:04:00+00:00,2025-04-07 16:04:00+00:00,2025-04-07 16:04:00+00:00,2026-04-07 16:04:00+00:00,2025-04-07 16:04:00+00:00,sub_1RBI9rCZ9aYYH5wiNlG3pArt,69,FVO3JKM7,paid


In [29]:
sub_df[sub_df['customer_id'] == 'cus_Q6F5nUeXK8doS8']

Unnamed: 0,subscription_id,customer_name,customer_id,status,cancellation_reason,created_utc,start_utc,current_period_start_utc,current_period_end_utc,trial_start_utc,trial_end_utc,canceled_at_utc,ended_at_utc,is_gifted_member
10822,sub_1PG2bVCZ9aYYH5wiDYhOIisE,Customer917,cus_Q6F5nUeXK8doS8,canceled,cancellation_requested,2024-05-13 17:24:00+00:00,2024-05-13 17:24:00+00:00,2024-05-23 17:24:00+00:00,2025-05-23 17:24:00+00:00,2024-05-13 17:24:00+00:00,2024-05-23 17:24:00+00:00,2024-11-05 15:37:00+00:00,2024-11-05 15:37:00+00:00,False
6672,sub_1QHooTCZ9aYYH5wiIkZHzWV1,Customer917,cus_Q6F5nUeXK8doS8,canceled,cancellation_requested,2024-11-05 15:37:00+00:00,2024-11-05 15:37:00+00:00,2024-11-05 15:37:00+00:00,2025-11-05 15:37:00+00:00,NaT,NaT,2025-04-07 16:04:00+00:00,2025-04-07 16:04:00+00:00,False
971,sub_1RBI9rCZ9aYYH5wiNlG3pArt,Customer917,cus_Q6F5nUeXK8doS8,active,,2025-04-07 16:04:00+00:00,2025-04-07 16:04:00+00:00,2025-04-07 16:04:00+00:00,2026-04-07 16:04:00+00:00,NaT,NaT,NaT,NaT,False


In [40]:
# Merge inv_df and sub_df on 'subscription_id'
merged_df = inv_df.merge(sub_df, on='subscription_id', suffixes=('_inv', '_sub'), how='left')

In [48]:
# Group by 'customer_id' and display the count of invoices per customer
invoice_counts = merged_df.groupby('customer_id_inv')['invoice_id'].count()
print(invoice_counts)

customer_id_inv
cus_OhZs6NtQ3RSc9j    1
cus_OhZsIHcfYXqNJx    1
cus_OhZtIUk5q419NR    1
cus_Oha922cf4488T3    1
cus_OhaH3sQTOqvuAH    1
                     ..
cus_SMYvL9t7buzWC9    1
cus_SMaCh3diB5702c    1
cus_SMab6nDMynXbYk    1
cus_SMas6YsqkS3FUn    1
cus_SMb0NMrlhzvojr    1
Name: invoice_id, Length: 21491, dtype: int64


customer_id_inv
cus_OhaWkzl9u5973o    3
cus_OhagJTEOs7LZLk    3
cus_OhaiLVc3wUwNWu    3
cus_OhartoZLR9vWnB    4
cus_Ohc4lazRKnDFZ3    3
                     ..
cus_Rd1v42e9nVT48M    4
cus_RdqL5EFDuF6gmh    4
cus_RdqV2AefC7WNOW    3
cus_Rg4og8ntkWmlXA    4
cus_Rh44ejQ8OJ7bDi    4
Name: invoice_id, Length: 2524, dtype: int64