In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import holidays
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from scipy.stats import ttest_ind, f_oneway
import scipy.stats as stats

import sys
from pathlib import Path

# # Add src directory to the Python path
sys.path.append(str(Path.cwd().parent))

from src.data_analysis.df_dataattribute_analysis import categorize_columns
from src.data_sourcing.import_export_data import get_data, save_data

from collections import defaultdict
from typing import List, Dict, Tuple

df = get_data('featured_engineered_testing.csv', 'processed_data')
df.head(4)


Unnamed: 0,ATL_OR_DR,CAMPAIGN_TYPE,CHANNEL,COST,FREE_TRIALS,HOLIDAY_FLAG,SEASON,MONTH,ATL_OR_DR_CAMPAIGN_TYPE,ATL_OR_DR_CAMPAIGN_TYPE_CHANNEL,...,LOG_COST,LOG_FREE_TRIALS,LOG_COST_PER_FREE_TRIALS,LOG_COST_BY_FREE_TRIALS,AVG_COST_MONTH,MEDIAN_COST_MONTH,AVG_COST_CAMPAIGN_TYPE_CHANNEL,MEDIAN_COST_CAMPAIGN_TYPE_CHANNEL,AVG_FREE_TRIALS_MONTH,MEDIAN_FREE_TRIALS_MONTH
0,DR - Direct Response,Title,paid social,7784.31,86401.15,False,Summer,June,DR - Direct Response_Title,DR - Direct Response_Title_paid social,...,8.959994,11.366768,0.086265,20.326622,16887.563506,3643.265,14781.334567,3309.5,42865.334723,25054.485
1,DR - Direct Response,Title,app,2474.31,2956.74,False,Summer,August,DR - Direct Response_Title,DR - Direct Response_Title_app,...,7.814121,7.992181,0.608045,15.80556,14172.931604,2955.06,15454.45154,3477.08,43953.768382,28127.39
2,ATL - Above The Line,Title,paid social,10222.82,12513.08,False,Summer,July,ATL - Above The Line_Title,ATL - Above The Line_Title_paid social,...,9.232476,9.43461,0.597171,18.666908,13667.902515,2994.47,14781.334567,3309.5,45959.687637,27091.21
3,DR - Direct Response,Title,app,49631.87,17207.04,False,Winter,December,DR - Direct Response_Title,DR - Direct Response_Title_app,...,10.812409,9.753132,1.356967,20.565462,18937.983302,3240.025,15454.45154,3477.08,47495.480279,28640.06


In [2]:
# Average/Median FREE_TRIALS generated per CHANNEL in each MONTH.
df.groupby(['CHANNEL', 'MONTH'])['FREE_TRIALS'].agg(['mean', 'median']).reset_index()
#
df['AVG_FREE_TRIALS_CHANNEL_MONTH'] = df.groupby(['CHANNEL', 'MONTH'])['FREE_TRIALS'].transform('mean')
df['MEDIAN_FREE_TRIALS_CHANNEL_MONTH'] = df.groupby(['CHANNEL', 'MONTH'])['FREE_TRIALS'].transform('median')


In [3]:
# Average/Median FREE_TRIALS generated per ATL_OR_DR in each MONTH.
df.groupby(['ATL_OR_DR', 'MONTH'])['FREE_TRIALS'].agg(['mean', 'median']).reset_index()
#
df['AVG_FREE_TRIALS_ATL_OR_DR_MONTH'] = df.groupby(['ATL_OR_DR', 'MONTH'])['FREE_TRIALS'].transform('mean')
df['MEDIAN_FREE_TRIALS_ATL_OR_DR_MONTH'] = df.groupby(['ATL_OR_DR', 'MONTH'])['FREE_TRIALS'].transform('median')


In [4]:
# Mean Normalization by Month:  FREE_TRIALS / Average FREE_TRIALS in that MONTH.
# This helps adjust for seasonal patterns without assuming campaign continuity.
# Group by 'month' and calculate the mean of 'free_trials'
monthly_avg = df.groupby("MONTH")["FREE_TRIALS"].transform("mean")

# Perform mean normalization by dividing 'free_trials' by the monthly average
df["MEAN_NORM_BY_MONTH_FREE_TRIALS"] = df["FREE_TRIALS"] / monthly_avg

In [5]:
'''
You do have to do two passes conceptually:

First pass: calculate sums & counts (or equivalently, gather all data to compute an average).
Second pass: normalize each individual value by its month’s average.
This is inevitable because you cannot know the final average for a given month until you’ve seen all the data for that month.
So there is no faster asymptotic way if you need the exact mean for each month.
'''
def calculate_mean_norm_by_month(months: List[str], free_trials: List[float]) -> List[float]:
    # Step 1: Calculate monthly sums and counts in a single pass
    # Using defaultdict to avoid key existence checks
    monthly_sums = defaultdict(float)
    monthly_counts = defaultdict(int)

    for month, trials in zip(months, free_trials):
        monthly_sums[month] += trials
        monthly_counts[month] += 1

    # Step 2: Calculate monthly averages
    # Store in dict for O(1) lookup during normalization
    monthly_averages = {
        month: monthly_sums[month] / monthly_counts[month]
        for month in monthly_sums
    }

    # Step 3: Calculate normalized values
    # Using list comprehension for better performance than append operations
    normalized_values = [
        trials / monthly_averages[month]
        for month, trials in zip(months, free_trials)
    ]

    return normalized_values

In [6]:
# Convert DataFrame columns to lists for processing
months = df['MONTH'].tolist()
free_trials = df['FREE_TRIALS'].tolist()

# Use the function from the previous code
normalized_values = calculate_mean_norm_by_month(months, free_trials)

In [7]:
df['test'] = normalized_values

In [10]:
df.columns

Index(['ATL_OR_DR', 'CAMPAIGN_TYPE', 'CHANNEL', 'COST', 'FREE_TRIALS',
       'HOLIDAY_FLAG', 'SEASON', 'MONTH', 'ATL_OR_DR_CAMPAIGN_TYPE',
       'ATL_OR_DR_CAMPAIGN_TYPE_CHANNEL',
       'ATL_OR_DR_CAMPAIGN_TYPE_CHANNEL_MONTH',
       'ATL_OR_DR_CAMPAIGN_TYPE_MONTH', 'ATL_OR_DR_CHANNEL',
       'ATL_OR_DR_CHANNEL_MONTH', 'ATL_OR_DR_MONTH', 'CAMPAIGN_TYPE_CHANNEL',
       'CAMPAIGN_TYPE_CHANNEL_MONTH', 'CAMPAIGN_TYPE_MONTH', 'CHANNEL_MONTH',
       'COST_PER_FREE_TRIALS', 'COST_BY_FREE_TRIALS', 'LOG_COST',
       'LOG_FREE_TRIALS', 'LOG_COST_PER_FREE_TRIALS',
       'LOG_COST_BY_FREE_TRIALS', 'AVG_COST_MONTH', 'MEDIAN_COST_MONTH',
       'AVG_COST_CAMPAIGN_TYPE_CHANNEL', 'MEDIAN_COST_CAMPAIGN_TYPE_CHANNEL',
       'AVG_FREE_TRIALS_MONTH', 'MEDIAN_FREE_TRIALS_MONTH',
       'AVG_FREE_TRIALS_CHANNEL_MONTH', 'MEDIAN_FREE_TRIALS_CHANNEL_MONTH',
       'AVG_FREE_TRIALS_ATL_OR_DR_MONTH', 'MEDIAN_FREE_TRIALS_ATL_OR_DR_MONTH',
       'MEAN_NORM_BY_MONTH_FREE_TRIALS', 'test'],
      dtype='obj

In [11]:
#Ratio of COST to FREE_TRIALS grouped by MONTH and CAMPAIGN_TYPE.
df.groupby('MONTH')['COST_BY_FREE_TRIALS'].agg(['mean', 'median']).reset_index()
df['AVG_COST_BY_FREE_TRIALS_BY_MONTH'] = df.groupby('CAMPAIGN_TYPE')['COST_BY_FREE_TRIALS'].transform('mean')
df['MEDIAN_COST_BY_FREE_TRIALS_BY_MONTH'] = df.groupby('CAMPAIGN_TYPE')['COST_BY_FREE_TRIALS'].transform('median')