In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import holidays
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from scipy.stats import ttest_ind, f_oneway
import scipy.stats as stats

import sys
from pathlib import Path

# # Add src directory to the Python path
sys.path.append(str(Path.cwd().parent))

from src.data_analysis.df_dataattribute_analysis import categorize_columns
from src.data_sourcing.import_export_data import get_data, save_data

from collections import defaultdict
from typing import List, Dict, Tuple

df = get_data('featured_engineered_testing.csv', 'processed_data')
df.head(4)


Unnamed: 0,ATL_OR_DR,CAMPAIGN_TYPE,CHANNEL,COST,FREE_TRIALS,HOLIDAY_FLAG,SEASON,MONTH,ATL_OR_DR_CAMPAIGN_TYPE,ATL_OR_DR_CAMPAIGN_TYPE_CHANNEL,...,LOG_COST,LOG_FREE_TRIALS,LOG_COST_PER_FREE_TRIALS,LOG_COST_BY_FREE_TRIALS,AVG_COST_MONTH,MEDIAN_COST_MONTH,AVG_COST_CAMPAIGN_TYPE_CHANNEL,MEDIAN_COST_CAMPAIGN_TYPE_CHANNEL,AVG_FREE_TRIALS_MONTH,MEDIAN_FREE_TRIALS_MONTH
0,DR - Direct Response,Title,paid social,7784.31,86401.15,False,Summer,June,DR - Direct Response_Title,DR - Direct Response_Title_paid social,...,8.959994,11.366768,0.086265,20.326622,16887.563506,3643.265,14781.334567,3309.5,42865.334723,25054.485
1,DR - Direct Response,Title,app,2474.31,2956.74,False,Summer,August,DR - Direct Response_Title,DR - Direct Response_Title_app,...,7.814121,7.992181,0.608045,15.80556,14172.931604,2955.06,15454.45154,3477.08,43953.768382,28127.39
2,ATL - Above The Line,Title,paid social,10222.82,12513.08,False,Summer,July,ATL - Above The Line_Title,ATL - Above The Line_Title_paid social,...,9.232476,9.43461,0.597171,18.666908,13667.902515,2994.47,14781.334567,3309.5,45959.687637,27091.21
3,DR - Direct Response,Title,app,49631.87,17207.04,False,Winter,December,DR - Direct Response_Title,DR - Direct Response_Title_app,...,10.812409,9.753132,1.356967,20.565462,18937.983302,3240.025,15454.45154,3477.08,47495.480279,28640.06


In [2]:
# Average/Median FREE_TRIALS generated per CHANNEL in each MONTH.
df.groupby(['CHANNEL', 'MONTH'])['FREE_TRIALS'].agg(['mean', 'median']).reset_index()
#
df['AVG_FREE_TRIALS_CHANNEL_MONTH'] = df.groupby(['CHANNEL', 'MONTH'])['FREE_TRIALS'].transform('mean')
df['MEDIAN_FREE_TRIALS_CHANNEL_MONTH'] = df.groupby(['CHANNEL', 'MONTH'])['FREE_TRIALS'].transform('median')


In [3]:
# Average/Median FREE_TRIALS generated per ATL_OR_DR in each MONTH.
df.groupby(['ATL_OR_DR', 'MONTH'])['FREE_TRIALS'].agg(['mean', 'median']).reset_index()
#
df['AVG_FREE_TRIALS_ATL_OR_DR_MONTH'] = df.groupby(['ATL_OR_DR', 'MONTH'])['FREE_TRIALS'].transform('mean')
df['MEDIAN_FREE_TRIALS_ATL_OR_DR_MONTH'] = df.groupby(['ATL_OR_DR', 'MONTH'])['FREE_TRIALS'].transform('median')


In [4]:
# Mean Normalization by Month:  FREE_TRIALS / Average FREE_TRIALS in that MONTH.
# This helps adjust for seasonal patterns without assuming campaign continuity.
# Group by 'month' and calculate the mean of 'free_trials'
monthly_avg = df.groupby("MONTH")["FREE_TRIALS"].transform("mean")

# Perform mean normalization by dividing 'free_trials' by the monthly average
df["MEAN_NORM_BY_MONTH_FREE_TRIALS"] = df["FREE_TRIALS"] / monthly_avg

In [5]:
def calculate_mean_norm_by_month(months: List[str], free_trials: List[float]) -> List[float]:
    # Step 1: Calculate monthly sums and counts in a single pass
    # Using defaultdict to avoid key existence checks
    monthly_sums = defaultdict(float)
    monthly_counts = defaultdict(int)

    for month, trials in zip(months, free_trials):
        monthly_sums[month] += trials
        monthly_counts[month] += 1

    # Step 2: Calculate monthly averages
    # Store in dict for O(1) lookup during normalization
    monthly_averages = {
        month: monthly_sums[month] / monthly_counts[month]
        for month in monthly_sums
    }

    # Step 3: Calculate normalized values
    # Using list comprehension for better performance than append operations
    normalized_values = [
        trials / monthly_averages[month]
        for month, trials in zip(months, free_trials)
    ]

    return normalized_values

In [6]:
# Convert DataFrame columns to lists for processing
months = df['MONTH'].tolist()
free_trials = df['FREE_TRIALS'].tolist()

# Use the function from the previous code
normalized_values = calculate_mean_norm_by_month(months, free_trials)

In [7]:
df['test'] = normalized_values

In [8]:
df

Unnamed: 0,ATL_OR_DR,CAMPAIGN_TYPE,CHANNEL,COST,FREE_TRIALS,HOLIDAY_FLAG,SEASON,MONTH,ATL_OR_DR_CAMPAIGN_TYPE,ATL_OR_DR_CAMPAIGN_TYPE_CHANNEL,...,AVG_COST_CAMPAIGN_TYPE_CHANNEL,MEDIAN_COST_CAMPAIGN_TYPE_CHANNEL,AVG_FREE_TRIALS_MONTH,MEDIAN_FREE_TRIALS_MONTH,AVG_FREE_TRIALS_CHANNEL_MONTH,MEDIAN_FREE_TRIALS_CHANNEL_MONTH,AVG_FREE_TRIALS_ATL_OR_DR_MONTH,MEDIAN_FREE_TRIALS_ATL_OR_DR_MONTH,MEAN_NORM_BY_MONTH_FREE_TRIALS,test
0,DR - Direct Response,Title,paid social,7784.31,86401.15,False,Summer,June,DR - Direct Response_Title,DR - Direct Response_Title_paid social,...,14781.334567,3309.50,42865.334723,25054.485,38790.036043,25454.120,41278.932086,25513.350,2.015642,2.015642
1,DR - Direct Response,Title,app,2474.31,2956.74,False,Summer,August,DR - Direct Response_Title,DR - Direct Response_Title_app,...,15454.451540,3477.08,43953.768382,28127.390,48598.808952,27119.520,48221.595657,29560.060,0.067269,0.067269
2,ATL - Above The Line,Title,paid social,10222.82,12513.08,False,Summer,July,ATL - Above The Line_Title,ATL - Above The Line_Title_paid social,...,14781.334567,3309.50,45959.687637,27091.210,44656.932868,27553.295,45883.862370,27283.700,0.272262,0.272262
3,DR - Direct Response,Title,app,49631.87,17207.04,False,Winter,December,DR - Direct Response_Title,DR - Direct Response_Title_app,...,15454.451540,3477.08,47495.480279,28640.060,45587.740753,30984.380,46403.302137,29570.435,0.362288,0.362288
4,ATL - Above The Line,Title,paid social,2081.06,21758.33,False,Summer,August,ATL - Above The Line_Title,ATL - Above The Line_Title_paid social,...,14781.334567,3309.50,43953.768382,28127.390,49129.031701,28912.470,41227.896600,26736.615,0.495028,0.495028
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14103,ATL - Above The Line,Brand,bvod,1478.78,16941.67,False,Winter,February,ATL - Above The Line_Brand,ATL - Above The Line_Brand_bvod,...,25653.190823,2983.70,41704.905263,26583.695,41627.441200,26426.850,41183.020972,26497.430,0.406227,0.406227
14104,ATL - Above The Line,Title,ooh,839.25,5727.03,False,Spring,April,ATL - Above The Line_Title,ATL - Above The Line_Title_ooh,...,12102.043469,2953.66,46600.296441,28029.530,40034.829608,32168.810,45219.071625,28975.695,0.122897,0.122897
14105,ATL - Above The Line,Title,bvod,193796.71,34824.51,False,Winter,December,ATL - Above The Line_Title,ATL - Above The Line_Title_bvod,...,17476.078974,3298.95,47495.480279,28640.060,42874.356757,23123.510,47938.062271,28199.725,0.733217,0.733217
14106,ATL - Above The Line,Brand,paid social,84690.65,25140.77,False,Winter,February,ATL - Above The Line_Brand,ATL - Above The Line_Brand_paid social,...,15413.423001,3011.83,41704.905263,26583.695,44259.174663,27498.420,41183.020972,26497.430,0.602825,0.602825
