In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stat
import statsmodels.api as sm
import statsmodels.formula.api as smf#, ols
from statsmodels.stats.outliers_influence import variance_inflation_factor
from scipy.stats import ttest_ind, f_oneway
import scipy.stats as stats
import sys
from pathlib import Path
# # Add src directory to the Python path
sys.path.append(str(Path.cwd().parent))
from src.data_analysis.df_dataattribute_analysis import categorize_columns
from src.data_sourcing.import_export_data import get_data, save_data
from src.data_analysis.df_ttest_anova_analysis import run_one_sample_ttest, run_univariate_tests
from src.data_analysis.df_multicollinearity_analysis import compute_vif
from src.data_analysis.df_correlation_analysis import compute_correlations

In [2]:
df = get_data('featured_engineered_testing.csv', 'processed_data')
df.head(5)

2025-01-26 17:53:39,115 - INFO - Loading data from: ..\data\processed\featured_engineered_testing.csv


Unnamed: 0,ATL_OR_DR,CAMPAIGN_TYPE,CHANNEL,COST,FREE_TRIALS,HOLIDAY_FLAG,MONTH,ATL_OR_DR_CAMPAIGN_TYPE,ATL_OR_DR_CAMPAIGN_TYPE_CHANNEL,ATL_OR_DR_CAMPAIGN_TYPE_CHANNEL_MONTH,...,LOG_COST,LOG_FREE_TRIALS,LOG_COST_PER_FREE_TRIALS,LOG_COST_BY_FREE_TRIALS,AVG_COST_MONTH,MEDIAN_COST_MONTH,AVG_COST_CAMPAIGN_TYPE_CHANNEL,MEDIAN_COST_CAMPAIGN_TYPE_CHANNEL,AVG_FREE_TRIALS_MONTH,MEDIAN_FREE_TRIALS_MONTH
0,DR - Direct Response,Title,paid social,7784.31,86401.15,False,June,DR - Direct Response_Title,DR - Direct Response_Title_paid social,DR - Direct Response_Title_paid social_June,...,8.959994,11.366768,0.086265,20.326622,16887.563506,3643.265,14781.334567,3309.5,42865.334723,25054.485
1,DR - Direct Response,Title,app,2474.31,2956.74,False,August,DR - Direct Response_Title,DR - Direct Response_Title_app,DR - Direct Response_Title_app_August,...,7.814121,7.992181,0.608045,15.80556,14172.931604,2955.06,15454.45154,3477.08,43953.768382,28127.39
2,ATL - Above The Line,Title,paid social,10222.82,12513.08,False,July,ATL - Above The Line_Title,ATL - Above The Line_Title_paid social,ATL - Above The Line_Title_paid social_July,...,9.232476,9.43461,0.597171,18.666908,13667.902515,2994.47,14781.334567,3309.5,45959.687637,27091.21
3,DR - Direct Response,Title,app,49631.87,17207.04,True,December,DR - Direct Response_Title,DR - Direct Response_Title_app,DR - Direct Response_Title_app_December,...,10.812409,9.753132,1.356967,20.565462,18937.983302,3240.025,15454.45154,3477.08,47495.480279,28640.06
4,ATL - Above The Line,Title,paid social,2081.06,21758.33,False,August,ATL - Above The Line_Title,ATL - Above The Line_Title_paid social,ATL - Above The Line_Title_paid social_August,...,7.641113,9.987798,0.091343,17.628385,14172.931604,2955.06,14781.334567,3309.5,43953.768382,28127.39


### Multi-collinearity Testing Analysis

In [3]:
df.columns

X_df= df.drop(columns=['FREE_TRIALS', 'ATL_OR_DR_CAMPAIGN_TYPE', 'ATL_OR_DR_CAMPAIGN_TYPE_CHANNEL', 'ATL_OR_DR_CAMPAIGN_TYPE_CHANNEL_MONTH',
                         'ATL_OR_DR_CAMPAIGN_TYPE_MONTH', 'ATL_OR_DR_CHANNEL_MONTH', 'CAMPAIGN_TYPE_CHANNEL_MONTH', 'CAMPAIGN_TYPE_MONTH', 'CHANNEL_MONTH',
                           'LOG_FREE_TRIALS', 'ATL_OR_DR_MONTH'], axis=1)
X_df.columns

Index(['ATL_OR_DR', 'CAMPAIGN_TYPE', 'CHANNEL', 'COST', 'HOLIDAY_FLAG',
       'MONTH', 'ATL_OR_DR_CHANNEL', 'CAMPAIGN_TYPE_CHANNEL',
       'COST_PER_FREE_TRIALS', 'COST_BY_FREE_TRIALS', 'LOG_COST',
       'LOG_COST_PER_FREE_TRIALS', 'LOG_COST_BY_FREE_TRIALS', 'AVG_COST_MONTH',
       'MEDIAN_COST_MONTH', 'AVG_COST_CAMPAIGN_TYPE_CHANNEL',
       'MEDIAN_COST_CAMPAIGN_TYPE_CHANNEL', 'AVG_FREE_TRIALS_MONTH',
       'MEDIAN_FREE_TRIALS_MONTH'],
      dtype='object')

In [4]:
hashmap = categorize_columns(df)
nums_df = df[hashmap['numerical']]
nums_df

Unnamed: 0,COST,FREE_TRIALS,COST_PER_FREE_TRIALS,COST_BY_FREE_TRIALS,LOG_COST,LOG_FREE_TRIALS,LOG_COST_PER_FREE_TRIALS,LOG_COST_BY_FREE_TRIALS,AVG_COST_MONTH,MEDIAN_COST_MONTH,AVG_COST_CAMPAIGN_TYPE_CHANNEL,MEDIAN_COST_CAMPAIGN_TYPE_CHANNEL,AVG_FREE_TRIALS_MONTH,MEDIAN_FREE_TRIALS_MONTH
0,7784.31,86401.15,0.090095,6.725733e+08,8.959994,11.366768,0.086265,20.326622,16887.563506,3643.265,14781.334567,3309.50,42865.334723,25054.485
1,2474.31,2956.74,0.836837,7.315891e+06,7.814121,7.992181,0.608045,15.805560,14172.931604,2955.060,15454.451540,3477.08,43953.768382,28127.390
2,10222.82,12513.08,0.816971,1.279190e+08,9.232476,9.434610,0.597171,18.666908,13667.902515,2994.470,14781.334567,3309.50,45959.687637,27091.210
3,49631.87,17207.04,2.884393,8.540176e+08,10.812409,9.753132,1.356967,20.565462,18937.983302,3240.025,15454.451540,3477.08,47495.480279,28640.060
4,2081.06,21758.33,0.095644,4.528039e+07,7.641113,9.987798,0.091343,17.628385,14172.931604,2955.060,14781.334567,3309.50,43953.768382,28127.390
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14103,1478.78,16941.67,0.087287,2.505300e+07,7.299649,9.737591,0.083685,17.036504,13900.634352,3124.380,25653.190823,2983.70,41704.905263,26583.695
14104,839.25,5727.03,0.146542,4.806410e+06,6.733699,8.653127,0.136750,15.385461,14257.447211,2830.540,12102.043469,2953.66,46600.296441,28029.530
14105,193796.71,34824.51,5.564952,6.748875e+09,12.174570,10.458105,1.881745,22.632642,18937.983302,3240.025,17476.078974,3298.95,47495.480279,28640.060
14106,84690.65,25140.77,3.368658,2.129188e+09,11.346772,10.132286,1.474456,21.479007,13900.634352,3124.380,15413.423001,3011.83,41704.905263,26583.695


In [5]:
vif_result = compute_vif(
    data=nums_df,
)
vif_result

Unnamed: 0,Feature,VIF
0,LOG_COST_BY_FREE_TRIALS,46.504098
1,LOG_COST,39.579163
2,LOG_FREE_TRIALS,12.467018
3,MEDIAN_FREE_TRIALS_MONTH,5.822304
4,AVG_COST_MONTH,4.511645
5,LOG_COST_PER_FREE_TRIALS,4.100295
6,MEDIAN_COST_MONTH,3.987905
7,COST,3.941788
8,AVG_FREE_TRIALS_MONTH,3.038827
9,FREE_TRIALS,2.507845


### One Sample T-Test Aanalysis

In [None]:

categorical = ["ATL_OR_DR", "HOLIDAY_FLAG"]
quantitative = ["FREE_TRIALS", "LOG_FREE_TRIALS"]


# 2) Using a dictionary for different hypothesized means per column:
results_2 = run_one_sample_ttest(
    df=df,
    categorical_cols=categorical,
    quantitative_cols=quantitative,
    popmean={"FREE_TRIALS": 277769, "LOG_FREE_TRIALS": 10.22},  # different popmeans
    stat_type="median",
    alpha=0.05,  # more stringent significance level
    sort_results=True
)
results_2

### ANOVA Testing Analysis

In [None]:
hashmap = categorize_columns(df)
categorical = hashmap['categorical']
categorical.remove('ATL_OR_DR')
categorical.remove('HOLIDAY_FLAG')
categorical

In [5]:
quantitative = ["FREE_TRIALS", "LOG_FREE_TRIALS"]

anova_results_df = run_univariate_tests(
    df=df,
    categorical_cols=categorical,
    quantitative_cols=quantitative,
    test_type="anova"
)
anova_results_df

kruskal_results_df = run_univariate_tests(
    df=df,
    categorical_cols=categorical,
    quantitative_cols=quantitative,
    test_type="kruskal"
)


### Kruskal Wallis Testing Analysis

In [None]:
kruskal_results_df = run_univariate_tests(
    df=df,
    categorical_cols=categorical,
    quantitative_cols=quantitative,
    test_type="kruskal"
)
kruskal_results_df

### Pearson Correlation Testing Analysis

In [8]:
# 1) Pearson correlation (numeric only - only linear relationships)
pearson_results = compute_correlations(
    df=df,
    include_categorical=False,
    cat_cols=None  # Do not encode categorical variables
)
pearson_results[pearson_results['p-value'] < 0.05]

### Spearman Correlation Testing Analysis

In [None]:
# 2) Spearman correlation (with categorical one-hot encoding)
spearman_results = compute_correlations(
    df=df,
    include_categorical=True,  # Encode categorical variables
    drop_first=True,
    cat_cols=categorical
)
spearman_results

### Random Testing Analysis

In [11]:
# import pandas as pd
# import statsmodels.api as sm
# from statsmodels.formula.api import ols

# # Example: Assuming your data is in a DataFrame called df
# # Replace 'FREE_TRIALS' with your dependent variable
# # Replace other variable names with the actual column names from your dataset

# # Create the OLS formula
# formula = """
# FREE_TRIALS ~ HOLIDAY_FLAG
# + ATL_OR_DR + CAMPAIGN_TYPE + CHANNEL + MONTH
# + ATL_OR_DR_CAMPAIGN_TYPE + ATL_OR_DR_CAMPAIGN_TYPE_CHANNEL
# + ATL_OR_DR_CAMPAIGN_TYPE_CHANNEL_MONTH + ATL_OR_DR_CAMPAIGN_TYPE_MONTH
# + ATL_OR_DR_CHANNEL + ATL_OR_DR_CHANNEL_MONTH + ATL_OR_DR_MONTH
# + CAMPAIGN_TYPE_CHANNEL + CAMPAIGN_TYPE_CHANNEL_MONTH + CAMPAIGN_TYPE_MONTH + CHANNEL_MONTH
# + COST + COST_PER_FREE_TRIALS + COST_BY_FREE_TRIALS
# + LOG_COST + LOG_COST_PER_FREE_TRIALS + LOG_COST_BY_FREE_TRIALS
# + AVG_COST_MONTH + MEDIAN_COST_MONTH
# + AVG_COST_CAMPAIGN_TYPE_CHANNEL + MEDIAN_COST_CAMPAIGN_TYPE_CHANNEL
# + AVG_FREE_TRIALS_MONTH + MEDIAN_FREE_TRIALS_MONTH
# """
# # formula = """
# # FREE_TRIALS ~ HOLIDAY_FLAG
# # + MONTH + CHANNEL + CAMPAIGN_TYPE + ATL_OR_DR
# # + COST + COST_PER_FREE_TRIALS + LOG_COST
# # + MONTH:CHANNEL + CAMPAIGN_TYPE:ATL_OR_DR
# # """
# #formula = "FREE_TRIALS ~ MONTH * ATL_OR_DR + CHANNEL"

# # Fit the OLS model
# # model = ols(formula, data=df).fit()

# import statsmodels.api as sm
# model = sm.GLM(df['FREE_TRIALS'], sm.add_constant(X), family=sm.families.Poisson()).fit()
# print(model.summary())


# # Print the summary of the regression model
# print(model.summary())


In [12]:
# pd.DataFrame(model.summary())

In [13]:
# # # Define a function to plot boxplots and detect outliers based on IQR
# # def plot_outliers(column, ax):
# #     ax.boxplot(data[column], vert=False, patch_artist=True, boxprops=dict(facecolor="lightblue"))
# #     ax.set_title(f'Boxplot of {column}')
# #     ax.set_xlabel(column)

# # # Create boxplots for COST and FREE_TRIALS
# # fig, axes = plt.subplots(1, 2, figsize=(12, 5))
# # plot_outliers('COST', axes[0])
# # plot_outliers('FREE_TRIALS', axes[1])
# # plt.tight_layout()
# # plt.show()

# # Calculate outlier thresholds using IQR
# def calculate_outliers(column):
#     q1 = data[column].quantile(0.25)
#     q3 = data[column].quantile(0.75)
#     iqr = q3 - q1
#     lower_bound = q1 - 1.5 * iqr
#     upper_bound = q3 + 1.5 * iqr
#     outliers = data[(data[column] < lower_bound) | (data[column] > upper_bound)]
#     return lower_bound, upper_bound, outliers

# # Remove outliers based on IQR thresholds for LOG_COST and LOG_FREE_TRIALS
# log_cost_bounds = calculate_outliers('LOG_COST')
# log_free_trials_bounds = calculate_outliers('LOG_FREE_TRIALS')

# # Filter data to exclude outliers in LOG_COST and LOG_FREE_TRIALS
# filtered_log_data = data[
#     (data['LOG_COST'] >= log_cost_bounds[0]) & (data['LOG_COST'] <= log_cost_bounds[1]) &
#     (data['LOG_FREE_TRIALS'] >= log_free_trials_bounds[0]) & (data['LOG_FREE_TRIALS'] <= log_free_trials_bounds[1])
# ]

# # Display the size of the dataset before and after outlier removal based on log variables
# log_filtered_size = filtered_log_data.shape[0]

# log_filtered_size


In [14]:
# from sklearn.preprocessing import StandardScaler

# # Select numerical columns to standardize
# numerical_features = ['LOG_COST', 'LOG_COST_SQ']
# scaler = StandardScaler()

# # Standardize numerical features
# filtered_log_data_encoded[numerical_features] = scaler.fit_transform(
#     filtered_log_data_encoded[numerical_features]
# )

# # Re-run OLS regression with standardized features
# X_standardized = filtered_log_data_encoded[['LOG_COST', 'LOG_COST_SQ'] +
#                                            [col for col in filtered_log_data_encoded.columns
#                                             if col.startswith(('ATL_OR_DR_', 'CAMPAIGN_TYPE_', 'CHANNEL_'))]]
# X_standardized = sm.add_constant(X_standardized)  # Add constant for OLS

# y_standardized = filtered_log_data_encoded['LOG_FREE_TRIALS']

# # Fit OLS regression with standardized features
# ols_standardized_model = sm.OLS(y_standardized, X_standardized).fit()

# # Display the updated summary
# ols_standardized_summary = ols_standardized_model.summary()
# ols_standardized_summary