In [56]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.formula.api import ols
from statsmodels.stats.outliers_influence import variance_inflation_factor

df = pd.read_csv('../../data/log_data.csv')
df.drop(['REPORT_DATE', 'FREE_TRIALS', 'COST'], axis=1, inplace=True)
df

Unnamed: 0,ATL_OR_DR,CAMPAIGN_TYPE,CHANNEL,LOG_COST,LOG_FREE_TRIALS
0,DR - Direct Response,Title,paid social,8.959994,11.366768
1,DR - Direct Response,Title,app,7.814121,7.992181
2,ATL - Above The Line,Title,paid social,9.232476,9.434610
3,DR - Direct Response,Title,app,10.812409,9.753132
4,ATL - Above The Line,Title,paid social,7.641113,9.987798
...,...,...,...,...,...
14103,ATL - Above The Line,Brand,bvod,7.299649,9.737591
14104,ATL - Above The Line,Title,ooh,6.733699,8.653127
14105,ATL - Above The Line,Title,bvod,12.174570,10.458105
14106,ATL - Above The Line,Brand,paid social,11.346772,10.132286


In [57]:
data = df

# Re-encode categorical variables
# Use formula-based OLS regression
formula = 'LOG_FREE_TRIALS ~ LOG_COST + ATL_OR_DR + CAMPAIGN_TYPE + CHANNEL'
ols_model_formula = smf.ols(formula=formula, data=data).fit()

# Generate the regression summary
ols_summary_formula = ols_model_formula.summary()
ols_summary_formula


0,1,2,3
Dep. Variable:,LOG_FREE_TRIALS,R-squared:,0.001
Model:,OLS,Adj. R-squared:,-0.0
Method:,Least Squares,F-statistic:,0.648
Date:,"Sun, 19 Jan 2025",Prob (F-statistic):,0.879
Time:,18:24:12,Log-Likelihood:,-19991.0
No. Observations:,14108,AIC:,40020.0
Df Residuals:,14087,BIC:,40180.0
Df Model:,20,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,10.1530,0.055,185.680,0.000,10.046,10.260
ATL_OR_DR[T.DR - Direct Response],0.0166,0.026,0.652,0.514,-0.033,0.067
CAMPAIGN_TYPE[T.Launch],-0.0109,0.042,-0.257,0.797,-0.094,0.072
CAMPAIGN_TYPE[T.Title],-0.0071,0.018,-0.383,0.701,-0.043,0.029
CHANNEL[T.bvod],0.0122,0.052,0.233,0.816,-0.091,0.115
CHANNEL[T.cinema],0.0147,0.056,0.264,0.792,-0.094,0.123
CHANNEL[T.ctv],0.0221,0.062,0.358,0.721,-0.099,0.143
CHANNEL[T.digital audio],0.0149,0.061,0.245,0.807,-0.104,0.134
CHANNEL[T.display],0.0651,0.041,1.582,0.114,-0.016,0.146

0,1,2,3
Omnibus:,0.54,Durbin-Watson:,2.007
Prob(Omnibus):,0.763,Jarque-Bera (JB):,0.567
Skew:,-0.009,Prob(JB):,0.753
Kurtosis:,2.975,Cond. No.,153.0


In [50]:
df.shape

(14108, 5)

In [58]:
# # Define a function to plot boxplots and detect outliers based on IQR
# def plot_outliers(column, ax):
#     ax.boxplot(data[column], vert=False, patch_artist=True, boxprops=dict(facecolor="lightblue"))
#     ax.set_title(f'Boxplot of {column}')
#     ax.set_xlabel(column)

# # Create boxplots for COST and FREE_TRIALS
# fig, axes = plt.subplots(1, 2, figsize=(12, 5))
# plot_outliers('COST', axes[0])
# plot_outliers('FREE_TRIALS', axes[1])
# plt.tight_layout()
# plt.show()

# Calculate outlier thresholds using IQR
def calculate_outliers(column):
    q1 = data[column].quantile(0.25)
    q3 = data[column].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    outliers = data[(data[column] < lower_bound) | (data[column] > upper_bound)]
    return lower_bound, upper_bound, outliers

# Remove outliers based on IQR thresholds for LOG_COST and LOG_FREE_TRIALS
log_cost_bounds = calculate_outliers('LOG_COST')
log_free_trials_bounds = calculate_outliers('LOG_FREE_TRIALS')

# Filter data to exclude outliers in LOG_COST and LOG_FREE_TRIALS
filtered_log_data = data[
    (data['LOG_COST'] >= log_cost_bounds[0]) & (data['LOG_COST'] <= log_cost_bounds[1]) &
    (data['LOG_FREE_TRIALS'] >= log_free_trials_bounds[0]) & (data['LOG_FREE_TRIALS'] <= log_free_trials_bounds[1])
]

# Display the size of the dataset before and after outlier removal based on log variables
log_filtered_size = filtered_log_data.shape[0]

log_filtered_size


13888

In [63]:
filtered_log_data.to_csv('../../data/filtered_data.csv', index=False)

In [59]:
data = filtered_log_data

# Re-encode categorical variables
# Use formula-based OLS regression
formula = 'LOG_FREE_TRIALS ~ LOG_COST + ATL_OR_DR + CAMPAIGN_TYPE + CHANNEL'
ols_model_formula = smf.ols(formula=formula, data=data).fit()

# Generate the regression summary
ols_summary_formula = ols_model_formula.summary()
ols_summary_formula


0,1,2,3
Dep. Variable:,LOG_FREE_TRIALS,R-squared:,0.001
Model:,OLS,Adj. R-squared:,-0.0
Method:,Least Squares,F-statistic:,0.6986
Date:,"Sun, 19 Jan 2025",Prob (F-statistic):,0.832
Time:,18:24:26,Log-Likelihood:,-19307.0
No. Observations:,13888,AIC:,38660.0
Df Residuals:,13867,BIC:,38810.0
Df Model:,20,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,10.1623,0.055,185.239,0.000,10.055,10.270
ATL_OR_DR[T.DR - Direct Response],0.0129,0.025,0.515,0.607,-0.036,0.062
CAMPAIGN_TYPE[T.Launch],-0.0167,0.041,-0.402,0.688,-0.098,0.065
CAMPAIGN_TYPE[T.Title],-0.0043,0.018,-0.239,0.811,-0.040,0.031
CHANNEL[T.bvod],0.0182,0.051,0.354,0.723,-0.083,0.119
CHANNEL[T.cinema],0.0265,0.054,0.487,0.626,-0.080,0.133
CHANNEL[T.ctv],0.0231,0.061,0.381,0.703,-0.096,0.142
CHANNEL[T.digital audio],0.0242,0.060,0.406,0.685,-0.093,0.141
CHANNEL[T.display],0.0735,0.040,1.818,0.069,-0.006,0.153

0,1,2,3
Omnibus:,71.528,Durbin-Watson:,1.992
Prob(Omnibus):,0.0,Jarque-Bera (JB):,51.592
Skew:,0.002,Prob(JB):,6.26e-12
Kurtosis:,2.701,Cond. No.,153.0


In [61]:
data = df

# Re-encode categorical variables
# Use formula-based OLS regression

formula = 'LOG_FREE_TRIALS ~ LOG_COST + ATL_OR_DR + CAMPAIGN_TYPE + CHANNEL'

# Fit the Poisson regression model using the original data
poisson_model = smf.poisson(formula=formula, data=data).fit()

# Display the Poisson model summary
poisson_model_summary = poisson_model.summary()
poisson_model_summary

Optimization terminated successfully.
         Current function value: 2.136255
         Iterations 3


0,1,2,3
Dep. Variable:,LOG_FREE_TRIALS,No. Observations:,14108.0
Model:,Poisson,Df Residuals:,14087.0
Method:,MLE,Df Model:,20.0
Date:,"Sun, 19 Jan 2025",Pseudo R-squ.:,2.096e-05
Time:,18:24:51,Log-Likelihood:,-30138.0
converged:,True,LL-Null:,-30139.0
Covariance Type:,nonrobust,LLR p-value:,1.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,2.3178,0.017,135.344,0.000,2.284,2.351
ATL_OR_DR[T.DR - Direct Response],0.0016,0.008,0.204,0.839,-0.014,0.017
CAMPAIGN_TYPE[T.Launch],-0.0011,0.013,-0.080,0.936,-0.027,0.025
CAMPAIGN_TYPE[T.Title],-0.0007,0.006,-0.120,0.905,-0.012,0.011
CHANNEL[T.bvod],0.0012,0.016,0.073,0.942,-0.031,0.033
CHANNEL[T.cinema],0.0014,0.017,0.082,0.934,-0.033,0.036
CHANNEL[T.ctv],0.0022,0.019,0.112,0.911,-0.036,0.040
CHANNEL[T.digital audio],0.0015,0.019,0.076,0.939,-0.036,0.039
CHANNEL[T.display],0.0064,0.013,0.494,0.621,-0.019,0.032


In [None]:
from sklearn.preprocessing import StandardScaler

# Select numerical columns to standardize
numerical_features = ['LOG_COST', 'LOG_COST_SQ']
scaler = StandardScaler()

# Standardize numerical features
filtered_log_data_encoded[numerical_features] = scaler.fit_transform(
    filtered_log_data_encoded[numerical_features]
)

# Re-run OLS regression with standardized features
X_standardized = filtered_log_data_encoded[['LOG_COST', 'LOG_COST_SQ'] +
                                           [col for col in filtered_log_data_encoded.columns
                                            if col.startswith(('ATL_OR_DR_', 'CAMPAIGN_TYPE_', 'CHANNEL_'))]]
X_standardized = sm.add_constant(X_standardized)  # Add constant for OLS

y_standardized = filtered_log_data_encoded['LOG_FREE_TRIALS']

# Fit OLS regression with standardized features
ols_standardized_model = sm.OLS(y_standardized, X_standardized).fit()

# Display the updated summary
ols_standardized_summary = ols_standardized_model.summary()
ols_standardized_summary