In [1]:
import pickle
import pandas as pd
import statsmodels.api as sm


In [2]:

websites = pd.read_csv('../data/websites.csv')
websites = websites[websites['has_gtag']] # analysis makes sense only on websites that have gtag


## Correlations


In [3]:
websites[['has_meta_pixel', 'meta_form_data_collection', 'is_health', 'is_finance']].corr(method='pearson')

Unnamed: 0,has_meta_pixel,meta_form_data_collection,is_health,is_finance
has_meta_pixel,1.0,0.714266,0.019795,0.023872
meta_form_data_collection,0.714266,1.0,-0.080931,-0.070074
is_health,0.019795,-0.080931,1.0,-0.060358
is_finance,0.023872,-0.070074,-0.060358,1.0


In [4]:
X = websites[['has_meta_pixel', 'is_health', 'is_finance']]
y = websites['google_form_data_collection']

# Convert boolean to int
X = X.astype(int)


## Regression Analysis

In [5]:
# Prepare data for statsmodels (add constant term)
X = sm.add_constant(X)
    

# Fit logistic regression model using statsmodels
logit_model = sm.Logit(y, X).fit(method='bfgs', maxiter=1000)

# Print summary of the model (coefficients and p-values)
print(logit_model.summary())



Optimization terminated successfully.
         Current function value: 0.329075
         Iterations: 27
         Function evaluations: 30
         Gradient evaluations: 30
                                Logit Regression Results                               
Dep. Variable:     google_form_data_collection   No. Observations:                29137
Model:                                   Logit   Df Residuals:                    29133
Method:                                    MLE   Df Model:                            3
Date:                         Tue, 11 Mar 2025   Pseudo R-squ.:                 0.08253
Time:                                 17:30:23   Log-Likelihood:                -9588.3
converged:                                True   LL-Null:                       -10451.
Covariance Type:                     nonrobust   LLR p-value:                     0.000
                     coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------

In [6]:
# Save the data to a csv file
pickle.dump(logit_model, open('regression_analysis_google_model.pkl','wb'))