In [1]:
import pandas as pd
import pickle
import numpy as np
from latex_formatter import LatexFormatter as lf
from io import StringIO

In [2]:
def read_logit_results(filename):
    """Return the summary of a logistic regression model from a file, and the pseudo-rsquared.

    Args:
        filename (str): The name of the file to read the model from.

    Returns:
        tuple: A tuple containing the summary of the model as a pandas DataFrame, and the pseudo-rsquared.
    """
    with open(filename, 'rb') as fl:
        logit_model = pickle.load(fl)
    results_summary = logit_model.summary()
    results_as_html = results_summary.tables[1].as_html()
    results_df = pd.read_html(StringIO(results_as_html), header=0, index_col=0)[0]
    return results_df, logit_model.prsquared

google_regression, google_pseudo_r2 = read_logit_results('regression_analysis/regression_analysis_google_model.pkl')
meta_regression, meta_pseudo_r2 = read_logit_results('regression_analysis/regression_analysis_meta_model.pkl')


In [3]:
google_regression['OR'] = np.exp(google_regression['coef'])
meta_regression['OR'] = np.exp(meta_regression['coef'])
google_regression['OR_LOWER_CI'] = np.exp(google_regression['[0.025'])
meta_regression['OR_LOWER_CI'] = np.exp(meta_regression['[0.025'])
google_regression['OR_HIGHER_CI'] = np.exp(google_regression['0.975]'])
meta_regression['OR_HIGHER_CI'] = np.exp(meta_regression['0.975]'])

In [4]:
table_7 = f"""
    \\begin{{table}}[ht]
        \\centering
        \\small
        \\begin{{tabular}}{{p{{1pt}}lrrr}}
            \\toprule
            &\\textbf{{Feature}} &  \\multicolumn{{1}}{{c}}{{\\textbf{{OR}}}} &  \\multicolumn{{1}}{{c}}{{\\textbf{{$p$-value}}}} &  \\multicolumn{{1}}{{c}}{{\\textbf{{CI}}}} \\\\
            \\midrule
            \\multirow{{4}}{{*}} {{\\rotatebox{{90}}{{\\textbf{{Meta}}}}}} & Has \\gpixel &  {lf.num(meta_regression['OR']['has_gtag'], 3)} &  {lf.num(meta_regression['P>|z|']['has_gtag'], 3)} & [{lf.num(meta_regression['OR_LOWER_CI']['has_gtag'], 3)}, {lf.num(meta_regression['OR_HIGHER_CI']['has_gtag'], 3)}] \\\\
            & Google \\dynamica[u] &  {lf.num(meta_regression['OR']['google_form_data_collection'], 3)} &  {lf.num(meta_regression['P>|z|']['google_form_data_collection'], 3)} & [{lf.num(meta_regression['OR_LOWER_CI']['google_form_data_collection'], 3)}, {lf.num(meta_regression['OR_HIGHER_CI']['google_form_data_collection'], 3)}] \\\\
            & Is Health &  {lf.num(meta_regression['OR']['is_health'], 3)} &  {lf.num(meta_regression['P>|z|']['is_health'], 3)} & [{lf.num(meta_regression['OR_LOWER_CI']['is_health'], 3)}, {lf.num(meta_regression['OR_HIGHER_CI']['is_health'], 3)}] \\\\
            & Is Finance &  {lf.num(meta_regression['OR']['is_finance'], 3)} &  {lf.num(meta_regression['P>|z|']['is_finance'], 3)} & [{lf.num(meta_regression['OR_LOWER_CI']['is_finance'], 3)}, {lf.num(meta_regression['OR_HIGHER_CI']['is_finance'], 3)}] \\\\
            \\midrule
            \\multirow{{3}}{{*}} {{\\rotatebox{{90}}{{\\textbf{{Google}}}}}} & Has \\mpixel &  {lf.num(google_regression['OR']['has_meta_pixel'], 3)} &  {lf.num(google_regression['P>|z|']['has_meta_pixel'], 3)} & [{lf.num(google_regression['OR_LOWER_CI']['has_meta_pixel'], 3)}, {lf.num(google_regression['OR_HIGHER_CI']['has_meta_pixel'], 3)}] \\\\
            & Is Health &  {lf.num(google_regression['OR']['is_health'], 3)} &  {lf.num(google_regression['P>|z|']['is_health'], 3)} & [{lf.num(google_regression['OR_LOWER_CI']['is_health'], 3)}, {lf.num(google_regression['OR_HIGHER_CI']['is_health'], 3)}] \\\\
            & Is Finance &  {lf.num(google_regression['OR']['is_finance'], 3)} &  {lf.num(google_regression['P>|z|']['is_finance'], 3)} & [{lf.num(google_regression['OR_LOWER_CI']['is_finance'], 3)}, {lf.num(google_regression['OR_HIGHER_CI']['is_finance'], 3)}] \\\\
            \\bottomrule
        \\end{{tabular}}
        \\caption{{Odds Ratios (OR), $p$-values, and Confidence Intervals (CI) from our Logistic Regression Analyses: \\\\ (i) With \\mpixel \\dynamica as dependent variable; trained on all websites that have \\mpixel (Pseudo R-squared: {meta_pseudo_r2:.4f}). Results suggest that a website is more likely to have  \\mpixel \\dynamica when it has \\gpixel and \\gpixel \\dynamica, and less likely if it belongs to Health or Finance verticals.\\\\ (ii) With \\gpixel \\dynamica as dependent variable; trained on all websites that have \\gpixel (Pseudo R-squared: {google_pseudo_r2:.4f}). Results suggest that a website is more likely to have \\gpixel \\dynamica when it has a \\mpixel. \\\\ All features are boolean variables -- True if a website has the property. }}
        \\label{{tab:logistic_regression}}
    \\end{{table}}

"""
print(table_7)


    \begin{table}[ht]
        \centering
        \small
        \begin{tabular}{p{1pt}lrrr}
            \toprule
            &\textbf{Feature} &  \multicolumn{1}{c}{\textbf{OR}} &  \multicolumn{1}{c}{\textbf{$p$-value}} &  \multicolumn{1}{c}{\textbf{CI}} \\
            \midrule
            \multirow{4}{*} {\rotatebox{90}{\textbf{Meta}}} & Has \gpixel &  1.903 &  0.000 & [1.443, 2.512] \\
            & Google \dynamica[u] &  1.699 &  0.000 & [1.533, 1.885] \\
            & Is Health &  0.206 &  0.000 & [0.180, 0.237] \\
            & Is Finance &  0.118 &  0.000 & [0.094, 0.147] \\
            \midrule
            \multirow{3}{*} {\rotatebox{90}{\textbf{Google}}} & Has \mpixel &  4.839 &  0.000 & [4.473, 5.233] \\
            & Is Health &  0.952 &  0.457 & [0.835, 1.084] \\
            & Is Finance &  1.086 &  0.376 & [0.905, 1.305] \\
            \bottomrule
        \end{tabular}
        \caption{Odds Ratios (OR), $p$-values, and Confidence Intervals (CI) from our Logistic Regression