## Regression

In [1]:
import os
os.chdir(os.path.dirname(os.getcwd()))


import pandas as pd
from functools import reduce
import numpy as np
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression

log = True
traffic_df = pd.read_csv ('data/traffic.csv')
attribute_df = pd.read_csv ('data/filtered_attrs.csv')

urls_to_remove = ['youtube.com', 'facebook.com']
for url in urls_to_remove:
    traffic_df = traffic_df[~traffic_df['url'].str.contains(url)]
    attribute_df = attribute_df[~attribute_df['url'].str.contains(url)]

# merge traffic data with backlinks data
url_df = attribute_df.copy() #pd.merge(traffic_df, attribute_df, on='url', how='inner')
url_df.dropna(inplace=True)

features_to_keep = ['backlinks']#, 'ref_pages''edu', 'gov', 'ugc']
drop_vars = ['source', 'url', 'linked_root_domains']
url_df.drop(columns=drop_vars, inplace=True)

# remove correlated features
correlation_matrix = url_df.corr()
correlated_features = set()
correlated_pairs = []
for i in range(len(correlation_matrix .columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > 0.9:
            colname = correlation_matrix.columns[j]
            correlated_features.add(colname)
            correlated_pairs.append((correlation_matrix.columns[i], colname))
print(correlated_pairs)

correlated_features = [x for x in correlated_features if not x in features_to_keep]
print(correlated_features)
uncorrelated_df = url_df.drop(columns=list(correlated_features))

# log of features
uncorrelated_log_df = uncorrelated_df.clip(lower=0)

if log:
    for col in (uncorrelated_log_df.columns):
        if col == 'label':
            continue
        uncorrelated_log_df[col] = np.log(1+uncorrelated_log_df[col])

[('refpages', 'backlinks'), ('valid_pages', 'pages'), ('text', 'backlinks'), ('text', 'refpages'), ('image', 'backlinks'), ('image', 'refpages'), ('image', 'text'), ('nofollow', 'backlinks'), ('nofollow', 'refpages'), ('nofollow', 'text'), ('nofollow', 'image'), ('dofollow', 'backlinks'), ('dofollow', 'refpages'), ('dofollow', 'text'), ('dofollow', 'image'), ('dofollow', 'nofollow'), ('gov', 'backlinks'), ('gov', 'refpages'), ('gov', 'text'), ('gov', 'image'), ('gov', 'nofollow'), ('gov', 'dofollow'), ('edu', 'backlinks'), ('edu', 'refpages'), ('edu', 'text'), ('edu', 'image'), ('edu', 'nofollow'), ('edu', 'dofollow'), ('edu', 'gov'), ('html_pages', 'pages'), ('html_pages', 'valid_pages'), ('refclass_c', 'refdomains'), ('refips', 'refdomains'), ('refips', 'refclass_c')]
['refclass_c', 'dofollow', 'nofollow', 'valid_pages', 'text', 'gov', 'image', 'refpages', 'refdomains', 'pages']


In [2]:
from sklearn.model_selection import train_test_split

insignificant_features = { 
    'label': [],#'pages', 'backlinks', 'valid_pages', 'canonical', 'gov','edu', 'rss', 'alternate', 'html_pages', 'links_internal', 'links_external'],
    'rank': ['rss','alternate'],#'canonical', 'rss', 'alternate', 'html_pages', 'links_external', 'linked_root_domains'],
    'traffic': ['rss', 'sponsored', 'redirect', 'ugc','canonical','edu','alternate', 'label', 'refips', 'links_internal'],#'linked_root_domains', 'links_internal', 'refpages'],
    'traffic_top3': ['rss', 'linked_root_domains', 'links_internal', 'html_pages'],
    'traffic_top10': [],
    'cost':[],
    'positions':[],
}
reg_var = 'rank'

# train, test, labels, y_test = train_test_split(uncorrelated_log_df, traffic_df, test_size=0.1, random_state=63)
train = uncorrelated_log_df.copy()
test = uncorrelated_log_df.copy()
labels = traffic_df.copy()
y_test = traffic_df.copy()
y_train = np.log(1+labels[reg_var]) if log else labels[reg_var]
# y_train = labels[reg_var]
X_train = train.drop(columns=insignificant_features[reg_var]) 

features = X_train.columns.to_list()

# regressor = LinearRegression()  
# model = regressor.fit(X_train, y_train)
# print("Coef:", model.coef_)
# print("Constant:", model.intercept_)
# print("R2:", model.score(X_train, y_train))
model = sm.OLS
est = sm.OLS(y_train, X_train.drop(columns=['label', 'sponsored', 'refips']))
est2 = est.fit()

print(est2.summary())

                                 OLS Regression Results                                
Dep. Variable:                   rank   R-squared (uncentered):                   0.968
Model:                            OLS   Adj. R-squared (uncentered):              0.968
Method:                 Least Squares   F-statistic:                          1.556e+04
Date:                Fri, 30 Jun 2023   Prob (F-statistic):                        0.00
Time:                        16:51:10   Log-Likelihood:                         -3998.6
No. Observations:                4158   AIC:                                      8013.
Df Residuals:                    4150   BIC:                                      8064.
Df Model:                           8                                                  
Covariance Type:            nonrobust                                                  
                     coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------

In [3]:
res = est2._results.summary()
f = open(reg_var + "_regression_base.txt", "w")
f.write(res.as_latex())
f.close()

## Split by reliability label

In [4]:
# fit model on reliable data
X_train_reliable = train[(train['label'] < 4)]
y_train_reliable = np.log(1+labels[reg_var][(train['label'] < 4)])
X_train_reliable.drop(columns=['label', 'refips', 'sponsored'] + insignificant_features[reg_var], inplace=True)
# X_train_reliable = X_train_reliable[features_to_keep]
est_reliable = sm.OLS(y_train_reliable, X_train_reliable)
est2_reliable = est_reliable.fit()
print(est2_reliable.summary())

res = est2_reliable._results.summary()
f = open("traffic_reg/"+reg_var+"_lable_regression_base_unrel.csv", "w")
f.write(res.as_csv())
f.close()

                                 OLS Regression Results                                
Dep. Variable:                   rank   R-squared (uncentered):                   0.937
Model:                            OLS   Adj. R-squared (uncentered):              0.936
Method:                 Least Squares   F-statistic:                              2300.
Date:                Fri, 30 Jun 2023   Prob (F-statistic):                        0.00
Time:                        16:51:10   Log-Likelihood:                         -1477.0
No. Observations:                1248   AIC:                                      2970.
Df Residuals:                    1240   BIC:                                      3011.
Df Model:                           8                                                  
Covariance Type:            nonrobust                                                  
                     coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_reliable.drop(columns=['label', 'refips', 'sponsored'] + insignificant_features[reg_var], inplace=True)


## Intervention

In [11]:
keep_only_intervention_features = True

if keep_only_intervention_features:
    test_features_to_keep = ['backlinks', 'links_internal', 'links_external']#, 'refpages']
else:
    test_features_to_keep = X_train.columns.to_list()
    test_features_to_keep.remove('label')

X_train_test = X_train[test_features_to_keep]
est_clean = sm.OLS(y_train, X_train_test)
est_clean_2 = est_clean.fit()
print(est_clean_2.summary())

                                 OLS Regression Results                                
Dep. Variable:                   rank   R-squared (uncentered):                   0.959
Model:                            OLS   Adj. R-squared (uncentered):              0.959
Method:                 Least Squares   F-statistic:                          3.281e+04
Date:                Fri, 30 Jun 2023   Prob (F-statistic):                        0.00
Time:                        16:59:10   Log-Likelihood:                         -4471.0
No. Observations:                4158   AIC:                                      8948.
Df Residuals:                    4155   BIC:                                      8967.
Df Model:                           3                                                  
Covariance Type:            nonrobust                                                  
                     coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------

In [12]:
def compute_diff(res_df, pre_col, post_col):
# clip negative values
    res_df[post_col] = res_df[post_col].clip(lower=0)
    # compute difference between t_clean and t_inter
    try:
        if log:
            res_df['diff'] = np.exp(res_df[post_col]) / np.exp(res_df[pre_col])
        else:
            res_df['diff'] = (res_df[post_col]) / (res_df[pre_col])
    except:
        res_df['diff'] = 1
    # group test_res by label
    return res_df.groupby('label').mean()['diff']


def pre_post_intervention_diff(path):
    weighted_df = pd.read_csv(path)[['url', 'pre_backlinks', 'pre_refpages', 'post_backlinks', 'post_refpages']]
    weighted_df['pre_backlinks'] = np.log(weighted_df['pre_backlinks']+1)
    weighted_df['post_backlinks'] = np.log(weighted_df['post_backlinks']+1)

    uncorrelated_log_df['url'] = attribute_df['url']
    weighted_df = pd.merge(weighted_df, uncorrelated_log_df, on='url', how='inner')

    link_relevancy_weighting_regression_df = weighted_df.copy()
    link_relevancy_weighting_regression_df['label'].replace({1:3, 6:5}, inplace=True)
    link_relevancy_weighting_regression_df = link_relevancy_weighting_regression_df[['url', 'label']]
    clean_attrs = list(test_clean.columns[1:])
    link_relevancy_weighting_regression_df['t_clean'] = est_clean_2.predict(weighted_df[['pre_backlinks'] + clean_attrs])
    link_relevancy_weighting_regression_df['t_inter'] = est_clean_2.predict(weighted_df[['post_backlinks'] + clean_attrs])
    return compute_diff(link_relevancy_weighting_regression_df, 't_clean', 't_inter')

test_clean = test.drop(columns=['label'])
test_clean = test_clean[test_features_to_keep]

In [13]:
res_df = pd.DataFrame(columns=['name',3,4,5])

experiments = {
    'negated_sample': 'results/final/link_scheme_negated_only.csv',
    'weighted_mean': 'results/final/backlink_relevancy_weighted_attributes_sampled_mean.csv',
    'weighted_max_sample': 'results/final/backlink_relevancy_weighted_attributes_sampled_max.csv',
    'weighted_max': 'results/final/backlink_relevancy_weighted_attributes_not_sampled_max.csv',
    'combined_sampled': 'results/final/backlink_relevancy_weighted_negated_combined.csv',
    'combined': 'results/final/backlink_relevancy_weighted_negated_combined_not_sampled.csv',
    # 'negated': 'results/final/link_scheme_negated_attributes.csv',
    'negated_0.2': 'results/final/link_scheme_negated_attributes_sampled.csv'
}

for name, path in experiments.items():
    res = pre_post_intervention_diff(path)
    res_df.loc[len(res_df)] = [name]+ res.values.tolist()

res_df

  result = getattr(ufunc, method)(*inputs, **kwargs)


Unnamed: 0,name,3,4,5
0,negated_sample,0.588435,0.703959,0.834559
1,weighted_mean,0.868463,0.855223,0.861729
2,weighted_max_sample,0.927078,0.916969,0.942356
3,weighted_max,0.966209,0.964591,0.975491
4,combined_sampled,0.502415,0.586628,0.732273
5,combined,0.67345,0.771822,0.876109
6,negated_0.2,0.915206,0.940386,0.97127


In [14]:
for control in [0, 0.5, 1]:
    test_intervention = test_clean.copy()
    test_res = test_intervention.copy()
    test_intervention['backlinks'] = np.log(np.exp(test_intervention['backlinks']) * control)
    # test_intervention['refpages'] = test_intervention['refpages'] * 0

    test_res['t_clean'] = est_clean_2.predict(test_clean)
    test_res['t_inter'] = est_clean_2.predict(test_intervention)
    test_res['label'] = test['label']

    test_res['label'].replace({1:3, 6:5}, inplace=True)

    res = compute_diff(test_res, 't_clean', 't_inter')
    res_df.loc[len(res_df)] = ['control_' + str(control)]+ res.values.tolist()
res_df

  result = getattr(ufunc, method)(*inputs, **kwargs)


Unnamed: 0,name,3,4,5
0,negated_sample,0.588435,0.703959,0.834559
1,weighted_mean,0.868463,0.855223,0.861729
2,weighted_max_sample,0.927078,0.916969,0.942356
3,weighted_max,0.966209,0.964591,0.975491
4,combined_sampled,0.502415,0.586628,0.732273
5,combined,0.67345,0.771822,0.876109
6,negated_0.2,0.915206,0.940386,0.97127
7,control_0,0.144644,0.131093,0.028625
8,control_0.5,0.829917,0.828454,0.828537
9,control_1,1.000088,1.0,1.0


In [15]:
col_order = ["combined", "negated_0.2", "weighted_max", "combined_sampled", "negated_sample", "weighted_max_sample", "weighted_mean", "control_0", "control_0.5", "control_1"]
res_df.set_index('name', inplace=True)
res_df = res_df.loc[col_order]
res_df['name'] = ['Combined', 'Link Scheme Removal', 'Relevancy Weighted (max)', 'Combined*', 'Link Scheme Removal*', 'Relevancy Weighted (max)*', 'Relevancy Weighted (mean)*', 'Control 100\%', 'Control 50\%', 'Control 0\%']
res_df.reset_index(drop=True, inplace=True)
res_df['metric'] = (1 - res_df[4]) + (1 - res_df[3]) - (1 - res_df[5]) * 2
res_df

Unnamed: 0,3,4,5,name,metric
0,0.67345,0.771822,0.876109,Combined,0.306946
1,0.915206,0.940386,0.97127,Link Scheme Removal,0.086948
2,0.966209,0.964591,0.975491,Relevancy Weighted (max),0.020182
3,0.502415,0.586628,0.732273,Combined*,0.375502
4,0.588435,0.703959,0.834559,Link Scheme Removal*,0.376725
5,0.927078,0.916969,0.942356,Relevancy Weighted (max)*,0.040666
6,0.868463,0.855223,0.861729,Relevancy Weighted (mean)*,-0.000227
7,0.144644,0.131093,0.028625,Control 100\%,-0.218488
8,0.829917,0.828454,0.828537,Control 50\%,-0.001298
9,1.000088,1.0,1.0,Control 0\%,-8.8e-05


In [16]:
res_df.to_csv('results/final/regression_'+reg_var+'.csv', index=False)