## Exploration of Mitigation strategies for small-scale webgraph
* Bias removal 
* Tuning interventions

### Bias Removal

In [1]:
import pandas as pd
from functools import reduce
import numpy as np
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression

# no true true
bias_transformed = False
bias_untransformed = False

output_folder = '../results/'
bias_output_folder = '../results/transformed/'
log = True
traffic_df = pd.read_csv ('../data/traffic.csv')
rank_df = traffic_df.copy()
bias_attribute_path = '../results/transformed_features.csv'
attribute_path = '../data/fitlered_attrs.csv'
attribute_df = pd.read_csv (attribute_path)
reg_var = 'traffic'

if bias_transformed:
    attribute_df = pd.read_csv (bias_attribute_path)
    # filter and reorder traffic_dc according to attribute_df['url']
    traffic_mod = pd.merge(traffic_df[['url', reg_var, 'rank']], attribute_df, on='url', how='inner')
    traffic_df = traffic_mod[['url', reg_var]]
    rank_df = traffic_mod[['url', 'rank']]
    attribute_df = traffic_mod.drop(columns=[reg_var, 'rank', 'bias'])
    output_folder = bias_output_folder
elif bias_untransformed:
    bias_attribute_path = '../results/transformed_features.csv'
    bias_attribute_df = pd.read_csv (bias_attribute_path)
    attribute_df = pd.merge(attribute_df, bias_attribute_df['url'], on='url', how='inner')
    traffic_mod = pd.merge(traffic_df[['url', reg_var, 'rank']], attribute_df, on='url', how='inner')
    traffic_df = traffic_mod[['url', reg_var]]
    rank_df = traffic_mod[['url', 'rank']]
    output_folder = bias_output_folder + 'orig_'
    # drop 
urls_to_remove = ['youtube.com', 'facebook.com']
for url in urls_to_remove:
    traffic_df = traffic_df[~traffic_df['url'].str.contains(url)]
    attribute_df = attribute_df[~attribute_df['url'].str.contains(url)]

# merge traffic data with backlinks data
url_df = attribute_df.copy() #pd.merge(traffic_df, attribute_df, on='url', how='inner')
url_df.dropna(inplace=True)

features_to_keep = ['backlinks']#, 'ref_pages''edu', 'gov', 'ugc']
drop_vars = ['source', 'url', 'linked_root_domains']
for var in drop_vars:
    if var in url_df.columns:
        url_df.drop(columns=var, inplace=True)

# remove correlated features
correlation_matrix = url_df.corr()
correlated_features = set()
correlated_pairs = []
for i in range(len(correlation_matrix .columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > 0.9:
            colname = correlation_matrix.columns[j]
            correlated_features.add(colname)
            correlated_pairs.append((correlation_matrix.columns[i], colname))
print(correlated_pairs)

correlated_features = [x for x in correlated_features if not x in features_to_keep]
print(correlated_features)
uncorrelated_df = url_df.drop(columns=list(correlated_features))

# log of features
uncorrelated_log_df = uncorrelated_df.clip(lower=0)

if log:
    for col in (uncorrelated_log_df.columns):
        if col == 'label':
            continue
        uncorrelated_log_df[col] = np.log(1+uncorrelated_log_df[col])

[('refpages', 'backlinks'), ('valid_pages', 'pages'), ('text', 'backlinks'), ('text', 'refpages'), ('image', 'backlinks'), ('image', 'refpages'), ('image', 'text'), ('nofollow', 'backlinks'), ('nofollow', 'refpages'), ('nofollow', 'text'), ('nofollow', 'image'), ('dofollow', 'backlinks'), ('dofollow', 'refpages'), ('dofollow', 'text'), ('dofollow', 'image'), ('dofollow', 'nofollow'), ('gov', 'backlinks'), ('gov', 'refpages'), ('gov', 'text'), ('gov', 'image'), ('gov', 'nofollow'), ('gov', 'dofollow'), ('edu', 'backlinks'), ('edu', 'refpages'), ('edu', 'text'), ('edu', 'image'), ('edu', 'nofollow'), ('edu', 'dofollow'), ('edu', 'gov'), ('html_pages', 'pages'), ('html_pages', 'valid_pages'), ('refclass_c', 'refdomains'), ('refips', 'refdomains'), ('refips', 'refclass_c')]
['dofollow', 'refclass_c', 'refpages', 'pages', 'nofollow', 'image', 'refdomains', 'gov', 'valid_pages', 'text']


In [9]:
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score
import warnings
import pandas as pd
from pandas.core.common import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)
import sys
sys.path.insert(0,'..')
import interventions.backlink_relevance_combined as intervention
import regressions.intervention_eval as regression
from importlib import reload
reload(intervention)
reload(regression)

traffic_results = regression.run_regression(traffic_df, attribute_df, reg_var = 'traffic')
traffic_results

Running interventions
Num link schemes:  167
Source domains with scores:  0
Num link schemes:  0
Source domains with scores:  0
Num link schemes:  0
Source domains with scores:  1026
Num link schemes:  167
Source domains with scores:  0
Num link schemes:  167
Source domains with scores:  1026
Num link schemes:  0
Source domains with scores:  1026
Num link schemes:  167
Source domains with scores:  1026
Num link schemes:  167
Source domains with scores:  1026
Num link schemes:  0
Source domains with scores:  1026
Num link schemes:  0
Source domains with scores:  1026
Num link schemes:  0
Source domains with scores:  1026
Num link schemes:  167
Source domains with scores:  1026
Finished running interventions
                                 OLS Regression Results                                
Dep. Variable:                traffic   R-squared (uncentered):                   0.907
Model:                            OLS   Adj. R-squared (uncentered):              0.907
Method:             

  result = getattr(ufunc, method)(*inputs, **kwargs)


ValueError: cannot set a row with mismatched columns

In [None]:
rank_results = regression.run_regression(rank_df, attribute_df, reg_var = 'rank')
rank_results

Running interventions
Num link schemes:  82
Source domains with scores:  0
Num link schemes:  0
Source domains with scores:  0
Num link schemes:  0
Source domains with scores:  1026
Num link schemes:  82
Source domains with scores:  0
Num link schemes:  82
Source domains with scores:  1026
Num link schemes:  0
Source domains with scores:  1026
Num link schemes:  82
Source domains with scores:  1026
Num link schemes:  82
Source domains with scores:  1026
Num link schemes:  0
Source domains with scores:  1026
Num link schemes:  0
Source domains with scores:  1026
Num link schemes:  0
Source domains with scores:  1026
Num link schemes:  82
Source domains with scores:  1026
Finished running interventions
                                 OLS Regression Results                                
Dep. Variable:                   rank   R-squared (uncentered):                   0.973
Model:                            OLS   Adj. R-squared (uncentered):              0.973
Method:                 Le

  result = getattr(ufunc, method)(*inputs, **kwargs)


Unnamed: 0,name,3,4,5,metric
0,\textbf{L}ink Scheme,0.874208,0.869552,0.933727,1.933224
1,\textbf{M}ultiplicity,0.777296,0.826582,0.868002,1.500488
2,\textbf{R}elevancy (max),0.971489,0.965055,0.976583,1.354928
3,L+M Combined,0.657431,0.70793,0.801244,1.596531
4,L+R Combined,0.862158,0.848302,0.907247,1.560811
5,R+M Combined,0.757811,0.795022,0.852533,1.516162
6,L+M+R Combined,0.646377,0.690424,0.782316,1.523308
7,\textbf{L}ink Scheme*,0.847135,0.771801,0.872587,1.495382
8,\textbf{M}ultiplicity*,0.784471,0.812301,0.876902,1.637828
9,\textbf{R}elevancy (max)*,0.951093,0.917925,0.946141,1.215966


In [122]:
traffic_results = traffic_results[['name', 3, 4, 5, 'metric']]
rank_results = rank_results[['name', 3, 4, 5, 'metric']]
results = pd.concat([traffic_results, rank_results.drop(columns=['name'])], axis=1)
results.replace([np.inf, -np.inf], 1, inplace=True)
results.fillna(1, inplace=True)
results.round(2).to_csv(output_folder + '_regression_results_multiplicity.csv', index=False)
results

Unnamed: 0,name,3,4,5,metric,3.1,4.1,5.1,metric.1
0,\textbf{L}ink Scheme,0.778868,0.755576,0.871379,1.809802,0.874208,0.869552,0.933727,1.933224
1,\textbf{M}ultiplicity,0.70729,0.756002,0.818743,1.480516,0.777296,0.826582,0.868002,1.500488
2,\textbf{R}elevancy (max),0.932214,0.91408,0.948151,1.482259,0.971489,0.965055,0.976583,1.354928
3,L+M Combined,0.526826,0.563178,0.698169,1.507461,0.657431,0.70793,0.801244,1.596531
4,L+R Combined,0.754221,0.709862,0.815434,1.45183,0.862158,0.848302,0.907247,1.560811
5,R+M Combined,0.66915,0.691748,0.792452,1.539647,0.757811,0.795022,0.852533,1.516162
6,L+M+R Combined,0.509665,0.53479,0.66337,1.419278,0.646377,0.690424,0.782316,1.523308
7,\textbf{L}ink Scheme*,0.798156,0.681314,0.809329,1.364999,0.847135,0.771801,0.872587,1.495382
8,\textbf{M}ultiplicity*,0.75536,0.789715,0.871108,1.764757,0.784471,0.812301,0.876902,1.637828
9,\textbf{R}elevancy (max)*,0.884986,0.811564,0.887335,1.346693,0.951093,0.917925,0.946141,1.215966
