In [36]:
import pandas as pd
from functools import reduce
import numpy as np
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression

# no true true
bias_transformed = False
bias_untransformed = False

output_folder = '../results/'
bias_output_folder = '../results/transformed/'
log = True
traffic_df = pd.read_csv ('../data/traffic.csv')
rank_df = traffic_df.copy()
bias_attribute_path = '../results/transformed_features.csv'
attribute_path = '../data/filtered_attrs.csv'
attribute_df = pd.read_csv (attribute_path)
attribute_df = attribute_df[attribute_df['source'] == 'MBFC']
reg_var = 'traffic'

if bias_transformed:
    attribute_df = pd.read_csv (bias_attribute_path)
    # filter and reorder traffic_dc according to attribute_df['url']
    traffic_mod = pd.merge(traffic_df[['url', reg_var, 'rank']], attribute_df, on='url', how='inner')
    traffic_df = traffic_mod[['url', reg_var]]
    rank_df = traffic_mod[['url', 'rank']]
    attribute_df = traffic_mod.drop(columns=[reg_var, 'rank', 'bias'])
    output_folder = bias_output_folder
elif bias_untransformed:
    bias_attribute_path = '../results/transformed_features.csv'
    bias_attribute_df = pd.read_csv (bias_attribute_path)
    attribute_df = pd.merge(attribute_df, bias_attribute_df['url'], on='url', how='inner')
    traffic_mod = pd.merge(traffic_df[['url', reg_var, 'rank']], attribute_df, on='url', how='inner')
    traffic_df = traffic_mod[['url', reg_var]]
    rank_df = traffic_mod[['url', 'rank']]
    output_folder = bias_output_folder + 'orig_'
    # drop 
urls_to_remove = ['youtube.com', 'facebook.com']
for url in urls_to_remove:
    traffic_df = traffic_df[~traffic_df['url'].str.contains(url)]
    attribute_df = attribute_df[~attribute_df['url'].str.contains(url)]

prdf = pd.read_csv('../data/cc_link_scheme_removal_results.csv').dropna()
prdf = prdf[prdf.url.isin(attribute_df.url.tolist())]
attribute_df = attribute_df[attribute_df.url.isin(prdf.url.tolist())]
prdf = prdf.sort_values('url')
attribute_df = attribute_df.sort_values('url')
np.array_equal(prdf.url.values, attribute_df.url.values)

# merge traffic data with backlinks data
url_df = attribute_df.copy() #pd.merge(traffic_df, attribute_df, on='url', how='inner')
url_df.dropna(inplace=True)

features_to_keep = ['backlinks']#, 'ref_pages''edu', 'gov', 'ugc']
drop_vars = ['source', 'url', 'linked_root_domains']
for var in drop_vars:
    if var in url_df.columns:
        url_df.drop(columns=var, inplace=True)

# remove correlated features
correlation_matrix = url_df.corr()
correlated_features = set()
correlated_pairs = []
for i in range(len(correlation_matrix .columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > 0.9:
            colname = correlation_matrix.columns[j]
            correlated_features.add(colname)
            correlated_pairs.append((correlation_matrix.columns[i], colname))
print(correlated_pairs)

correlated_features = [x for x in correlated_features if not x in features_to_keep]
print(correlated_features)
uncorrelated_df = url_df.drop(columns=list(correlated_features))

# log of features
uncorrelated_log_df = uncorrelated_df.clip(lower=0)

if log:
    for col in (uncorrelated_log_df.columns):
        if col == 'label':
            continue
        uncorrelated_log_df[col] = np.log(1+uncorrelated_log_df[col])


traffic_df = traffic_df[traffic_df.url.isin(attribute_df.url.tolist())]

[('refpages', 'backlinks'), ('valid_pages', 'pages'), ('text', 'backlinks'), ('text', 'refpages'), ('image', 'backlinks'), ('image', 'refpages'), ('image', 'text'), ('nofollow', 'backlinks'), ('nofollow', 'refpages'), ('nofollow', 'text'), ('nofollow', 'image'), ('dofollow', 'backlinks'), ('dofollow', 'refpages'), ('dofollow', 'text'), ('dofollow', 'image'), ('dofollow', 'nofollow'), ('gov', 'backlinks'), ('gov', 'refpages'), ('gov', 'text'), ('gov', 'image'), ('gov', 'nofollow'), ('gov', 'dofollow'), ('edu', 'backlinks'), ('edu', 'refpages'), ('edu', 'text'), ('edu', 'image'), ('edu', 'nofollow'), ('edu', 'dofollow'), ('edu', 'gov'), ('html_pages', 'pages'), ('html_pages', 'valid_pages'), ('refclass_c', 'refdomains'), ('refips', 'refdomains'), ('refips', 'refclass_c')]
['valid_pages', 'refclass_c', 'refdomains', 'image', 'nofollow', 'dofollow', 'text', 'pages', 'gov', 'refpages']


In [47]:
from sklearn.model_selection import train_test_split

insignificant_features = { 
    'label': [],#'pages', 'backlinks', 'valid_pages', 'canonical', 'gov','edu', 'rss', 'alternate', 'html_pages', 'links_internal', 'links_external'],
    'rank': ['rss','sponsored','alternate'],#'canonical', 'rss', 'alternate', 'html_pages', 'links_external', 'linked_root_domains'],
    'traffic': ['rss','sponsored'],#['rss', 'sponsored', 'redirect', 'ugc','canonical','edu','alternate', 'label', 'refips', 'links_internal'],#'linked_root_domains', 'links_internal', 'refpages'],
    'traffic_top3': ['rss', 'linked_root_domains', 'links_internal', 'html_pages'],
    'traffic_top10': [],
    'cost':[],
    'positions':[],
}

# if transformed:
#     insignificant_features['traffic'] = ['links_external', 'gov', 'edu', 'redirect', 'dofollow', 'valid_pages','backlinks', 'html_pages', 'image']

# train, test, labels, y_test = train_test_split(uncorrelated_log_df, traffic_df, test_size=0.1, random_state=63)
train = uncorrelated_log_df.copy()
test = uncorrelated_log_df.copy()
labels = prdf.copy()
y_test = prdf.copy()
y_train = np.log(prdf['cc-orig-pr'])

X_train = train.drop(columns=insignificant_features[reg_var]) 

features = X_train.columns.to_list()

# regressor = LinearRegression()  
# model = regressor.fit(X_train, y_train)
# print("Coef:", model.coef_)
# print("Constant:", model.intercept_)
# print("R2:", model.score(X_train, y_train))
model = sm.OLS
est = sm.OLS(y_train.values, X_train.drop(columns=['label','refips']).values)
est2 = est.fit()

print(est2.summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.976
Model:                            OLS   Adj. R-squared (uncentered):              0.976
Method:                 Least Squares   F-statistic:                          1.243e+04
Date:                Tue, 13 Feb 2024   Prob (F-statistic):                        0.00
Time:                        14:23:00   Log-Likelihood:                         -6024.3
No. Observations:                2728   AIC:                                  1.207e+04
Df Residuals:                    2719   BIC:                                  1.212e+04
Df Model:                           9                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [48]:
# check MSE
ypred = est2.predict(X_train.drop(columns=['label','refips']).values)
np.mean((ypred - y_train)**2)

4.8489665202995305