# Predicting Factuality of Reporting and Bias of News Media Sources
#### NAMES OF THE GROUP MEMBERS: Ayushi Choudhary, Tai-Hua Chung, Olivia Natasha, Pragyan Sharma, Jinjin Yu

In [302]:
%pylab inline
import pandas as pd
import os
from ipypublish import nb_setup
import numpy as np
import warnings
warnings.filterwarnings('ignore')

Populating the interactive namespace from numpy and matplotlib


## Loading Data

In [343]:
#loading corpus
corpus = pd.read_csv('data/corpus.csv')

In [344]:
corpus.fact.value_counts()

HIGH     542
MIXED    268
LOW      256
Name: fact, dtype: int64

In [345]:
corpus.bias.value_counts()

center           263
left-center      209
left             168
right            157
extreme-right    156
right-center      92
extreme-left      21
Name: bias, dtype: int64

In [346]:
corpus.head()

Unnamed: 0,source_url,source_url_processed,URL,fact,bias
0,http://www.villagevoice.com/,villagevoice.com,http://mediabiasfactcheck.com/the-village-voice/,HIGH,left
1,https://insideclimatenews.org/,insideclimatenews.org,https://mediabiasfactcheck.com/insideclimate-n...,HIGH,left-center
2,http://www.fury.news/,fury.news,http://mediabiasfactcheck.com/fury-news/,LOW,extreme-right
3,http://now8news.com/,now8news.com,http://mediabiasfactcheck.com/now8news/,LOW,center
4,http://constitution.com/,constitution.com,http://mediabiasfactcheck.com/the-constitution/,LOW,extreme-right


**Run this for an individual file**

In [366]:
features = pd.read_csv('data/features/body.csv')
features.drop(columns = ['fact','bias' ], inplace = True)

**Run this for multiple files**

In [347]:
import glob #library to read multiple files

#loading features
path = 'data/features/comb_top5' 
all_files = glob.glob(path + "/*.csv") #a list of all the file paths
li = []

for filepath in all_files:    # reading content of files
    df = pd.read_csv(filepath, index_col=None, header=0) 
    source = df.iloc[:,0]     #saving source_url_processed 
    fname = str(filepath).split('\\')[-1:][0].split('.')[-2:][0] #considering on 
    df.columns.values[1:-2] = pd.Series(df.columns.values[1:-2]).apply(lambda x: fname +'_' + x)
    df = df.iloc[:, 1:-2]
    li.append(df)
    
li.append(source)
features = pd.concat(li, axis=1, ignore_index=False)

In [367]:
features.head()

Unnamed: 0,source_url_processed,f0,f1,f2,f3,f4,f5,f6,f7,f8,...,f131,f132,f133,f134,f135,f136,f137,f138,f139,f140
0,villagevoice.com,5.356412,0.000582,0.001631,0.000369,1.6e-05,0.001176,0.000187,0.001575,0.000444,...,1.266089,1.031037,1.190084,0.208078,0.316082,0.13902,0.181814,0.267129,0.051228,0.258858
1,insideclimatenews.org,5.356014,0.001911,0.001304,0.000118,6.1e-05,0.003447,0.000156,0.002677,0.000148,...,1.93303,1.375038,0.36496,0.284998,0.685641,0.083882,0.040443,0.350957,0.045444,0.116543
2,fury.news,5.273517,0.00147,0.003578,0.000384,0.000209,0.00179,0.000698,0.002967,0.001184,...,0.885676,0.779502,0.569813,0.215883,0.302485,0.34376,0.305787,0.554208,0.075992,0.186568
3,now8news.com,5.404849,0.001432,0.001522,0.0,0.0,0.000489,0.0,0.002332,0.0,...,1.448575,0.476627,0.796585,0.12075,0.425525,0.023552,0.59562,1.109764,0.049361,0.079153
4,constitution.com,5.311548,0.001383,0.002403,0.000216,5.3e-05,0.001761,0.0003,0.002713,0.000188,...,1.391083,1.341072,0.415024,0.111452,0.392811,0.135954,0.178916,0.500542,0.054368,0.189897


In [368]:
merged_df = corpus.merge(features, how='inner', left_on='source_url_processed', right_on='source_url_processed' )

**Only run to merge new added features**

In [389]:
about_us = pd.read_csv('data/features/new_features/has_about_us.csv')
terms_of_use = pd.read_csv('data/features/new_features/has_terms_of_use.csv')
merged_df = pd.concat([merged_df,terms_of_use.has_termsf0, about_us.has_about_usf0], axis=1, ignore_index=False)

**Merging all the files**

In [369]:
merged_df.drop(columns = ['source_url','URL'], inplace = True)

merged_df.replace({'HIGH':2, 'MIXED': 1 ,'LOW':0 }, inplace = True)
merged_df.replace({'extreme-right': 0, 'right': 1, 'right-center': 2, 'center': 3, \
                     'left-center': 4, 'left': 5, 'extreme-left': 6},inplace = True)

In [370]:
merged_df.head()

Unnamed: 0,source_url_processed,fact,bias,f0,f1,f2,f3,f4,f5,f6,...,f131,f132,f133,f134,f135,f136,f137,f138,f139,f140
0,villagevoice.com,2,5,5.356412,0.000582,0.001631,0.000369,1.6e-05,0.001176,0.000187,...,1.266089,1.031037,1.190084,0.208078,0.316082,0.13902,0.181814,0.267129,0.051228,0.258858
1,insideclimatenews.org,2,4,5.356014,0.001911,0.001304,0.000118,6.1e-05,0.003447,0.000156,...,1.93303,1.375038,0.36496,0.284998,0.685641,0.083882,0.040443,0.350957,0.045444,0.116543
2,fury.news,0,0,5.273517,0.00147,0.003578,0.000384,0.000209,0.00179,0.000698,...,0.885676,0.779502,0.569813,0.215883,0.302485,0.34376,0.305787,0.554208,0.075992,0.186568
3,now8news.com,0,3,5.404849,0.001432,0.001522,0.0,0.0,0.000489,0.0,...,1.448575,0.476627,0.796585,0.12075,0.425525,0.023552,0.59562,1.109764,0.049361,0.079153
4,constitution.com,0,0,5.311548,0.001383,0.002403,0.000216,5.3e-05,0.001761,0.0003,...,1.391083,1.341072,0.415024,0.111452,0.392811,0.135954,0.178916,0.500542,0.054368,0.189897


In [371]:
merged_df.shape

(1066, 144)

In [372]:
merged_df.fact.value_counts().sort_index()

0    256
1    268
2    542
Name: fact, dtype: int64

In [373]:
merged_df.bias.value_counts().sort_index()

0    156
1    157
2     92
3    263
4    209
5    168
6     21
Name: bias, dtype: int64

## Dimension Reduction  - PCA

In [17]:
#For a specific file
#merged_df = pd.read_csv('data/features/body.csv')

In [374]:
Y1 = merged_df.fact
Y2 = merged_df.bias
X = merged_df.drop(columns = [ 'source_url_processed', 'fact', 'bias',])
source_url_processed = merged_df.source_url_processed

In [375]:
#NORMALIZE
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
Xs = pd.DataFrame(scale(X))
Xs.columns = X.columns
print(Xs.shape)
Xs.head()

(1066, 141)


Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f131,f132,f133,f134,f135,f136,f137,f138,f139,f140
0,0.057173,-0.527042,-0.283778,0.216815,-0.374495,-0.587672,-0.298482,-0.734595,0.077078,-0.246495,...,-0.661424,-0.363023,1.241183,0.093056,-0.695434,-0.225201,0.022414,-0.424115,-0.182847,0.410037
1,0.053047,0.188272,-0.475914,-0.516027,-0.146224,0.553024,-0.365944,-0.129089,-0.319511,1.132348,...,0.151888,0.409582,-0.673321,0.435834,-0.117133,-0.389815,-0.597655,-0.073743,-0.287388,-0.224168
2,-0.804032,-0.049219,0.862446,0.260871,0.593902,-0.279257,0.803487,0.030073,1.066928,-0.041215,...,-1.125325,-0.927956,-0.198009,0.127836,-0.716712,0.386047,0.566177,0.775769,0.264694,0.087888
3,0.560397,-0.069737,-0.347713,-0.859347,-0.452465,-0.932738,-0.70124,-0.31867,-0.517512,-0.462832,...,-0.438888,-1.608194,0.328163,-0.296109,-0.524173,-0.569931,1.837421,3.097781,-0.216605,-0.390791
4,-0.408917,-0.095974,0.171205,-0.230327,-0.18583,-0.293571,-0.054456,-0.10972,-0.265256,-0.371709,...,-0.508998,0.333295,-0.557159,-0.337544,-0.575366,-0.234353,0.009702,0.551462,-0.126103,0.102725


In [376]:
#REDUCED DATA 
from sklearn import decomposition
pca = decomposition.PCA(n_components=20)
pca.fit(Xs)
R = pca.transform(Xs)
print(R.shape)

(1066, 20)


In [377]:
#LOADINGS MATRIX L
L = pca.components_.T
print(L.shape)
print(X.columns)
L

(141, 20)
Index(['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9',
       ...
       'f131', 'f132', 'f133', 'f134', 'f135', 'f136', 'f137', 'f138', 'f139',
       'f140'],
      dtype='object', length=141)


array([[-0.06634065,  0.17191595,  0.09558903, ..., -0.04125147,
         0.0802725 ,  0.06287761],
       [-0.00574339, -0.01266322,  0.14141967, ..., -0.17301286,
        -0.01246951,  0.07975825],
       [ 0.00429122, -0.16676031, -0.00903793, ...,  0.03081497,
        -0.02499066, -0.07887178],
       ..., 
       [-0.0401252 , -0.07861004, -0.0211476 , ...,  0.03226636,
         0.00557862,  0.14851699],
       [-0.00045452, -0.01119932, -0.01790573, ...,  0.11467374,
        -0.06615179,  0.14760274],
       [-0.07839052,  0.01225349, -0.02298516, ...,  0.27545386,
        -0.04943142, -0.05532074]])

In [378]:
#CHECK THAT DECOMPOSITION IS CORRECT
sum(R - Xs.dot(L))

0     9.289331e-13
1    -7.494326e-13
2    -5.541357e-13
3     3.978749e-13
4    -3.993169e-13
5     1.544276e-12
6     1.467333e-13
7     6.973788e-13
8    -9.145865e-13
9     1.662324e-13
10    1.463074e-12
11   -1.511812e-13
12    5.198062e-13
13    1.865373e-13
14    1.035572e-12
15   -1.428269e-13
16   -1.994157e-13
17    2.203974e-13
18   -5.649240e-13
19   -1.152348e-12
dtype: float64

In [379]:
#EXPLAINED VARIANCE
pca.explained_variance_ratio_

array([ 0.1105016 ,  0.07761039,  0.0528884 ,  0.04584536,  0.04024659,
        0.0342182 ,  0.02964961,  0.02513232,  0.02434382,  0.02264096,
        0.02022735,  0.01970224,  0.01863306,  0.01579273,  0.01473892,
        0.01417867,  0.01387213,  0.01276873,  0.01251716,  0.01164248])

In [380]:
principalDf = pd.DataFrame(data = R
             , columns = ['PC1', 'PC2','PC3','PC4','PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10', 'PC11', 'PC12', 'PC13',\
                         'PC14', 'PC15', 'PC16', 'PC17', 'PC18', 'PC19', 'PC20'])
finalDf = pd.concat([source_url_processed,principalDf, Y1, Y2], axis = 1)

In [381]:
principalDf = pd.DataFrame(data = R)
finalDf = pd.concat([source_url_processed,principalDf, Y1, Y2], axis = 1)

In [382]:
finalDf.head(2)

Unnamed: 0,source_url_processed,0,1,2,3,4,5,6,7,8,...,12,13,14,15,16,17,18,19,fact,bias
0,villagevoice.com,-0.434868,-0.244455,-1.434269,-0.862653,-2.616887,-1.430359,-0.030051,-0.507447,0.225103,...,-1.013,0.067354,0.062923,-0.959288,0.64006,1.068211,-0.059945,0.77214,2,5
1,insideclimatenews.org,3.519641,-0.757469,1.378165,-2.146577,-0.603021,-0.129587,0.118731,1.073598,0.308076,...,0.210378,-0.101454,-0.021804,0.011623,-0.266414,-0.039805,0.12196,0.418141,2,4


# Model Building

In [383]:
import sklearn as sk
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb

In [384]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve,auc
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold

In [385]:
clfs = [sk.svm.SVC(),
        sk.linear_model.LogisticRegression(),
        sk.ensemble.RandomForestClassifier(n_estimators=50), 
        sk.ensemble.AdaBoostClassifier(),
        #SMOTEEN
        KNeighborsClassifier(n_neighbors=5),  
        #TENSOFLOW        
        xgb.XGBClassifier(max_depth=3, n_estimators=200, learning_rate=0.05)       
       ]

In [386]:
def Auto_modeling(data):
   
    classifier = []
    accuracy = []
    f1 = []
    
    # define variables
    X = np.array(data.drop(columns = ['source_url_processed','fact','bias'],axis=1))
    y = np.array(data.fact)
    
    # best
    maxACC = -1
    bestCL = ''
       
    # k-fold
    kf = KFold(n_splits=5,random_state=0)
    
    # iterate different models
    for clf in clfs:
        
        # Scores
        overall_acc = 0
        overall_f1 = 0
    
        for train_index, test_index in kf.split(X):
            
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            
            clf.fit(X_train, y_train)
            ypred = clf.predict(X_test)
            
            acc = accuracy_score(y_test,ypred)
            #print(acc)
            overall_acc += acc 
            
            #print(type(y_test))
            #print(type(ypred))
            
            f = f1_score(y_test, ypred, average='macro')
            #f = 1
            #print(f)
            overall_f1 += f
        
        print(str(clf)[:str(clf).find('(')])
        classifier.append(str(clf)[:str(clf).find('(')])

        # Acc
        avg_acc = overall_acc/5
        print('avg. Accuracy:' + str(avg_acc))
        accuracy.append(avg_acc)
        
        # f1
        avg_f1 = overall_f1/5
        print('avg. f1:' + str(avg_f1))
        f1.append(avg_f1)
        
        print('------------------')
        print('\n')
        
        if avg_acc > maxACC:
            bestCL = str(clf)[:str(clf).find('(')]
            maxACC = avg_acc
            
    
    print ('Best Classifier:')
    print ('      ' + str(bestCL))
    print('\n')
    print ('Accuracy:' + str(maxACC))
    
    #print(classifier)
    #print(accuracy)
    #print(f1)
    
    df = pd.DataFrame(data={"classifier":classifier, "accuracy":accuracy, "f1":f1}).set_index('classifier').T
    
    return df

In [387]:
df = Auto_modeling(finalDf) #pass finalDf if using PCA as agrument, pass merged merged_df if not using PCA

SVC
avg. Accuracy:0.609753850204
avg. f1:0.512640292878
------------------


LogisticRegression
avg. Accuracy:0.616317844763
avg. f1:0.492053761239
------------------


RandomForestClassifier
avg. Accuracy:0.614417972006
avg. f1:0.519448300527
------------------


AdaBoostClassifier
avg. Accuracy:0.566583300426
avg. f1:0.497486956457
------------------


KNeighborsClassifier
avg. Accuracy:0.599412048616
avg. f1:0.540552728873
------------------


XGBClassifier
avg. Accuracy:0.609740687113
avg. f1:0.52506961368
------------------


Best Classifier:
      LogisticRegression


Accuracy:0.616317844763


In [388]:
df

classifier,SVC,LogisticRegression,RandomForestClassifier,AdaBoostClassifier,KNeighborsClassifier,XGBClassifier
accuracy,0.609754,0.616318,0.614418,0.566583,0.599412,0.609741
f1,0.51264,0.492054,0.519448,0.497487,0.540553,0.52507


## Code for Creating new features

In [7]:
import requests
import urllib.request
import time
from bs4 import BeautifulSoup
import pandas as pd
import re

In [8]:
corpus.head(2)

Unnamed: 0,source_url,source_url_processed,URL,fact,bias
0,http://www.villagevoice.com/,villagevoice.com,http://mediabiasfactcheck.com/the-village-voice/,HIGH,left
1,https://insideclimatenews.org/,insideclimatenews.org,https://mediabiasfactcheck.com/insideclimate-n...,HIGH,left-center


In [21]:
has_terms_of_use =[]

In [22]:
for url in corpus.source_url.values:
    try:
        response = requests.get(url, verify = False, timeout = 60)
        soup = BeautifulSoup(response.text, 'html.parser')
        has_terms_of_use.append((len(soup.body.findAll(text=re.compile('(terms of use)|(terms of service)',re.IGNORECASE))) > 0)+0)
    except:
        has_terms_of_use.append(-1)

In [32]:
len(has_terms_of_use)

1066

In [34]:
ht = pd.DataFrame(has_terms_of_use)
ht.to_csv("has_terms_of_use.csv")