# Data Preprocessing

In [123]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, recall_score, f1_score

In [3]:
stop_words = {'ourselves', 'hers', 'between', 'yourself', 'but', 'again', 'there', 'about', 'once', 'during', 'out', 'very', 'having', 'with', 'they', 'own', 'an', 'be', 'some', 'for', 'do', 'its', 'yours', 'such', 'into', 'of', 'most', 'itself', 'other', 'off', 'is', 's', 'am', 'or', 'who', 'as', 'from', 'him', 'each', 'the', 'themselves', 'until', 'below', 'are', 'we', 'these', 'your', 'his', 'through', 'don', 'nor', 'me', 'were', 'her', 'more', 'himself', 'this', 'down', 'should', 'our', 'their', 'while', 'above', 'both', 'up', 'to', 'ours', 'had', 'she', 'all', 'no', 'when', 'at', 'any', 'before', 'them', 'same', 'and', 'been', 'have', 'in', 'will', 'on', 'does', 'yourselves', 'then', 'that', 'because', 'what', 'over', 'why', 'so', 'can', 'did', 'not', 'now', 'under', 'he', 'you', 'herself', 'has', 'just', 'where', 'too', 'only', 'myself', 'which', 'those', 'i', 'after', 'few', 'whom', 't', 'being', 'if', 'theirs', 'my', 'against', 'a', 'by', 'doing', 'it', 'how', 'further', 'was', 'here', 'than'}

In [35]:
# Read data
politifact_real = pd.read_csv('./FakeNewsNet-master/dataset/politifact_real.csv')
politifact_fake = pd.read_csv('./FakeNewsNet-master/dataset/politifact_fake.csv')
gossipcop_real = pd.read_csv('./FakeNewsNet-master/dataset/gossipcop_real.csv')
gossipcop_fake = pd.read_csv('./FakeNewsNet-master/dataset/gossipcop_fake.csv')
len(politifact_real), len(politifact_fake), len(gossipcop_real), len(gossipcop_fake)

(624, 432, 16817, 5323)

In [36]:
politifact_real = politifact_real.sample(432)
gossipcop_real = gossipcop_real.sample(5323)

In [37]:
politifact_real['label'] = 1
politifact_fake['label'] = 0
gossipcop_real['label'] = 1
gossipcop_fake['label'] = 0

In [38]:
politifact = pd.concat((politifact_fake, politifact_real), axis=0)
gossipcop = pd.concat((gossipcop_real, gossipcop_fake), axis=0)

In [39]:
len(politifact), len(gossipcop)

(864, 10646)

In [40]:
politifact = politifact[['title', 'label']]
gossipcop = gossipcop[['title', 'label']]

In [41]:
# Get rid of the stop words 
def clean_stopwords(sentences):
    """
    input: array of sentences
    """
    word_list = re.findall(r'\w+', sentences)
    ans = ''
    for word in word_list:
        if word in stop_words:
            pass
        ans += word + ' '
    return ans

In [42]:
politifact['title'] = politifact['title'].map(lambda x: clean_stopwords(x))
gossipcop['title'] = gossipcop['title'].map(lambda x: clean_stopwords(x))

In [46]:
# Balancing the data
print(politifact['label'].value_counts())
print(gossipcop['label'].value_counts())

1    432
0    432
Name: label, dtype: int64
1    5323
0    5323
Name: label, dtype: int64


# Convert the text data to binary encodings

In [303]:
def train_test_processing(df):
    """
    input: dataframe with label
    """
    X, y = df.iloc[:, :-1], df.iloc[:, -1]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)
    
    tokenizer = Tokenizer(num_words=100, lower=False)
    
    tokenizer.fit_on_texts(X_train['title'])
    
    X_train = tokenizer.texts_to_sequences(X_train['title'])
    
    X_test = tokenizer.texts_to_sequences(X_test['title'])
  
    vocab_size = len(tokenizer.word_index)+1
    
    X_train = pad_sequences(X_train, padding='post', maxlen=300) 
    
    X_test = pad_sequences(X_test, padding='post', maxlen=300) 
    
    return X_train, X_test, y_train, y_test

In [304]:
X_train, X_test, y_train, y_test = train_test_processing(politifact)

# Model Training

In [311]:
def evaluate(clf, X):
    X_train, X_test, y_train, y_test = train_test_processing(X)
    # clf.fit(X_train, y_train)
    train_predict = clf.predict(X_train)
    test_predict = clf.predict(X_test)
    f1_score_train = f1_score(train_predict, y_train)
    f1_score_test = f1_score(test_predict, y_test)
    
    print(f'Training F1: {f1_score_train}')
    print(f'Testing F1: {f1_score_test}')

In [306]:
svm_clf = SVC(C=1.0, random_state=42)
forest_clf = RandomForestClassifier(random_state=42)

In [310]:
evaluate(forest_clf, politifact)

Training F1: 0.9319526627218936
Testing F1: 0.6608695652173914


In [312]:
evaluate(forest_clf, gossipcop)

Training F1: 0.468807994289793
Testing F1: 0.47578589634664403


# Propensity Score Matching

### 1. Calculate Propensity Score

For every word, we need to build a logistic regression with other words features.



In [309]:
X_train

array([[ 5, 57, 44, ...,  0,  0,  0],
       [61, 45, 14, ...,  0,  0,  0],
       [ 6,  0,  0, ...,  0,  0,  0],
       ...,
       [ 6, 52, 20, ...,  0,  0,  0],
       [ 4,  9,  3, ...,  0,  0,  0],
       [ 9,  8,  0, ...,  0,  0,  0]], dtype=int32)