# Data Preprocessing

In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, recall_score, f1_score

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  from numpy.core.umath_tests import inner1d


In [2]:
stop_words = {'a', 'ourselves', 'hers', 'between', 'yourself', 
              'but', 'again', 'there', 'about', 'once', 'during', 
              'out', 'very', 'having', 'with', 'they', 'own', 'an', 
              'be', 'some', 'for', 'do', 'its', 'yours', 'such', 'into', 
              'of', 'most', 'itself', 'other', 'off', 'is', 's', 'am', 'or', 
              'who', 'as', 'from', 'him', 'each', 'the', 'themselves', 'until', 
              'below', 'are', 'we', 'these', 'your', 'his', 'through', 'don', 
              'nor', 'me', 'were', 'her', 'more', 'himself', 'this', 'down', 
              'should', 'our', 'their', 'while', 'above', 'both', 'up', 'to', 
              'ours', 'had', 'she', 'all', 'no', 'when', 'at', 'any', 'before', 
              'them', 'same', 'and', 'been', 'have', 'in', 'will', 'on', 'does', 
              'yourselves', 'then', 'that', 'because', 'what', 'over', 'why', 'so', 
              'can', 'did', 'not', 'now', 'under', 'he', 'you', 'herself', 'has', 
              'just', 'where', 'too', 'only', 'myself', 'which', 'those', 'i', 
              'after', 'few', 'whom', 't', 'being', 'if', 'theirs', 'my', 'against', 
              'a', 'by', 'doing', 'it', 'how', 'further', 'was', 'here', 'than', 's', 'i', 't'}

# Read data
politifact_real = pd.read_csv('./FakeNewsNet-master/dataset/politifact_real.csv')
politifact_fake = pd.read_csv('./FakeNewsNet-master/dataset/politifact_fake.csv')
gossipcop_real = pd.read_csv('./FakeNewsNet-master/dataset/gossipcop_real.csv')
gossipcop_fake = pd.read_csv('./FakeNewsNet-master/dataset/gossipcop_fake.csv')
len(politifact_real), len(politifact_fake), len(gossipcop_real), len(gossipcop_fake)

politifact_real = politifact_real.sample(432)
gossipcop_real = gossipcop_real.sample(5323)

politifact_real['label'] = 1
politifact_fake['label'] = 0
gossipcop_real['label'] = 1
gossipcop_fake['label'] = 0

politifact = pd.concat((politifact_fake, politifact_real), axis=0)
gossipcop = pd.concat((gossipcop_real, gossipcop_fake), axis=0)

len(politifact), len(gossipcop)

politifact = politifact[['title', 'label']]
gossipcop = gossipcop[['title', 'label']]

# Get rid of the stop words 
def clean_stopwords(sentences):
    """
    input: array of sentences
    """
    word_list = re.findall(r'\w+', sentences)
    ans = ''
    for word in word_list:
        if word in stop_words:
            pass
        ans += word + ' '
    return ans

politifact['title'] = politifact['title'].map(lambda x: clean_stopwords(x))
gossipcop['title'] = gossipcop['title'].map(lambda x: clean_stopwords(x))

In [3]:
# Balancing the data
print(politifact['label'].value_counts())
print(gossipcop['label'].value_counts())

1    432
0    432
Name: label, dtype: int64
1    5323
0    5323
Name: label, dtype: int64


# Convert the text data to binary encodings

In [7]:
# def train_test_processing(df):
#     """
#     input: dataframe with label
#     """
#     X, y = df.iloc[:, :-1], df.iloc[:, -1]
    
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)
    
#     tokenizer = Tokenizer(num_words=100, lower=False)
    
#     tokenizer.fit_on_texts(X_train['title'])
    
#     X_train = tokenizer.texts_to_sequences(X_train['title'])
    
#     X_test = tokenizer.texts_to_sequences(X_test['title'])
  
#     vocab_size = len(tokenizer.word_index)+1
    
#     X_train = pad_sequences(X_train, padding='post', maxlen=100) 
    
#     X_test = pad_sequences(X_test, padding='post', maxlen=100) 
    
#     return X_train, X_test, y_train, y_test

def train_test_processing(df):
    """
    input: dataframe with label
    """
    X, y = df.iloc[:, :-1], df.iloc[:, -1]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)
    
    tokenizer = Tokenizer(num_words=100, lower=False)
    
    tokenizer.fit_on_texts(X_train['title'])
    
    X_train = tokenizer.texts_to_matrix(X_train['title'])
    
    X_test = tokenizer.texts_to_matrix(X_test['title'])

    vocab_size = len(tokenizer.word_index)+1
    
    return X_train, X_test, y_train, y_test

In [8]:
X_train, X_test, y_train, y_test = train_test_processing(politifact)

# Model Training

In [388]:
def evaluate(clf, X):
    X_train, X_test, y_train, y_test = train_test_processing(X)
    # clf.fit(X_train, y_train)
    train_predict = clf.predict(X_train)
    test_predict = clf.predict(X_test)
    f1_score_train = f1_score(train_predict, y_train)
    f1_score_test = f1_score(test_predict, y_test)
    
    print(f'Training F1: {f1_score_train}')
    print(f'Testing F1: {f1_score_test}')

In [386]:
svm_clf = SVC(C=1.0, random_state=42)
forest_clf = RandomForestClassifier(random_state=42)

In [387]:
evaluate(forest_clf, politifact)

Training F1: 0.9301634472511144
Testing F1: 0.7542372881355932


In [389]:
evaluate(forest_clf, gossipcop)

Training F1: 0.4494163424124513
Testing F1: 0.4607762180016515


# Propensity Score Matching

### 1. Calculate Propensity Score

For every word, we need to build a logistic regression with other words features.



In [3]:
from models.propensity_score import propensity_score

p_score = propensity_score()
p_score.fit(politifact)

the 0
[]
to 0
[]
of 0
[]
in 0
[]
trump 0
[]
s 0
[]
on 0
[]
and 0
[]
for 0
[]
obama 0
[]
a 0
[]
breaking 0
[]
president 0
[]
is 0
[]
by 0
[]
with 0
[]
news 0
[]
just 0
[]
from 0
[]
clinton 0
[]
at 0
[]
says 0
[]
transcript 0
[]
after 0
[]
donald 0
[]
new 0
[]
it 0
[]
mccain 0
[]
be 0
[]
t 0
[]
you 0
[]
we 0
[]
as 0
[]
this 0
[]
he 0
[]
was 0
[]
debate 0
[]
remarks 0
[]
has 0
[]
his 0
[]
hillary 0
[]


KeyboardInterrupt: 