# `FakeNewsNET` dataset

In [2]:
import os
import numpy as np
import pandas as pd

In [3]:
%load_ext autoreload
%autoreload 2

## Loading data

In [4]:
data_path = '/Users/julienseguy/code/EddyEdzwan/StopFAIke/raw_data/fakenewsnet_dataset'

dir_pol_real = os.path.join(data_path, 'politifact', 'real')
dir_pol_fake = os.path.join(data_path, 'politifact', 'fake')
dir_gos_real = os.path.join(data_path, 'gossipcop', 'real')
dir_gos_fake = os.path.join(data_path, 'gossipcop', 'fake')

In [5]:
from StopFAIke.data import get_data

pol_real_df = get_data(dir_pol_real)
pol_fake_df = get_data(dir_pol_fake)
gos_real_df = get_data(dir_gos_real)
gos_fake_df = get_data(dir_gos_fake)

print('-'*80)
print(f"pol_real_df shape: {pol_real_df.shape}")
print(f"pol_fake_df shape: {pol_fake_df.shape}")
print(f"gos_real_df shape: {gos_real_df.shape}")
print(f"gos_fake_df shape: {gos_fake_df.shape}")
print('-'*80)

--------------------------------------------------------------------------------
pol_real_df shape: (559, 6)
pol_fake_df shape: (403, 6)
gos_real_df shape: (6311, 6)
gos_fake_df shape: (4896, 6)
--------------------------------------------------------------------------------


In [6]:
pol_real_df.head()

Unnamed: 0,title,text,authors,num_images,domain,url
0,Djou wins special election for Congress,Hanabusa leads Case with nearly all the votes ...,['Honolulu Star-Bulletin'],40,archive,https://web.archive.org/web/20100523122054/htt...
1,Change We Can Believe In,Remarks of Senator Barack Obama: Apostolic Chu...,[],33,archive,https://web.archive.org/web/20080618171108/htt...
2,One in Four,One out of every four Pennsylvania households ...,['Congressman Joe Pitts'],2,medium,https://medium.com/@RepJoePitts/one-in-four-66...
3,,,[],0,politico,http://www.politico.com/news/stories/0309/2034...
4,,,[],0,fec,http://docquery.fec.gov/pdf/613/20180415910815...


### Add labels & info

In [7]:
pol_real_df['category'] = 0 #True
pol_fake_df['category'] = 1 #Fake
gos_real_df['category'] = 0
gos_fake_df['category'] = 1

pol_real_df['news_type'] = 'political'
pol_fake_df['news_type'] = 'political'
gos_real_df['news_type'] = 'gossip'
gos_fake_df['news_type'] = 'gossip'

In [8]:
pol_real_df.head()

Unnamed: 0,title,text,authors,num_images,domain,url,category,news_type
0,Djou wins special election for Congress,Hanabusa leads Case with nearly all the votes ...,['Honolulu Star-Bulletin'],40,archive,https://web.archive.org/web/20100523122054/htt...,0,political
1,Change We Can Believe In,Remarks of Senator Barack Obama: Apostolic Chu...,[],33,archive,https://web.archive.org/web/20080618171108/htt...,0,political
2,One in Four,One out of every four Pennsylvania households ...,['Congressman Joe Pitts'],2,medium,https://medium.com/@RepJoePitts/one-in-four-66...,0,political
3,,,[],0,politico,http://www.politico.com/news/stories/0309/2034...,0,political
4,,,[],0,fec,http://docquery.fec.gov/pdf/613/20180415910815...,0,political


### Merging

In [17]:
data = pd.concat([pol_real_df, pol_fake_df, gos_real_df, gos_fake_df]).reset_index(drop=True)

print('-'*80)
print(f"data shape: {data.shape}")
print('-'*80)
print(f"ratio #true: {len(data[data['category']==0])/len(data)*100:.2f}%")
print(f"ratio #fake: {len(data[data['category']==1])/len(data)*100:.2f}%")
print('-'*80)
print(f"ratio #true - political: {len(data[(data['category']==0) & (data['news_type']=='political')])/len(data)*100:.2f}%")
print(f"ratio #fake - political: {len(data[(data['category']==1) & (data['news_type']=='political')])/len(data)*100:.2f}%")
print('-'*80)
print(f"ratio #true - gossip: {len(data[(data['category']==0) & (data['news_type']=='gossip')])/len(data)*100:.2f}%")
print(f"ratio #fake - gossip: {len(data[(data['category']==1) & (data['news_type']=='gossip')])/len(data)*100:.2f}%")
print('-'*80)

--------------------------------------------------------------------------------
data shape: (12169, 8)
--------------------------------------------------------------------------------
ratio #true: 56.45%
ratio #fake: 43.55%
--------------------------------------------------------------------------------
ratio #true - political: 4.59%
ratio #fake - political: 3.31%
--------------------------------------------------------------------------------
ratio #true - gossip: 51.86%
ratio #fake - gossip: 40.23%
--------------------------------------------------------------------------------


## Preprocessing

### Missing values

In [18]:
def get_missing(df):
    missing_values = df.isnull().sum().sort_values(ascending = False)
    ratio = missing_values/len(data)*100
    return pd.DataFrame({'missing_values': missing_values, 'ratio': round(ratio)}).head(10)

In [19]:
get_missing(data)

Unnamed: 0,missing_values,ratio
title,0,0.0
text,0,0.0
authors,0,0.0
num_images,0,0.0
domain,0,0.0
url,0,0.0
category,0,0.0
news_type,0,0.0


In [43]:
data.head()

Unnamed: 0,title,text,authors,num_images,domain,url,category,news_type
0,Djou wins special election for Congress,Hanabusa leads Case with nearly all the votes ...,['Honolulu Star-Bulletin'],40,archive,https://web.archive.org/web/20100523122054/htt...,0,political
1,Change We Can Believe In,Remarks of Senator Barack Obama: Apostolic Chu...,[],33,archive,https://web.archive.org/web/20080618171108/htt...,0,political
2,One in Four,One out of every four Pennsylvania households ...,['Congressman Joe Pitts'],2,medium,https://medium.com/@RepJoePitts/one-in-four-66...,0,political
5,Pastors To Protest IRS Rules on Political Advo...,"On Sept. 28, pastors from 20 states will give ...",[],4,pewforum,http://www.pewforum.org/2008/09/19/pastors-to-...,0,political
7,“Dictionary” on President Obama’s Health Care ...,WASHINGTON – The Republican National Committee...,"['Written On September', 'Republican National ...",32,archive,https://web.archive.org/web/20091003005639/htt...,0,political


In [44]:
indices = data[data['title'] == ''].index
len(indices)

61

In [47]:
indices = data[data['text'] == ''].index
len(indices)

0

In [48]:
indices = data[data['authors'] == '[]'].index
len(indices)

3722

In [49]:
from StopFAIke.data import remove_missing_values

data = remove_missing_values(data, 'text', '')

print('-'*80)
print(f"data shape: {data.shape}")
print('-'*80)

--------------------------------------------------------------------------------
data shape: (11763, 8)
--------------------------------------------------------------------------------


In [51]:
data.head()

Unnamed: 0,title,text,authors,num_images,domain,url,category,news_type
0,Djou wins special election for Congress,Hanabusa leads Case with nearly all the votes ...,['Honolulu Star-Bulletin'],40,archive,https://web.archive.org/web/20100523122054/htt...,0,political
1,Change We Can Believe In,Remarks of Senator Barack Obama: Apostolic Chu...,[],33,archive,https://web.archive.org/web/20080618171108/htt...,0,political
2,One in Four,One out of every four Pennsylvania households ...,['Congressman Joe Pitts'],2,medium,https://medium.com/@RepJoePitts/one-in-four-66...,0,political
5,Pastors To Protest IRS Rules on Political Advo...,"On Sept. 28, pastors from 20 states will give ...",[],4,pewforum,http://www.pewforum.org/2008/09/19/pastors-to-...,0,political
7,“Dictionary” on President Obama’s Health Care ...,WASHINGTON – The Republican National Committee...,"['Written On September', 'Republican National ...",32,archive,https://web.archive.org/web/20091003005639/htt...,0,political


### Duplicates

In [52]:
def count_duplicate(df):
    return df.duplicated().sum()

In [54]:
print('-'*80)
print(f"duplicates: {count_duplicate(data)}")
print('-'*80)

--------------------------------------------------------------------------------
duplicates: 514
--------------------------------------------------------------------------------


In [55]:
data.drop_duplicates(inplace=True)

print('-'*80)
print(f"data shape (wo duplicates): {data.shape}")
print('-'*80)

--------------------------------------------------------------------------------
data shape (wo duplicates): (11249, 8)
--------------------------------------------------------------------------------


### Cleaning - punctuation / lowercased / tokenize / stop_words / lemmatize

In [61]:
from nltk.corpus import stopwords 
import string
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize 

def clean(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, ' ') # Remove Punctuation
    lowercased = text.lower() # Lower Case
    tokenized = word_tokenize(lowercased) # Tokenize
    words_only = [word for word in tokenized if word.isalpha()] # Remove numbers
    stop_words = set(stopwords.words('english')) # Make stopword list
    without_stopwords = [word for word in words_only if not word in stop_words] # Remove Stop Words
    lemma = WordNetLemmatizer() # Initiate Lemmatizer
    lemmatized = [lemma.lemmatize(word) for word in without_stopwords] # Lemmatize
    return lemmatized

In [62]:
data['clean_text'] = data['text'].apply(clean)

In [64]:
data['clean_text'].head()

0    [hanabusa, lead, case, nearly, vote, counted, ...
1    [remark, senator, barack, obama, apostolic, ch...
2    [one, every, four, pennsylvania, household, af...
5    [sept, pastor, state, give, politically, based...
7    [washington, republican, national, committee, ...
Name: clean_text, dtype: object

## Training

In [65]:
X = data['clean_text'].copy()
y = data['category'].copy()

print('-'*80)
print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")
print('-'*80)

--------------------------------------------------------------------------------
X shape: (11249,)
y shape: (11249,)
--------------------------------------------------------------------------------


In [67]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print('-'*80)
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print('-'*80)
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")
print('-'*80)

--------------------------------------------------------------------------------
X_train shape: (7874,)
y_train shape: (7874,)
--------------------------------------------------------------------------------
X_test shape: (3375,)
y_test shape: (3375,)
--------------------------------------------------------------------------------


### Tokenizer

In [68]:
from tensorflow.keras.preprocessing.text import Tokenizer

# This initializes a Keras utilities that does all the tokenization for you
tokenizer = Tokenizer()

# The tokenization learns a dictionnary that maps a token (integer) to each word
# It can be done only on the train set - we are not supposed to know the test set !
# This tokenization also lower your words, apply some filters, and so on - you can check the doc if you want
tokenizer.fit_on_texts(X_train)
    
# We apply the tokenization to the train and test set
X_train_token = tokenizer.texts_to_sequences(X_train)
X_test_token = tokenizer.texts_to_sequences(X_test)

print('-'*80)
print(f"X_train_token shape: {len(X_train_token)}")
print(f"X_test_token shape: {len(X_test_token)}")
print('-'*80)

--------------------------------------------------------------------------------
X_train_token shape: 7874
X_test_token shape: 3375
--------------------------------------------------------------------------------


In [69]:
vocab_size = len(tokenizer.word_index)

print('-'*80)
print(f"vocab_size: {vocab_size}")
print('-'*80)

--------------------------------------------------------------------------------
vocab_size: 62967
--------------------------------------------------------------------------------


### Padding

In [75]:
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences

X_train_pad = pad_sequences(X_train_token, dtype='float32', padding='post')
X_test_pad = pad_sequences(X_test_token, dtype='float32', padding='post')
# X_test_pad = pad_sequences(X_test_token, maxlen=X_train_pad.shape[1], dtype='float32', padding='post')

print('-'*80)
print(f"X_train_pad shape: {X_train_pad.shape}")
print(f"y_train shape: {len(y_train)}")
print('-'*80)
print(f"X_test_pad shape: {X_test_pad.shape}")
print(f"y_test shape: {len(y_test)}")
print('-'*80)

--------------------------------------------------------------------------------
X_train_pad shape: (7874, 9994)
y_train shape: 7874
--------------------------------------------------------------------------------
X_test_pad shape: (3375, 9870)
y_test shape: 3375
--------------------------------------------------------------------------------


### Tensorflow - embedding layer included

In [76]:
def init_model():

    embedding_size = 5

    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.Embedding(
        input_dim = vocab_size+1,
        input_length = 9994, # Max_sentence_length (optional, for model summary)
        output_dim = embedding_size,# 100
        mask_zero = True, # Included masking layer :)
    ))

    model.add(tf.keras.layers.LSTM(20, activation='tanh'))
    model.add(tf.keras.layers.Dense(5, activation='relu'))
    model.add(tf.keras.layers.Dense(1, activation="sigmoid"))

    model.compile(loss='binary_crossentropy',
                  optimizer='rmsprop',
                  metrics=['accuracy'])
    
    return model

In [77]:
init_model().summary()

2021-08-19 20:51:15.533747: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 9994, 5)           314840    
_________________________________________________________________
lstm (LSTM)                  (None, 20)                2080      
_________________________________________________________________
dense (Dense)                (None, 5)                 105       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 6         
Total params: 317,031
Trainable params: 317,031
Non-trainable params: 0
_________________________________________________________________


In [None]:
es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

model = init_model()

history = model.fit(X_train_pad, 
                  y_train,
                  validation_split=0.2,
                  batch_size=64,
                  epochs=20, 
                  callbacks=[es],
                  verbose=1)

2021-08-19 20:51:32.659788: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/20
Epoch 2/20
18/99 [====>.........................] - ETA: 11:22 - loss: 0.5882 - accuracy: 0.6858