# `FakeNewsNET` dataset

In [87]:
import os
import numpy as np
import pandas as pd

In [88]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Loading data

In [89]:
data_path = '/Users/julienseguy/code/EddyEdzwan/StopFAIke/raw_data/fakenewsnet_dataset'

dir_pol_real = os.path.join(data_path, 'politifact', 'real')
dir_pol_fake = os.path.join(data_path, 'politifact', 'fake')
dir_gos_real = os.path.join(data_path, 'gossipcop', 'real')
dir_gos_fake = os.path.join(data_path, 'gossipcop', 'fake')

In [90]:
from StopFAIke.data import get_data

pol_real_df = get_data(dir_pol_real)
pol_fake_df = get_data(dir_pol_fake)
gos_real_df = get_data(dir_gos_real)
gos_fake_df = get_data(dir_gos_fake)

print('-'*80)
print(f"pol_real_df shape: {pol_real_df.shape}")
print(f"pol_fake_df shape: {pol_fake_df.shape}")
print(f"gos_real_df shape: {gos_real_df.shape}")
print(f"gos_fake_df shape: {gos_fake_df.shape}")
print('-'*80)

--------------------------------------------------------------------------------
pol_real_df shape: (559, 6)
pol_fake_df shape: (403, 6)
gos_real_df shape: (11106, 6)
gos_fake_df shape: (4898, 6)
--------------------------------------------------------------------------------


In [91]:
pol_real_df.head()

Unnamed: 0,title,text,authors,num_images,domain,url
0,Djou wins special election for Congress,Hanabusa leads Case with nearly all the votes ...,['Honolulu Star-Bulletin'],40,archive,https://web.archive.org/web/20100523122054/htt...
1,Change We Can Believe In,Remarks of Senator Barack Obama: Apostolic Chu...,[],33,archive,https://web.archive.org/web/20080618171108/htt...
2,One in Four,One out of every four Pennsylvania households ...,['Congressman Joe Pitts'],2,medium,https://medium.com/@RepJoePitts/one-in-four-66...
3,,,[],0,politico,http://www.politico.com/news/stories/0309/2034...
4,,,[],0,fec,http://docquery.fec.gov/pdf/613/20180415910815...


### Add labels & info

In [92]:
pol_real_df['category'] = 0 #True
pol_fake_df['category'] = 1 #Fake
gos_real_df['category'] = 0
gos_fake_df['category'] = 1

pol_real_df['news_type'] = 'political'
pol_fake_df['news_type'] = 'political'
gos_real_df['news_type'] = 'gossip'
gos_fake_df['news_type'] = 'gossip'

In [93]:
pol_real_df.head()

Unnamed: 0,title,text,authors,num_images,domain,url,category,news_type
0,Djou wins special election for Congress,Hanabusa leads Case with nearly all the votes ...,['Honolulu Star-Bulletin'],40,archive,https://web.archive.org/web/20100523122054/htt...,0,political
1,Change We Can Believe In,Remarks of Senator Barack Obama: Apostolic Chu...,[],33,archive,https://web.archive.org/web/20080618171108/htt...,0,political
2,One in Four,One out of every four Pennsylvania households ...,['Congressman Joe Pitts'],2,medium,https://medium.com/@RepJoePitts/one-in-four-66...,0,political
3,,,[],0,politico,http://www.politico.com/news/stories/0309/2034...,0,political
4,,,[],0,fec,http://docquery.fec.gov/pdf/613/20180415910815...,0,political


### Merging

In [94]:
data = pd.concat([pol_real_df, pol_fake_df, gos_real_df, gos_fake_df]).reset_index(drop=True)

print('-'*80)
print(f"data shape: {data.shape}")
print('-'*80)
print(f"ratio #true: {len(data[data['category']==0])/len(data)*100:.2f}%")
print(f"ratio #fake: {len(data[data['category']==1])/len(data)*100:.2f}%")
print('-'*80)
print(f"ratio #true - political: {len(data[(data['category']==0) & (data['news_type']=='political')])/len(data)*100:.2f}%")
print(f"ratio #fake - political: {len(data[(data['category']==1) & (data['news_type']=='political')])/len(data)*100:.2f}%")
print('-'*80)
print(f"ratio #true - gossip: {len(data[(data['category']==0) & (data['news_type']=='gossip')])/len(data)*100:.2f}%")
print(f"ratio #fake - gossip: {len(data[(data['category']==1) & (data['news_type']=='gossip')])/len(data)*100:.2f}%")
print('-'*80)

--------------------------------------------------------------------------------
data shape: (16966, 8)
--------------------------------------------------------------------------------
ratio #true: 68.76%
ratio #fake: 31.24%
--------------------------------------------------------------------------------
ratio #true - political: 3.29%
ratio #fake - political: 2.38%
--------------------------------------------------------------------------------
ratio #true - gossip: 65.46%
ratio #fake - gossip: 28.87%
--------------------------------------------------------------------------------


## Preprocessing

### Missing values

In [95]:
def get_missing(df):
    missing_values = df.isnull().sum().sort_values(ascending = False)
    ratio = missing_values/len(data)*100
    return pd.DataFrame({'missing_values': missing_values, 'ratio': round(ratio)}).head(10)

In [96]:
get_missing(data)

Unnamed: 0,missing_values,ratio
title,0,0.0
text,0,0.0
authors,0,0.0
num_images,0,0.0
domain,0,0.0
url,0,0.0
category,0,0.0
news_type,0,0.0


No **missing values** detected by `.isnull()` method

In [97]:
data.head()

Unnamed: 0,title,text,authors,num_images,domain,url,category,news_type
0,Djou wins special election for Congress,Hanabusa leads Case with nearly all the votes ...,['Honolulu Star-Bulletin'],40,archive,https://web.archive.org/web/20100523122054/htt...,0,political
1,Change We Can Believe In,Remarks of Senator Barack Obama: Apostolic Chu...,[],33,archive,https://web.archive.org/web/20080618171108/htt...,0,political
2,One in Four,One out of every four Pennsylvania households ...,['Congressman Joe Pitts'],2,medium,https://medium.com/@RepJoePitts/one-in-four-66...,0,political
3,,,[],0,politico,http://www.politico.com/news/stories/0309/2034...,0,political
4,,,[],0,fec,http://docquery.fec.gov/pdf/613/20180415910815...,0,political


In [98]:
data[data['text'] == 'NaN'].shape

(0, 8)

In [99]:
indices = data[data['title'] == ''].index
len(indices)

198

In [100]:
indices = data[data['text'] == ''].index
len(indices)

505

In [101]:
indices = data[data['authors'] == '[]'].index
len(indices)

5668

In [102]:
from StopFAIke.data import remove_missing_values

data = remove_missing_values(data, 'title', '')
data = remove_missing_values(data, 'text', '')

print('-'*80)
print(f"data shape: {data.shape}")
print('-'*80)

--------------------------------------------------------------------------------
data shape: (16380, 8)
--------------------------------------------------------------------------------


In [103]:
data.head()

Unnamed: 0,title,text,authors,num_images,domain,url,category,news_type
0,Djou wins special election for Congress,Hanabusa leads Case with nearly all the votes ...,['Honolulu Star-Bulletin'],40,archive,https://web.archive.org/web/20100523122054/htt...,0,political
1,Change We Can Believe In,Remarks of Senator Barack Obama: Apostolic Chu...,[],33,archive,https://web.archive.org/web/20080618171108/htt...,0,political
2,One in Four,One out of every four Pennsylvania households ...,['Congressman Joe Pitts'],2,medium,https://medium.com/@RepJoePitts/one-in-four-66...,0,political
5,Pastors To Protest IRS Rules on Political Advo...,"On Sept. 28, pastors from 20 states will give ...",[],4,pewforum,http://www.pewforum.org/2008/09/19/pastors-to-...,0,political
7,“Dictionary” on President Obama’s Health Care ...,WASHINGTON – The Republican National Committee...,"['Written On September', 'Republican National ...",32,archive,https://web.archive.org/web/20091003005639/htt...,0,political


### Duplicates

In [104]:
def count_duplicate(df):
    return df.duplicated().sum()

In [105]:
print('-'*80)
print(f"duplicates: {count_duplicate(data)}")
print('-'*80)

--------------------------------------------------------------------------------
duplicates: 795
--------------------------------------------------------------------------------


In [106]:
data.drop_duplicates(inplace=True)

print('-'*80)
print(f"data shape (wo duplicates): {data.shape}")
print('-'*80)

--------------------------------------------------------------------------------
data shape (wo duplicates): (15585, 8)
--------------------------------------------------------------------------------


###  Save to `CSV` file

In [107]:
data.to_csv('../raw_data/FakesNewsNET.csv', index=False)

### Loading the `CSV` file

In [108]:
data = pd.read_csv('../raw_data/FakesNewsNET.csv')

In [109]:
data.head()

Unnamed: 0,title,text,authors,num_images,domain,url,category,news_type
0,Djou wins special election for Congress,Hanabusa leads Case with nearly all the votes ...,['Honolulu Star-Bulletin'],40,archive,https://web.archive.org/web/20100523122054/htt...,0,political
1,Change We Can Believe In,Remarks of Senator Barack Obama: Apostolic Chu...,[],33,archive,https://web.archive.org/web/20080618171108/htt...,0,political
2,One in Four,One out of every four Pennsylvania households ...,['Congressman Joe Pitts'],2,medium,https://medium.com/@RepJoePitts/one-in-four-66...,0,political
3,Pastors To Protest IRS Rules on Political Advo...,"On Sept. 28, pastors from 20 states will give ...",[],4,pewforum,http://www.pewforum.org/2008/09/19/pastors-to-...,0,political
4,“Dictionary” on President Obama’s Health Care ...,WASHINGTON – The Republican National Committee...,"['Written On September', 'Republican National ...",32,archive,https://web.archive.org/web/20091003005639/htt...,0,political


In [24]:
data.shape

(15666, 8)

### Cleaning - punctuation / lowercased / tokenize / stop_words / lemmatize

In [25]:
from StopFAIke.data import clean

data['clean_text'] = data['text'].apply(clean)
data['clean_text'].head()

0    [hanabusa, lead, case, nearly, vote, counted, ...
1    [remark, senator, barack, obama, apostolic, ch...
2    [one, every, four, pennsylvania, household, af...
3    [sept, pastor, state, give, politically, based...
4    [washington, republican, national, committee, ...
Name: clean_text, dtype: object

## Training

In [26]:
# X = data['clean_text'].copy()
# y = data['category'].copy()
X = data['clean_text'].sample(1000).copy()
y = data['category'].sample(1000).copy()

print('-'*80)
print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")
print('-'*80)

--------------------------------------------------------------------------------
X shape: (1000,)
y shape: (1000,)
--------------------------------------------------------------------------------


In [None]:
y.value_counts()

In [None]:
len(X.iloc[8])

In [None]:
X.iloc[8]

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print('-'*80)
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print('-'*80)
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")
print('-'*80)

### Tokenizer

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

# This initializes a Keras utilities that does all the tokenization for you
tokenizer = Tokenizer()

# The tokenization learns a dictionnary that maps a token (integer) to each word
# It can be done only on the train set - we are not supposed to know the test set !
# This tokenization also lower your words, apply some filters, and so on - you can check the doc if you want
tokenizer.fit_on_texts(X_train)
    
# We apply the tokenization to the train and test set
X_train_token = tokenizer.texts_to_sequences(X_train)
X_test_token = tokenizer.texts_to_sequences(X_test)

print('-'*80)
print(f"X_train_token shape: {len(X_train_token)}")
print(f"X_test_token shape: {len(X_test_token)}")
print('-'*80)

In [None]:
vocab_size = len(tokenizer.word_index)

print('-'*80)
print(f"vocab_size: {vocab_size}")
print('-'*80)

### Padding

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences

X_train_pad = pad_sequences(X_train_token, maxlen=1000, dtype='float32', padding='post')
X_test_pad = pad_sequences(X_test_token, maxlen=1000, dtype='float32', padding='post')
# X_test_pad = pad_sequences(X_test_token, maxlen=X_train_pad.shape[1], dtype='float32', padding='post')

print('-'*80)
print(f"X_train_pad shape: {X_train_pad.shape}")
print(f"y_train shape: {len(y_train)}")
print('-'*80)
print(f"X_test_pad shape: {X_test_pad.shape}")
print(f"y_test shape: {len(y_test)}")
print('-'*80)

### Tensorflow - embedding layer included

In [None]:
def init_model():

    embedding_size = 5

    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.Embedding(
        input_dim = vocab_size+1,
        input_length = 1000, # Max_sentence_length (optional, for model summary)
#         input_length = X_train_pad.shape[1], # Max_sentence_length (optional, for model summary)
        output_dim = embedding_size,# 100
        mask_zero = True, # Included masking layer :)
    ))

    model.add(tf.keras.layers.LSTM(20, activation='tanh'))
    model.add(tf.keras.layers.Dense(5, activation='relu'))
    model.add(tf.keras.layers.Dense(1, activation="sigmoid"))

    model.compile(loss='binary_crossentropy',
                  optimizer='rmsprop',
                  metrics=['accuracy'])
    
    return model

In [None]:
init_model().summary()

In [None]:
es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

model = init_model()

history = model.fit(X_train_pad, 
                  y_train,
                  validation_split=0.2,
                  batch_size=64,
                  epochs=15, 
                  callbacks=[es],
                  verbose=1)

In [None]:
import matplotlib.pyplot as plt

def plot_loss(history, title=None):
    fig, (ax1, ax2) = plt.subplots(1,2, figsize=(13,4))
    ax1.plot(history.history['loss'])
    ax1.plot(history.history['val_loss'])
    ax1.set_title('Model loss')
    ax1.set_ylabel('Loss')
    ax1.set_xlabel('Epoch')
    ax1.set_ylim(ymin=0, ymax=1)
    ax1.legend(['Train', 'Validation'], loc='best')

    ax2.plot(history.history['accuracy'])
    ax2.plot(history.history['val_accuracy'])
    ax2.set_title('ACC')
    ax2.set_ylabel('ACC')
    ax2.set_xlabel('Epoch')
    ax2.set_ylim(ymin=0, ymax=1)
    ax2.legend(['Train', 'Validation'], loc='best')
    if title:
        fig.suptitle(title)
    plt.show()

In [None]:
plot_loss(history, title=None)

In [None]:
results = model.evaluate(X_test_pad, y_test)

print('-'*80)
print(f"test score (MAPE): {results[1]:.3f}")
print('-'*80)