# `FakeNewsNET` dataset

In [1]:
import os
import numpy as np
import pandas as pd

In [None]:
%load_ext autoreload
%autoreload 2

## Loading data

In [2]:
data_path = '/Users/julienseguy/code/EddyEdzwan/StopFAIke/raw_data/fakenewsnet_dataset'

dir_pol_real = os.path.join(data_path, 'politifact', 'real')
dir_pol_fake = os.path.join(data_path, 'politifact', 'fake')
dir_gos_real = os.path.join(data_path, 'gossipcop', 'real')
dir_gos_fake = os.path.join(data_path, 'gossipcop', 'fake')

In [3]:
from StopFAIke.data import get_data

pol_real_df = get_data(dir_pol_real)
pol_fake_df = get_data(dir_pol_fake)
gos_real_df = get_data(dir_gos_real)
gos_fake_df = get_data(dir_gos_fake)

print('-'*80)
print(f"pol_real_df shape: {pol_real_df.shape}")
print(f"pol_fake_df shape: {pol_fake_df.shape}")
print(f"gos_real_df shape: {gos_real_df.shape}")
print(f"gos_fake_df shape: {gos_fake_df.shape}")
print('-'*80)

--------------------------------------------------------------------------------
pol_real_df shape: (559, 6)
pol_fake_df shape: (403, 6)
gos_real_df shape: (15173, 6)
gos_fake_df shape: (4898, 6)
--------------------------------------------------------------------------------


In [4]:
pol_real_df.head()

Unnamed: 0,title,text,authors,num_images,domain,url
0,Djou wins special election for Congress,Hanabusa leads Case with nearly all the votes ...,['Honolulu Star-Bulletin'],40,archive,https://web.archive.org/web/20100523122054/htt...
1,Change We Can Believe In,Remarks of Senator Barack Obama: Apostolic Chu...,[],33,archive,https://web.archive.org/web/20080618171108/htt...
2,One in Four,One out of every four Pennsylvania households ...,['Congressman Joe Pitts'],2,medium,https://medium.com/@RepJoePitts/one-in-four-66...
3,,,[],0,politico,http://www.politico.com/news/stories/0309/2034...
4,,,[],0,fec,http://docquery.fec.gov/pdf/613/20180415910815...


## Labeling 

In [5]:
pol_real_df['category'] = 0 #True
pol_fake_df['category'] = 1 #Fake
gos_real_df['category'] = 0
gos_fake_df['category'] = 1

pol_real_df['news_type'] = 'political'
pol_fake_df['news_type'] = 'political'
gos_real_df['news_type'] = 'gossip'
gos_fake_df['news_type'] = 'gossip'

In [6]:
pol_real_df.head()

Unnamed: 0,title,text,authors,num_images,domain,url,category,news_type
0,Djou wins special election for Congress,Hanabusa leads Case with nearly all the votes ...,['Honolulu Star-Bulletin'],40,archive,https://web.archive.org/web/20100523122054/htt...,0,political
1,Change We Can Believe In,Remarks of Senator Barack Obama: Apostolic Chu...,[],33,archive,https://web.archive.org/web/20080618171108/htt...,0,political
2,One in Four,One out of every four Pennsylvania households ...,['Congressman Joe Pitts'],2,medium,https://medium.com/@RepJoePitts/one-in-four-66...,0,political
3,,,[],0,politico,http://www.politico.com/news/stories/0309/2034...,0,political
4,,,[],0,fec,http://docquery.fec.gov/pdf/613/20180415910815...,0,political


## Merging

In [7]:
data = pd.concat([pol_real_df, pol_fake_df, gos_real_df, gos_fake_df]).reset_index(drop=True)

print('-'*80)
print(f"data shape: {data.shape}")
print('-'*80)
print(f"ratio #true: {len(data[data['category']==0])/len(data)*100:.2f}%")
print(f"ratio #fake: {len(data[data['category']==1])/len(data)*100:.2f}%")
print('-'*80)
print(f"ratio #true - political: {len(data[(data['category']==0) & (data['news_type']=='political')])/len(data)*100:.2f}%")
print(f"ratio #fake - political: {len(data[(data['category']==1) & (data['news_type']=='political')])/len(data)*100:.2f}%")
print('-'*80)
print(f"ratio #true - gossip: {len(data[(data['category']==0) & (data['news_type']=='gossip')])/len(data)*100:.2f}%")
print(f"ratio #fake - gossip: {len(data[(data['category']==1) & (data['news_type']=='gossip')])/len(data)*100:.2f}%")
print('-'*80)

--------------------------------------------------------------------------------
data shape: (21033, 8)
--------------------------------------------------------------------------------
ratio #true: 74.80%
ratio #fake: 25.20%
--------------------------------------------------------------------------------
ratio #true - political: 2.66%
ratio #fake - political: 1.92%
--------------------------------------------------------------------------------
ratio #true - gossip: 72.14%
ratio #fake - gossip: 23.29%
--------------------------------------------------------------------------------


## Preprocessing

### Missing values

In [8]:
def get_missing(df):
    missing_values = df.isnull().sum().sort_values(ascending = False)
    ratio = missing_values/len(data)*100
    return pd.DataFrame({'missing_values': missing_values, 'ratio': round(ratio)}).head(10)

In [9]:
get_missing(data)

Unnamed: 0,missing_values,ratio
title,0,0.0
text,0,0.0
authors,0,0.0
num_images,0,0.0
domain,0,0.0
url,0,0.0
category,0,0.0
news_type,0,0.0


No **missing values** detected by `.isnull()` method

In [10]:
data.head()

Unnamed: 0,title,text,authors,num_images,domain,url,category,news_type
0,Djou wins special election for Congress,Hanabusa leads Case with nearly all the votes ...,['Honolulu Star-Bulletin'],40,archive,https://web.archive.org/web/20100523122054/htt...,0,political
1,Change We Can Believe In,Remarks of Senator Barack Obama: Apostolic Chu...,[],33,archive,https://web.archive.org/web/20080618171108/htt...,0,political
2,One in Four,One out of every four Pennsylvania households ...,['Congressman Joe Pitts'],2,medium,https://medium.com/@RepJoePitts/one-in-four-66...,0,political
3,,,[],0,politico,http://www.politico.com/news/stories/0309/2034...,0,political
4,,,[],0,fec,http://docquery.fec.gov/pdf/613/20180415910815...,0,political


In [11]:
data[data['text'] == 'NaN'].shape

(0, 8)

In [12]:
indices = data[data['title'] == ''].index
len(indices)

236

In [13]:
indices = data[data['text'] == ''].index
len(indices)

600

In [14]:
indices = data[data['authors'] == '[]'].index
len(indices)

6952

In [15]:
from StopFAIke.data import remove_missing_values

data = remove_missing_values(data, 'title', '')
data = remove_missing_values(data, 'text', '')

print('-'*80)
print(f"data shape: {data.shape}")
print('-'*80)

--------------------------------------------------------------------------------
data shape: (20339, 8)
--------------------------------------------------------------------------------


In [16]:
data.head()

Unnamed: 0,title,text,authors,num_images,domain,url,category,news_type
0,Djou wins special election for Congress,Hanabusa leads Case with nearly all the votes ...,['Honolulu Star-Bulletin'],40,archive,https://web.archive.org/web/20100523122054/htt...,0,political
1,Change We Can Believe In,Remarks of Senator Barack Obama: Apostolic Chu...,[],33,archive,https://web.archive.org/web/20080618171108/htt...,0,political
2,One in Four,One out of every four Pennsylvania households ...,['Congressman Joe Pitts'],2,medium,https://medium.com/@RepJoePitts/one-in-four-66...,0,political
5,Pastors To Protest IRS Rules on Political Advo...,"On Sept. 28, pastors from 20 states will give ...",[],4,pewforum,http://www.pewforum.org/2008/09/19/pastors-to-...,0,political
7,“Dictionary” on President Obama’s Health Care ...,WASHINGTON – The Republican National Committee...,"['Written On September', 'Republican National ...",32,archive,https://web.archive.org/web/20091003005639/htt...,0,political


### Duplicates

In [17]:
def count_duplicate(df):
    return df.duplicated().sum()

In [18]:
print('-'*80)
print(f"duplicates: {count_duplicate(data)}")
print('-'*80)

--------------------------------------------------------------------------------
duplicates: 1058
--------------------------------------------------------------------------------


In [19]:
data.drop_duplicates(inplace=True)

print('-'*80)
print(f"data shape (wo duplicates): {data.shape}")
print('-'*80)

--------------------------------------------------------------------------------
data shape (wo duplicates): (19281, 8)
--------------------------------------------------------------------------------


###  Save to `CSV` file

In [20]:
data.to_csv('../raw_data/FakesNewsNET.csv', index=False)

### Loading the `CSV` file

In [21]:
data = pd.read_csv('../raw_data/FakesNewsNET.csv')

In [22]:
data.head()

Unnamed: 0,title,text,authors,num_images,domain,url,category,news_type
0,Djou wins special election for Congress,Hanabusa leads Case with nearly all the votes ...,['Honolulu Star-Bulletin'],40,archive,https://web.archive.org/web/20100523122054/htt...,0,political
1,Change We Can Believe In,Remarks of Senator Barack Obama: Apostolic Chu...,[],33,archive,https://web.archive.org/web/20080618171108/htt...,0,political
2,One in Four,One out of every four Pennsylvania households ...,['Congressman Joe Pitts'],2,medium,https://medium.com/@RepJoePitts/one-in-four-66...,0,political
3,Pastors To Protest IRS Rules on Political Advo...,"On Sept. 28, pastors from 20 states will give ...",[],4,pewforum,http://www.pewforum.org/2008/09/19/pastors-to-...,0,political
4,“Dictionary” on President Obama’s Health Care ...,WASHINGTON – The Republican National Committee...,"['Written On September', 'Republican National ...",32,archive,https://web.archive.org/web/20091003005639/htt...,0,political


In [24]:
data.shape

(15666, 8)