In [1]:
import pandas as pd
import warnings
import string
import nltk
import re

from bs4 import BeautifulSoup
from nltk import word_tokenize
from nltk.corpus import stopwords

warnings.filterwarnings("ignore", message='.*looks like a URL.*', category=UserWarning, module='bs4')


In [2]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/alexandravoda/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In this notebook I'm going to clean the dataset by removing duplicates, empty articles, html tags, stop words and special symbols. I also tried to remove some of the bias I have identified in the exploratory phase. After data cleaning, I lemmatize and tokenize the text content in order to be ready for training.

In [3]:
df_fake = pd.read_csv('../../data/raw/Fake.csv')
df_true = pd.read_csv('../../data/raw/True.csv')

df_fake['fake'] = 1
df_true['fake'] = 0

data = pd.concat([df_true, df_fake], ignore_index=True, sort=False)
data.head()

Unnamed: 0,title,text,subject,date,fake
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",0
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",0
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",0
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",0
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",0


Because some of the articles contain html tags, I'll use Beautifulsoup to extract only the text from the existing content.

In [4]:
data['text'] = data['text'].apply(lambda x: BeautifulSoup(x).get_text())

I'll remove text duplicates and entries containing empty texts.

In [5]:
print("Initial dataset size: ", data.shape[0])
data.drop_duplicates(subset=['text'], inplace=True)
print("Dataset size after removing duplicates: ", data.shape[0])
data = data[data['text'].str.len() > 0]
print("Dataset size after removing empty articles: ", data.shape[0])

Initial dataset size:  44898
Dataset size after removing duplicates:  38645
Dataset size after removing empty articles:  38644


I'm going to concatenate the title and the article content in order to get a single piece of text. I'll also remove the subject and date fields. This will result in a dataframe containing just the text and its category (fake/true) 

In [6]:
data['text'] = data['title'] + " " + data['text']
data = data[['text', 'fake']]
data.head()

Unnamed: 0,text,fake
0,"As U.S. budget fight looms, Republicans flip t...",0
1,U.S. military to accept transgender recruits o...,0
2,Senior U.S. Republican senator: 'Let Mr. Muell...,0
3,FBI Russia probe helped by Australian diplomat...,0
4,Trump wants Postal Service to charge 'much mor...,0


Because "Reuters" is specified as an information source in the majority of true articles, I'll delete its occurences in order to remove the bias form the training set.

In [7]:
data['text'] = data['text'].apply(lambda x: 
            x.replace("WASHINGTON \(Reuters\)", "").replace("(Reuters)", "").replace("Reuters", ""))

In [None]:
# Removing urls?
# data['text'].apply(lambda x: re.compile(r'https?://\S+|www\.\S+').sub(r'',x))

Next, I'll remove stop words, punctation marks and other special symbols, because they do not contain useful information that should be used in training. \
Before lemmatization, I need to tokenize the articles contents. This implies splitting the blocks of texts into individual words.\
\
When preparing data for natural language processing tasks, one common preprocessing step is converting the tokens to lowercase. However, for some tasks, keeping the capital letters can make sense. In our case I prefer skipping the lowercase step, because I expect fake news articles to contain more upper-cased words. As we can see below, fake articles have an average of 126.5 uppercase letters, whereas true articles have an average of 89.5 uppercase letters.

In [9]:
# Number of uppercase letters in fake articles
data[data['fake'] == 1]['text'].apply(lambda x: sum(1 for c in x if c.isupper())).describe()

count    17453.000000
mean       126.543116
std         97.254926
min          0.000000
25%         79.000000
50%        103.000000
75%        143.000000
max       2832.000000
Name: text, dtype: float64

In [10]:
# Number of uppercase letters in true articles
data[data['fake'] == 0]['text'].apply(lambda x: sum(1 for c in x if c.isupper())).describe()

count    21191.000000
mean        89.539144
std         61.260820
min          4.000000
25%         43.000000
50%         78.000000
75%        117.000000
max       1294.000000
Name: text, dtype: float64

In [11]:
stop = set(stopwords.words('english') + list(string.punctuation))
data['text'] = data['text'].apply(lambda x: [token for token in word_tokenize(x) if token not in stop])

Now we can continue with the lemmatization step that will convert tokens obtained at the previous step to their base form, removing their inflectional endings.

In [12]:
lemmatizer = nltk.WordNetLemmatizer()
data['text'] = data['text'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

In [13]:
data

Unnamed: 0,text,fake
0,"[As, U.S., budget, fight, loom, Republicans, f...",0
1,"[U.S., military, accept, transgender, recruit,...",0
2,"[Senior, U.S., Republican, senator, 'Let, Mr.,...",0
3,"[FBI, Russia, probe, helped, Australian, diplo...",0
4,"[Trump, want, Postal, Service, charge, 'much, ...",0
...,...,...
44115,"[The, White, House, The, Theatrics, ‘, Gun, Co...",1
44116,"[Activists, Terrorists, How, Media, Controls, ...",1
44117,"[BOILER, ROOM, –, No, Surrender, No, Retreat, ...",1
44118,"[Federal, Showdown, Looms, Oregon, After, BLM,...",1


Save the preprocessed data in a separate file that will be later used for training.

In [14]:
data.to_csv('../../data/processed/data.csv')