In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
import re

In [13]:
# Code to download nltk packages
"""
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download()
"""

'\nimport nltk\nimport ssl\n\ntry:\n    _create_unverified_https_context = ssl._create_unverified_context\nexcept AttributeError:\n    pass\nelse:\n    ssl._create_default_https_context = _create_unverified_https_context\n\nnltk.download()\n'

## Text Cleaning (preprocessing)

In [3]:
#Dataset to clean
dataset = pd.read_csv('fake_or_real_news.csv')

textColumn = 'text' #Change this depending on dataset

dataset.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


## Remove Special Characters
Removes HTML Tags which may be an issue when copy and pasting from an online article. Also removes commas, quotation marks etc.

In [4]:
#Remove tags and Special characters
#Using Regular Expressions

# Removing HTML Tags
dataset[textColumn] = dataset[textColumn].apply(lambda x: re.sub(r'<.*?>', '', x))

# Removing Special Characters
dataset[textColumn] = dataset[textColumn].apply(lambda x: re.sub(r'[^a-zA-Z0-9\s]', '', x))

dataset.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,Daniel Greenfield a Shillman Journalism Fellow...,FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,US Secretary of State John F Kerry said Monday...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,Kaydee King KaydeeKing November 9 2016 The le...,FAKE
4,875,The Battle of New York: Why This Primary Matters,Its primary day in New York and frontrunners H...,REAL


## Convert to Lower Case

In [5]:
#Convert to lower case
dataset[textColumn] = dataset[textColumn].apply(lambda x: x.lower())

dataset.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,daniel greenfield a shillman journalism fellow...,FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,google pinterest digg linkedin reddit stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,us secretary of state john f kerry said monday...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,kaydee king kaydeeking november 9 2016 the le...,FAKE
4,875,The Battle of New York: Why This Primary Matters,its primary day in new york and frontrunners h...,REAL


## Stop Word Removal
Removes stop words that dont add much to the text, 'a' 'in' 'as' 'is' etc

In [8]:
#Remove Stop words
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))
dataset[textColumn] = dataset[textColumn].apply(lambda x: [word for word in x.split() if word not in stop_words])

AttributeError: 'list' object has no attribute 'split'

In [7]:
dataset.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"[n, e, l, , g, r, e, e, n, f, e, l, , , h, ...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,"[g, g, l, e, , p, n, e, r, e, , g, g, , l, ...",FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,"[u, , e, c, r, e, r, , f, , e, , j, h, n, ...",REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"[ , k, e, e, , k, n, g, , k, e, e, k, n, g, ...",FAKE
4,875,The Battle of New York: Why This Primary Matters,"[ , p, r, r, , , n, , n, e, w, , r, k, , ...",REAL


## Lemmatization
Dictionary based Stemming, results in more correct meanings and spellings

In [7]:
#Lemmatization
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
dataset[textColumn] = dataset[textColumn].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

In [8]:
dataset.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"[daniel, greenfield, shillman, journalism, fel...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,"[google, pinterest, digg, linkedin, reddit, st...",FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,"[u, secretary, state, john, f, kerry, said, mo...",REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"[kaydee, king, kaydeeking, november, 9, 2016, ...",FAKE
4,875,The Battle of New York: Why This Primary Matters,"[primary, day, new, york, frontrunners, hillar...",REAL


In [9]:
## Export preprocessed dataset ##
#dataset.to_csv('Preprocessed_Fake_News.csv', index= True)

## Vectorizing
Join words back to sentences as vectorizer works with strings rather than tokens. It tokenizes text itself.

*Might not necessarily be done as a part of cleaning our data, but this is the next step before feeding the data to our model*

In [21]:
#Vectorizing
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

vectorizedData = vectorizer.fit_transform(dataset[textColumn].apply(lambda x: ' '.join(x)))