# Data Cleaning

In [1]:
# pip install text hero
import pandas as pd
import texthero as hero

ModuleNotFoundError: No module named 'texthero'

In [2]:
# Load Datasets
fake_df = pd.read_csv('resources/fake.csv')
real_df = pd.read_csv('resources/true.csv')

In [3]:
fake_df

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"
...,...,...,...,...
23445,McPain: John McCain Furious That Iran Treated ...,21st Century Wire says As 21WIRE reported earl...,Middle-east,"January 16, 2016"
23446,JUSTICE? Yahoo Settles E-mail Privacy Class-ac...,21st Century Wire says It s a familiar theme. ...,Middle-east,"January 16, 2016"
23447,Sunnistan: US and Allied ‘Safe Zone’ Plan to T...,Patrick Henningsen 21st Century WireRemember ...,Middle-east,"January 15, 2016"
23448,How to Blow $700 Million: Al Jazeera America F...,21st Century Wire says Al Jazeera America will...,Middle-east,"January 14, 2016"


In [4]:
real_df

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"
...,...,...,...,...
21412,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017"
21413,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017"
21414,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017"
21415,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017"


In [5]:
# Join datasets with on target variable
fake_df["news_outcome"] = 1
real_df["news_outcome"] = 0

news_df = pd.concat([fake_df, real_df])
news_df.count()

title           44867
text            44867
subject         44867
date            44867
news_outcome    44867
dtype: int64

In [6]:
# Remove null values
news_df = news_df.dropna()
news_df.count()

title           44867
text            44867
subject         44867
date            44867
news_outcome    44867
dtype: int64

In [7]:
# Drop duplicates
news_df = news_df.drop_duplicates(keep="first")
news_df.count()

title           44658
text            44658
subject         44658
date            44658
news_outcome    44658
dtype: int64

## Text Cleaning

In [8]:
# Clean 'title' feature with TextHero

# Remove punctuation
news_df['title'] = hero.remove_punctuation(news_df['title'])
# Convert to lowercase
news_df['title'] = hero.lowercase(news_df['title'])
# Remove URLs
news_df['title'] = hero.remove_urls(news_df['title'])
# Convert foreign symbols
news_df['title'] = hero.remove_diacritics(news_df['title'])
# Remove brackets
news_df['title'] = hero.remove_brackets(news_df['title'])
news_df['title'] = hero.remove_angle_brackets(news_df['title'])
news_df['title'] = hero.remove_curly_brackets(news_df['title'])
news_df['title'] = hero.remove_round_brackets(news_df['title'])
news_df['title'] = hero.remove_square_brackets(news_df['title'])
# Remove stopwords (e.g. and, then)
news_df['title'] = hero.remove_stopwords(news_df['title'])
# Remove excess spaces
news_df['title'] = hero.remove_whitespace(news_df['title'])

In [9]:
news_df['title'].head()

0    donald trump sends embarrassing new year' eve ...
1    drunk bragging trump staffer started russian c...
2    sheriff david clarke becomes internet joke thr...
3    trump obsessed even obama' name coded website ...
4    pope francis called donald trump christmas speech
Name: title, dtype: object

In [10]:
## Clean 'text' feature with TextHero

# Remove punctuation
news_df['text'] = hero.remove_punctuation(news_df['text'])
# Convert to lowercase
news_df['text'] = hero.lowercase(news_df['text'])
# Remove URLs
news_df['text'] = hero.remove_urls(news_df['text'])
# Remove foreign symbols
news_df['text'] = hero.remove_diacritics(news_df['text'])
# Remove brackets
news_df['text'] = hero.remove_brackets(news_df['text'])
news_df['text'] = hero.remove_angle_brackets(news_df['text'])
news_df['text'] = hero.remove_curly_brackets(news_df['text'])
news_df['text'] = hero.remove_round_brackets(news_df['text'])
news_df['text'] = hero.remove_square_brackets(news_df['text'])
# Remove stopwords (e.g. and, then)
news_df['text'] = hero.remove_stopwords(news_df['text'])
# Remove excess spaces
news_df['text'] = hero.remove_whitespace(news_df['text'])

In [11]:
news_df["text"].head()

0    donald trump wish americans happy new year lea...
1    house intelligence committee chairman devin nu...
2    friday revealed former milwaukee sheriff david...
3    christmas day donald trump announced would bac...
4    pope francis used annual christmas day message...
Name: text, dtype: object

In [12]:
# Further punctuation cleansing for title
news_df["title"] = news_df["title"].str.replace('[^\w\s]','')

  news_df["title"] = news_df["title"].str.replace('[^\w\s]','')


In [13]:
# Further punctuation cleansing for text
news_df["text"] = news_df["text"].str.replace('[^\w\s]','')

  news_df["text"] = news_df["text"].str.replace('[^\w\s]','')


## Lemmatisation

In [14]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [15]:
# Define lemmatise function to loop through dataframe
def lemmatise(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc])

In [16]:
# Lemmatise title feature
news_df["title"] = news_df["title"].apply(lemmatise)
news_df["title"].head()

0    donald trump send embarrassing new year eve me...
1    drunk bragging trump staffer start russian col...
2    sheriff david clarke become internet joke thre...
3     trump obsess even obama name coded website image
4      pope francis call donald trump christmas speech
Name: title, dtype: object

In [17]:
# Lemmatise text feature
news_df["text"] = news_df["text"].apply(lemmatise)
news_df["text"].head()

0    donald trump wish americans happy new year lea...
1    house intelligence committee chairman devin nu...
2    friday reveal former milwaukee sheriff david c...
3    christmas day donald trump announce would back...
4    pope francis use annual christmas day message ...
Name: text, dtype: object

## Tokenisation

In [19]:
# Tokenise title feature
news_df['title'] = hero.tokenize(news_df['title'])

In [20]:
# Tokenise text feature
news_df['text'] = hero.tokenize(news_df['text'])

In [21]:
news_df.head()

Unnamed: 0,title,text,subject,date,news_outcome
0,"[donald, trump, send, embarrassing, new, year,...","[donald, trump, wish, americans, happy, new, y...",News,"December 31, 2017",1
1,"[drunk, bragging, trump, staffer, start, russi...","[house, intelligence, committee, chairman, dev...",News,"December 31, 2017",1
2,"[sheriff, david, clarke, become, internet, jok...","[friday, reveal, former, milwaukee, sheriff, d...",News,"December 30, 2017",1
3,"[trump, obsess, even, obama, name, coded, webs...","[christmas, day, donald, trump, announce, woul...",News,"December 29, 2017",1
4,"[pope, francis, call, donald, trump, christmas...","[pope, francis, use, annual, christmas, day, m...",News,"December 25, 2017",1


In [25]:
# Convert date from string to date data type
news_df['date'] = pd.to_datetime(news_df['date'])

In [26]:
news_df.head()

Unnamed: 0,title,text,subject,date,news_outcome
0,"[donald, trump, send, embarrassing, new, year,...","[donald, trump, wish, americans, happy, new, y...",News,2017-12-31,1
1,"[drunk, bragging, trump, staffer, start, russi...","[house, intelligence, committee, chairman, dev...",News,2017-12-31,1
2,"[sheriff, david, clarke, become, internet, jok...","[friday, reveal, former, milwaukee, sheriff, d...",News,2017-12-30,1
3,"[trump, obsess, even, obama, name, coded, webs...","[christmas, day, donald, trump, announce, woul...",News,2017-12-29,1
4,"[pope, francis, call, donald, trump, christmas...","[pope, francis, use, annual, christmas, day, m...",News,2017-12-25,1


In [27]:
# Export cleaned CSV file
news_df.to_csv('resources/news.csv', index=False)

## SQL Table Creation

DROP TABLE IF EXISTS news;

CREATE TABLE news (
	title VARCHAR(60000),
	text VARCHAR(60000),
	subject VARCHAR(250),
	date DATE,
	news_outcome INT
)


One row removed within excel after export as the document 
appears error free in jupyter notebook.
Error Type: Non numerical value in 'news_outcome' feature 
but doesnt show up in pd dataframe when tested.