# Fake News Machine learning prediction model

## Imports

In [18]:
# Generic
import pandas as pd
import re
#from tqdm import tqdm
from tqdm.auto import tqdm  # for notebooks

# Create new `pandas` methods which use `tqdm` progress
# (can use tqdm_gui, optional kwargs, etc.)
tqdm.pandas()

# Natural Language Processing
import nltk
nltk.download('stopwords')
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     /Users/thomas/nltk_data...
[nltk_data]    |   Unzipping corpora/cmudict.zip.
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     /Users/thomas/nltk_data...
[nltk_data]    |   Unzipping corpora/gazetteers.zip.
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     /Users/thomas/nltk_data...
[nltk_data]    |   Unzipping corpora/genesis.zip.
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     /Users/thomas/nltk_data...
[nltk_data]    |   Unzipping corpora/gutenberg.zip.
[nltk_data]    | Downloading package inaugural to
[nltk_data]    |     /Users/thomas/nltk_data...
[nltk_data]    |   Unzipping corpora/inaugural.zip.
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /Users/thomas/nltk_data...
[nltk_data]    |   Unzipping corpora/movie_reviews.zip.
[nltk_data]    | Downloading packag

# Loading and filtering data

In [20]:
# First we load the data
train = pd.read_csv('./fake-news/train.csv')
test = pd.read_csv('./fake-news/test.csv')
# Then we check for any missing values in the data
print("Empty Training data:")
print(train.isnull().sum())

print("Empty Testing data:")
print(test.isnull().sum())

Empty Training data:
id           0
title      558
author    1957
text        39
label        0
dtype: int64
Empty Testing data:
id          0
title     122
author    503
text        7
dtype: int64


In [21]:
#Seeing as there is some empty data, we have to fill this with something
# We are working with text so we'll fill it with empty strings:
train = train.fillna("")
test = test.fillna("")

In [22]:
# Inspecting the data
test.head()
train.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [23]:
# To make an accurate predection we want to include all the relevant factors when passing data to the model
# In our case, both the title, the author and the content can be an indication of fake news
test['content']=test['author'] + ': ' + test['title'] + '\n' + test['text']
train['content']=train['author'] +': ' + train['title'] + '\n' + train['text']

**Stemming** <br>
To determine which words are important in the fake news articles, we have to "Stem" them.
In other words reduce them to their roots to unify them.
Example:
* waited,waiting,waits -> wait

To do this we use the python package **N**atural **L**anguage **T**ool**k**it (nltk)

In [None]:
# First we utilize a port stemmer to stem the words from the article content
port_stem = PorterStemmer()
# Next we specify a function that both applies this port stemmer algorithm and cleans the content 
def stemContent(content):
    content = re.sub('[^a-zA-Z]',' ',content) # Remove \#!€ etc.
    content = content.lower() # Change all to lowercase
    content = content.split() # Convert into an array to apply port stemmer algorithm on each word
    # Stem each word if it is not a stop word (words commenly used in a language but don't provide 
    # any value for the machine learning categorization task (for, an, nor, but, or, yet, so etc.))
    # This allows for faster processing later on
    content = [port_stem.stem(word) for word in content if not word in stopwords.words('english')] 
    content = ' '.join(content) # Join the list of stemmed words back into one string
    return content

# Apply the stemming function to each element in the dataset:

train['content'] = train['content'].progress_apply(stemContent)
print(train['content'])

    

: 

KeyboardInterrupt: 