In [76]:
import re
import pandas as pd
import nltk 
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
# nltk.download('stopwords') ----> udkommenter denne linje, hvis du ikke har nltk installeret
# nltk.download('punkt') ----> udkommenter denne linje, hvis du ikke har nltk installeret
import matplotlib.pyplot as plt

In [77]:
url = 'https://raw.githubusercontent.com/several27/FakeNewsCorpus/master/news_sample.csv'

data = pd.read_csv(url)

print("the shape of the data is: ", data.shape)

print(data.head())

the shape of the data is:  (250, 16)
   Unnamed: 0   id                domain        type  \
0           0  141               awm.com  unreliable   
1           1  256     beforeitsnews.com        fake   
2           2  700           cnnnext.com  unreliable   
3           3  768               awm.com  unreliable   
4           4  791  bipartisanreport.com   clickbait   

                                                 url  \
0  http://awm.com/church-congregation-brings-gift...   
1  http://beforeitsnews.com/awakening-start-here/...   
2  http://www.cnnnext.com/video/18526/never-hike-...   
3  http://awm.com/elusive-alien-of-the-sea-caught...   
4  http://bipartisanreport.com/2018/01/21/trumps-...   

                                             content  \
0  Sometimes the power of Christmas will make you...   
1  AWAKENING OF 12 STRANDS of DNA – “Reconnecting...   
2  Never Hike Alone: A Friday the 13th Fan Film U...   
3  When a rare shark was caught, scientists were ...   
4  Donald

In [78]:
def cleanText(text):
    # lower case
    text = text.lower()

    # should not contain multiple spaces, tabs or newlines
    text = re.sub(r'\s+', ' ', text)

    # date and time stuff
    # text = re.sub(r'\b(?:the )?(\d{1,2})(?:st|nd|rd|th)?\s*(?:of\s+)?(January|February|March|April|May|June|July|August|September|October|November|December)\b', '<DATE>', text)
    # text = re.sub(r'r"([a-zA-Z]{3}\s\d{1,2}\s\d{4})"', '<DATE>', text, flags=re.IGNORECASE)
    # text = re.sub(r'\b(?:the )?(\d{1,2})(?:st|nd|rd|th)?(?: of)?(?: (?:jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|jul(?:y)?|aug(?:ust)?|sep(?:tember)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?))(?: (\d{4}|\d{2}))?\b', '<DATE>', text)
    # text = re.sub(r'\b(?:the )?(\d{1,2})(?:st|nd|rd|th)?(?: of)?(?: (?:jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|jul(?:y)?|aug(?:ust)?|sep(?:tember)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?|Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?))(?: (\d{4}|\d{2}))?\b', '<DATE>', text)

    # replace dates with <DATE>
    #  january 18, 2018. jan 18, 2018. 2018-01-18
    date_pattern = r'\b(?:jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|jul(?:y)?|aug(?:ust)?|sep(?:tember)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?)\s+\d{1,2}(?:,\s+|\s+)\d{4}\b|\b\d{4}-\d{2}-\d{2}\b'

    # date_pattern = r'\b(?:jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|jul(?:y)?|aug(?:ust)?|sep(?:tember)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?)\s+\d{1,2},\s+\d{4}\b|\b\d{4}-\d{2}-\d{2}\b'
    text = re.sub(date_pattern, '<DATE>', text)
    # nov. 5
    date_pattern2 = r'\b(?:jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|jul(?:y)?|aug(?:ust)?|sep(?:tember)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?)\.\s+\d{1,2}\b'
    text = re.sub(date_pattern2, '<DATE>', text)

    # text = re.sub(r'\b(?:\d{1,2}[-/th|st|nd|rd\s]*)?(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)(?:\s*(?:\d{1,2}[-/th|st|nd|rd\s]*))?(?:\s*(?:\d{4}|\d{2}))?\b', '<DATE>', text)

    # replace numbers with <NUM>
    text = re.sub(r'\d+', '<NUM>', text)

    # replace urls with <URL>
    text = re.sub(r'(http|https)://[^\s]*', '<URL>', text)

    # replace emails with <EMAIL>
    text = re.sub(r'\b[\w\.-]+@[\w\.-]+\.\w{2,4}\b', '<EMAIL>', text)

    # remove various punctuations
    text = re.sub(r'[^\w\s]', '', text)

    return text

In [79]:
data['content'] = data['content'].apply(cleanText)

data.to_csv('cleaned_data.csv', index=False)

# tokenize the text using nltk

tokens = data['content'].apply(nltk.word_tokenize)

print(tokens.head())
print(tokens.shape)

0    [sometimes, the, power, of, christmas, will, m...
1    [awakening, of, NUM, strands, of, dna, reconne...
2    [never, hike, alone, a, friday, the, NUMth, fa...
3    [when, a, rare, shark, was, caught, scientists...
4    [donald, trump, has, the, unnerving, ability, ...
Name: content, dtype: object
(250,)


In [80]:
# remove stopwords
stop_words = set(stopwords.words('english'))
filtered_tokens = []

for list in tokens:
    filtered_words = []
    for w in list:
        if w not in stop_words:
            filtered_words.append(w)
    filtered_tokens.append(filtered_words)
    

print(tokens.head())
print(filtered_tokens)

0    [sometimes, the, power, of, christmas, will, m...
1    [awakening, of, NUM, strands, of, dna, reconne...
2    [never, hike, alone, a, friday, the, NUMth, fa...
3    [when, a, rare, shark, was, caught, scientists...
4    [donald, trump, has, the, unnerving, ability, ...
Name: content, dtype: object


In [81]:
# computing vocabularies and their length

vocabulary = []
vocabulary_with_stopwords = []

for list in tokens:
    for w in list:
        if w not in vocabulary_with_stopwords:
            vocabulary_with_stopwords.append(w)


for list in filtered_tokens:
    for w in list:
        if w not in vocabulary:
            vocabulary.append(w)


print("Vocabulary with stopwords included:\n")
print(vocabulary_with_stopwords)
print("Length of vocabulary with stopwords included: " + str(len(vocabulary_with_stopwords)) + "\n")
print("Vocabulary with stopwords excluded:\n")
print(vocabulary)
print("Length of vocabulary with stopwords excluded: " + str(len(vocabulary)))
print("Reduction in vocabulary length: " + str(len(vocabulary_with_stopwords) - len(vocabulary)))

Vocabulary with stopwords included:

Length of vocabulary with stopwords included: 16602

Vocabulary with stopwords excluded:

Length of vocabulary with stopwords excluded: 16470
Reduction in vocabulary length: 132


In [83]:
# stemming data

ps = PorterStemmer()
stemmed_filtered_tokens = []

for list in filtered_tokens:
    stemmed_words = []
    for w in list:
        if w not in stemmed_words:
            stemmed_words.append(ps.stem(w))
    stemmed_filtered_tokens.append(stemmed_words)

# making vocabulary from stemmed words
vocabulary_stemmed = []

for list in stemmed_filtered_tokens:
    for w in list:
        if w not in vocabulary_stemmed:
            vocabulary_stemmed.append(w)

print(vocabulary_stemmed)
print("Length of stemmed vocabulary: " + str(len(vocabulary_stemmed)))
print("Reduction in vocabulary length: " + str(len(vocabulary) - len(vocabulary_stemmed)))

['sometim', 'power', 'christma', 'make', 'wild', 'wonder', 'thing', 'need', 'believ', 'holi', 'triniti', 'posit', 'good', 'other', 'simpl', 'act', 'give', 'without', 'receiv', 'lost', 'mani', 'us', 'day', 'worri', 'money', 'success', 'hold', 'back', 'one', 'congreg', 'ohio', 'move', 'action', 'sermon', 'given', 'church', 'eve', 'pastor', 'grand', 'lake', 'unit', 'methodist', 'celina', 'gave', 'emot', 'import', 'understand', 'messag', 'jesu', 'religi', 'peopl', 'help', 'sure', 'suffer', 'get', 'enjoy', 'life', 'littl', 'bit', 'realli', 'generos', 'look', 'like', 'live', 'long', 'time', 'ago', 'gener', 'fashion', 'would', 'focu', 'potenc', 'take', 'end', 'decid', 'offer', 'bowl', 'pass', 'around', 'room', 'everyon', 'pitch', 'could', 'word', 'still', 'ring', 'ear', 'member', 'drove', 'local', 'waffl', 'hous', 'visit', 'ladi', 'work', 'night', 'shift', 'great', 'choic', 'famili', 'clearli', 'paid', 'bill', 'understood', 'sacrific', 'made', 'want', 'donat', 'entir', 'split', 'amongst', 'to