In [14]:
import numpy as np
import pandas as pd
import plotly.express as px
import spacy
from spacy.matcher import Matcher

**Dataset**

In [2]:
data = pd.read_excel('../data/news_dataset.xlsx')

In [3]:
data.head()

Unnamed: 0,text,label
0,No comment is expected from Barack Obama Membe...,fake
1,Did they post their votes for Hillary already?,fake
2,"Now, most of the demonstrators gathered last ...",fake
3,A dozen politically active pastors came here f...,real
4,"The RS-28 Sarmat missile, dubbed Satan 2, will...",fake


## Data Preprocessing

**Strategy:**
1. Tokenize news.
2. Lower case the tokens.
3. Remove stop words.
4. Use lemmatization on the tokens.
5. Replace numbers with NUM and time with TIME token.
6. Remove special characters from the news.
7. Remove rare tokens.
8. Vectorize the news using TF-IDF or Word Embedding to represent in formtat suitable for machine learning model.

In [4]:
# Example of using Spacy
nlp = spacy.load("en_core_web_sm")
text = "Tokenize this sentence using Spacy."
doc = nlp(text)
[token.text for token in doc]

['Tokenize', 'this', 'sentence', 'using', 'Spacy', '.']

In [7]:
# Take one short news
sample_news = data['text'][1]

In [8]:
# Tokenize the news
doc = nlp(sample_news)
[token.text for token in doc]

['Did', 'they', 'post', 'their', 'votes', 'for', 'Hillary', 'already', '?']

In [9]:
# Lower case the tokens
[token.text.lower() for token in doc]

['did', 'they', 'post', 'their', 'votes', 'for', 'hillary', 'already', '?']

In [10]:
# Remove the stop words
[token.text.lower() for token in doc if not token.is_stop]

['post', 'votes', 'hillary', '?']

In [11]:
# Look at the Spacy lemmatizer
lemmatizer = nlp.get_pipe("lemmatizer")
print(lemmatizer.model)

None


In [12]:
# Get lemmatization of tokens
[token.lemma_.lower() for token in doc if not token.is_stop]

['post', 'vote', 'hillary', '?']

In [29]:
sample_news = data['text'][3]

doc = nlp(sample_news)

num_token = '<NUMBER>'
time_token = '<TIME>'

news = []

for token in doc:
    if token.like_num:
        news.append(num_token)
    elif token.ent_type_=='TIME':
        news.append(time_token)
    else:
        news.append(token.lemma_.lower() if not token.is_stop else "")

' '.join([word for word in news if not word==''])

'dozen politically active pastor come private dinner friday <TIME> hear conversion story unique context presidential politic : louisiana gov. bobby jindal travel hinduism protestant christianity , ultimately , call â€œevangelical catholic.â€\x9d \n\n <NUMBER> <TIME> , jindal , <NUMBER> , recall talk girl high school want â€œsave soul , â€\x9d read bible closet parent feel stir watch movie senior year depict jesus cross . \n\n â€œi strike , strike hard , â€\x9d jindal tell pastor . â€œthis son god , die sins.â€\x9d \n\n jindalâ€ ™ s session christian clergy , lead congregation early presidential battleground state iowa south carolina , - - scene effort louisiana governor find political base help propel tier republican candidate seek run white house <NUMBER> . \n\n know gop circle mastery policy issue health care , jindal , rhodes scholar graduate ivy leagueâ€ ™ s brown university , obvious pool activist supporter help drive excitement outside home state . harness religious experience wa

In [None]:
def preprocessing(news):
    pass