In [63]:
import numpy as np
import pandas as pd
import plotly.express as px
import spacy

**Dataset**

In [2]:
data = pd.read_excel('../data/news_dataset_small.xlsx')

In [57]:
data.head()

Unnamed: 0,text,label
0,No comment is expected from Barack Obama Membe...,fake
1,Did they post their votes for Hillary already?,fake
2,"Now, most of the demonstrators gathered last ...",fake
3,A dozen politically active pastors came here f...,real
4,"The RS-28 Sarmat missile, dubbed Satan 2, will...",fake


In [4]:
data.shape

(14999, 2)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    14999 non-null  object
 1   label   14999 non-null  object
dtypes: object(2)
memory usage: 234.5+ KB


## Data Preprocessing

**Strategy:**
1. Tokenize news.
2. Lower case the tokens.
3. Remove stop words.
4. Use lemmatization on the tokens.
5. Remove punctuations from the news.
6. Vectorize the news using TF-IDF or Word Embedding to represent in formtat suitable for machine learning model.

In [84]:
# Funtion for preprocessing
nlp = spacy.load("en_core_web_sm")

def preprocess_news(news):
    preprocessed_news = []
    
    docs = nlp.tokenizer.pipe(news)

    for doc in docs:
        preprocessed_words = [token.text.lower() for token in doc if not (token.is_stop or token.is_punct or token.is_space)]
        preprocessed_news.append(' '.join(preprocessed_words))

    return preprocessed_news

def preprocess_label(label):
    return 1 if label=="fake" else 0

In [85]:
preprocessed_data = pd.DataFrame({'text': [], 'label': []})

preprocessed_data['text'] = preprocess_news(data['text'])
preprocessed_data['label'] = data['label'].apply(preprocess_label)

In [86]:
preprocessed_data['text'][0]

'comment expected barack obama members fyf911 fukyoflag blacklivesmatter movements called lynching hanging white people cops encouraged radio tuesday night turn tide kill white people cops send message killing black people america f***yoflag organizers called sunshine radio blog hosted texas called sunshine s f***ing opinion radio snapshot fyf911 @lolatwhitefear twitter page 9:53 p.m. shows urging supporters fyf911 tonight continue dismantle illusion white snapshot twitter radio invite fyf911the radio aired 10:00 p.m. eastern standard time callers clearly lynching killing white people 2:39 minute clip radio heard provided breitbart texas like referred hannibal received death threats result interrupting fyf911 conference calls unidentified black man said mother f**kers s start f***ing like bunch ni**ers takin s roll said cause roll gangs seven black mother f**ckers white person lynch ass let s turn tables conspired cops started losing people state emergency speculated things happen big 

In [87]:
preprocessed_data['label'][0]

1