# Data Preprocessing

## Import necessary libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')

!pip install spacy
!pip install pytextrank
import spacy
import pytextrank
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe('textrank')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Collecting pytextrank
  Downloading pytextrank-3.3.0-py3-none-any.whl (26 kB)
Collecting GitPython>=3.1 (from pytextrank)
  Downloading GitPython-3.1.43-py3-none-any.whl (207 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
Collecting icecream>=2.1 (from pytextrank)
  Downloading icecream-2.1.3-py2.py3-none-any.whl (8.4 kB)
Collecting gitdb<5,>=4.0.1 (from GitPython>=3.1->pytextrank)
  Downloading gitdb-4.0.11-py3-none-any.whl (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.7/62.7 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting colorama>=0.3.9 (from icecream>=2.1->pytextrank)
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Collecting executing>=0.3.1 (from icecream>=2.1->pytextrank)
  Downloading executing-2.0.1-py2.py3-none-any.whl (24 kB)
Collecting asttokens>=2.0.1 (from icecream>=2.1->pytextrank)
  Downloading asttokens-2.4.1-py2.py3-none-any.whl (27 kB)


<pytextrank.base.BaseTextRankFactory at 0x79906fee02e0>

## Data Collecting

In [2]:
dataset = pd.read_csv('data.csv')

## Data understanding and exploration

In [3]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32470 entries, 0 to 32469
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Unnamed: 0     32470 non-null  int64 
 1   title          32470 non-null  object
 2   text           32470 non-null  object
 3   fake or true?  32470 non-null  object
dtypes: int64(1), object(3)
memory usage: 1014.8+ KB


In [4]:
dataset.describe()

Unnamed: 0.1,Unnamed: 0
count,32470.0
mean,10146.625008
std,7127.254084
min,0.0
25%,4058.25
50%,8117.5
75%,16234.75
max,24352.0


In [5]:
dataset

Unnamed: 0.1,Unnamed: 0,title,text,fake or true?
0,0,‘Maury’ Show Official Facebook Posts F*CKED U...,Maury is perhaps one of the trashiest shows on...,fake
1,1,Trump’s Favorite News Channel Tries To Soothe...,"Yesterday, after the father of one of the UCLA...",fake
2,2,"Russia warns Iraq, Kurds not to destabilize Mi...",MOSCOW (Reuters) - Russia on Wednesday warned ...,true
3,3,WATCH STEVE SCALISE Throw A Strike At The Nati...,"House Majority Whip Steve Scalise (R., La.) th...",fake
4,4,Trump Will HATE What Stephen Colbert Just Did...,It can be said that Late Show host Stephen Col...,fake
...,...,...,...,...
32465,8112,Opposition leader says Brexit must not be used...,"BRUSSELS (Reuters) - Jeremy Corbyn, leader of ...",true
32466,8113,‘Pro-Life’ Scott Walker Just Signed Two Bills...,Republicans always talk a big game about being...,fake
32467,8114,LIBERAL HUMOR? FLORIDA BAR POSTS SIGN Suggesti...,It s a federal offense to threaten a president...,fake
32468,8115,New York protesters camp out at Goldman Sachs ...,NEW YORK (Reuters) - Dozens of protesters gath...,true


## Data cleaning

In [6]:
dataset = dataset.drop(columns={'Unnamed: 0'})

In [7]:
#Check if the dataset has missing values
for i in range(len(dataset.columns)):
    missing_data = dataset[dataset.columns[i]].isna().sum()
    perc = missing_data / len(dataset) * 100
    print('>%d,  missing entries: %d, percentage %.2f' % (i, missing_data, perc))

>0,  missing entries: 0, percentage 0.00
>1,  missing entries: 0, percentage 0.00
>2,  missing entries: 0, percentage 0.00


In [8]:
#Remove stopwords using nltk
stop_words = set(stopwords.words("english"))
def remove_stopwords(s):
    words = word_tokenize(s)
    lst = [word for word in words if word not in stop_words]
    return ' '.join(lst)

def lemmatization(text):
    token = nlp(text)
    text = [word.lemma_ for word in token]
    return ' '.join(text)

def transform(s):
    s = s.lower().strip()
    s = re.sub('http\S+', ' ', s)
    s = re.sub('[,\.!?:()"]', '', s)
    s = re.sub('<.*?>', ' ', s)
    s = re.sub('[^a-zA-Z0-9]', ' ', s)
    s = re.sub('\s+', ' ', s)
    s = lemmatization(s)
    s = remove_stopwords(s)
    return s

In [9]:
dataset['text'] = dataset['text'].apply(transform)
dataset['title'] = dataset['title'].apply(transform)

## Data Transformation

In [10]:
dataset.rename(columns={'fake or true?':'categorical'}, inplace=True)

In [11]:
dataset['categorical'].value_counts()

categorical
true    17530
fake    14940
Name: count, dtype: int64