# Data Preprocessing
In this notebook, we will be doing the following:
- Importing the data
- Cleaning the data
- Creating synthetic features
- Exporting the data

In [1]:
# Importing python modules
import numpy as np
import pandas as pd
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Importing the fake-news dataset
df_train = pd.read_csv('Datasets/fake-news/train.csv')
df_test = pd.read_csv('Datasets/fake-news/test.csv')

In [3]:
# Dropping the 'title' column and NaN values from the dataset
df_train = df_train.drop(['title'], axis=1)
df_train = df_train.dropna()
df_test = df_test.drop(['title'], axis=1)
df_test = df_test.dropna()

In [5]:
# Cleaning the text
# In this step, we will remove all the punctuations, numbers and stopwords from the text.
# We will also convert all the text to lowercase and apply stemming to the text.
# We will do this for the training and testing dataset separately.

# Training dataset
corpus_train = []
for i in range(0, len(df_train)):
    review = re.sub('[^a-zA-Z]', ' ', df_train['text'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if word not in stopwords.words('english')]
    review = ' '.join(review)
    corpus_train.append(review)   

KeyError: 6

In [None]:
# adding the cleaned text to the train and test datasets
df_train['text'] = corpus_train
# Exporting the cleaned datasets
df_train.to_csv('Data-Preprocessing/train_cleaned.csv', index=False)

In [None]:
# Testing dataset
corpus_test = []
for i in range(0, len(df_test)):
    review = re.sub('[^a-zA-Z]', ' ', df_test['text'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if word not in stopwords.words('english')]
    review = ' '.join(review)
    corpus_test.append(review)

In [None]:
# adding the cleaned text to the train and test datasets
df_test['text'] = corpus_test
# Exporting the cleaned datasets
df_test.to_csv('Data-Preprocessing/test_cleaned.csv', index=False)

In [None]:
# Creating the Bag of Words model for the corpus
# In this step, we will create a Bag of Words model for the corpus.
# We will also limit the number of features to 1500.

cv = CountVectorizer(max_features=1500)
X = cv.fit_transform(corpus['text']).toarray()
y = df_train.iloc[:, 4].values