In [1]:
import numpy as np
import pandas as pd
df = pd.read_csv('news.csv')
df = df.drop(df.columns[0],axis=1)
df['news'] = df['title'] + ' ' + df['text']
convert_to_binary = {'REAL':1,'FAKE':0}
df['label'] = df['label'].map(convert_to_binary)
df = df.drop([df.columns[0],df.columns[1]],axis=1)
df = df.reindex(columns=['news','label'])

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import re

# Creating a list of stopwords (common words which are not important to the overall meaning of the article).
stop_words = stopwords.words('english')
stop_words.extend(['the','it','in'])

# Lemmatization is the process of reducing words to their basic form, e.g. dogs->dog, running->run.
WNL = WordNetLemmatizer()

# We now iterate through each article in the dataset.
for index, row in df.iterrows():
    filtered_article = ''
    article = row['news']
    # Removing all punctuation using RegEx.
    article = re.sub(r'[^\w\s]', '', article)
    # Tokenization: changing the article from one continuous string to a list of tokens (words).
    words = [word.lower() for word in nltk.word_tokenize(article)]
    # Removing stopwords.
    words = [word for word in words if not word in stop_words]
    # Lemmatization and concatenating the words back into a single string.
    words_lemmatized = []
    for word in words:
        if word == 'us':
            words_lemmatized.append(word)
        else:
            words_lemmatized.append(WNL.lemmatize(word))
    filtered_article = " ".join([word for word in words_lemmatized])
    
    # Replace the article with the new filtered version.
    df.loc[index, 'news'] = filtered_article
    
df_input = df['news']
df_output = df['label']

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
tf_idf_matrix = vectorizer.fit_transform(df_input)

x = tf_idf_matrix.toarray()
labels = df_output.values

In [2]:
# setting random seed (for consistent results & choosing operating system)

from numpy.random import seed
from tensorflow.random import set_seed
seed(0)
set_seed(0)
import os
os.environ['TF_DETERMINISTIC_OPS'] = '1'

In [3]:
# 'hot encoding' the data (go from binary 0 or 1 to [1,0] and [0,1] respectively)

from keras.utils import to_categorical
print("example point's binary outcome:", labels[100])
labels = to_categorical(labels)
print("same example point's one hot encoded vector:", labels[100])

example point's binary outcome: 1
same example point's one hot encoded vector: [0. 1.]


In [4]:
# splitting the data into a training data set (60%), a validation data set (20%) and a test data set (20%) 

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, labels, test_size=0.2)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.25)

In [5]:
# scaling the training and validation data features. Removes mean and makes the datas variance = 1. 
# learns the correct transformation on the training data and then applies to the test data.

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)

x_val = scaler.transform(x_val)
x_test = scaler.transform(x_test)

In [7]:
np.save('X_train',x_train)
np.save('Y_train',y_train)
np.save('X_val',x_val)
np.save('Y_val',y_val)
np.save('X_test',x_test)
np.save('Y_test',y_test)