In [2]:
import collections
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import spacy
import tensorflow as tf

from gensim.models.keyedvectors import KeyedVectors
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from tensorflow import keras
from tensorflow.keras import layers




# Dataset

The dataset is sourced from [Kaggle](kaggle.com). <br>
It is the [fake-and-real-news-dataset](https://www.kaggle.com/datasets/clmentbisaillon/fake-and-real-news-dataset?select=True.csv) </br>

Since the dataset is split into two parts Fake.csv and True.csv , we combine the datasets into one and add a new column to indicate whether it is true or false.


In [3]:
fake_df = pd.read_csv('Fake.csv')
true_df = pd.read_csv('True.csv')

fake_df['label'] = 0
true_df['label'] = 1

combined_df = pd.concat([fake_df, true_df], ignore_index=True)
combined_df = combined_df.sample(frac=1).reset_index(drop=True)

combined_df.head()

Unnamed: 0,title,text,subject,date,label
0,WATCH: Female Cop Halts Sex Offender’s Violen...,"Earlier in January, 31-year-old Michael Cox wa...",News,"February 1, 2016",0
1,"Senate confirms two FERC commissioners, restor...",WASHINGTON (Reuters) - The U.S. Senate on Thur...,politicsNews,"August 4, 2017",1
2,AWESOME! Conservative Artist Crashes Anti-Trum...,Our favorite conservative street artist Sabo c...,politics,"Nov 13, 2017",0
3,Clarence Thomas On The Bench Without Scalia I...,Supreme Court Justice Clarence Thomas is the e...,News,"February 22, 2016",0
4,HUH? NYT EDITOR Blames “Republican Rage Machin...,Talk about projecting! On Fareed Zakaria s CNN...,politics,"Jun 18, 2017",0


In [4]:
combined_df = combined_df[['text', 'label']]
combined_df.head()

Unnamed: 0,text,label
0,"Earlier in January, 31-year-old Michael Cox wa...",0
1,WASHINGTON (Reuters) - The U.S. Senate on Thur...,1
2,Our favorite conservative street artist Sabo c...,0
3,Supreme Court Justice Clarence Thomas is the e...,0
4,Talk about projecting! On Fareed Zakaria s CNN...,0


In [5]:
combined_df.shape

(44898, 2)

In [6]:
# train_split, val_split = train_test_split(combined_df, train_size=0.8, random_state=1)

train_ratio = 0.75
validation_ratio = 0.15
test_ratio = 0.10

train_split, test_split = train_test_split(combined_df, random_state=1, test_size=1 - train_ratio)
val_split, test_split = train_test_split(test_split, test_size=test_ratio/(test_ratio + validation_ratio),random_state=1)


In [7]:
train_reviews = train_split['text']
y_train = train_split['label']

val_reviews = val_split['text']
y_val = val_split['label']

test_reviews = test_split['text']
y_test = test_split['label']

In [8]:
collections.Counter(y_train)

Counter({0: 17675, 1: 15998})

# Preprocessing the data

Preprocessing the data with some simple filters to remove numbers and special characters and convert all words to lower case. <br>
The majority of the preprocessing section is handled by the Word Vectors which are coded later on. 

In [9]:
tokenizer = keras.preprocessing.text.Tokenizer(num_words=20000,
                                               filters='0123456789!"#$%&()*+,-./:;<=>?@[\]^_`{|}~',
                                               lower=True)

In [10]:
tokenizer.fit_on_texts(train_reviews)

In [11]:
# converting the input to a sequence of integers instead of a bag of words
# each word is represneted by a different integer

X_train = tokenizer.texts_to_sequences(train_reviews)
print(X_train[0])

[107, 67, 37, 8, 35, 333, 77, 492, 198, 2, 2640, 1, 3393, 1720, 2634, 233, 26, 84, 2, 239, 2645, 1, 599, 9, 621, 285, 109, 1764, 4, 4215, 4272, 1, 91, 72, 12, 9, 207, 1643, 39, 22, 319, 7, 198, 21, 2, 8451, 16, 31, 76, 808, 18240, 7, 1086, 2, 226, 1, 35, 1294, 7, 11, 1, 817, 1487, 7, 34, 23, 1, 1281, 2249, 6, 1, 1417, 2, 2645, 1, 599, 3, 1, 1820, 10, 285, 134, 99, 5, 34, 113, 1098, 90, 7, 47, 91, 72, 398, 1634, 3559, 12, 582, 35, 1098, 1294, 26, 233, 113, 896, 26, 84, 47]


In [12]:
[tokenizer.index_word[x] for x in X_train[0][:5]] , X_train[0][:5]


(['washington', 'reuters', 'u', 's', 'president'], [107, 67, 37, 8, 35])

# Normalizing the Inputs

After the input text has been converted into a sequence of numbers, we further normalize the text so that all articles have a uniform length, which, after some hit-and-trial was found to be around 250. 

In [13]:
MAX_LEN = 250
X_train = keras.preprocessing.sequence.pad_sequences(X_train, maxlen=MAX_LEN)

In [14]:
print(X_train[0])

[    0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0   107
    67    37     8    35   333    77   492   198     2  2640     1  3393
  1720  2634   233    26    84     2   239  2645   

In [15]:
X_val = tokenizer.texts_to_sequences(val_reviews)
X_val = keras.preprocessing.sequence.pad_sequences(X_val, maxlen=MAX_LEN)

In [16]:
X_test = tokenizer.texts_to_sequences(test_reviews)
X_test = keras.preprocessing.sequence.pad_sequences(X_test, maxlen=MAX_LEN)

In [17]:
print(tokenizer.word_index['awesome'])

5027


In [18]:
num_tokens = len(tokenizer.word_index) + 1
embedding_dim = 300

# The Model

The model to classify the text is relatively simple. </br>
It first consists of the Embedding Layer to create the Word Vectors.

There are two approaches to word vectors - use a already created pre-existing word vector to classify the entities in your dataset or start fresh with a random (empty) embedding matrix and let the model simultaenously come up with its own vectors while fitting the training data.

Since the pre-existing word vectors were quite big and cumbersome to deal with, I have only used the fresh / empty embedding approach here. </br>
Earlier experiments showed that there wasn't a significant difference in using pre-trained word vectors or this approach </br>

The model itself comprises of a AveragePooling layer that takes the average of all the embedding word vectors and converts it into a single vector before sending it further into the network. </br>
Further, we only have 2 hidden layers in the Neural Network of 128 and 64 dimensions respectively.
This was mainly derived from a hit and trial approach, also taking into account that the embedding matrix had 300 features

The output layer only has a single neuron which gives a decimal value (around 0.9+ if it predicts it's Real or around 0.05 if it says it's Fake)

Added dropout layers to significantly reduce overfitting on the training data and terminating the model early since this was a observed earlier. </br>
Also added Early Stopping (with patience = 3, which could be reduced) to stop the process in case it detects the Valuation and Train Accuracy diverging (Val accuracy decreases but Train accuracy increases), indicating overfitting.

In [19]:
tf.random.set_seed(0)

model = keras.Sequential()

model.add(layers.Embedding(input_dim=num_tokens, 
                           output_dim=embedding_dim, 
                           input_length=MAX_LEN))


model.add(layers.GlobalAveragePooling1D())
model.add(layers.Dense(128, activation='relu', kernel_initializer=tf.keras.initializers.random_normal(seed=1)))
model.add(layers.Dense(64, activation='relu', kernel_initializer=tf.keras.initializers.random_normal(seed=1)))
model.add(layers.Dense(1, activation='sigmoid', kernel_initializer=tf.keras.initializers.random_normal(seed=1)))

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

es_callback = keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)
history = model.fit(X_train, y_train, epochs=20, batch_size=512, validation_data=(X_val, y_val), callbacks=[es_callback])



Epoch 1/20


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20


# Results

The model has a very impressive train data accuracy of 99.97% and an even impressive Valuation accuracy of 99.1 % 
The model can very confidently classify news as fake or real.

In [20]:
def sentiment(reviews):
  seqs = tokenizer.texts_to_sequences(reviews)
  seqs = keras.preprocessing.sequence.pad_sequences(seqs, maxlen=MAX_LEN)
  return model.predict(seqs)


In [21]:
fake_news = 'Scientists have discovered a massive underground city beneath the Sahara Desert, believed to be inhabited by a lost civilization that possesses advanced technology far beyond modern capabilities. This hidden metropolis, which spans hundreds of miles, is said to be powered by a mysterious energy source that can cure all known diseases and generate limitless clean energy. Archaeologists claim that this civilization has been in contact with extraterrestrial beings, who have guided them in developing technologies that defy the laws of physics. Government officials are allegedly keeping this discovery a secret to prevent global panic and protect the powerful technologies from falling into the wrong hands.'
real_news = "The government should publish advice for its departments on engaging with young people, including on TikTok, a group of MPs has said.\n\nThe culture, media and sport committee has been looking into countering disinformation online.\n\nIts call comes despite TikTok currently being banned on government devices due to data security concerns.\n\nAccurate information needs to be communicated in a \"relatable\" way, the MPs say.\n\nThe committee says that countering misinformation is particularly important for young people, who are increasingly turning away from traditional media and towards social media for their information.\n\nIt advises meeting young people \"where they are\" - with 15 to 24 year olds spending around an hour per day on TikTok, according to media regulator Ofcom.\n\nThe report says: \"The Government must have a clear strategy for communicating with young people and adapting to the development of new apps and platforms which appeal to this audience.\"\n\nSome MPs do still use TikTok, despite the fact it is blocked on the Parliament Wi-Fi network.\n\nDefence Secretary Grant Shapps has almost 20,000 followers, though he says he does not have the app on his personal phone.\n\n\"Grant knows that TikTok can be a valuable tool for communicating with his constituents,\" a source close to Mr Shapps told the BBC in September 2023.\n\nThe Ministry of Defence also operates a separate account which has about 17,500 followers.\n\nTikTok is under pressure in many countries over its links to the Chinese state - links it has always denied - with law-makers in the US recently passing legislation saying it should be sold or banned.\n\nIt has though endorsed the committee's findings.\n\n\"We welcome this report's recommendation that the Government should engage with the public on whatever platform they choose to use\", it said in a statement.\n\nThe government has responded to the committee report by saying it makes efforts to \"reach people directly on the platforms they spend the most time on.\"\n\nIt adds that the Online Safety Act, which came into law last year, \"will also help tackle the root cause of disinformation\" by requiring social media companies \"to swiftly remove illegal misinformation and disinformation as soon as they become aware of it.\"\n\nBut the law was criticised at the time by fact-checking service Full Fact, which said it did not go far enough \"to address the way that platforms treat harmful misinformation and disinformation.\"\n\nThe MPs took evidence from over 60 different people prior to publishing their report, including disinformation experts and journalists.\n\nAmong these were BBC journalists Rebecca Skippage and Marianna Spring.\n\nAlso interviewed were financial journalist Martin Lewis, Channel 4 journalist Georgina Lee and the chief executive of Full Fact Will Moy"

In [22]:
print(sentiment([fake_news, real_news]))

[[0.0015998]
 [0.9991107]]


These are the results of the model sample news articles obtained online. </br>

The real news was obtained from a BBC news article on Fake News :) </br>
The fake news was generated by ChatGPT

In [23]:
model.evaluate(X_test, y_test)



[0.018902253359556198, 0.995100200176239]

In [24]:
y_pred = model.predict(X_test)



In [27]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = (y_pred >= 0.5).astype(int)

print(classification_report(y_true=y_test, y_pred=y_pred))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2298
           1       0.99      0.99      0.99      2192

    accuracy                           1.00      4490
   macro avg       1.00      1.00      1.00      4490
weighted avg       1.00      1.00      1.00      4490



In [28]:
print(confusion_matrix(y_true=y_test, y_pred=y_pred))

[[2287   11]
 [  11 2181]]


In [29]:
from tensorflow.keras.models import load_model

# Saving the model
model.save('fake_news_classifier.h5')



  saving_api.save_model(


In [30]:
import joblib

# Save the tokenizer using joblib
joblib.dump(tokenizer, 'tokenizer.joblib')


['tokenizer.joblib']