# Fake News Detection Model using TensorFlow in Python

In [1]:
import pandas as pd
import numpy as np
import json
import random
import csv

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import regularizers

import pprint
import tensorflow.compat.v1 as tf
from tensorflow.python.framework import ops
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
# tf.disable_eager_execution()
data = pd.read_csv('news.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


# Preprocessing of data

In [2]:
data = data.drop(['Unnamed: 0'],axis=1)
data.head()

Unnamed: 0,title,text,label
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


# Encoding the labels

In [3]:

le = preprocessing.LabelEncoder()
le.fit(data['label'])
data['label'] = le.transform(data['label']) # 0 is fake, 1 real

In [4]:
embedding_dim = 50
max_length = 54
trunc_type = 'post'
padding_type = 'post'
oov_tok = "<OOV>"
training_size = 3000
test_portion = .1

# Tokenization

In [5]:
title = []
text = []
labels = []
for x in range(training_size):
    title.append(data['title'][x])
    text.append(data['text'][x])
    labels.append(data['label'][x])

# Applying Tokenization
tokenizer1 = Tokenizer()
tokenizer1.fit_on_texts(title)
word_index1 = tokenizer1.word_index # for every token assign one unique number
vocab_size1 =len(word_index1)
sequences1=tokenizer1.texts_to_sequences(title) # convert text to sequence
padded1 = pad_sequences(sequences1,padding=padding_type, truncating=trunc_type)
split = int(test_portion * training_size)
training_sequences1=padded1[split: training_size]
test_sequences1 = padded1[0:split]
test_labels = labels[0:split]
training_labels = labels[split:training_size]

# Generating Word Embeddings

In [6]:
embedding_index = {}
with open('glove.6B.50d.txt',encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs


# generating embeddings
embeddings_matrix = np.zeros((vocab_size1+1,embedding_dim))
for word, i in word_index1.items():
    print(word,i)
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embeddings_matrix[i] = embedding_vector

the 1
to 2
in 3
of 4
trump 5
for 6
a 7
on 8
and 9
is 10
clinton 11
hillary 12
with 13
by 14
donald 15
new 16
as 17
obama 18
at 19
election 20
from 21
why 22
us 23
about 24
gop 25
how 26
after 27
will 28
video 29
2016 30
are 31
over 32
be 33
what 34
it 35
– 36
this 37
that 38
debate 39
not 40
has 41
his 42
fbi 43
says 44
campaign 45
news 46
war 47
you 48
state 49
comment 50
russia 51
up 52
out 53
sanders 54
u 55
have 56
s 57
can 58
iran 59
an 60
police 61
against 62
no 63
more 64
could 65
who 66
isis 67
deal 68
america 69
world 70
republicans 71
cruz 72
president 73
just 74
now 75
house 76
he 77
trump’s 78
white 79
email 80
media 81
but 82
first 83
if 84
presidential 85
all 86
— 87
was 88
american 89
court 90
republican 91
emails 92
win 93
time 94
paul 95
bernie 96
day 97
we 98
plan 99
one 100
rubio 101
black 102
report 103
congress 104
real 105
party 106
her 107
bill 108
than 109
political 110
attack 111
their 112
anti 113
vote 114
democratic 115
supreme 116
wins 117
do 118
senate 119


# Creating model architecture

In [7]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size1+1, embedding_dim,
                              input_length=max_length, weights=[
                                  embeddings_matrix],
                              trainable=False),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Conv1D(64, 5, activation='relu'),
    tf.keras.layers.MaxPooling1D(pool_size=4),
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss = 'binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()



In [9]:
num_epochs = 50

training_padded = np.array(training_sequences1)
training_labels = np.array(training_labels)
testing_padded = np.array(test_sequences1)
testing_labels = np.array(test_labels)

history = model.fit(training_padded, training_labels, 
                    epochs=num_epochs,
                    validation_data=(testing_padded,
                                     testing_labels), 
                    verbose=2)

Epoch 1/10
85/85 - 1s - 10ms/step - accuracy: 0.9822 - loss: 0.0456 - val_accuracy: 0.7533 - val_loss: 1.0533
Epoch 2/10
85/85 - 1s - 14ms/step - accuracy: 0.9789 - loss: 0.0542 - val_accuracy: 0.7433 - val_loss: 0.9751
Epoch 3/10
85/85 - 1s - 8ms/step - accuracy: 0.9793 - loss: 0.0580 - val_accuracy: 0.7333 - val_loss: 1.0654
Epoch 4/10
85/85 - 1s - 9ms/step - accuracy: 0.9770 - loss: 0.0635 - val_accuracy: 0.7433 - val_loss: 0.9366
Epoch 5/10
85/85 - 1s - 9ms/step - accuracy: 0.9774 - loss: 0.0670 - val_accuracy: 0.7333 - val_loss: 0.9170
Epoch 6/10
85/85 - 1s - 10ms/step - accuracy: 0.9822 - loss: 0.0493 - val_accuracy: 0.7600 - val_loss: 1.0487
Epoch 7/10
85/85 - 1s - 10ms/step - accuracy: 0.9767 - loss: 0.0553 - val_accuracy: 0.7667 - val_loss: 0.9702
Epoch 8/10
85/85 - 1s - 11ms/step - accuracy: 0.9800 - loss: 0.0562 - val_accuracy: 0.7567 - val_loss: 0.9883
Epoch 9/10
85/85 - 1s - 14ms/step - accuracy: 0.9741 - loss: 0.0657 - val_accuracy: 0.7767 - val_loss: 1.0513
Epoch 10/10
8

In [12]:
# sample text to check if fake or not
#X = "Karry to go to France in gesture of sympathy"
#X = "Strong Solar Storm, Tech Risks Today | S0 News Oct.26.2016 [VIDEO]"
X = "How women lead differently"
sequences = tokenizer1.texts_to_sequences([X])[0]
sequences = pad_sequences([sequences],maxlen=54,padding=padding_type,truncating = trunc_type)
if(model.predict(sequences, verbose=0)[0][0] >= 0.5):
    print("This news is True")
else:
    print("This news is false")

This news is True
