## step 1  - Importing Libraries and dataset

In [None]:
import pandas as pd
df = pd.read_csv('../TextFiles/news.csv')
df.head(10)

## step 2 - Preprocessing Dataset

In [13]:
print(len(df))
df = df.dropna()
df = df.drop(["Unnamed: 0"], axis = 1) #this column is of no use...
print(len(df))

6335
6335


In [11]:
df.head()

Unnamed: 0,title,text,label
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


## Generating Word Embeddings
#### Data Encoding
It converts the categorical column (label in out case) into numerical values.

In [18]:
# encoding the labels
from sklearn import preprocessing

labelEncoder = preprocessing.LabelEncoder()
labelEncoder.fit(df['label'])
#just the matter of converting the fake to 0 and Real to 1
df['label'] = labelEncoder.transform(df['label']) 
# df['label']
df.head()

Unnamed: 0,title,text,label
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",0
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,0
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,1
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",0
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,1


In [19]:
# some variables for model training
embedding_dim = 50
max_length = 54
trunc_type = 'post'
padding_type = 'post'
oov_tok = "<OOV>"
training_size = 3000
test_portion = .1


### Tokenization 
This process divides a large piece of continuous text into distinct units or tokens basically. Here we use columns separately for a temporal basis as a pipeline just for good accuracy.

In [28]:
title = []
text = []
labels = []
for x in range(training_size):
    title.append(df['title'][x])
    text.append(df['text'][x])
    labels.append(df['label'][x])
# print(f"{title[:3]} {text[:3]} {labels[:3]}")

In [52]:
#Applying Tokenization
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer1 = Tokenizer()
tokenizer1.fit_on_texts(title) #Updates internal vocabulary based on a list of texts.
print(title[0])
print(title[1])

word_index1 = tokenizer1.word_index #mapping words to indices
# print(word_index1)
vocab_size1 = len(word_index1)

sequences1 = tokenizer1.texts_to_sequences(title)

print(sequences1[0])
print(sequences1[1])

from tensorflow.keras.preprocessing.sequence import pad_sequences
padded1 = pad_sequences(sequences1,  
                        padding=padding_type, 
                        truncating=trunc_type)
print(padded1)
split = int(test_portion * training_size)
test_sequences1 = padded1[0:split]
training_sequences1 = padded1[split:training_size]
test_labels = labels[0:split]
training_labels = labels[split:training_size]

You Can Smell Hillary’s Fear
Watch The Exact Moment Paul Ryan Committed Political Suicide At A Trump Rally (VIDEO)
[48, 58, 1930, 182, 577]
[183, 1, 3170, 1109, 95, 184, 1931, 110, 445, 19, 7, 5, 225, 29]
[[  48   58 1930 ...    0    0    0]
 [ 183    1 3170 ...    0    0    0]
 [ 446    2  243 ...    0    0    0]
 ...
 [   1 7547  455 ...    0    0    0]
 [1115  321  614 ...    0    0    0]
 [   5 2516   42 ...    0    0    0]]


### Generating Word Embedding
It allows words with similar meanings to have a similar representation. Here each individual word is represented as real-valued vectors in a predefined vector space.

In [45]:
import numpy as np

embeddings_index = {}
with open('../TextFiles/glove.6B.50d.txt', encoding="utf8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
print(embeddings_index['with'])

[ 0.25616   0.43694  -0.11889   0.20345   0.41959   0.85863  -0.60344
 -0.31835  -0.6718    0.003984 -0.075159  0.11043  -0.73534   0.27436
  0.054015 -0.23828  -0.13767   0.011573 -0.46623  -0.55233   0.083317
  0.55938   0.51903  -0.27065  -0.28211  -1.3918    0.17498   0.26586
  0.061449 -0.273     3.9032    0.38169  -0.056009 -0.004425  0.24033
  0.30675  -0.12638   0.33436   0.075485 -0.036218  0.13691   0.37762
 -0.12159  -0.13808   0.19505   0.22793  -0.17304  -0.07573  -0.25868
 -0.39339 ]


In [46]:
print(embeddings_index['the'])

[ 4.1800e-01  2.4968e-01 -4.1242e-01  1.2170e-01  3.4527e-01 -4.4457e-02
 -4.9688e-01 -1.7862e-01 -6.6023e-04 -6.5660e-01  2.7843e-01 -1.4767e-01
 -5.5677e-01  1.4658e-01 -9.5095e-03  1.1658e-02  1.0204e-01 -1.2792e-01
 -8.4430e-01 -1.2181e-01 -1.6801e-02 -3.3279e-01 -1.5520e-01 -2.3131e-01
 -1.9181e-01 -1.8823e+00 -7.6746e-01  9.9051e-02 -4.2125e-01 -1.9526e-01
  4.0071e+00 -1.8594e-01 -5.2287e-01 -3.1681e-01  5.9213e-04  7.4449e-03
  1.7778e-01 -1.5897e-01  1.2041e-02 -5.4223e-02 -2.9871e-01 -1.5749e-01
 -3.4758e-01 -4.5637e-02 -4.4251e-01  1.8785e-01  2.7849e-03 -1.8411e-01
 -1.1514e-01 -7.8581e-01]


In [47]:
print(embeddings_index['from'])

[ 0.41037   0.11342   0.051524 -0.53833  -0.12913   0.22247  -0.9494
 -0.18963  -0.36623  -0.067011  0.19356  -0.33044   0.11615  -0.58585
  0.36106   0.12555  -0.3581   -0.023201 -1.2319    0.23383   0.71256
  0.14824   0.50874  -0.12313  -0.20353  -1.82      0.22291   0.020291
 -0.081743 -0.27481   3.7343   -0.01874  -0.084522 -0.30364   0.27959
  0.043328 -0.24621   0.015373  0.49751   0.15108  -0.01619   0.40132
  0.23067  -0.10743  -0.36625  -0.051135  0.041474 -0.36064  -0.19616
 -0.81066 ]


In [48]:
# Generating embeddings
embeddings_matrix = np.zeros((vocab_size1+1, embedding_dim))
for word, i in word_index1.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embeddings_matrix[i] = embedding_vector
print(embeddings_matrix)

[[ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.41800001  0.24968    -0.41242    ... -0.18411    -0.11514
  -0.78580999]
 [ 0.68046999 -0.039263    0.30186    ... -0.073297   -0.064699
  -0.26043999]
 ...
 [-0.39359999  0.025387   -0.34753999 ... -1.22019994 -0.93142998
  -0.29789999]
 [ 0.072997    0.033069    0.95832002 ...  0.28909001  0.15396
   0.48139   ]
 [ 0.14022    -1.12510002  0.62690002 ...  0.79251999  0.44918001
   0.42815   ]]


### Creating Model Architecture
Now it’s time to introduce TensorFlow to create the model.  Here we use the TensorFlow embedding technique with Keras Embedding Layer where we map original input data into some set of real-valued dimensions.

In [50]:
import tensorflow.compat.v1 as tf
model = tf.keras.Sequential([
	tf.keras.layers.Embedding(vocab_size1+1, embedding_dim,
							input_length=max_length, weights=[
								embeddings_matrix],
							trainable=False),
	tf.keras.layers.Dropout(0.2),
	tf.keras.layers.Conv1D(64, 5, activation='relu'),
	tf.keras.layers.MaxPooling1D(pool_size=4),
	tf.keras.layers.LSTM(64),
	tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',
			optimizer='adam', metrics=['accuracy'])
model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 54, 50)            377600    
                                                                 
 dropout (Dropout)           (None, 54, 50)            0         
                                                                 
 conv1d (Conv1D)             (None, 50, 64)            16064     
                                                                 
 max_pooling1d (MaxPooling1  (None, 12, 64)            0         
 D)                                                              
                                                                 
 lstm (LSTM)                 (None, 64)                33024     
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                        

In [53]:
num_epochs = 50

training_padded = np.array(training_sequences1)
training_labels = np.array(training_labels)
testing_padded = np.array(test_sequences1)
testing_labels = np.array(test_labels)

history = model.fit(training_padded, training_labels,
					epochs=num_epochs,
					validation_data=(testing_padded,
									testing_labels),
					verbose=2)


Epoch 1/50
85/85 - 6s - loss: 0.6591 - accuracy: 0.6033 - val_loss: 0.5866 - val_accuracy: 0.6767 - 6s/epoch - 67ms/step
Epoch 2/50
85/85 - 1s - loss: 0.5783 - accuracy: 0.6952 - val_loss: 0.5424 - val_accuracy: 0.6700 - 1s/epoch - 13ms/step
Epoch 3/50
85/85 - 1s - loss: 0.5347 - accuracy: 0.7300 - val_loss: 0.5324 - val_accuracy: 0.7267 - 1s/epoch - 13ms/step
Epoch 4/50
85/85 - 1s - loss: 0.4888 - accuracy: 0.7648 - val_loss: 0.5456 - val_accuracy: 0.7167 - 1s/epoch - 13ms/step
Epoch 5/50
85/85 - 1s - loss: 0.4465 - accuracy: 0.7948 - val_loss: 0.5509 - val_accuracy: 0.7233 - 1s/epoch - 13ms/step
Epoch 6/50
85/85 - 1s - loss: 0.3930 - accuracy: 0.8244 - val_loss: 0.5196 - val_accuracy: 0.7467 - 1s/epoch - 12ms/step
Epoch 7/50
85/85 - 1s - loss: 0.3592 - accuracy: 0.8478 - val_loss: 0.4811 - val_accuracy: 0.7633 - 1s/epoch - 13ms/step
Epoch 8/50
85/85 - 1s - loss: 0.3189 - accuracy: 0.8659 - val_loss: 0.5693 - val_accuracy: 0.7633 - 1s/epoch - 13ms/step
Epoch 9/50
85/85 - 1s - loss: 0.

In [55]:
# sample text to check if fake or not
X = "Karry to go to France in gesture of sympathy"
  
# detection
sequences = tokenizer1.texts_to_sequences([X])[0]
sequences = pad_sequences([sequences], maxlen=54,
                          padding=padding_type, 
                          truncating=trunc_type)
if(model.predict(sequences, verbose=0)[0][0] >= 0.5):
    print("This news is True")
else:
    print("This news is false")

This news is True
