In [9]:
import pandas as pd
from tensorflow.keras.layers import Dense, Dropout, Embedding, Bidirectional, LSTM
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
import numpy as np

In [91]:
df = pd.read_csv('datasets/fake news dataset.csv')
df.head()

Unnamed: 0,id,title,author,text,label
0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning...",0
1,20801,Russian warships ready to strike terrorists ne...,,Russian warships ready to strike terrorists ne...,1
2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...,0
3,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different...",1
4,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...,1


In [92]:
df.shape

(5200, 5)

In [93]:
df.isna().sum()

id          0
title     122
author    503
text        7
label       0
dtype: int64

In [94]:
df = df.dropna()
df.shape

(4575, 5)

In [95]:
y= df['label']

In [96]:
y.value_counts()

1    2362
0    2213
Name: label, dtype: int64

In [97]:
import tensorflow as tf
tf.__version__

'2.2.0'

In [98]:
df['title'] = df['title'].apply(lambda x: x.lower())
df.head()

Unnamed: 0,id,title,author,text,label
0,20800,"specter of trump loosens tongues, if not purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning...",0
2,20802,#nodapl: native american leaders vow to stay a...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...,0
3,20803,"tim tebow will attempt another comeback, this ...",Daniel Victor,"If at first you don’t succeed, try a different...",1
4,20804,keiser report: meme wars (e995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...,1
6,20806,pelosi calls for fbi investigation to find out...,Pam Key,"Sunday on NBC’s “Meet the Press,” House Minori...",1


In [73]:
from spacy.lang.en import STOP_WORDS
import spacy
from string import punctuation
nlp = spacy.load('en_core_web_lg')

In [74]:
punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [99]:
df['title'] = df['title'].apply(lambda x: "".join([i for i in x if i not in punctuation]))
df.head()

Unnamed: 0,id,title,author,text,label
0,20800,specter of trump loosens tongues if not purse ...,David Streitfeld,"PALO ALTO, Calif. — After years of scorning...",0
2,20802,nodapl native american leaders vow to stay all...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...,0
3,20803,tim tebow will attempt another comeback this t...,Daniel Victor,"If at first you don’t succeed, try a different...",1
4,20804,keiser report meme wars e995,Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...,1
6,20806,pelosi calls for fbi investigation to find out...,Pam Key,"Sunday on NBC’s “Meet the Press,” House Minori...",1


In [100]:
df['title'] = df['title'].apply(lambda x: " ".join([i for i in x.split() if i not in STOP_WORDS]))
df.head()

Unnamed: 0,id,title,author,text,label
0,20800,specter trump loosens tongues purse strings si...,David Streitfeld,"PALO ALTO, Calif. — After years of scorning...",0
2,20802,nodapl native american leaders vow stay winter...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...,0
3,20803,tim tebow attempt comeback time baseball new y...,Daniel Victor,"If at first you don’t succeed, try a different...",1
4,20804,keiser report meme wars e995,Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...,1
6,20806,pelosi calls fbi investigation find ’what russ...,Pam Key,"Sunday on NBC’s “Meet the Press,” House Minori...",1


In [103]:
text = df['title'].tolist()
text

['specter trump loosens tongues purse strings silicon valley new york times',
 'nodapl native american leaders vow stay winter file lawsuit police',
 'tim tebow attempt comeback time baseball new york times',
 'keiser report meme wars e995',
 'pelosi calls fbi investigation find ’what russians donald trump’ breitbart',
 'weekly featured profile – randy shannon',
 '184 generals admirals endorse trump commanderinchief',
 '“working class hero” john brennon',
 'rise mandatory vaccinations means end medical freedom',
 'communists terrorize small business',
 'computer programmer comes forward admits paid rig voting booths trump right • usa newsflash',
 'thieves chunk change 221 pounds berlin museum new york times',
 'new england patriots’ owner sore nfl payback sight new york times',
 'college republicans yaf sue berkeley ann coulter event breitbart',
 'trump melts accuses postal service stealing election clinton',
 'visiting madagascar leave red swimsuits lemur recipes home new york times',

In [104]:
texts = []
for line in text:
    sequences =[]
    for word in nlp(line):
        if word.lemma_ == '-PRON-':
            sequences.append(str(word))
        else:
            sequences.append(word.lemma_)
            
    sequences = " ".join(sequences)
    texts.append(sequences)
texts

['specter trump loosen tongue purse string silicon valley new york times',
 'nodapl native american leader vow stay winter file lawsuit police',
 'tim tebow attempt comeback time baseball new york times',
 'keiser report meme war e995',
 "pelosi call fbi investigation find ' what russians donald trump ' breitbart",
 'weekly featured profile – randy shannon',
 '184 general admiral endorse trump commanderinchief',
 '" work class hero " john brennon',
 'rise mandatory vaccination mean end medical freedom',
 'communist terrorize small business',
 'computer programmer come forward admit pay rig voting booth trump right • usa newsflash',
 'thief chunk change 221 pound berlin museum new york times',
 "new england patriots ' owner sore nfl payback sight new york times",
 'college republicans yaf sue berkeley ann coulter event breitbart',
 'trump melt accuse postal service steal election clinton',
 'visit madagascar leave red swimsuit lemur recipe home new york times',
 'reese ’s peanut butter 

In [163]:
import re
cleaned_texts = []
for sent in texts:
    clean = []
    for words in sent.split():
        if words not in punctuation:
            if words not in re.findall('[0-9]+', sent):
                if words not in ['•','’s','…','–','’']:
                    clean.append(words)
            
    clean = " ".join(clean)
    cleaned_texts.append(clean)
cleaned_texts

['specter trump loosen tongue purse string silicon valley new york times',
 'nodapl native american leader vow stay winter file lawsuit police',
 'tim tebow attempt comeback time baseball new york times',
 'keiser report meme war e995',
 'pelosi call fbi investigation find what russians donald trump breitbart',
 'weekly featured profile randy shannon',
 'general admiral endorse trump commanderinchief',
 'work class hero john brennon',
 'rise mandatory vaccination mean end medical freedom',
 'communist terrorize small business',
 'computer programmer come forward admit pay rig voting booth trump right usa newsflash',
 'thief chunk change pound berlin museum new york times',
 'new england patriots owner sore nfl payback sight new york times',
 'college republicans yaf sue berkeley ann coulter event breitbart',
 'trump melt accuse postal service steal election clinton',
 'visit madagascar leave red swimsuit lemur recipe home new york times',
 'reese peanut butter cup cheap toxic chemical'

In [164]:
token = Tokenizer()
token.fit_on_texts(cleaned_texts)

In [165]:
token.word_index

{'new': 1,
 'york': 2,
 'times': 3,
 'trump': 4,
 'breitbart': 5,
 'hillary': 6,
 'clinton': 7,
 'donald': 8,
 'election': 9,
 'obama': 10,
 'not': 11,
 'say': 12,
 'video': 13,
 'report': 14,
 'fbi': 15,
 'news': 16,
 'war': 17,
 'email': 18,
 'president': 19,
 'russia': 20,
 'day': 21,
 'world': 22,
 'police': 23,
 'find': 24,
 'white': 25,
 'man': 26,
 'america': 27,
 'state': 28,
 'comment': 29,
 'house': 30,
 'watch': 31,
 'be': 32,
 'time': 33,
 'do': 34,
 'woman': 35,
 'attack': 36,
 'vote': 37,
 'kill': 38,
 'investigation': 39,
 'plan': 40,
 'syria': 41,
 'people': 42,
 'black': 43,
 'win': 44,
 'campaign': 45,
 'year': 46,
 'break': 47,
 'go': 48,
 'voter': 49,
 'comey': 50,
 'china': 51,
 'medium': 52,
 'take': 53,
 '—': 54,
 'call': 55,
 'russian': 56,
 'big': 57,
 'i': 58,
 'want': 59,
 'fire': 60,
 'bill': 61,
 'get': 62,
 'million': 63,
 'court': 64,
 'know': 65,
 'wikileaks': 66,
 'it': 67,
 'lose': 68,
 'american': 69,
 'leader': 70,
 'right': 71,
 'protest': 72,
 'chi

In [166]:
vocab_size = len(token.word_index) + 1
vocab_size

8625

In [167]:
encoded_text = token.texts_to_sequences(cleaned_texts)
encoded_text

[[3786, 4, 3787, 3788, 3789, 3790, 904, 785, 1, 2, 3],
 [1231, 2536, 69, 70, 394, 524, 905, 430, 632, 23],
 [1054, 2537, 786, 1866, 33, 1468, 1, 2, 3],
 [3791, 14, 1469, 17, 3792],
 [705, 55, 15, 39, 24, 291, 574, 8, 4, 5],
 [2538, 3793, 1867, 3794, 3795],
 [328, 1232, 633, 4, 1868],
 [140, 395, 1470, 74, 3796],
 [141, 1869, 1870, 906, 92, 706, 575],
 [1471, 1871, 787, 231],
 [1472, 3797, 77, 707, 361, 150, 178, 468, 3798, 4, 71, 525, 3799],
 [907, 3800, 142, 2539, 708, 788, 1, 2, 3],
 [1, 1055, 1872, 908, 3801, 469, 2540, 1473, 1, 2, 3],
 [133, 134, 3802, 396, 397, 789, 709, 576, 5],
 [4, 1056, 192, 3803, 577, 329, 9, 7],
 [362, 3804, 83, 578, 3805, 3806, 1873, 207, 1, 2, 3],
 [3807, 3808, 1874, 1474, 1875, 2541, 634],
 [19, 10, 1876, 8, 4, 143, 25, 30],
 [179, 526, 292, 4, 44, 470],
 [789, 709, 3809, 3810, 1877, 1475, 363, 710, 44, 1878, 364],
 [398, 1233, 3811, 293, 2542, 51, 3812, 1, 2, 3],
 [2543, 3813, 399, 2544, 1234, 579, 6, 1879, 579],
 [61, 3814, 151, 1476, 2545, 8, 4, 93, 40

In [168]:
#getting maximum length of array indexes of encoded_text

max_ = []
def get_max_len(x):
    for i in x[:]:
        max_.append(len(i))
    
    length = np.max(max_)
    return length

max_length = get_max_len(encoded_text)
max_length

30

In [169]:
maxlen = max_length +2
maxlen

32

In [171]:
X = pad_sequences(encoded_text, maxlen, padding='pre')
X

array([[  0,   0,   0, ...,   1,   2,   3],
       [  0,   0,   0, ..., 430, 632,  23],
       [  0,   0,   0, ...,   1,   2,   3],
       ...,
       [  0,   0,   0, ...,   1,   2,   3],
       [  0,   0,   0, ...,   1,   2,   3],
       [  0,   0,   0, ...,   1,   2,   3]])

In [176]:
X.shape

(4575, 32)

In [180]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state=2)

In [172]:
vec_size = 40

In [174]:
model = Sequential()
model.add(Embedding(vocab_size, vec_size, input_length = maxlen))
model.add(Bidirectional(LSTM(100)))
model.add(Dense(1, activation = 'sigmoid'))

model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 32, 40)            345000    
_________________________________________________________________
bidirectional (Bidirectional (None, 200)               112800    
_________________________________________________________________
dense (Dense)                (None, 1)                 201       
Total params: 458,001
Trainable params: 458,001
Non-trainable params: 0
_________________________________________________________________


In [181]:
model.fit(x_train, y_train, validation_data = (x_test,y_test), epochs = 10, batch_size =15)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x2709f631730>