In [1]:
import csv
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
plain_sentences = []
sentences = []
labels = []
stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]
print(len(stopwords))

153


In [7]:
with open("./Fake.csv", 'r') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    next(reader) #skipping the columns
    for row in reader:
        labels.append(0)
        sentence = row[0].lower()
        plain_sentences.append(sentence)
        for word in stopwords:
            token = " " + word + " "
            sentence = sentence.replace(token, " ")
        sentences.append(sentence)

In [8]:
print(len(labels))
print(len(sentences))
print(sentences[0])
print(plain_sentences[0])

44898
44898
as u.s. budget fight looms, republicans flip fiscal script
as u.s. budget fight looms, republicans flip their fiscal script


In [9]:
training_portion = .8
train_size = int(len(sentences) * training_portion)

train_sentences = sentences[:train_size]
train_labels = labels[:train_size]

validation_sentences = sentences[train_size:]
validation_labels = labels[train_size:]

print(train_size)
print(len(train_sentences))
print(len(train_labels))
print(len(validation_sentences))
print(len(validation_labels))

35918
35918
35918
8980
8980


In [10]:
#parameters

vocab_size = 1000
embedding_dim = 16
max_length = 60
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"

In [11]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_sentences)
word_index = tokenizer.word_index

train_sequences = tokenizer.texts_to_sequences(train_sentences)
train_padded = pad_sequences( train_sequences, padding=padding_type, maxlen=max_length, truncating=trunc_type)

In [12]:
print(len(train_sequences[0]))
print(len(train_padded[0]))

print(len(train_sequences[1]))
print(len(train_padded[1]))

print(len(train_sequences[10]))
print(len(train_padded[10]))

10
60
8
60
9
60


In [13]:
validation_sequences = tokenizer.texts_to_sequences(validation_sentences)
validation_padded = pad_sequences(validation_sequences, padding=padding_type, maxlen=max_length,  truncating=trunc_type)

print(len(validation_sequences))
print(validation_padded.shape)

8980
(8980, 60)


In [14]:


training_label_seq = np.array(train_labels)
validation_label_seq = np.array(validation_labels)

print(training_label_seq[0])
print(training_label_seq[1])
print(training_label_seq[2])
print(training_label_seq.shape)

print(validation_label_seq[0])
print(validation_label_seq[1])
print(validation_label_seq[2])
print(validation_label_seq.shape)

1
1
1
(35918,)
0
0
0
(8980,)


In [15]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 60, 16)            16000     
_________________________________________________________________
bidirectional (Bidirectional (None, 60, 128)           41472     
_________________________________________________________________
bidirectional_1 (Bidirection (None, 64)                41216     
_________________________________________________________________
dense (Dense)                (None, 24)                1560      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 25        
Total params: 100,273
Trainable params: 100,273
Non-trainable params: 0
_________________________________________________________________


In [17]:

num_epochs = 10
history = model.fit(train_padded, training_label_seq, 
                    epochs=num_epochs, validation_data=(validation_padded, validation_label_seq), verbose=2)

Epoch 1/10
1123/1123 - 19s - loss: 0.0853 - accuracy: 0.9678 - val_loss: 0.3211 - val_accuracy: 0.8895
Epoch 2/10
1123/1123 - 19s - loss: 0.0799 - accuracy: 0.9689 - val_loss: 0.2131 - val_accuracy: 0.9192
Epoch 3/10
1123/1123 - 19s - loss: 0.0750 - accuracy: 0.9712 - val_loss: 0.2588 - val_accuracy: 0.9121
Epoch 4/10
1123/1123 - 19s - loss: 0.0712 - accuracy: 0.9732 - val_loss: 0.2711 - val_accuracy: 0.8999
Epoch 5/10
1123/1123 - 19s - loss: 0.0670 - accuracy: 0.9741 - val_loss: 0.3696 - val_accuracy: 0.8763
Epoch 6/10
1123/1123 - 19s - loss: 0.0633 - accuracy: 0.9764 - val_loss: 0.2761 - val_accuracy: 0.9068
Epoch 7/10
1123/1123 - 19s - loss: 0.0586 - accuracy: 0.9782 - val_loss: 0.3237 - val_accuracy: 0.9050
Epoch 8/10
1123/1123 - 19s - loss: 0.0547 - accuracy: 0.9787 - val_loss: 0.3322 - val_accuracy: 0.9047
Epoch 9/10
1123/1123 - 19s - loss: 0.0505 - accuracy: 0.9808 - val_loss: 0.4476 - val_accuracy: 0.8997
Epoch 10/10
1123/1123 - 19s - loss: 0.0467 - accuracy: 0.9820 - val_loss:

In [18]:
model.save('true_fasle.h5')
import os
print(str(os.path.getsize('./true_fasle.h5')/1000000) + 'MB')

1.282MB
