In [1]:
import random
import pickle

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.tokenize import RegexpTokenizer

import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Dense, Activation
from tensorflow.keras.optimizers import RMSprop

In [2]:
text_df = pd.read_csv("fake_or_real_news.csv")
text = list(text_df.text.values)
joined_text = " ".join(text)

with open("joined_text.txt", "w", encoding="utf-8") as f:
    f.write(joined_text)

In [3]:
partial_text = joined_text[:1000000]

In [4]:
tokenizer = RegexpTokenizer(r"\w+")
tokens = tokenizer.tokenize(partial_text.lower())

In [5]:
unique_tokens = np.unique(tokens)
unique_token_index = {token: index for index, token in enumerate(unique_tokens)}

In [6]:
n_words = 10
input_words = []
next_word = []

for i in range(len(tokens) - n_words):
    input_words.append(tokens[i:i + n_words])
    next_word.append(tokens[i + n_words])

In [7]:
X = np.zeros((len(input_words), n_words, len(unique_tokens)), dtype=bool)  # for each sample, n input words and then a boolean for each possible next word
y = np.zeros((len(next_word), len(unique_tokens)), dtype=bool)  # for each sample a boolean for each possible next word

In [8]:
for i, words in enumerate(input_words):
    for j, word in enumerate(words):
        X[i, j, unique_token_index[word]] = 1
    y[i, unique_token_index[next_word[i]]] = 1

In [9]:
model = Sequential()
model.add(LSTM(128, input_shape=(n_words, len(unique_tokens)), return_sequences=True))
model.add(LSTM(128))
model.add(Dense(len(unique_tokens)))
model.add(Activation("softmax"))

  super().__init__(**kwargs)


In [10]:
optimizer = RMSprop(learning_rate=0.01)
model.compile(loss="categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"])
history = model.fit(X, y, batch_size=128, epochs=10, shuffle=True).history

Epoch 1/10
[1m1318/1318[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m218s[0m 163ms/step - accuracy: 0.0575 - loss: 7.3556
Epoch 2/10
[1m1318/1318[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m205s[0m 155ms/step - accuracy: 0.0988 - loss: 6.7412
Epoch 3/10
[1m1318/1318[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m213s[0m 161ms/step - accuracy: 0.1205 - loss: 6.4755
Epoch 4/10
[1m1318/1318[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m199s[0m 151ms/step - accuracy: 0.1337 - loss: 6.2833
Epoch 5/10
[1m1318/1318[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m203s[0m 154ms/step - accuracy: 0.1483 - loss: 6.0831
Epoch 6/10
[1m1318/1318[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m208s[0m 158ms/step - accuracy: 0.1617 - loss: 5.8931
Epoch 7/10
[1m1318/1318[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m214s[0m 159ms/step - accuracy: 0.1779 - loss: 5.7204
Epoch 8/10
[1m1318/1318[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m210s[0m 159ms/step - accuracy: 0.1962 - loss:

In [11]:
history = model.fit(X, y, batch_size=128, epochs=5, shuffle=True).history

Epoch 1/5
[1m1318/1318[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m209s[0m 156ms/step - accuracy: 0.2394 - loss: 5.0289
Epoch 2/5
[1m1318/1318[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m202s[0m 153ms/step - accuracy: 0.2553 - loss: 4.8816
Epoch 3/5
[1m1318/1318[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m202s[0m 153ms/step - accuracy: 0.2696 - loss: 4.7457
Epoch 4/5
[1m1318/1318[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m203s[0m 154ms/step - accuracy: 0.2841 - loss: 4.6212
Epoch 5/5
[1m1318/1318[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m201s[0m 153ms/step - accuracy: 0.2991 - loss: 4.4968


In [12]:
model.save("text_gen_model2.h5")
with open("history2.p", "wb") as f:
    pickle.dump(history, f)



In [13]:
model = load_model("text_gen_model2.h5")
history = pickle.load(open("history2.p", "rb"))



In [14]:
def predict_next_word(input_text, n_best):
    input_text = input_text.lower()
    X = np.zeros((1, n_words, len(unique_tokens)))
    for i, word in enumerate(input_text.split()):
        X[0, i, unique_token_index[word]] = 1
        
    predictions = model.predict(X)[0]
    return np.argpartition(predictions, -n_best)[-n_best:]

In [15]:
possible = predict_next_word("I will have to look into this thing because I", 5)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 279ms/step


In [16]:
for idx in possible:
    print(unique_tokens[idx])

went
hope
give
am
m


In [17]:
def generate_text(input_text, n_words, creativity=3):
    word_sequence = input_text.split()
    current = 0
    for _ in range(n_words):
        sub_sequence = " ".join(tokenizer.tokenize(" ".join(word_sequence).lower())[current:current+n_words])
        try:
            choice = unique_tokens[random.choice(predict_next_word(sub_sequence, creativity))]
        except:
            choice = random.choice(unique_tokens)
        word_sequence.append(choice)
        current += 1
    return " ".join(word_sequence)

In [18]:
generate_text("I will have to look into this thing because I", 100, 10)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22

'I will have to look into this thing because I hope but i think here can and she could be going on and in him it after a vote should just know you are more about those with make your head where this point these chief needs also already was afraid they can do become a real effect against donald donald it is still able but he would ve always see back how is that the country s set for new should help us relief filled the peace research trying and change us van check out of texas hispanics saw state in which 2016 in congress house paul kasich was'

In [19]:
generate_text("The president of the United States announced yesterday that he", 100, 10)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21

'The president of the United States announced yesterday that he gets a look against russia i care it s not going the people as it has just already see the republican said you want it so the fed body is into you see there does it change this point you need that donald how those at her that have changed both the hour that do that only it would only see this very different story attention back back and in both september her agency as much shot it was clear this suspect that mr cruz is going at to should be an administration at government wants at nsa level these'

In [20]:
for idx in predict_next_word("The president will most likely not be there to help", 5):
    print(unique_tokens[idx])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
his
in
their
her
the
