In [44]:
import random
import pickle
import numpy as np
import pandas as pd
from nltk.tokenize import RegexpTokenizer
from tensorflow.keras.layers import LSTM, Dense, Activation
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.optimizers import RMSprop

#lstm: recurrent neural network layer (long short term memory for stock price prediction, next word prediction)

In [4]:
text_df = pd.read_csv("fake_or_real_news.csv")#text_df

In [7]:
# we don't need fake or not
text = list(text_df.text.values)
joined_text = " ".join(text)

In [8]:
partial_text = joined_text[:10000]

In [9]:
tokenizer = RegexpTokenizer(r"\w+")
tokens = tokenizer.tokenize(partial_text.lower())#tokens: a list of individual words

In [12]:
unique_tokens = np.unique(tokens)#remove duplicates
#a dictionary to map index to token
unique_token_index = {token: idx for idx, token in enumerate(unique_tokens)}#unique_token_index#mapped each word to an index

In [32]:
# how many words to look at to predict the next word, eg. 1 is too few, 1000 is too much
n_words = 10
input_words = []#x, 
next_words = []#y, next word is going to be the 11th word, repeat it until our training data is obtained

for i in range(len(tokens) - n_words):#not all words, leave some for test
    input_words.append(tokens[i:i + n_words])
    next_words.append(tokens[i + n_words])

In [33]:
# now we turn them all into x and y
# for each sample(amt of input_words, for each unique token we have either 0 or 1(it is next word), i.e. next word or not)
X = np.zeros((len(input_words), n_words, len(unique_tokens)), dtype = bool)
y = np.zeros((len(next_words), len(unique_tokens)), dtype=bool)

In [34]:
# fill up the structure with actual values
for i, words in enumerate(input_words):
    for j, word in enumerate(words):
        X[i, j, unique_token_index[word]] = 1
    y[i, unique_token_index[next_words[i]]] = 1

In [40]:
# now we train model

model = Sequential()
model.add(LSTM(128, input_shape=(n_words, len(unique_tokens)), return_sequences = True))#128 neurons
model.add(LSTM(128))
model.add(Dense(len(unique_tokens)))
model.add(Activation("softmax"))

  super().__init__(**kwargs)


In [41]:
model.compile(loss="categorical_crossentropy", optimizer=RMSprop(learning_rate=0.01), metrics=["accuracy"])
model.fit(X, y, batch_size=128, epochs=10, shuffle = True)

Epoch 1/10
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 119ms/step - accuracy: 0.0555 - loss: 6.1545
Epoch 2/10
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 111ms/step - accuracy: 0.0618 - loss: 5.8440
Epoch 3/10
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 105ms/step - accuracy: 0.0618 - loss: 5.8044
Epoch 4/10
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 120ms/step - accuracy: 0.0618 - loss: 5.7850
Epoch 5/10
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 119ms/step - accuracy: 0.0618 - loss: 5.7507
Epoch 6/10
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 99ms/step - accuracy: 0.0624 - loss: 5.7032
Epoch 7/10
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 125ms/step - accuracy: 0.0612 - loss: 5.6571
Epoch 8/10
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 109ms/step - accuracy: 0.0578 - loss: 5.6125
Epoch 9/10
[1m14/14[0m [32m━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x21f5fcc02f0>

In [45]:
# how we use model to predict next word
# firstly save model
model.save("mymodel.h5")



In [46]:
model = load_model("mymodel.h5")



In [47]:
def predict_next_word(input_text, n_best):
    input_text = input_text.lower()
    X = np.zeros((1, n_words, len(unique_tokens)))
    for i, word in enumerate(input_text.split()):
        X[0, i, unique_token_index[word]] = 1

    predictions = model.predict(X)[0]
    return np.argpartition(predictions, -n_best)[-n_best:]

In [53]:
possible = predict_next_word("He will have to look into this thing and he", 5)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 252ms/step


In [54]:
possible

array([286,  36,   4, 588, 513])

In [56]:
#To get actual words
print([unique_tokens[idx] for idx in possible])#prediction is garbage due to not enough data

[np.str_('his'), np.str_('and'), np.str_('a'), np.str_('the'), np.str_('ryan')]


In [59]:
# Now we generate text based on this garbage prediction

def generate_text(input_text, text_length, creativity=3):
    word_sequence = input_text.split()
    current = 0
    for _ in range(text_length):
        sub_sequence = " ".join(tokenizer.tokenize(" ".join(word_sequence).lower())[current:current + n_words])
        try:
            choice = unique_tokens[random.choice(predict_next_word(sub_sequence, creativity))]
        except:
            choice = random.choice(unique_tokens)
        word_sequence.append(choice)
        current += 1
    return " ".join(word_sequence)

In [60]:
generate_text("He will have to look into this thing and he", 100, 5)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44

'He will have to look into this thing and he a him with a for him are for for with and s with the act to investigation the assault and investigation the act the war act is investigation the of the investigation is the of the while assault on a while war while the investigation are investigation of the fbi investigation of s the fbi of and has fbi are while for investigation has assault of a act for for for with of the the fbi is and investigation is is is is investigation of to be him of of that the fbi the of the war on and fbi'