<a href="https://colab.research.google.com/github/FOwen123/Text-Generation-Model/blob/main/DeepLearning_Midterm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install tensorflow

In [10]:
import random
import pickle
import heapq

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.tokenize import RegexpTokenizer

import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Dense, Activation, Embedding, Dropout
from tensorflow.keras.optimizers import RMSprop

# Pre Processing

In [3]:
text_df = pd.read_csv("fake_or_real_news.csv")
text = list(text_df.text.values)
joined_text = " ".join(text)

with open("joined_text.txt", "w", encoding="utf-8") as f:
    f.write(joined_text)

In [4]:
partial_text = joined_text[:1000000]

In [5]:
tokenizer = RegexpTokenizer(r"\w+")
tokens = tokenizer.tokenize(partial_text.lower())

In [6]:
unique_tokens = np.unique(tokens)
unique_token_index = {token: index for index, token in enumerate(unique_tokens)}

In [7]:
n_words = 10
input_words = []
next_word = []

for i in range(len(tokens) - n_words):
    input_words.append(tokens[i:i + n_words])
    next_word.append(tokens[i + n_words])

In [8]:
# 1. Create X as a 2D array of integers
# Shape: (num_samples, n_words)
X = np.zeros((len(input_words), n_words), dtype=np.int32)

# 2. Create y as a 1D array of integers
# Shape: (num_samples,)
y = np.zeros((len(input_words),), dtype=np.int32)

In [9]:
for i, words in enumerate(input_words):
    for j, word in enumerate(words):
        X[i, j] = unique_token_index[word]
    y[i] = unique_token_index[next_word[i]]

#Model Training

In [11]:
embedding_dim = 100 # Embedding dimension

model = Sequential()

# 1. The 'Embedding' layer
# Takes integer inputs (vocab size 15000
# Turns them into 100-dimension vectors.
model.add(Embedding(input_dim=len(unique_tokens),
                    output_dim=embedding_dim,
                    input_length=n_words))

# 2. The LSTM layer
# [batch_size, n_words, embedding_dim]
model.add(LSTM(128, return_sequences=True))
model.add(LSTM(128))
model.add(Dense(len(unique_tokens), activation="softmax"))



In [13]:
optimizer = RMSprop(learning_rate=0.001)
model.compile(loss="sparse_categorical_crossentropy",
              optimizer=optimizer,
              metrics=["accuracy"])
history = model.fit(X, y, batch_size=128, epochs=10, shuffle=True).history

Epoch 1/10
[1m1326/1326[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m242s[0m 179ms/step - accuracy: 0.0556 - loss: 7.3470
Epoch 2/10
[1m1326/1326[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m234s[0m 176ms/step - accuracy: 0.0550 - loss: 7.3101
Epoch 3/10
[1m1326/1326[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m223s[0m 169ms/step - accuracy: 0.0552 - loss: 7.2907
Epoch 4/10
[1m1326/1326[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m216s[0m 163ms/step - accuracy: 0.0562 - loss: 7.2359
Epoch 5/10
[1m1326/1326[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m219s[0m 165ms/step - accuracy: 0.0759 - loss: 7.0156
Epoch 6/10
[1m1326/1326[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m220s[0m 166ms/step - accuracy: 0.0889 - loss: 6.8763
Epoch 7/10
[1m1326/1326[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m218s[0m 164ms/step - accuracy: 0.0986 - loss: 6.7574
Epoch 8/10
[1m1326/1326[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m262s[0m 164ms/step - accuracy: 0.1025 - loss:

In [15]:
model.save("text_gen_model.h5")
with open("history_model.p", "wb") as f:
  pickle.dump(history, f)



In [16]:
model = load_model("text_gen_model.h5")
history = pickle.load(open("history_model.p", "rb"))



In [17]:
def predict_next_word(input_text, n_best):
  """
  Prepares input text as integer indices for the embedding model
  and predicts the next word.
  """
  input_text = input_text.lower()
  # 1. Create X as a 2D array of integers
  X = np.zeros((1, n_words), dtype=np.int32)

  # 2. Tokenize and fill the array with integer indices
  words = input_text.split()
  for i, word in enumerate(words):
      # Stop if we exceed the sequence length
      if i >= n_words:
          break
      X[0, i] = unique_token_index.get(word, 0)

  # 3. Predict using the integer-based input
  predictions = model.predict(X)[0]

  # 4. Return the indices of the 'n_best' most likely words
  return np.argpartition(predictions, -n_best)[-n_best:]

In [18]:
possible = predict_next_word("I will have to look into this thing because I", 5)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 353ms/step


In [19]:
for idx in possible:
  print(unique_tokens[idx])

re
m
ve
had
have


In [20]:
def generate_text(input_text, text_length, creativity=3):
  word_sequence = input_text.split()
  current = 0
  for _ in range(text_length):
    # Simple sliding window
    sub_sequence = " ".join(tokenizer.tokenize(" ".join(word_sequence).lower())[current:current+n_words])
    try:
        # Get the indices of the top 'creativity' predictions
        # Similar to top-k sampling in LLMs
        possible_indices = predict_next_word(sub_sequence, creativity)

        # Randomly choose one of those indices
        chosen_index = random.choice(possible_indices)

        # Get the actual word from the index
        choice = unique_tokens[chosen_index]
    except Exception as e:
        print(f"Error during prediction: {e}. Choosing random word.")
        choice = random.choice(unique_tokens)
    word_sequence.append(choice)
    current += 1
  return " ".join(word_sequence)

In [21]:
for idx in predict_next_word("The president will most likely not be there to help", 5):
    print(unique_tokens[idx])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 76ms/step
this
in
to
a
the


In [23]:
generate_text("The president of the Asia University announced yesterday that he", 100, 10)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 85ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 95ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 87ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46

'The president of the Asia University announced yesterday that he is also a way of this way to keep clinton and that she said that he has a country on an years is an states to do that is a the new of new and one but i had the republican house i are that it to the same republican and the same and campaign to be the country that has have a country on clinton s president in this year to her of his campaign to a campaign is the way of an years for an years of all obama to keep it that i had not be in'

# Summary

This project details the implementation of a **causal language model** based on a **Recurrent Neural Network (RNN)** architecture. The model utilizes an `Embedding` layer followed by stacked `LSTM` layers to process fixed-length (10-word) sequences from a news corpus. Its objective is to predict a probability distribution for the subsequent word, from which a selection is made using a **Top-k sampling** strategy.

This foundational approach serves as a basis for modern Large Language Models (LLMs), which employ an evolved architecture—the **Transformer**—to achieve similar goals. Whereas this project's LSTM processes information sequentially, Transformers utilize a **self-attention** mechanism to weigh the influence of all tokens in parallel. This mechanism enables a far more sophisticated capture of complex, long-range contextual dependencies, which, when combined with internet-scale training data and advanced sampling (like **Top-p sampling**), allows modern LLMs to generate text of significantly higher coherence and quality.