In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.python.client import device_lib
from tensorflow.keras.models import load_model

2025-07-10 02:18:11.128446: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-07-10 02:18:11.268904: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752128291.321592     742 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752128291.339266     742 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1752128291.471372     742 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [3]:
sequence_length = 5  # Number of words in input

# preparing data
df = pd.read_csv("jokes.csv")
text = " ".join(df['Joke'].astype(str).tolist()).lower()

tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1
print("Vocab Size:", total_words)

input_sequences = []
tokens = tokenizer.texts_to_sequences([text])[0]
for i in range(sequence_length, len(tokens)):
    n_gram_seq = tokens[i-sequence_length:i+1]
    input_sequences.append(n_gram_seq)

print("Total sequences:", len(input_sequences))

# 4. Pad & split into X/y
input_sequences = np.array(input_sequences)
X = input_sequences[:, :-1]  # all words except last
y = input_sequences[:, -1]   # the last word

Vocab Size: 70649
Total sequences: 4082135


In [4]:
model = Sequential([
	Embedding(total_words, 100, input_length=sequence_length),
	LSTM(128),
	Dense(total_words, activation="softmax")
])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')
model.fit(X, y, batch_size=128, epochs=30)
model.save("joke_words_2.keras")

I0000 00:00:1752034852.297762    1227 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 5563 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 4060, pci bus id: 0000:01:00.0, compute capability: 8.9


Epoch 1/30
[1m    1/31892[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m12:15:31[0m 1s/step - loss: 11.1656

I0000 00:00:1752034854.088616    1363 cuda_dnn.cc:529] Loaded cuDNN version 91002


[1m31892/31892[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m455s[0m 14ms/step - loss: 6.2750
Epoch 2/30
[1m31892/31892[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m461s[0m 14ms/step - loss: 5.3006
Epoch 3/30
[1m31892/31892[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m455s[0m 14ms/step - loss: 5.0755
Epoch 4/30
[1m31892/31892[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m457s[0m 14ms/step - loss: 4.9323
Epoch 5/30
[1m31892/31892[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m457s[0m 14ms/step - loss: 4.8311
Epoch 6/30
[1m31892/31892[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m458s[0m 14ms/step - loss: 4.7466
Epoch 7/30
[1m31892/31892[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m458s[0m 14ms/step - loss: 4.6869
Epoch 8/30
[1m31892/31892[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m459s[0m 14ms/step - loss: 4.6405
Epoch 9/30
[1m31892/31892[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m460s[0m 14ms/step - loss: 4.5997
Epoch 10/30
[1m31892/31892[0m [32m━━

In [7]:
final_model = load_model("joke_words_2.keras")

def generate_text(seed_text, next_words=5, temperature=1.0):
    result = seed_text.lower()
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([result])[0]
        token_list = pad_sequences([token_list], maxlen=sequence_length, padding='pre')
        predictions = final_model.predict(token_list, verbose=0)[0]

        # Temperature sampling
        preds = np.log(predictions + 1e-8) / temperature
        exp_preds = np.exp(preds)
        preds = exp_preds / np.sum(exp_preds)

        next_index = np.random.choice(len(preds), p=preds)
        next_word = tokenizer.index_word.get(next_index, '')

        # Termination logic
        if not next_word.strip() or next_word in result.split()[-5:]:
            break

        result += ' ' + next_word
    return result


# Example use
your_line = "Knock Knock"
seed = your_line.lower().split()[:sequence_length]
print(generate_text(" ".join(seed), temperature=0.7))

knock knock who's there i was on
