In [60]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
import numpy as np
import pandas as pd
import seaborn as sns

In [61]:
df=pd.read_csv("shortjokes.csv")

**Preprocessing**

In [62]:
df1=df
df1.head()

Unnamed: 0,ID,Joke
0,1,"[me narrating a documentary about narrators] ""..."
1,2,Telling my daughter garlic is good for you. Go...
2,3,I've been going through a really rough period ...
3,4,"If I could have dinner with anyone, dead or al..."
4,5,Two guys walk into a bar. The third guy ducks.


In [63]:
df1.shape

(231657, 2)

In [64]:
desired_rows = 1000  #since the dataset is very large we take only 1000 random rows of it
sampled_df = df1.sample(n=desired_rows, random_state=42)

In [65]:
sampled_df.shape

(1000, 2)

In [66]:
joke_paragraph = '\n'.join(sampled_df['Joke'])#converting every joke in a paragraph based on new line

In [67]:
lines = joke_paragraph.split('\n')
for line in lines[:10]:
    print(line)

What do all battered women have in common? They don't listen.
Who invented the North America? TEACHER: Sarah, go to the map and find North America. SARAH: Here it is. TEACHER: Correct. Now class, who discovered America?CLASS: Sarah!
I feel like this election ended up being a good thing for Hilary Clinton. At least now she knows what it feels like to get fucked by the president.
What do you call a pile of kittens? A Meowntain
I feel bad for people named John Smith. They probably didn't get the gmail account they wanted.
What would you have if your car's  motor was in flames? A fire engine.
When accused by a woman a man's first instinct is to deny. We're not lying, we're just buying time to remember what you're talking about...
I set my phone to airplane mode I lost it two weeks ago and everyone has a different opinion on what happened to it
There's a thin line distinguishing "Heroes" from "Herpes"
What did batman tell robin before they got in the car? Get in the car.


In [68]:
import nltk
from nltk.tokenize import word_tokenize


In [69]:
tokenizer = Tokenizer()

In [70]:
tokenizer.fit_on_texts([joke_paragraph])

In [71]:
len(tokenizer.word_index)

3981

In [72]:
input_sequences = []
for sentence in joke_paragraph.split('\n'):
  tokenized_sentence = tokenizer.texts_to_sequences([sentence])[0]

  for i in range(1,len(tokenized_sentence)):
    input_sequences.append(tokenized_sentence[:i+1])

In [73]:
input_sequences

[[11, 13],
 [11, 13, 46],
 [11, 13, 46, 1529],
 [11, 13, 46, 1529, 135],
 [11, 13, 46, 1529, 135, 19],
 [11, 13, 46, 1529, 135, 19, 6],
 [11, 13, 46, 1529, 135, 19, 6, 267],
 [11, 13, 46, 1529, 135, 19, 6, 267, 26],
 [11, 13, 46, 1529, 135, 19, 6, 267, 26, 43],
 [11, 13, 46, 1529, 135, 19, 6, 267, 26, 43, 507],
 [49, 1530],
 [49, 1530, 2],
 [49, 1530, 2, 645],
 [49, 1530, 2, 645, 268],
 [49, 1530, 2, 645, 268, 303],
 [49, 1530, 2, 645, 268, 303, 508],
 [49, 1530, 2, 645, 268, 303, 508, 83],
 [49, 1530, 2, 645, 268, 303, 508, 83, 4],
 [49, 1530, 2, 645, 268, 303, 508, 83, 4, 2],
 [49, 1530, 2, 645, 268, 303, 508, 83, 4, 2, 1531],
 [49, 1530, 2, 645, 268, 303, 508, 83, 4, 2, 1531, 7],
 [49, 1530, 2, 645, 268, 303, 508, 83, 4, 2, 1531, 7, 214],
 [49, 1530, 2, 645, 268, 303, 508, 83, 4, 2, 1531, 7, 214, 645],
 [49, 1530, 2, 645, 268, 303, 508, 83, 4, 2, 1531, 7, 214, 645, 268],
 [49, 1530, 2, 645, 268, 303, 508, 83, 4, 2, 1531, 7, 214, 645, 268, 508],
 [49, 1530, 2, 645, 268, 303, 508, 83,

In [74]:
len(input_sequences)

16612

In [75]:
max_len = max([len(x) for x in input_sequences])

In [76]:
max_len

44

In [77]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
padded_input_sequences = pad_sequences(input_sequences, maxlen = max_len, padding='pre')

In [78]:
padded_input_sequences

array([[   0,    0,    0, ...,    0,   11,   13],
       [   0,    0,    0, ...,   11,   13,   46],
       [   0,    0,    0, ...,   13,   46, 1529],
       ...,
       [   0,    0,    0, ...,    4, 3980,   35],
       [   0,    0,    0, ..., 3980,   35, 3981],
       [   0,    0,    0, ...,   35, 3981,  853]], dtype=int32)

**Model** **Building**

In [79]:
X = padded_input_sequences[:,:-1]
y = padded_input_sequences[:,-1]

In [80]:
X.shape

(16612, 43)

In [81]:
y.shape

(16612,)

In [82]:
from tensorflow.keras.utils import to_categorical
y = to_categorical(y,num_classes=15279)

In [83]:
y.shape

(16612, 15279)

In [84]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [85]:
model = Sequential()
model.add(Embedding(input_dim=15279, output_dim=2000))
model.add(LSTM(500))
model.add(Dense(15279, activation='softmax'))

In [86]:
model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])

In [87]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, None, 2000)        30558000  
                                                                 
 lstm_2 (LSTM)               (None, 500)               5002000   
                                                                 
 dense_2 (Dense)             (None, 15279)             7654779   
                                                                 
Total params: 43214779 (164.85 MB)
Trainable params: 43214779 (164.85 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [88]:
model.fit(X,y,epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x7a86713f03a0>

Prediction

In [94]:
text = "what do"
for i in range(10):
  # tokenize
  token_text = tokenizer.texts_to_sequences([text])[0]
  # padding
  padded_token_text = pad_sequences([token_text], maxlen=56, padding='pre')
  # predict
  pos = np.argmax(model.predict(padded_token_text))

  for word,index in tokenizer.word_index.items():
    if index == pos:
      text = text + " " + word
      print(text)

what do you
what do you call
what do you call a
what do you call a man
what do you call a man with
what do you call a man with no
what do you call a man with no arms
what do you call a man with no arms or
what do you call a man with no arms or legs
what do you call a man with no arms or legs being
