In [1]:
import tensorflow as tf
import re
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from cleantext import clean

## Reading Dataset and Cleaning it

In [2]:
df=pd.read_csv('jokes.csv')

In [3]:
df.head()

Unnamed: 0,ID,Question,Answer
0,1,Did you hear about the Native American man tha...,He nearly drown in his own tea pee.
1,2,What's the best anti diarrheal prescription?,Mycheexarphlexin
2,3,What do you call a person who is outside a doo...,Matt
3,4,Which Star Trek character is a member of the m...,Jean-Luc Pickacard
4,5,What's the difference between a bullet and a h...,A bullet doesn't miss Harambe


In [4]:
df.drop(columns=['ID'],inplace = True)

In [5]:
foo_nltk=__import__('nltk')
foo_nltk.download('stopwords')
foo_nltk.download('punkt')
foo_nltk.download('wordnet')
foo_nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kunwe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kunwe\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kunwe\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\kunwe\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [6]:
def clean_text(text):
    text = text.lower() #convert all the chracters into small letters
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"[-()\"#/@;:<>{}'+=|.!?,]", "", text)
    text = text.replace("[", "")
    text = text.replace("]", "")
    return clean(text,no_emoji=True)

In [7]:
from nltk import RegexpTokenizer
tokaniser=RegexpTokenizer(r"\w+")
df['cleaned_Q'] = [tokaniser.tokenize(clean_text(sentence)) for sentence in df['Question']]
df['cleaned_A'] = [tokaniser.tokenize(clean_text(sentence)) for sentence in df['Answer']]
df.head()

Unnamed: 0,Question,Answer,cleaned_Q,cleaned_A
0,Did you hear about the Native American man tha...,He nearly drown in his own tea pee.,"[did, you, hear, about, the, native, american,...","[he, nearly, drown, in, his, own, tea, pee]"
1,What's the best anti diarrheal prescription?,Mycheexarphlexin,"[what, is, the, best, anti, diarrheal, prescri...",[mycheexarphlexin]
2,What do you call a person who is outside a doo...,Matt,"[what, do, you, call, a, person, who, is, outs...",[matt]
3,Which Star Trek character is a member of the m...,Jean-Luc Pickacard,"[which, star, trek, character, is, a, member, ...","[jeanluc, pickacard]"
4,What's the difference between a bullet and a h...,A bullet doesn't miss Harambe,"[what, is, the, difference, between, a, bullet...","[a, bullet, does, not, miss, harambe]"


In [8]:
df = df[df['cleaned_A'].str.len() < 20]

In [9]:
corpus =df[['cleaned_Q']].apply(lambda words:[" ".join(word) for word in words]).to_numpy()
corpus2 =df[['cleaned_A']].apply(lambda words:[" ".join(word) for word in words]).to_numpy()
corpus = list(corpus.flatten())
corpus2 = list(corpus2.flatten())

## Creating n-gram tokens and giving them padding according to the max length of input

In [10]:
tokenizer=Tokenizer()
def get_sequence_of_tokens(corpus):
    ## tokenization
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    
    ## convert data to a token sequence 
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        input_sequences.append(token_list)
    return input_sequences, total_words

In [11]:
inp_sequence,total_words = get_sequence_of_tokens(corpus)
inp_sequence[:5],total_words

([[8, 4, 26, 17, 2, 610, 215, 43, 32, 1759, 2365, 4114, 14, 430],
  [1, 6, 2, 63, 4712, 9275, 4713],
  [1, 5, 4, 11, 3, 131, 29, 6, 683, 3, 443, 10, 60, 76, 280, 4714, 143],
  [111, 331, 2085, 1124, 6, 3, 1011, 14, 2, 1079, 1507],
  [1, 6, 2, 21, 20, 3, 2216, 10, 3, 611]],
 17667)

In [12]:
len(inp_sequence),len(corpus)

(36716, 36716)

In [13]:
def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    
    predictors = input_sequences[:,:-1],input_sequences[:,-1]
    return predictors, max_sequence_len

In [14]:
X, input_len=generate_padded_sequences(inp_sequence)
X[:10], input_len

((array([[   0,    0,    0, ..., 2365, 4114,   14],
         [   0,    0,    0, ...,   63, 4712, 9275],
         [   0,    0,    0, ...,   76,  280, 4714],
         ...,
         [   0,    0,    0, ...,   22, 1919,   12],
         [   0,    0,    0, ...,   12,    3,  348],
         [   0,    0,    0, ...,    3,  822, 8635]]),
  array([ 430, 4713,  143, ...,  697, 3573,  499])),
 23)

In [15]:
out_sequence,total_w = get_sequence_of_tokens(corpus2)
out_sequence[:5],total_w

([[18, 7561, 1568, 11, 30, 446, 441, 965],
  [19932],
  [3859],
  [12593, 19933],
  [2, 1969, 20, 13, 747, 1852]],
 28542)

In [16]:
input_len

23

In [17]:
y, out_lens=generate_padded_sequences(out_sequence)
y[:10], out_lens

((array([[    0,     0,     0, ...,    30,   446,   441],
         [    0,     0,     0, ...,     0,     0,     0],
         [    0,     0,     0, ...,     0,     0,     0],
         ...,
         [    0,     0,     0, ...,    22,   115,    50],
         [    0,     0,     0, ...,     0,     2, 28540],
         [    0,     0,     0, ...,     0,    29,   489]]),
  array([  965, 19932,  3859, ...,  2217, 28541, 13266])),
 19)

In [18]:
tokenized_answers = tokenizer.texts_to_sequences([str(i) for i in df['cleaned_A']])
for i in range(len(tokenized_answers)) :
    tokenized_answers[i] = tokenized_answers[i][1:]
padded_answers = tf.keras.preprocessing.sequence.pad_sequences( tokenized_answers , maxlen=out_lens , padding='pre' )
decoder_output_data = np.array( padded_answers )

In [19]:
decoder_output_data.shape,X[0].shape,y[0].shape

((36716, 19), (36716, 22), (36716, 18))

In [20]:
earlyStopper=tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=7, verbose=1, mode='auto',restore_best_weights=True)

## Makeing a simple LSTM RNN Model for prediction

In [21]:
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Embedding(total_w, 10,input_length=input_len-1))
model.add(tf.keras.layers.LSTM(50,return_sequences=True))
model.add(tf.keras.layers.LSTM(50))
model.add(tf.keras.layers.Dense(50))
model.add(tf.keras.layers.Dense(out_lens-1,activation='relu'))
opt_adam = tf.keras.optimizers.Adam(learning_rate=0.001)

In [22]:
model.compile(optimizer=opt_adam,loss='categorical_crossentropy',metrics=['accuracy'])

In [23]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 22, 10)            285420    
                                                                 
 lstm (LSTM)                 (None, 22, 50)            12200     
                                                                 
 lstm_1 (LSTM)               (None, 50)                20200     
                                                                 
 dense (Dense)               (None, 50)                2550      
                                                                 
 dense_1 (Dense)             (None, 18)                918       
                                                                 
Total params: 321,288
Trainable params: 321,288
Non-trainable params: 0
_________________________________________________________________


In [24]:
model.fit(X[0],y[0],epochs=30,validation_split=0.2,verbose=2,callbacks=[earlyStopper])

Epoch 1/30
918/918 - 15s - loss: 20862.7168 - accuracy: 0.3814 - val_loss: 16817.6660 - val_accuracy: 0.3637 - 15s/epoch - 16ms/step
Epoch 2/30
918/918 - 11s - loss: 14250.7266 - accuracy: 0.3825 - val_loss: 15139.0488 - val_accuracy: 0.3637 - 11s/epoch - 12ms/step
Epoch 3/30
918/918 - 11s - loss: 13786.5684 - accuracy: 0.3821 - val_loss: 14469.5439 - val_accuracy: 0.3660 - 11s/epoch - 12ms/step
Epoch 4/30
918/918 - 11s - loss: 13123.4336 - accuracy: 0.3829 - val_loss: 14558.5537 - val_accuracy: 0.3568 - 11s/epoch - 12ms/step
Epoch 5/30
918/918 - 11s - loss: 12865.0518 - accuracy: 0.3884 - val_loss: 14564.6572 - val_accuracy: 0.3637 - 11s/epoch - 12ms/step
Epoch 6/30
918/918 - 11s - loss: 12720.3496 - accuracy: 0.3906 - val_loss: 14772.6445 - val_accuracy: 0.3632 - 11s/epoch - 12ms/step
Epoch 7/30
918/918 - 11s - loss: 12559.2959 - accuracy: 0.3876 - val_loss: 14712.1406 - val_accuracy: 0.3637 - 11s/epoch - 12ms/step
Epoch 8/30
918/918 - 12s - loss: 12395.6709 - accuracy: 0.3891 - val_

<keras.callbacks.History at 0x19b3b7296d0>

In [25]:
def generate_text(seed_text, model, max_sequence_len):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list],maxlen=max_sequence_len-1, padding='pre')
    predict_x=model.predict(token_list)[0]
    output_word = ""
    for i in predict_x:
        for word,index in tokenizer.word_index.items():
            if index == int(i):
                output_word +=" "+ word
                break
    seed_text += " -> "+output_word
    return seed_text.title()

In [41]:
rand=np.random.randint(0,high=df['cleaned_Q'].shape[0])
generate_text(clean_text(df['Question'][rand]),model,input_len)

'What Pokemon Can You Find At Auschwitz ->  The A'

## The loss was very high and low accuracy for the given data