In [1]:
import tensorflow as tf
import re
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
fbrom keras.preprocessing.sequence import pad_sequences
from cleantext import clean

## Reading data and removing unnecessary column

In [2]:
df=pd.read_csv('jokes.csv')
df.head(5)

Unnamed: 0,ID,Question,Answer
0,1,Did you hear about the Native American man tha...,He nearly drown in his own tea pee.
1,2,What's the best anti diarrheal prescription?,Mycheexarphlexin
2,3,What do you call a person who is outside a doo...,Matt
3,4,Which Star Trek character is a member of the m...,Jean-Luc Pickacard
4,5,What's the difference between a bullet and a h...,A bullet doesn't miss Harambe


In [3]:
df.drop(columns=['ID'],inplace = True)

# Cleaning Data

In [4]:
def clean_text(text):
    text = text.lower() #convert all the chracters into small letters
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"[-()\"#/@;:<>{}'+=|.!?,]", "", text)
    text = text.replace("[", "")
    text = text.replace("]", "")
    return clean(text,no_emoji=True)

In [5]:
from nltk import RegexpTokenizer
tokaniser=RegexpTokenizer(r"\w+")
df['cleaned_Q'] = [tokaniser.tokenize(clean_text(sentence)) for sentence in df['Question']]
df['cleaned_A'] = [tokaniser.tokenize(clean_text(sentence)) for sentence in df['Answer']]
df.head()

Unnamed: 0,Question,Answer,cleaned_Q,cleaned_A
0,Did you hear about the Native American man tha...,He nearly drown in his own tea pee.,"[did, you, hear, about, the, native, american,...","[he, nearly, drown, in, his, own, tea, pee]"
1,What's the best anti diarrheal prescription?,Mycheexarphlexin,"[what, is, the, best, anti, diarrheal, prescri...",[mycheexarphlexin]
2,What do you call a person who is outside a doo...,Matt,"[what, do, you, call, a, person, who, is, outs...",[matt]
3,Which Star Trek character is a member of the m...,Jean-Luc Pickacard,"[which, star, trek, character, is, a, member, ...","[jeanluc, pickacard]"
4,What's the difference between a bullet and a h...,A bullet doesn't miss Harambe,"[what, is, the, difference, between, a, bullet...","[a, bullet, does, not, miss, harambe]"


In [6]:
len(df.index)

38269

In [7]:
df = df[df['cleaned_A'].str.len() < 20]

In [8]:
df['cleaned_Q'] =df[['cleaned_Q']].apply(lambda words:[" ".join(word) for word in words]).to_numpy()
df['cleaned_A'] =df[['cleaned_A']].apply(lambda words:[" ".join(word) for word in words]).to_numpy()

In [9]:
range_val = 3000

In [10]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([str(i) for i in df['cleaned_Q'].sample(n=range_val,random_state=121)] +  [str(i) for i in df['cleaned_A'].sample(n=range_val,random_state=121)])
VOCAB_SIZE = len( tokenizer.word_index )+1
print( 'VOCAB SIZE : {}'.format( VOCAB_SIZE ))

VOCAB SIZE : 7166


## Converting 

In [11]:
from gensim.models import Word2Vec
import re

vocab = []
for word in tokenizer.word_index:
    vocab.append( word )

def tokenize( sentences ):
    tokens_list = []
    vocabulary = []
    for sentence in sentences:
        sentence = sentence.lower()
        sentence = re.sub( '[^a-zA-Z]', ' ', sentence )
        tokens = sentence.split()
        vocabulary += tokens
        tokens_list.append( tokens )
    return tokens_list , vocabulary


# encoder_input_data
tokenized_questions = tokenizer.texts_to_sequences( [str(i) for i in df['cleaned_Q'].sample(n=range_val,random_state=121)] )
maxlen_questions = max( [ len(x) for x in tokenized_questions ] )
padded_questions = tf.keras.preprocessing.sequence.pad_sequences( tokenized_questions , maxlen=maxlen_questions , padding='post' )
encoder_input_data = np.array( padded_questions )
print( encoder_input_data.shape , maxlen_questions )

# decoder_input_data
tokenized_answers = tokenizer.texts_to_sequences( [str(i) for i in df['cleaned_A'].sample(n=range_val,random_state=121)] )
maxlen_answers = max( [ len(x) for x in tokenized_answers ] )
padded_answers = tf.keras.preprocessing.sequence.pad_sequences( tokenized_answers , maxlen=maxlen_answers , padding='post' )
decoder_input_data = np.array( padded_answers )
print( decoder_input_data.shape , maxlen_answers )

# decoder_output_data
tokenized_answers = tokenizer.texts_to_sequences( [str(i) for i in df['cleaned_A'].sample(n=range_val,random_state=121)] )
for i in range(len(tokenized_answers)) :
    tokenized_answers[i] = tokenized_answers[i][1:]
padded_answers = tf.keras.preprocessing.sequence.pad_sequences( tokenized_answers , maxlen=maxlen_answers , padding='post' )
onehot_answers = tf.keras.utils.to_categorical( padded_answers,VOCAB_SIZE)
decoder_output_data = np.array( onehot_answers)
print( decoder_output_data.shape )

(3000, 21) 21
(3000, 19) 19
(3000, 19, 7166)


In [18]:
encoder_inputs = tf.keras.layers.Input(shape=( maxlen_questions , ))
encoder_embedding = tf.keras.layers.Embedding( VOCAB_SIZE, 200 , mask_zero=True ) (encoder_inputs)
encoder_outputs , state_h , state_c = tf.keras.layers.LSTM( 200 , return_state=True ,dropout=0.33)( encoder_embedding)
encoder_states = [ state_h , state_c ]

decoder_inputs = tf.keras.layers.Input(shape=( maxlen_answers ,  ))
decoder_embedding = tf.keras.layers.Embedding( VOCAB_SIZE, 100 , mask_zero=True) (decoder_inputs)
decoder_lstm = tf.keras.layers.LSTM( 200 , return_state=True , return_sequences=True ,dropout=0.33)
decoder_outputs , _ , _ = decoder_lstm ( decoder_embedding , initial_state=encoder_states )
decoder_dense = tf.keras.layers.Dense( VOCAB_SIZE , activation=tf.keras.activations.softmax ) 
output = decoder_dense ( decoder_outputs )

model = tf.keras.models.Model([encoder_inputs, decoder_inputs], output )

In [19]:
model.compile(optimizer="adam", loss='categorical_crossentropy',metrics=['accuracy'])

In [20]:
earlyStopper=tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=20, verbose=1, mode='auto',restore_best_weights=True)

In [21]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_5 (InputLayer)           [(None, 21)]         0           []                               
                                                                                                  
 input_6 (InputLayer)           [(None, 19)]         0           []                               
                                                                                                  
 embedding_4 (Embedding)        (None, 21, 200)      1433200     ['input_5[0][0]']                
                                                                                                  
 embedding_5 (Embedding)        (None, 19, 100)      716600      ['input_6[0][0]']                
                                                                                            

In [22]:
model.fit([encoder_input_data , decoder_input_data], decoder_output_data,validation_split=0.2, epochs=150,verbose=2,callbacks=[earlyStopper])

Epoch 1/150
75/75 - 14s - loss: 2.0194 - accuracy: 0.1791 - val_loss: 1.7603 - val_accuracy: 0.1899 - 14s/epoch - 180ms/step
Epoch 2/150
75/75 - 3s - loss: 1.7080 - accuracy: 0.1834 - val_loss: 1.7431 - val_accuracy: 0.1899 - 3s/epoch - 45ms/step
Epoch 3/150
75/75 - 3s - loss: 1.6499 - accuracy: 0.1834 - val_loss: 1.7648 - val_accuracy: 0.1899 - 3s/epoch - 46ms/step
Epoch 4/150
75/75 - 4s - loss: 1.6150 - accuracy: 0.1828 - val_loss: 1.7747 - val_accuracy: 0.1892 - 4s/epoch - 51ms/step
Epoch 5/150
75/75 - 4s - loss: 1.5753 - accuracy: 0.1859 - val_loss: 1.7963 - val_accuracy: 0.1933 - 4s/epoch - 53ms/step
Epoch 6/150
75/75 - 5s - loss: 1.5375 - accuracy: 0.1972 - val_loss: 1.8052 - val_accuracy: 0.1924 - 5s/epoch - 61ms/step
Epoch 7/150
75/75 - 4s - loss: 1.5049 - accuracy: 0.2034 - val_loss: 1.8221 - val_accuracy: 0.1965 - 4s/epoch - 56ms/step
Epoch 8/150
75/75 - 4s - loss: 1.4714 - accuracy: 0.2147 - val_loss: 1.8216 - val_accuracy: 0.2019 - 4s/epoch - 59ms/step
Epoch 9/150
75/75 - 4

<keras.callbacks.History at 0x1e91881fb50>

In [23]:
def make_inference_models():
    
    encoder_model = tf.keras.models.Model(encoder_inputs, encoder_states)
    
    decoder_state_input_h = tf.keras.layers.Input(shape=( 200 ,))
    decoder_state_input_c = tf.keras.layers.Input(shape=( 200 ,))
    
    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
    
    decoder_outputs, state_h, state_c = decoder_lstm(
        decoder_embedding , initial_state=decoder_states_inputs)
    decoder_states = [state_h, state_c]
    decoder_outputs = decoder_dense(decoder_outputs)
    decoder_model = tf.keras.models.Model(
        [decoder_inputs] + decoder_states_inputs,
        [decoder_outputs] + decoder_states)
    
    return encoder_model , decoder_model

In [28]:
def str_to_tokens( sentence : str ):
    words = sentence.lower().split()
    tokens_list = list()
    for word in words:
        tokens_list.append( tokenizer.word_index[ word ] ) 
    return tf.keras.preprocessing.sequence.pad_sequences( [tokens_list] , maxlen=maxlen_questions , padding='post')

In [33]:
rand=np.random.randint(0,high=df['cleaned_Q'].shape[0])
tokenized_questions = tokenizer.texts_to_sequences([clean_text(df['Question'][rand])])
maxlen_questions = max( [ len(x) for x in tokenized_questions ] )
padded_questions = tf.keras.preprocessing.sequence.pad_sequences( tokenized_questions , maxlen=maxlen_questions , padding='post' )
encoder_input_data = np.array( padded_questions )
print(encoder_input_data.shape)
tokenized_answers = tokenizer.texts_to_sequences( [clean_text(df['Question'][rand])] )
maxlen_answers = max( [ len(x) for x in tokenized_answers ] )
padded_answers = tf.keras.preprocessing.sequence.pad_sequences( tokenized_answers , maxlen=maxlen_answers , padding='post' )
decoder_input_data = np.array( padded_answers )

model.predict([encoder_input_data,decoder_input_data])

(1, 6)


array([[[1.78382799e-01, 2.84035038e-02, 2.84499303e-02, ...,
         2.62624735e-06, 2.50836661e-06, 2.68936310e-06],
        [2.64580101e-01, 3.90688330e-02, 3.23100984e-02, ...,
         1.84343344e-06, 1.69792975e-06, 1.70128646e-06],
        [8.17695409e-02, 2.10882686e-02, 1.59607586e-02, ...,
         1.10525334e-05, 1.05808858e-05, 1.01583955e-05],
        [3.15135241e-01, 4.20886911e-02, 2.93629933e-02, ...,
         1.66654024e-06, 1.53514850e-06, 1.48302649e-06],
        [4.67917353e-01, 4.68271561e-02, 3.15175317e-02, ...,
         5.31806336e-07, 4.86323586e-07, 4.74760867e-07],
        [2.70807028e-01, 3.91846485e-02, 2.71630455e-02, ...,
         2.49449454e-06, 2.35553648e-06, 2.30308251e-06]]], dtype=float32)

In [None]:
model.predict