In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [10]:
df=pd.read_csv('../datasets/Conversation.csv')

In [12]:
df.shape

(3725, 2)

In [14]:
df

Unnamed: 0,question,answer
0,"hi, how are you doing?",i'm fine. how about yourself?
1,i'm fine. how about yourself?,i'm pretty good. thanks for asking.
2,i'm pretty good. thanks for asking.,no problem. so how have you been?
3,no problem. so how have you been?,i've been great. what about you?
4,i've been great. what about you?,i've been good. i'm in school right now.
...,...,...
3720,that's a good question. maybe it's not old age.,are you right-handed?
3721,are you right-handed?,yes. all my life.
3722,yes. all my life.,you're wearing out your right hand. stop using...
3723,you're wearing out your right hand. stop using...,but i do all my writing with my right hand.


In [16]:
df[['answer']]

Unnamed: 0,answer
0,i'm fine. how about yourself?
1,i'm pretty good. thanks for asking.
2,no problem. so how have you been?
3,i've been great. what about you?
4,i've been good. i'm in school right now.
...,...
3720,are you right-handed?
3721,yes. all my life.
3722,you're wearing out your right hand. stop using...
3723,but i do all my writing with my right hand.


In [20]:
import re
def clean_sequence(sentence):
    return re.sub(r'[^\w\s]', '', sentence)

In [24]:
df['answer']=df['answer'].apply(clean_sequence)

In [26]:
df['question']=df['question'].apply(clean_sequence)

In [28]:
df

Unnamed: 0,question,answer
0,hi how are you doing,im fine how about yourself
1,im fine how about yourself,im pretty good thanks for asking
2,im pretty good thanks for asking,no problem so how have you been
3,no problem so how have you been,ive been great what about you
4,ive been great what about you,ive been good im in school right now
...,...,...
3720,thats a good question maybe its not old age,are you righthanded
3721,are you righthanded,yes all my life
3722,yes all my life,youre wearing out your right hand stop using i...
3723,youre wearing out your right hand stop using i...,but i do all my writing with my right hand


In [81]:
doc=df['answer'].values

In [86]:
doc.shape

(3725,)

### Converting document text into input Sequences

In [50]:
import tensorflow
from tensorflow import keras
from tensorflow.keras.utils import pad_sequences,to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer

In [88]:
tokenizer=Tokenizer()
tokenizer.fit_on_texts(doc)

In [90]:
len(tokenizer.word_index)

2459

In [114]:
input_sequences=[]
for sentence in doc:
    tokenized_sentence=tokenizer.texts_to_sequences([sentence])[0]
    # print(tokenized_sentence)
    for i in range(1,len(tokenized_sentence)):
        input_sequences.append(tokenized_sentence[:i+1])

In [116]:
input_sequences

[[32, 546],
 [32, 546, 38],
 [32, 546, 38, 33],
 [32, 546, 38, 33, 547],
 [32, 167],
 [32, 167, 45],
 [32, 167, 45, 211],
 [32, 167, 45, 211, 22],
 [32, 167, 45, 211, 22, 469],
 [23, 168],
 [23, 168, 17],
 [23, 168, 17, 38],
 [23, 168, 17, 38, 13],
 [23, 168, 17, 38, 13, 2],
 [23, 168, 17, 38, 13, 2, 98],
 [99, 98],
 [99, 98, 108],
 [99, 98, 108, 9],
 [99, 98, 108, 9, 33],
 [99, 98, 108, 9, 33, 2],
 [99, 98],
 [99, 98, 45],
 [99, 98, 45, 32],
 [99, 98, 45, 32, 15],
 [99, 98, 45, 32, 15, 103],
 [99, 98, 45, 32, 15, 103, 66],
 [99, 98, 45, 32, 15, 103, 66, 96],
 [9, 103],
 [9, 103, 8],
 [9, 103, 8, 2],
 [9, 103, 8, 2, 39],
 [9, 103, 8, 2, 39, 4],
 [1, 39],
 [1, 39, 4],
 [1, 39, 4, 779],
 [8, 2],
 [8, 2, 21],
 [8, 2, 21, 6],
 [8, 2, 21, 6, 52],
 [18, 78],
 [18, 78, 18],
 [18, 78, 18, 5],
 [18, 78, 18, 5, 53],
 [18, 78, 18, 5, 53, 161],
 [18, 78, 18, 5, 53, 161, 1448],
 [45, 630],
 [45, 630, 50],
 [45, 630, 50, 103],
 [130, 2],
 [130, 2, 140],
 [130, 2, 140, 85],
 [32, 187],
 [32, 187, 41]

## Padding Input Sequences

In [119]:
from tensorflow.keras.utils import pad_sequences

In [129]:
max_len=max([len(item) for item in input_sequences])

In [133]:
padded_input_sequences=pad_sequences(input_sequences,maxlen=max_len,padding='pre')

In [135]:
padded_input_sequences

array([[  0,   0,   0, ...,   0,  32, 546],
       [  0,   0,   0, ...,  32, 546,  38],
       [  0,   0,   0, ..., 546,  38,  33],
       ...,
       [  0,   0,   0, ...,  43,   8, 534],
       [  0,   0,   0, ...,   8, 534,   3],
       [  0,   0,   0, ..., 534,   3, 148]])

In [137]:
padded_input_sequences.shape

(20572, 19)

### Splitting into training and testing dataset

In [170]:
from sklearn.model_selection import train_test_split

In [164]:
X=padded_input_sequences[:,:-1]
y=padded_input_sequences[:,-1]

### Converting target column into vector

In [185]:
from tensorflow.keras.utils import to_categorical

In [193]:
y=to_categorical(y,num_classes=len(tokenizer.word_index)+1)

In [195]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [200]:
X_train.shape

(16457, 18)

In [206]:
y.shape

(20572, 2460)

## Model Building

In [204]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,LSTM,Embedding

In [225]:
model=Sequential()
model.add(Embedding(2460,100,input_length=18))
model.add(LSTM(200))
model.add(Dense(2460,activation='softmax'))



In [227]:
model.summary()

In [230]:
model.compile(optimizer='Adam',loss='categorical_crossentropy',metrics=['accuracy'])

In [232]:
model.fit(X_train,y_train,epochs=100,validation_data=(X_test,y_test))

Epoch 1/100
[1m515/515[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 20ms/step - accuracy: 0.0328 - loss: 6.5836 - val_accuracy: 0.0598 - val_loss: 6.1675
Epoch 2/100
[1m515/515[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 18ms/step - accuracy: 0.0637 - loss: 5.8333 - val_accuracy: 0.0765 - val_loss: 5.9133
Epoch 3/100
[1m515/515[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 19ms/step - accuracy: 0.0985 - loss: 5.3484 - val_accuracy: 0.0979 - val_loss: 5.7417
Epoch 4/100
[1m515/515[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 19ms/step - accuracy: 0.1228 - loss: 4.9894 - val_accuracy: 0.1220 - val_loss: 5.6299
Epoch 5/100
[1m515/515[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 19ms/step - accuracy: 0.1537 - loss: 4.6257 - val_accuracy: 0.1256 - val_loss: 5.6280
Epoch 6/100
[1m515/515[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 19ms/step - accuracy: 0.1808 - loss: 4.2939 - val_accuracy: 0.1397 - val_loss: 5.6203
Epoch 7/100

<keras.src.callbacks.history.History at 0x1f8e3e9d5e0>

In [274]:
tokenized_input

[[9]]

In [304]:
input_test='name'

In [306]:
for i in range(5):
    tokenized_input=tokenizer.texts_to_sequences([input_test])
    tokenized_input=pad_sequences(tokenized_input,maxlen=max_len,padding='pre')
    output=np.argmax(model.predict(tokenized_input))

    for word,index in tokenizer.word_index.items():
        if(index==output):
            input_test=input_test+' '+word+' '
            print(input_test)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
name it 
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
name it  was 
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
name it  was  love 
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
name it  was  love  at 
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
name it  was  love  at  noon 
