In [104]:
import numpy as np
import pandas as pd
import tensorflow as tf
import unicodedata
import re
import string

In [105]:
question  =[]
answer = []
with open("dialogs.txt",'r') as f :
    for line in f :
        line  =  line.split('\t')
        question.append(line[0])
        answer.append(line[1])
print(len(question) == len(answer))

True


In [106]:
question[:5]

['hi, how are you doing?',
 "i'm fine. how about yourself?",
 "i'm pretty good. thanks for asking.",
 'no problem. so how have you been?',
 "i've been great. what about you?"]

In [107]:
answer[:5]

["i'm fine. how about yourself?\n",
 "i'm pretty good. thanks for asking.\n",
 'no problem. so how have you been?\n',
 "i've been great. what about you?\n",
 "i've been good. i'm in school right now.\n"]

In [108]:
answer = [ i.replace("\n","") for i in answer]

In [109]:
data = pd.DataFrame({"question" : question ,"answer":answer})
data.head()

Unnamed: 0,question,answer
0,"hi, how are you doing?",i'm fine. how about yourself?
1,i'm fine. how about yourself?,i'm pretty good. thanks for asking.
2,i'm pretty good. thanks for asking.,no problem. so how have you been?
3,no problem. so how have you been?,i've been great. what about you?
4,i've been great. what about you?,i've been good. i'm in school right now.


In [110]:
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
      if unicodedata.category(c) != 'Mn')

In [111]:
def clean_text(text):
    text = unicode_to_ascii(text.lower().strip())
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"\r", "", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "that is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"n'", "ng", text)
    text = re.sub(r"'bout", "about", text)
    text = re.sub(r"'til", "until", text)
    text = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub("(\\W)"," ",text)
    text = re.sub('\S*\d\S*\s*','', text)
    return text

In [112]:
data["question"][0]

'hi, how are you doing?'

In [113]:
data["question"] = data.question.apply(clean_text)
data["question"][0]

'hi how are you doing'

In [114]:
data["question"] = [ i.replace("sos","") for i in question]
data["answer"]= [ i.replace("eos","") for i in answer]

In [115]:
data

Unnamed: 0,question,answer
0,"hi, how are you doing?",i'm fine. how about yourself?
1,i'm fine. how about yourself?,i'm pretty good. thanks for asking.
2,i'm pretty good. thanks for asking.,no problem. so how have you been?
3,no problem. so how have you been?,i've been great. what about you?
4,i've been great. what about you?,i've been good. i'm in school right now.
...,...,...
3720,that's a good question. maybe it's not old age.,are you right-handed?
3721,are you right-handed?,yes. all my life.
3722,yes. all my life.,you're wearing out your right hand. stop using...
3723,you're wearing out your right hand. stop using...,but i do all my writing with my right hand.


In [116]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, LSTM, GRU, Reshape,Dropout,Input
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model


In [117]:
all_texts = list(data["question"]) + list(data["answer"])

In [118]:
# Initialize Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_texts)

In [119]:
# Convert text to sequences
input_sequences = tokenizer.texts_to_sequences(data["question"])
response_sequences = tokenizer.texts_to_sequences(data["answer"])

In [120]:
# Padding sequences to ensure uniform input length
max_len = max(max(len(seq) for seq in input_sequences), max(len(seq) for seq in response_sequences))
input_sequences = pad_sequences(input_sequences, maxlen=max_len, padding="post")
response_sequences = pad_sequences(response_sequences, maxlen=max_len, padding="post")

In [121]:
# Vocabulary size (needed for embedding layer)
vocab_size = len(tokenizer.word_index) + 1

In [122]:
print("Vocabulary Size:", vocab_size)
print("Sample Input Sequence:", input_sequences[0])
print("Sample Response Sequence:", response_sequences[0])

Vocabulary Size: 2520
Sample Input Sequence: [1522   36   14    2  174    0    0    0    0    0    0    0    0    0
    0    0    0    0    0]
Sample Response Sequence: [ 31 614  36  33 562   0   0   0   0   0   0   0   0   0   0   0   0   0
   0]


In [123]:
# Encoder
encoder_inputs = Input(shape=(max_len,))
encoder_embedding = Embedding(vocab_size, 256)(encoder_inputs)
encoder_lstm = LSTM(512, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)

In [124]:
# Decoder
decoder_inputs = Input(shape=(max_len,))
decoder_embedding = Embedding(vocab_size, 256)(decoder_inputs)
decoder_lstm = LSTM(512, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=[state_h, state_c])
# dense
decoder_dense = Dense(vocab_size, activation="softmax")
decoder_outputs = decoder_dense(decoder_outputs)

In [125]:
# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer="adam", loss="categorical_crossentropy")

In [None]:
# Train the model
# Convert response_sequences to one-hot encoding
response_sequences_onehot = tf.keras.utils.to_categorical(response_sequences, num_classes=vocab_size)

model.fit([input_sequences, input_sequences], response_sequences_onehot, batch_size=32, epochs=10)

TypeError: TensorFlowTrainer.fit() got an unexpected keyword argument 'metrices'

In [None]:
def generate_response(user_input):
    input_seq = tokenizer.texts_to_sequences([user_input])
    input_seq = pad_sequences(input_seq, maxlen=max_len, padding="post")

    predicted_seq = model.predict([input_seq, input_seq])
    predicted_words = [tokenizer.index_word.get(np.argmax(word), "") for word in predicted_seq[0]]

    return " ".join(predicted_words)

# Chat with the bot
while True:
    user_input = input("You: ")
    if user_input.lower() == "exit":
        break
    print("Bot:", generate_response(user_input))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
Bot: you it you going you just with but you          
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 92ms/step
Bot: the movie two in and the in  r          
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 100ms/step
Bot: you it you going you just with but you          
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step
Bot: you it you going you just with but you          


In [None]:
# hi, how are you doing?	i'm fine. how about yourself?
# 1	i'm fine. how about yourself?	i'm pretty good. thanks for asking.
# 2	i'm pretty good. thanks for asking.	no problem. so how have you been?
# 3	no problem. so how have you been?	i've been great. what about you?
# 4	i've been great. what about you?	i've been good. i'm in school right now

In [None]:
# # Save model and tokenizer
# model.save("seq2seq_chatbot.h5")
# import pickle
# with open("tokenizer.pkl", "wb") as f:
#     pickle.dump(tokenizer, f)



In [None]:
# from tensorflow.keras.models import load_model
# import pickle

# # Load model and tokenizer
# model = load_model("seq2seq_chatbot.h5")
# with open("tokenizer.pkl", "rb") as f:
#     tokenizer = pickle.load(f)

JOKER BOT

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv("shortjokes.csv")
data.head()

Unnamed: 0,ID,Joke
0,1,"[me narrating a documentary about narrators] ""..."
1,2,Telling my daughter garlic is good for you. Go...
2,3,I've been going through a really rough period ...
3,4,"If I could have dinner with anyone, dead or al..."
4,5,Two guys walk into a bar. The third guy ducks.


In [3]:
data.shape
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 231657 entries, 0 to 231656
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   ID      231657 non-null  int64 
 1   Joke    231657 non-null  object
dtypes: int64(1), object(1)
memory usage: 3.5+ MB


In [4]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [5]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data["Joke"])

In [6]:
tokenizer.word_index

{'a': 1,
 'the': 2,
 'i': 3,
 'to': 4,
 'you': 5,
 'and': 6,
 'in': 7,
 'of': 8,
 'my': 9,
 'what': 10,
 'is': 11,
 'it': 12,
 'do': 13,
 'me': 14,
 'on': 15,
 'was': 16,
 'for': 17,
 'that': 18,
 'with': 19,
 'have': 20,
 'why': 21,
 'he': 22,
 'your': 23,
 'when': 24,
 'did': 25,
 'are': 26,
 'like': 27,
 'how': 28,
 'they': 29,
 'if': 30,
 'so': 31,
 'just': 32,
 'at': 33,
 'but': 34,
 'an': 35,
 "i'm": 36,
 'because': 37,
 'one': 38,
 'get': 39,
 'his': 40,
 'be': 41,
 'out': 42,
 'about': 43,
 "don't": 44,
 'call': 45,
 'up': 46,
 'this': 47,
 "it's": 48,
 'not': 49,
 'her': 50,
 'who': 51,
 'can': 52,
 'all': 53,
 'no': 54,
 "what's": 55,
 'say': 56,
 'she': 57,
 'know': 58,
 'people': 59,
 'does': 60,
 'into': 61,
 'from': 62,
 'man': 63,
 'there': 64,
 'said': 65,
 'got': 66,
 'as': 67,
 'we': 68,
 'had': 69,
 'between': 70,
 'them': 71,
 'joke': 72,
 'time': 73,
 'has': 74,
 'go': 75,
 'their': 76,
 'by': 77,
 'would': 78,
 'two': 79,
 'make': 80,
 "can't": 81,
 'him': 82,
 't

In [7]:
input_data = tokenizer.texts_to_sequences(data["Joke"])

In [8]:
max_len = 93

In [9]:
input_data_sequences = pad_sequences(input_data, maxlen=max_len, padding="post")

In [10]:
vocab_size = len(tokenizer.word_index) +1
vocab_size

70649

In [11]:
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, LSTM, GRU, Reshape,Dropout,Input
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model

In [None]:
# Encoder
incoder_in = Input(shape=(max_len,))
incoder_embd = Embedding(vocab_size,256)(incoder_in)
incoder_lstm = LSTM(512,return_state=True)
incoder_out ,state_h,state_c= incoder_lstm(incoder_embd)

In [17]:
# Decoder
dcoder_in = Input(shape=(max_len,))
dcoder_embd = Embedding(vocab_size,256)(dcoder_in)
dcoder_lstm = LSTM(512,return_state=True,return_sequences=True)
dcoder_out ,_,_ = dcoder_lstm(dcoder_embd,initial_state=[state_h,state_c])

# dense

dcoder_dense = Dense(vocab_size,activation="softmax")
dcoder_out = dcoder_dense(dcoder_out)