In [6]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
from pathlib import Path
import string
import re
import joblib
import json
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Flatten, Conv1D, MaxPooling1D
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau

In [7]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\danie\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [8]:
def load_doc(jsonFile):
    with open(jsonFile) as file:
        Json_data = json.loads(file.read())
    return Json_data

In [9]:
data = load_doc("dataset\intents.json")

In order to avoid writing all functions from scratch, we can frame the JSON file with a Pandas DataFrame with the function below:



In [10]:
def frame_data(feat_1,feat_2,is_pattern):
    is_pattern = is_pattern
    df = pd.DataFrame(columns=[feat_1,feat_2])
    for intent in data['intents']:
        if is_pattern:
            for pattern in intent['patterns']:
                w = pattern
                df_to_append = pd.Series([w,intent['tag']], index = df.columns)
                df = df.append(df_to_append,ignore_index=True)
        else:
            for response in intent['responses']:
                w = response
                df_to_append = pd.Series([w,intent['tag']], index = df.columns)
                df = df.append(df_to_append,ignore_index=True)
    return df

In [11]:
# users intents 
df1 = frame_data('questions','labels',True)
df1

  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_

Unnamed: 0,questions,labels
0,Hi there,start_conversation
1,Is anyone there?,start_conversation
2,Hey,start_conversation
3,Hola,start_conversation
4,Hello,start_conversation
...,...,...
82,What are the match formats in tennis,match_format
83,Do we just keep playing till we get tired,match_format
84,How many match sets are there in tennis,match_format
85,When do we stop playing?,match_format


In [12]:
df1.labels.value_counts(sort=False)

start_conversation     7
what_are_you           4
end_conversation       5
thanks                 5
options                8
bot_scope              5
top_players            6
general_rules          5
kits                  10
court_lines            3
first_server           4
serving_rules          6
tennis_games           4
tie_break              5
scoring                5
match_format           5
Name: labels, dtype: int64

Tokenization

In [13]:
# Bot response
df2 = frame_data('response','labels',False)
df2

  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_

Unnamed: 0,response,labels
0,"Hello, I'm sure you love tennis",start_conversation
1,Happy to have you here,start_conversation
2,Good to see you again,start_conversation
3,"Hi there, how can I help?",start_conversation
4,"Hi, I'm BoTennis",what_are_you
5,I'm BoTennis,what_are_you
6,Call me BoTennis,what_are_you
7,Have a lovely tennis!,end_conversation
8,Have a nice tennis playing time,end_conversation
9,Enjoy the game!,end_conversation


In [14]:
lemmatizer = WordNetLemmatizer()

vocab = Counter()
labels = []
def tokenizer(entry):
    tokens = entry.split()
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    tokens = [re_punc.sub('', w) for w in tokens]
    tokens = [word for word in tokens if word.isalpha()]
    tokens = [lemmatizer.lemmatize(w.lower()) for w in tokens]
#     stop_words = set(stopwords.words('english'))
#     tokens = [w for w in tokens if not w in stop_words]
    tokens = [word.lower() for word in tokens if len(word) > 1]
    return tokens

Removing Stop-Words

In [15]:
def remove_stop_words(tokenizer,df,feature):
    doc_without_stopwords = []
    for entry in df[feature]:
        tokens = tokenizer(entry)
        joblib.dump(tokens,'tokens.pkl')
        doc_without_stopwords.append(' '.join(tokens))
    df[feature] = doc_without_stopwords
    return 

Vocabulary Building

In [16]:
def create_vocab(tokenizer,df,feature):
    for entry in df[feature]:
        tokens = tokenizer(entry)   
        vocab.update(tokens)
    joblib.dump(vocab,'vocab.pkl')
    return 

In [18]:
from nltk.corpus import wordnet

In [19]:
create_vocab(tokenizer,df1,'questions')
remove_stop_words(tokenizer,df1,'questions')

In [20]:
print(vocab.most_common(20))

[('tennis', 36), ('the', 24), ('what', 23), ('you', 22), ('are', 19), ('do', 16), ('in', 11), ('rule', 11), ('how', 10), ('can', 7), ('know', 7), ('about', 7), ('is', 6), ('player', 6), ('list', 6), ('service', 6), ('score', 6), ('who', 5), ('me', 5), ('kit', 5)]


In [21]:
vocab_size = len(vocab)
vocab_size

118

In [22]:
df1

Unnamed: 0,questions,labels
0,hi there,start_conversation
1,is anyone there,start_conversation
2,hey,start_conversation
3,hola,start_conversation
4,hello,start_conversation
...,...,...
82,what are the match format in tennis,match_format
83,do we just keep playing till we get tired,match_format
84,how many match set are there in tennis,match_format
85,when do we stop playing,match_format


In [23]:
test_list = list(df1.groupby(by='labels',as_index=False).first()['questions'])
test_list

['what do you know about tennis',
 'each line on the tennis court mean whats',
 'bye',
 'who serf first',
 'what are the general rule of tennis',
 'what are the tennis kit need to get started',
 'what are the match format in tennis',
 'how can you help me',
 'how is score counted',
 'what are the service rule',
 'hi there',
 'do you know tennis game type',
 'thanks',
 'what will happen if game end in same score',
 'who are the top player in tennis',
 'what is your name']

In [24]:
test_index = []
for i,_ in enumerate(test_list):
    idx = df1[df1.questions == test_list[i]].index[0]
    test_index.append(idx)
test_index

[29, 55, 11, 58, 40, 45, 82, 21, 77, 62, 0, 68, 16, 72, 34, 7]

In [25]:
train_index = [i for i in df1.index if i not in test_index]

In [26]:
' '.join(list(vocab.keys()))

'hi there is anyone hey hola hello good day what your name are you who pls bye see later goodbye nice chatting to till next time thanks thank thats helpful awesome for helping me how can help do provide be support offered know about tennis tell really lot exactly mean by guide through the top player in list best general rule of playing kit need get started wear beginner full will each line on court whats why so many where base serf first decide serve anybody service stand anywhere game type different explain happen if end same score tiebreak doe work winner when tie break counted scoring board count match format we just keep tired set stop played'

In [27]:
def encoder(df,feature):
#     text = ' '.join(list(vocab.keys()))
    t = Tokenizer()
    entries = [entry for entry in df[feature]]
    t.fit_on_texts(entries)
    joblib.dump(t,'tokenizer_t.pkl')
    vocab_size = len(t.word_index) + 1
    entries = [entry for entry in df[feature]]
    max_length = max([len(s.split()) for s in entries])
    encoded = t.texts_to_sequences(entries)
    padded = pad_sequences(encoded, maxlen=max_length, padding='post')
    return padded, vocab_size

In [28]:
X,vocab_size = encoder(df1,'questions')

In [29]:
df_encoded = pd.DataFrame(X)

In [30]:
df_encoded['labels'] = df1.labels
df_encoded.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,labels
0,44,23,0,0,0,0,0,0,0,0,0,0,0,0,0,0,start_conversation
1,13,65,23,0,0,0,0,0,0,0,0,0,0,0,0,0,start_conversation
2,66,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,start_conversation
3,67,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,start_conversation
4,68,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,start_conversation


In [31]:
for i in range(0,2):
    dt = [0]*16
    dt.append('confused')
    dt = [dt]
    pd.DataFrame(dt).rename(columns = {16:'labels'})
    df_encoded = df_encoded.append(pd.DataFrame(dt).rename(columns = {16:'labels'}),ignore_index=True)

  df_encoded = df_encoded.append(pd.DataFrame(dt).rename(columns = {16:'labels'}),ignore_index=True)
  df_encoded = df_encoded.append(pd.DataFrame(dt).rename(columns = {16:'labels'}),ignore_index=True)


In [32]:
df_encoded.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,labels
84,9,40,63,64,5,23,7,1,0,0,0,0,0,0,0,0,match_format
85,61,6,43,117,36,0,0,0,0,0,0,0,0,0,0,0,match_format
86,9,40,64,10,25,118,0,0,0,0,0,0,0,0,0,0,match_format
87,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,confused
88,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,confused


In [33]:
train_index.append(87)

In [34]:
test_index.append(88)

In [35]:
from sklearn.preprocessing import LabelEncoder
lable_enc = LabelEncoder()

In [36]:
labl = lable_enc.fit_transform(df_encoded.labels)
labl

array([11, 11, 11, 11, 11, 11, 11, 16, 16, 16, 16,  3,  3,  3,  3,  3, 13,
       13, 13, 13, 13,  8,  8,  8,  8,  8,  8,  8,  8,  0,  0,  0,  0,  0,
       15, 15, 15, 15, 15, 15,  5,  5,  5,  5,  5,  6,  6,  6,  6,  6,  6,
        6,  6,  6,  6,  2,  2,  2,  4,  4,  4,  4, 10, 10, 10, 10, 10, 10,
       12, 12, 12, 12, 14, 14, 14, 14, 14,  9,  9,  9,  9,  9,  7,  7,  7,
        7,  7,  1,  1])

In [37]:
mapper = {}
for index,key in enumerate(df_encoded.labels):
    if key not in mapper.keys():
        mapper[key] = labl[index]
mapper

{'start_conversation': 11,
 'what_are_you': 16,
 'end_conversation': 3,
 'thanks': 13,
 'options': 8,
 'bot_scope': 0,
 'top_players': 15,
 'general_rules': 5,
 'kits': 6,
 'court_lines': 2,
 'first_server': 4,
 'serving_rules': 10,
 'tennis_games': 12,
 'tie_break': 14,
 'scoring': 9,
 'match_format': 7,
 'confused': 1}

In [None]:
df2.head()

In [38]:
df2.labels = df2.labels.map(mapper).astype({'labels': 'int32'})
df2.head()

Unnamed: 0,response,labels
0,"Hello, I'm sure you love tennis",11
1,Happy to have you here,11
2,Good to see you again,11
3,"Hi there, how can I help?",11
4,"Hi, I'm BoTennis",16


In [39]:
df2.to_csv('response.csv',index=False)

In [40]:
train = df_encoded.loc[train_index]
test = df_encoded.loc[test_index]

In [41]:
X_train = train.drop(columns=['labels'],axis=1)
y_train = train.labels
X_test = test.drop(columns=['labels'],axis=1)
y_test = test.labels

In [42]:
y_train =pd.get_dummies(y_train).values
y_test =pd.get_dummies(y_test).values

In [43]:
y_train[0].shape,y_test[0].shape

((17,), (17,))

In [44]:
X_train.shape

(72, 16)

In [45]:
max_length = X_train.shape[1]
# output = len(df3.labels.unique())
output = 17

In [46]:
max_length

16

In [47]:
early_stopping = EarlyStopping(monitor='val_loss',patience=10)
checkpoint = ModelCheckpoint("model-v1.h5",
                             monitor="val_loss",
                             mode="min",
                             save_best_only = True,
                             verbose=1)
reduce_lr = ReduceLROnPlateau(monitor = 'val_loss', factor = 0.2, patience = 3, verbose = 1, min_delta = 0.0001)
callbacks = [early_stopping,checkpoint,reduce_lr]
    
    

In [48]:
def define_model(vocab_size, max_length):
    model = Sequential()
    model.add(Embedding(vocab_size,300, input_length=max_length))
    model.add(Conv1D(filters=64, kernel_size=4, activation='relu'))
    model.add(MaxPooling1D(pool_size=8))
    model.add(Flatten())
#     model.add(Dense(32, activation='relu'))
    model.add(Dense(17, activation='softmax'))
    
    
    # compile network
#     model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.compile(loss = 'categorical_crossentropy',
              # optimizer = Adam(lr=0.001),
              optimizer = 'adam',
              metrics = ['accuracy'])
    
    # summarize defined model
    model.summary()
#     plot_model(model, to_file='model.png', show_shapes=True)
    return model

In [49]:
# define model
model = define_model(vocab_size, max_length)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 16, 300)           35700     
                                                                 
 conv1d (Conv1D)             (None, 13, 64)            76864     
                                                                 
 max_pooling1d (MaxPooling1D  (None, 1, 64)            0         
 )                                                               
                                                                 
 flatten (Flatten)           (None, 64)                0         
                                                                 
 dense (Dense)               (None, 17)                1105      
                                                                 
Total params: 113,669
Trainable params: 113,669
Non-trainable params: 0
__________________________________________________

In [50]:
history = model.fit(X_train, y_train, epochs=500, verbose=1,validation_data=(X_test,y_test),callbacks=callbacks)

Epoch 1/500
Epoch 1: val_loss improved from inf to 2.81052, saving model to model-v1.h5
Epoch 2/500
Epoch 2: val_loss improved from 2.81052 to 2.78479, saving model to model-v1.h5
Epoch 3/500
Epoch 3: val_loss improved from 2.78479 to 2.75629, saving model to model-v1.h5
Epoch 4/500
Epoch 4: val_loss improved from 2.75629 to 2.72853, saving model to model-v1.h5
Epoch 5/500
Epoch 5: val_loss improved from 2.72853 to 2.70007, saving model to model-v1.h5
Epoch 6/500
Epoch 6: val_loss improved from 2.70007 to 2.66868, saving model to model-v1.h5
Epoch 7/500
Epoch 7: val_loss improved from 2.66868 to 2.63370, saving model to model-v1.h5
Epoch 8/500
Epoch 8: val_loss improved from 2.63370 to 2.59689, saving model to model-v1.h5
Epoch 9/500
Epoch 9: val_loss improved from 2.59689 to 2.55894, saving model to model-v1.h5
Epoch 10/500
Epoch 10: val_loss improved from 2.55894 to 2.51623, saving model to model-v1.h5
Epoch 11/500
Epoch 11: val_loss improved from 2.51623 to 2.47320, saving model to 

In [51]:
def get_text():
    input_text  = ['what are you']
    df_input = pd.DataFrame(input_text,columns=['questions'])
    df_input
    return df_input 

In [52]:
#load artifacts 
from tensorflow.keras.models import load_model
model = load_model('model-v1.h5')
tokenizer_t = joblib.load('tokenizer_t.pkl')
vocab = joblib.load('vocab.pkl')

In [53]:
def tokenizer(entry):
    tokens = entry.split()
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    tokens = [re_punc.sub('', w) for w in tokens]
    tokens = [word for word in tokens if word.isalpha()]
    tokens = [lemmatizer.lemmatize(w.lower()) for w in tokens]
#     stop_words = set(stopwords.words('english'))
#     tokens = [w for w in tokens if not w in stop_words]
    tokens = [word.lower() for word in tokens if len(word) > 1]
    return tokens

In [54]:
def remove_stop_words_for_input(tokenizer,df,feature):
    doc_without_stopwords = []
    entry = df[feature][0]
    tokens = tokenizer(entry)
    doc_without_stopwords.append(' '.join(tokens))
    df[feature] = doc_without_stopwords
    return df

In [55]:
def encode_input_text(tokenizer_t,df,feature):
    t = tokenizer_t
    entry = entry = [df[feature][0]]
    encoded = t.texts_to_sequences(entry)
    padded = pad_sequences(encoded, maxlen=16, padding='post')
    return padded

In [56]:
def get_pred(model,encoded_input):
    pred = np.argmax(model.predict(encoded_input))
    return pred

In [57]:
def bot_precausion(df_input,pred):
    words = df_input.questions[0].split()
    if len([w for w in words if w in vocab])==0 :
        pred = 1
    return pred

In [58]:
def get_response(df2,pred):
    upper_bound = df2.groupby('labels').get_group(pred).shape[0]
    r = np.random.randint(0,upper_bound)
    responses = list(df2.groupby('labels').get_group(pred).response)
    return responses[r]

In [59]:
def bot_response(response,):
    print(response)

In [60]:
df_input = get_text()

#load artifacts 
tokenizer_t = joblib.load('tokenizer_t.pkl')
vocab = joblib.load('vocab.pkl')

df_input = remove_stop_words_for_input(tokenizer,df_input,'questions')
encoded_input = encode_input_text(tokenizer_t,df_input,'questions')

pred = get_pred(model,encoded_input)
pred = bot_precausion(df_input,pred)

response = get_response(df2,pred)
bot_response(response)

Hi, I'm  BoTennis
