In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
from pathlib import Path
import string
import re
import joblib
import json
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Flatten, Conv1D, MaxPooling1D
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau

In [2]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\danie\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
def load_doc(jsonFile):
    with open(jsonFile) as file:
        Json_data = json.loads(file.read())
    return Json_data

In [4]:
data = load_doc("dataset\intents.json")

In order to avoid writing all functions from scratch, we can frame the JSON file with a Pandas DataFrame with the function below:



In [5]:
def frame_data(feat_1,feat_2,is_pattern):
    is_pattern = is_pattern
    df = pd.DataFrame(columns=[feat_1,feat_2])
    for intent in data['intents']:
        if is_pattern:
            for pattern in intent['patterns']:
                w = pattern
                df_to_append = pd.Series([w,intent['tag']], index = df.columns)
                df = df.append(df_to_append,ignore_index=True)
        else:
            for response in intent['responses']:
                w = response
                df_to_append = pd.Series([w,intent['tag']], index = df.columns)
                df = df.append(df_to_append,ignore_index=True)
    return df

In [6]:
# users intents 
df1 = frame_data('questions','labels',True)
df1

  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_

Unnamed: 0,questions,labels
0,Hi there,start_conversation
1,Is anyone there?,start_conversation
2,Hey,start_conversation
3,Hola,start_conversation
4,Hello,start_conversation
...,...,...
58,Where does Christain Union holds service?,meeting
59,When does CU holds service?,meeting
60,What year was CU formed,history
61,What year was the Christian Union established?,history


In [7]:
df1.labels.value_counts(sort=False)

start_conversation    7
what_are_you          4
end_conversation      5
thanks                6
options               8
freshers_welcome      5
theme                 4
buzz                  2
buzz_rep              2
court_lines           3
first_server          4
fresher_resp          2
gifts                 2
date_venue            2
meeting               4
history               3
Name: labels, dtype: int64

Tokenization

In [8]:
# Bot response
df2 = frame_data('response','labels',False)
df2

  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_to_append,ignore_index=True)
  df = df.append(df_

Unnamed: 0,response,labels
0,"Hello, good to see you here.\nI'm Nora, how ca...",start_conversation
1,"Happy to have you here.\nI'm Nora, how can I h...",start_conversation
2,"Hi there.\nI'm Nora, how can I help you?",start_conversation
3,"Hi, I'm Nora.",what_are_you
4,I'm Nora.,what_are_you
5,My name is Nora.,what_are_you
6,Have a lovely day.,end_conversation
7,See you in fellowship.,end_conversation
8,Have a nice day.,end_conversation
9,See you some other time.,end_conversation


In [9]:
lemmatizer = WordNetLemmatizer()

vocab = Counter()
labels = []
def tokenizer(entry):
    tokens = entry.split()
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    tokens = [re_punc.sub('', w) for w in tokens]
    tokens = [word for word in tokens if word.isalpha()]
    tokens = [lemmatizer.lemmatize(w.lower()) for w in tokens]
#     stop_words = set(stopwords.words('english'))
#     tokens = [w for w in tokens if not w in stop_words]
    tokens = [word.lower() for word in tokens if len(word) > 1]
    return tokens

Removing Stop-Words

In [10]:
def remove_stop_words(tokenizer,df,feature):
    doc_without_stopwords = []
    for entry in df[feature]:
        tokens = tokenizer(entry)
        joblib.dump(tokens,'tokens.pkl')
        doc_without_stopwords.append(' '.join(tokens))
    df[feature] = doc_without_stopwords
    return 

Vocabulary Building

In [11]:
def create_vocab(tokenizer,df,feature):
    for entry in df[feature]:
        tokens = tokenizer(entry)   
        vocab.update(tokens)
    joblib.dump(vocab,'vocab.pkl')
    return 

In [12]:
from nltk.corpus import wordnet

In [13]:
create_vocab(tokenizer,df1,'questions')
remove_stop_words(tokenizer,df1,'questions')

In [14]:
print(vocab.most_common(20))

[('what', 19), ('you', 18), ('do', 13), ('fresher', 12), ('is', 11), ('welcome', 11), ('can', 7), ('the', 7), ('about', 6), ('to', 5), ('nora', 5), ('me', 5), ('kamdela', 5), ('mandeke', 5), ('doe', 5), ('on', 5), ('where', 5), ('help', 4), ('how', 4), ('know', 4)]


In [15]:
vocab_size = len(vocab)
vocab_size

107

In [16]:
df1

Unnamed: 0,questions,labels
0,hi there,start_conversation
1,is anyone there,start_conversation
2,hey,start_conversation
3,hola,start_conversation
4,hello,start_conversation
...,...,...
58,where doe christain union hold service,meeting
59,when doe cu hold service,meeting
60,what year wa cu formed,history
61,what year wa the christian union established,history


In [17]:
test_list = list(df1.groupby(by='labels',as_index=False).first()['questions'])
test_list

['have been hearing buzz on campus lately',
 'yes am talking about fresher welcome what is fresher welcome',
 'each line on the tennis court mean whats',
 'that is great but when and where is fresher welcome holding',
 'bye',
 'who serf first',
 'am fresher am invited to kamdela mandeke',
 'what do you know about fresher welcome',
 'yes do',
 'what year wa cu formed',
 'where doe fellowship meet',
 'how can you help me',
 'hi there',
 'thanks nora',
 'what is kamdela mandeke',
 'what is your name']

In [18]:
test_index = []
for i,_ in enumerate(test_list):
    idx = df1[df1.questions == test_list[i]].index[0]
    test_index.append(idx)
test_index

[39, 41, 43, 54, 11, 46, 50, 30, 52, 60, 56, 22, 0, 16, 35, 7]

In [19]:
train_index = [i for i in df1.index if i not in test_index]

In [20]:
' '.join(list(vocab.keys()))

'hi there is anyone hey hola hello good day what your name are you who pls bye see later goodbye nice chatting to till next time thanks nora thank thats helpful awesome for helping me very much help how can do provide be support offer know of about fresher welcome tell give information need kamdela mandeke doe mean the meaning have been hearing buzz on campus lately news yes am talking each line tennis court whats why so many where base serf first decide serve anybody invited that great but when and holding fellowship meet cu hold christain union service year wa formed christian established did start'

In [21]:
def encoder(df,feature):
#     text = ' '.join(list(vocab.keys()))
    t = Tokenizer()
    entries = [entry for entry in df[feature]]
    t.fit_on_texts(entries)
    joblib.dump(t,'tokenizer_t.pkl')
    vocab_size = len(t.word_index) + 1
    entries = [entry for entry in df[feature]]
    max_length = max([len(s.split()) for s in entries])
    encoded = t.texts_to_sequences(entries)
    padded = pad_sequences(encoded, maxlen=max_length, padding='post')
    return padded, vocab_size

In [22]:
X,vocab_size = encoder(df1,'questions')

In [23]:
df_encoded = pd.DataFrame(X)

In [24]:
df_encoded['labels'] = df1.labels
df_encoded.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,labels
0,37,28,0,0,0,0,0,0,0,0,0,0,0,0,0,0,start_conversation
1,5,57,28,0,0,0,0,0,0,0,0,0,0,0,0,0,start_conversation
2,58,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,start_conversation
3,59,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,start_conversation
4,60,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,start_conversation


In [25]:
for i in range(0,2):
    dt = [0]*16
    dt.append('confused')
    dt = [dt]
    pd.DataFrame(dt).rename(columns = {16:'labels'})
    df_encoded = df_encoded.append(pd.DataFrame(dt).rename(columns = {16:'labels'}),ignore_index=True)

  df_encoded = df_encoded.append(pd.DataFrame(dt).rename(columns = {16:'labels'}),ignore_index=True)
  df_encoded = df_encoded.append(pd.DataFrame(dt).rename(columns = {16:'labels'}),ignore_index=True)


In [26]:
df_encoded.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,labels
60,1,55,56,27,103,0,0,0,0,0,0,0,0,0,0,0,history
61,1,55,56,8,104,53,105,0,0,0,0,0,0,0,0,0,history
62,26,106,27,107,0,0,0,0,0,0,0,0,0,0,0,0,history
63,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,confused
64,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,confused


In [27]:
train_index.append(63)

In [28]:
test_index.append(64)

In [29]:
from sklearn.preprocessing import LabelEncoder
lable_enc = LabelEncoder()

In [30]:
labl = lable_enc.fit_transform(df_encoded.labels)
labl

array([13, 13, 13, 13, 13, 13, 13, 16, 16, 16, 16,  5,  5,  5,  5,  5, 14,
       14, 14, 14, 14, 14, 12, 12, 12, 12, 12, 12, 12, 12,  8,  8,  8,  8,
        8, 15, 15, 15, 15,  0,  0,  1,  1,  3,  3,  3,  6,  6,  6,  6,  7,
        7,  9,  9,  4,  4, 11, 11, 11, 11, 10, 10, 10,  2,  2])

In [31]:
mapper = {}
for index,key in enumerate(df_encoded.labels):
    if key not in mapper.keys():
        mapper[key] = labl[index]
mapper

{'start_conversation': 13,
 'what_are_you': 16,
 'end_conversation': 5,
 'thanks': 14,
 'options': 12,
 'freshers_welcome': 8,
 'theme': 15,
 'buzz': 0,
 'buzz_rep': 1,
 'court_lines': 3,
 'first_server': 6,
 'fresher_resp': 7,
 'gifts': 9,
 'date_venue': 4,
 'meeting': 11,
 'history': 10,
 'confused': 2}

In [32]:
df2.head()

Unnamed: 0,response,labels
0,"Hello, good to see you here.\nI'm Nora, how ca...",start_conversation
1,"Happy to have you here.\nI'm Nora, how can I h...",start_conversation
2,"Hi there.\nI'm Nora, how can I help you?",start_conversation
3,"Hi, I'm Nora.",what_are_you
4,I'm Nora.,what_are_you


In [33]:
df2.labels = df2.labels.map(mapper).astype({'labels': 'int32'})
df2.head()

Unnamed: 0,response,labels
0,"Hello, good to see you here.\nI'm Nora, how ca...",13
1,"Happy to have you here.\nI'm Nora, how can I h...",13
2,"Hi there.\nI'm Nora, how can I help you?",13
3,"Hi, I'm Nora.",16
4,I'm Nora.,16


In [34]:
df2.to_csv('response.csv',index=False)

In [35]:
train = df_encoded.loc[train_index]
test = df_encoded.loc[test_index]

In [36]:
train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,labels
1,5,57,28,0,0,0,0,0,0,0,0,0,0,0,0,0,start_conversation
2,58,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,start_conversation
3,59,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,start_conversation
4,60,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,start_conversation
5,61,62,0,0,0,0,0,0,0,0,0,0,0,0,0,0,start_conversation
6,37,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,start_conversation
8,1,30,2,0,0,0,0,0,0,0,0,0,0,0,0,0,what_are_you
9,31,30,2,0,0,0,0,0,0,0,0,0,0,0,0,0,what_are_you
10,29,38,63,0,0,0,0,0,0,0,0,0,0,0,0,0,what_are_you
12,64,2,65,0,0,0,0,0,0,0,0,0,0,0,0,0,end_conversation


In [37]:
test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,labels
39,45,46,47,85,16,48,86,0,0,0,0,0,0,0,0,0,buzz
41,33,22,88,9,4,6,1,5,4,6,0,0,0,0,0,0,buzz_rep
43,89,23,16,8,24,25,21,90,0,0,0,0,0,0,0,0,court_lines
54,97,5,98,99,26,100,17,5,4,6,51,0,0,0,0,0,date_venue
11,39,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,end_conversation
46,31,95,34,0,0,0,0,0,0,0,0,0,0,0,0,0,first_server
50,22,4,22,50,10,13,14,0,0,0,0,0,0,0,0,0,fresher_resp
30,1,3,2,20,9,4,6,0,0,0,0,0,0,0,0,0,freshers_welcome
52,33,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,gifts
60,1,55,56,27,103,0,0,0,0,0,0,0,0,0,0,0,history


In [38]:
X_train = train.drop(columns=['labels'],axis=1)
y_train = train.labels
X_test = test.drop(columns=['labels'],axis=1)
y_test = test.labels

In [39]:
y_train

1     start_conversation
2     start_conversation
3     start_conversation
4     start_conversation
5     start_conversation
6     start_conversation
8           what_are_you
9           what_are_you
10          what_are_you
12      end_conversation
13      end_conversation
14      end_conversation
15      end_conversation
17                thanks
18                thanks
19                thanks
20                thanks
21                thanks
23               options
24               options
25               options
26               options
27               options
28               options
29               options
31      freshers_welcome
32      freshers_welcome
33      freshers_welcome
34      freshers_welcome
36                 theme
37                 theme
38                 theme
40                  buzz
42              buzz_rep
44           court_lines
45           court_lines
47          first_server
48          first_server
49          first_server
51          fresher_resp


In [40]:
y_test

39                  buzz
41              buzz_rep
43           court_lines
54            date_venue
11      end_conversation
46          first_server
50          fresher_resp
30      freshers_welcome
52                 gifts
60               history
56               meeting
22               options
0     start_conversation
16                thanks
35                 theme
7           what_are_you
64              confused
Name: labels, dtype: object

In [41]:
y_train =pd.get_dummies(y_train).values
y_test =pd.get_dummies(y_test).values

In [42]:
train.labels.unique()

array(['start_conversation', 'what_are_you', 'end_conversation', 'thanks',
       'options', 'freshers_welcome', 'theme', 'buzz', 'buzz_rep',
       'court_lines', 'first_server', 'fresher_resp', 'gifts',
       'date_venue', 'meeting', 'history', 'confused'], dtype=object)

In [43]:
test.labels.unique()

array(['buzz', 'buzz_rep', 'court_lines', 'date_venue',
       'end_conversation', 'first_server', 'fresher_resp',
       'freshers_welcome', 'gifts', 'history', 'meeting', 'options',
       'start_conversation', 'thanks', 'theme', 'what_are_you',
       'confused'], dtype=object)

In [44]:
y_test

array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,

In [45]:
y_train

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [46]:
y_train[0].shape,y_test[0].shape

((17,), (17,))

In [47]:
X_train.shape

(48, 16)

In [48]:
max_length = X_train.shape[1]
# output = len(df3.labels.unique())
output = 17

In [49]:
max_length

16

In [50]:
early_stopping = EarlyStopping(monitor='val_loss',patience=10)
checkpoint = ModelCheckpoint("model-v1.h5",
                             monitor="val_loss",
                             mode="min",
                             save_best_only = True,
                             verbose=1)
reduce_lr = ReduceLROnPlateau(monitor = 'val_loss', factor = 0.2, patience = 3, verbose = 1, min_delta = 0.0001)
callbacks = [early_stopping,checkpoint,reduce_lr]
    
    

In [51]:
def define_model(vocab_size, max_length):
    model = Sequential()
    model.add(Embedding(vocab_size,300, input_length=max_length))
    model.add(Conv1D(filters=64, kernel_size=4, activation='relu'))
    model.add(MaxPooling1D(pool_size=8))
    model.add(Flatten())
#     model.add(Dense(32, activation='relu'))
    model.add(Dense(17, activation='softmax'))
    
    
    # compile network
#     model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.compile(loss = 'categorical_crossentropy',
              # optimizer = Adam(lr=0.001),
              optimizer = 'adam',
              metrics = ['accuracy'])
    
    # summarize defined model
    model.summary()
#     plot_model(model, to_file='model.png', show_shapes=True)
    return model

In [52]:
# define model
model = define_model(vocab_size, max_length)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 16, 300)           32400     
                                                                 
 conv1d (Conv1D)             (None, 13, 64)            76864     
                                                                 
 max_pooling1d (MaxPooling1D  (None, 1, 64)            0         
 )                                                               
                                                                 
 flatten (Flatten)           (None, 64)                0         
                                                                 
 dense (Dense)               (None, 17)                1105      
                                                                 
Total params: 110,369
Trainable params: 110,369
Non-trainable params: 0
__________________________________________________

In [53]:
history = model.fit(X_train, y_train, epochs=500, verbose=1,validation_data=(X_test,y_test),callbacks=callbacks)

Epoch 1/500
Epoch 1: val_loss improved from inf to 2.80604, saving model to model-v1.h5
Epoch 2/500
Epoch 2: val_loss improved from 2.80604 to 2.77971, saving model to model-v1.h5
Epoch 3/500
Epoch 3: val_loss improved from 2.77971 to 2.75617, saving model to model-v1.h5
Epoch 4/500
Epoch 4: val_loss improved from 2.75617 to 2.73465, saving model to model-v1.h5
Epoch 5/500
Epoch 5: val_loss improved from 2.73465 to 2.71299, saving model to model-v1.h5
Epoch 6/500
Epoch 6: val_loss improved from 2.71299 to 2.69125, saving model to model-v1.h5
Epoch 7/500
Epoch 7: val_loss improved from 2.69125 to 2.66843, saving model to model-v1.h5
Epoch 8/500
Epoch 8: val_loss improved from 2.66843 to 2.64385, saving model to model-v1.h5
Epoch 9/500
Epoch 9: val_loss improved from 2.64385 to 2.61846, saving model to model-v1.h5
Epoch 10/500
Epoch 10: val_loss improved from 2.61846 to 2.59231, saving model to model-v1.h5
Epoch 11/500
Epoch 11: val_loss improved from 2.59231 to 2.56560, saving model to 

In [54]:
def get_text():
    input_text  = ['what are you']
    df_input = pd.DataFrame(input_text,columns=['questions'])
    df_input
    return df_input 

In [55]:
#load artifacts 
from tensorflow.keras.models import load_model
model = load_model('model-v1.h5')
tokenizer_t = joblib.load('tokenizer_t.pkl')
vocab = joblib.load('vocab.pkl')

In [56]:
def tokenizer(entry):
    tokens = entry.split()
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    tokens = [re_punc.sub('', w) for w in tokens]
    tokens = [word for word in tokens if word.isalpha()]
    tokens = [lemmatizer.lemmatize(w.lower()) for w in tokens]
#     stop_words = set(stopwords.words('english'))
#     tokens = [w for w in tokens if not w in stop_words]
    tokens = [word.lower() for word in tokens if len(word) > 1]
    return tokens

In [57]:
def remove_stop_words_for_input(tokenizer,df,feature):
    doc_without_stopwords = []
    entry = df[feature][0]
    tokens = tokenizer(entry)
    doc_without_stopwords.append(' '.join(tokens))
    df[feature] = doc_without_stopwords
    return df

In [58]:
def encode_input_text(tokenizer_t,df,feature):
    t = tokenizer_t
    entry = entry = [df[feature][0]]
    encoded = t.texts_to_sequences(entry)
    padded = pad_sequences(encoded, maxlen=16, padding='post')
    return padded

In [59]:
def get_pred(model,encoded_input):
    pred = np.argmax(model.predict(encoded_input))
    return pred

In [60]:
def bot_precausion(df_input,pred):
    words = df_input.questions[0].split()
    if len([w for w in words if w in vocab])==0 :
        pred = 1
    return pred

In [61]:
def get_response(df2,pred):
    upper_bound = df2.groupby('labels').get_group(pred).shape[0]
    r = np.random.randint(0,upper_bound)
    responses = list(df2.groupby('labels').get_group(pred).response)
    return responses[r]

In [62]:
def bot_response(response,):
    print(response)

In [63]:
df_input = get_text()

#load artifacts 
tokenizer_t = joblib.load('tokenizer_t.pkl')
vocab = joblib.load('vocab.pkl')

df_input = remove_stop_words_for_input(tokenizer,df_input,'questions')
encoded_input = encode_input_text(tokenizer_t,df_input,'questions')

pred = get_pred(model,encoded_input)
pred = bot_precausion(df_input,pred)

response = get_response(df2,pred)
bot_response(response)

My name is Nora.
