In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from pathlib import Path
import string
import re
import joblib
import json
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Flatten
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import MaxPooling1D
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau

In [None]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
!rm questions.json

In [None]:
from google.colab import files
uploaded = files.upload()

Saving questions.json to questions.json


In [None]:
!ls

questions.json	sample_data


In [None]:
f = open ('questions.json', "r")
ds = json.loads(f.read())
def dff(c1,c2,bol):
    df = pd.DataFrame(columns=[c1,c2])
    for i in ds['questions']:
        if bol=='user':
            for p in i['patterns']:
                x = p
                df_app = pd.Series([x,i['tag']], index = df.columns)
                df = df.append(df_app,ignore_index=True)
        else:
            for r in i['responses']:
                x = r
                df_app = pd.Series([x,i['tag']], index = df.columns)
                df = df.append(df_app,ignore_index=True)
    return df

In [None]:
df1 = dff('user','labels','user')
df1

Unnamed: 0,user,labels
0,Hi there,start_conversation
1,Is anyone there?,start_conversation
2,Hey,start_conversation
3,Hola,start_conversation
4,Hello,start_conversation
...,...,...
79,What's the test format?,format
80,Do we just keep answering till we get tired?,format
81,How many questions are there in the test?,format
82,When do we attend the test?,format


In [None]:
df2 = dff('bot','labels','bot')
df2

Unnamed: 0,bot,labels
0,"Hello, nice to meet you",start_conversation
1,Happy to have you here,start_conversation
2,Good to see you again,start_conversation
3,"Hi there, how can I help?",start_conversation
4,"Hi, I'm your CounselBot",what_are_you
5,I'm CounselBot,what_are_you
6,Call me CounselBot,what_are_you
7,Have a lovely time!,end_conversation
8,Have a wonderful day,end_conversation
9,Suite yourself,end_conversation


In [None]:
lem = WordNetLemmatizer()
words = Counter()
labels = []
def tokenizer(x):
    tokens = x.split()
    rep = re.compile('[%s]' % re.escape(string.punctuation))
    tokens = [rep.sub('', i) for i in tokens]
    tokens = [i for i in tokens if i.isalpha()]
    tokens = [lem.lemmatize(i.lower()) for i in tokens]
    tokens = [i.lower() for i in tokens if len(i) > 1]
    return tokens

def no_stopwords(tokenizer,df,c):
    no_stop = []
    for x in df[c]:
        tokens = tokenizer(x)
        joblib.dump(tokens,'tokens.pkl')
        no_stop.append(' '.join(tokens))
    df[c] = no_stop
    return

def new_words(tokenizer,df,c):
    for x in df[c]:
        tokens = tokenizer(x)   
        words.update(tokens)
    joblib.dump(words,'words.pkl')
    return

In [None]:
new_words(tokenizer,df1,'user')
no_stopwords(tokenizer,df1,'user')

In [None]:
test_list = list(df1.groupby(by='labels',as_index=False).first()['user'])
test_list

['what are the general sub stream in science',
 'what are the general sub stream in art',
 'what are the job oppurtunities after school',
 'what will happen if make the wrong choice',
 'what are the job oppurtunities in civil service',
 'what are the general sub stream in commerce',
 'bye',
 'what are the branch in engineering',
 'whats the test format',
 'can get further guidance after my result',
 'how can you help me',
 'what will my result look like',
 'hi there',
 'thanks',
 'who are the top choice of every student',
 'what is your name']

In [None]:
test_index = []
for i,_ in enumerate(test_list):
    idx = df1[df1.user == test_list[i]].index[0]
    test_index.append(idx)
test_index

[40, 45, 29, 69, 55, 50, 11, 60, 79, 74, 21, 65, 0, 16, 34, 7]

In [None]:
train_index = [i for i in df1.index if i not in test_index]
' '.join(list(words.keys()))


'hi there is anyone hey hola hello good day what your name are you who pls bye see later goodbye nice chatting to till next time thanks thank thats helpful awesome for helping me how can help do provide be support offered know about the job oppurtunities after school tell various career choice really lot exactly choosing right mean by guide through selection process top of every student list most commonly chosen path common general sub stream in science art commerce civil service branch engineering will my result look like when get give detail happen if make wrong youve made it too late realise decide have two conflicting rectify further guidance talk an expert more insight required whats test format we just keep answering tired many question attend taken'

In [None]:
def encoder(df,c):
    t = Tokenizer()
    ent = [x for x in df[c]]
    t.fit_on_texts(ent)
    joblib.dump(t,'tokenizer_t.pkl')
    wordlen = len(t.word_index) + 1
    ent = [x for x in df[c]]
    max_length = max([len(s.split()) for s in ent])
    encoded = t.texts_to_sequences(ent)
    padded = pad_sequences(encoded, maxlen=max_length, padding='post')
    return padded, wordlen, max_length

In [None]:
X,wordlen,max_length = encoder(df1,'user')


In [None]:
dfen = pd.DataFrame(X)
dfen['labels'] = df1.labels
dfen

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,labels
0,44,35,0,0,0,0,0,0,0,0,0,0,start_conversation
1,36,64,35,0,0,0,0,0,0,0,0,0,start_conversation
2,65,0,0,0,0,0,0,0,0,0,0,0,start_conversation
3,66,0,0,0,0,0,0,0,0,0,0,0,start_conversation
4,67,0,0,0,0,0,0,0,0,0,0,0,start_conversation
...,...,...,...,...,...,...,...,...,...,...,...,...,...
79,121,1,34,122,0,0,0,0,0,0,0,0,format
80,6,43,123,124,125,49,43,32,126,0,0,0,format
81,11,63,127,5,35,4,1,34,0,0,0,0,format
82,28,6,43,128,1,34,0,0,0,0,0,0,format


In [None]:
for i in range(0,2):
    ds = [0]*max_length
    ds.append('confused')
    ds = [ds]
    pd.DataFrame(ds).rename(columns = {max_length:'labels'})
    dfen = dfen.append(pd.DataFrame(ds).rename(columns = {max_length:'labels'}),ignore_index=True)


In [None]:
dfen

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,labels
0,44,35,0,0,0,0,0,0,0,0,0,0,start_conversation
1,36,64,35,0,0,0,0,0,0,0,0,0,start_conversation
2,65,0,0,0,0,0,0,0,0,0,0,0,start_conversation
3,66,0,0,0,0,0,0,0,0,0,0,0,start_conversation
4,67,0,0,0,0,0,0,0,0,0,0,0,start_conversation
...,...,...,...,...,...,...,...,...,...,...,...,...,...
81,11,63,127,5,35,4,1,34,0,0,0,0,format
82,28,6,43,128,1,34,0,0,0,0,0,0,format
83,11,63,34,13,39,129,0,0,0,0,0,0,format
84,0,0,0,0,0,0,0,0,0,0,0,0,confused


In [None]:
i,j = dfen.shape
train_index.append(i-2)
test_index.append(i-1)
print(train_index)
print(test_index)

[1, 2, 3, 4, 5, 6, 8, 9, 10, 12, 13, 14, 15, 17, 18, 19, 20, 22, 23, 24, 25, 26, 27, 28, 30, 31, 32, 33, 35, 36, 37, 38, 39, 41, 42, 43, 44, 46, 47, 48, 49, 51, 52, 53, 54, 56, 57, 58, 59, 61, 62, 63, 64, 66, 67, 68, 70, 71, 72, 73, 75, 76, 77, 78, 80, 81, 82, 83, 84]
[40, 45, 29, 69, 55, 50, 11, 60, 79, 74, 21, 65, 0, 16, 34, 7, 85]


In [None]:

from sklearn.preprocessing import LabelEncoder
leobj = LabelEncoder()
l = leobj.fit_transform(dfen.labels)
l

array([13, 13, 13, 13, 13, 13, 13, 16, 16, 16, 16,  7,  7,  7,  7,  7, 14,
       14, 14, 14, 14, 11, 11, 11, 11, 11, 11, 11, 11,  2,  2,  2,  2,  2,
       15, 15, 15, 15, 15, 15,  0,  0,  0,  0,  0,  1,  1,  1,  1,  1,  5,
        5,  5,  5,  5,  4,  4,  4,  4,  4,  8,  8,  8,  8,  8, 12, 12, 12,
       12,  3,  3,  3,  3,  3, 10, 10, 10, 10, 10,  9,  9,  9,  9,  9,  6,
        6])

In [None]:

mapper = {}
for i,k in enumerate(dfen.labels):
    if k not in mapper.keys():
        mapper[k] = l[i]
mapper

{'Science': 0,
 'arts': 1,
 'bot_scope': 2,
 'choice': 3,
 'civil': 4,
 'commerce': 5,
 'confused': 6,
 'end_conversation': 7,
 'engineering': 8,
 'format': 9,
 'guidance': 10,
 'options': 11,
 'results': 12,
 'start_conversation': 13,
 'thanks': 14,
 'top_choices': 15,
 'what_are_you': 16}

In [None]:
df2.labels = df2.labels.map(mapper).astype({'labels': 'int32'})

In [None]:
df2

Unnamed: 0,bot,labels
0,"Hello, nice to meet you",13
1,Happy to have you here,13
2,Good to see you again,13
3,"Hi there, how can I help?",13
4,"Hi, I'm your CounselBot",16
5,I'm CounselBot,16
6,Call me CounselBot,16
7,Have a lovely time!,7
8,Have a wonderful day,7
9,Suite yourself,7


In [None]:
df2.to_csv('bot.csv',index=False)


In [None]:

train = dfen.loc[train_index]
test = dfen.loc[test_index]

In [None]:
X_train = train.drop(columns=['labels'],axis=1)
y_train = train.labels
X_test = test.drop(columns=['labels'],axis=1)
y_test = test.labels


In [None]:

y_train =pd.get_dummies(y_train).values
y_test =pd.get_dummies(y_test).values

In [None]:
y_train[0].shape,y_test[0].shape

((17,), (17,))

In [None]:
X_train.shape

(69, 12)

In [None]:
max_length = X_train.shape[1]
output = len(df2.labels.unique())
print(max_length, output)

12 17


In [None]:

early_stopping = EarlyStopping(monitor='val_loss',patience=10)
cp = ModelCheckpoint("botmodel.h5",
                             monitor="val_loss",
                             mode="min",
                             save_best_only = True,
                             verbose=1)
rlr = ReduceLROnPlateau(monitor = 'val_loss', factor = 0.2, patience = 3, verbose = 1, min_delta = 0.0001)
total_call = [early_stopping,cp,rlr]

In [None]:

def define_model(wordlen, max_length,output):
    model = Sequential()
    #model.add(LSTM(1, activation='relu'))
    model.add(Embedding(wordlen,300, input_length=max_length))
    model.add(Conv1D(filters=64, kernel_size=4, activation='relu'))
    model.add(MaxPooling1D(pool_size=8))
    model.add(Flatten())
   # model.add(LSTM(1, activation="relu", return_sequences=True))
    model.add(Dense(output, activation='softmax'))
    
    model.compile(loss = 'categorical_crossentropy',
              optimizer = 'adam',
              metrics = ['accuracy'])
    
    model.summary()
    return model

In [None]:
model = define_model(wordlen, max_length,output)


Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 12, 300)           39000     
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 9, 64)             76864     
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 1, 64)             0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 17)                1105      
Total params: 116,969
Trainable params: 116,969
Non-trainable params: 0
_________________________________________________________________


In [None]:
mod_sum = model.fit(X_train, y_train, epochs=500, verbose=1,
                    validation_data=(X_test,y_test),callbacks=total_call)

Epoch 1/500
Epoch 00001: val_loss did not improve from 0.69712
Epoch 2/500
Epoch 00002: val_loss did not improve from 0.69712
Epoch 3/500
Epoch 00003: val_loss did not improve from 0.69712
Epoch 4/500
Epoch 00004: val_loss did not improve from 0.69712
Epoch 5/500
Epoch 00005: val_loss did not improve from 0.69712
Epoch 6/500
Epoch 00006: val_loss did not improve from 0.69712
Epoch 7/500
Epoch 00007: val_loss did not improve from 0.69712
Epoch 8/500
Epoch 00008: val_loss did not improve from 0.69712
Epoch 9/500
Epoch 00009: val_loss did not improve from 0.69712
Epoch 10/500
Epoch 00010: val_loss did not improve from 0.69712
Epoch 11/500
Epoch 00011: val_loss did not improve from 0.69712
Epoch 12/500
Epoch 00012: val_loss did not improve from 0.69712
Epoch 13/500
Epoch 00013: val_loss did not improve from 0.69712
Epoch 14/500
Epoch 00014: val_loss did not improve from 0.69712
Epoch 15/500
Epoch 00015: val_loss did not improve from 0.69712
Epoch 16/500
Epoch 00016: val_loss did not improv

In [None]:
[np.argmax(i) for i in model.predict(X_test)][:]


[0, 1, 4, 3, 4, 5, 7, 8, 9, 12, 11, 12, 13, 14, 15, 16, 6]

In [None]:
[np.argmax(i) for i in y_test][:]

[0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 6]

USER TESTING

In [None]:
def get_text():
    x = input()
    x=x.lower()
    xx = x[:13]
    if(xx =="start my test"):
      return 0
    else:
      input_text  = [x]
      df_input = pd.DataFrame(input_text,columns=['user'])
      df_input
      return df_input

In [None]:
from tensorflow.keras.models import load_model
model = load_model('botmodel.h5')
tok = joblib.load('tokenizer_t.pkl')
words_ = joblib.load('words.pkl')


In [None]:

def tokenizer(x):
    tokens = x.split()
    rep = re.compile('[%s]' % re.escape(string.punctuation))
    tokens = [rep.sub('', i) for i in tokens]
    tokens = [i for i in tokens if i.isalpha()]
    tokens = [lem.lemmatize(i.lower()) for i in tokens]
    tokens = [i.lower() for i in tokens if len(i) > 1]
    return tokens


def no_stop_inp(tokenizer,df,c):
    no_stop = []
    x = df[c][0]
    tokens = tokenizer(x)
    no_stop.append(' '.join(tokens))
    df[c] = no_stop
    return df


def inpenc(tok,df,c):
    t = tok
    x = x = [df[c][0]]
    enc = t.texts_to_sequences(x)
    padded = pad_sequences(enc, maxlen=16, padding='post')
    return padded

def predinp(model,x):
    pred = np.argmax(model.predict(x))
    return pred

def botp(df3,pred):
    l = df3.user[0].split()
    if len([i for i in l if i in words_])==0 :
        pred = 1
    return pred

def botop(df2,pred):
    x2 = df2.groupby('labels').get_group(pred).shape[0]
    idx1 = np.random.randint(0,x2)
    op = list(df2.groupby('labels').get_group(pred).bot)
    return op[idx1]
  

In [None]:
def start_test():
  print("starting test...")

In [None]:

df3 = get_text()
if(df3==0):
  ans = "Sure, good luck!"
  start_test()
else:
  tok = joblib.load('tokenizer_t.pkl')
  word = joblib.load('words.pkl')
  df3 = no_stop_inp(tokenizer,df3,'user')
  inp = inpenc(tok,df3,'user')
  pred = predinp(model,inp)
  pred = botp(df3,pred)
  ans = botop(df2,pred)
print("CounselBot : ", ans)

START MY TEST!!!
starting test...
CounselBot :  Sure, good luck!


In [None]:
!ls

bot.csv      questions.json  tokenizer_t.pkl  words.pkl
botmodel.h5  sample_data     tokens.pkl
