<a href="https://colab.research.google.com/github/DhaneshGavimath/NLP-Projects/blob/main/model/chatbot%20modelling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# pip install jupyterthemes
# !jt -l

In [2]:
# Choosing the notebook theme
# !jt -t monokai

Imports

In [3]:
import pandas as pd
import numpy as np
import json
import nltk 
import sklearn

In [4]:
import tensorflow as tf


In [5]:
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from string import punctuation

In [6]:
nltk.download("stopwords")
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
"""
classes
"""
intents_classes = ["greetings", "goodbye","fee_structure", "campus_placement", "departments", 
                    "adress","activities", "commute","gratitude"]
intents_classes.sort()
classes_code_label = dict(map(lambda x:(intents_classes.index(x),x),intents_classes))
classes_label_code = dict(map(lambda x:(x,intents_classes.index(x)),intents_classes))
# print("-------------classes_str_to_num-----------\n",classes_str_to_num)
# print("-------------classes_num_to_str-----------\n",classes_num_to_str)

Createing onehot vector representation for output vector

In [8]:
def vector(output_class):
    vec = np.zeros(len(intents_classes))
    vec[intents_classes.index(output_class)] = 1
    return vec

In [9]:
classes_vec_label = dict()

for intent in intents_classes:
  classes_vec_label[intent] = vector(intent)

print(classes_vec_label)

{'activities': array([1., 0., 0., 0., 0., 0., 0., 0., 0.]), 'adress': array([0., 1., 0., 0., 0., 0., 0., 0., 0.]), 'campus_placement': array([0., 0., 1., 0., 0., 0., 0., 0., 0.]), 'commute': array([0., 0., 0., 1., 0., 0., 0., 0., 0.]), 'departments': array([0., 0., 0., 0., 1., 0., 0., 0., 0.]), 'fee_structure': array([0., 0., 0., 0., 0., 1., 0., 0., 0.]), 'goodbye': array([0., 0., 0., 0., 0., 0., 1., 0., 0.]), 'gratitude': array([0., 0., 0., 0., 0., 0., 0., 1., 0.]), 'greetings': array([0., 0., 0., 0., 0., 0., 0., 0., 1.])}


### Load Data
 Here we will be loading the json we made containing the questions user might ask.
 as we run this file it show load the training file, perform datacleaning, vector repressentation
 and finally creating a dataframe and write to current folder. 
 we will be running this file only to retrain the model with updated data we might get new words.
    

In [10]:
def load_chat_data():
    # loading the data  
    with open("train_classes.json",'r') as jdata:
        data = json.load(jdata)

    train_data = []

    for intent, sentences in data.items():
        input_target_pair = list(map(lambda x: (x,intent),sentences))
        train_data.extend(input_target_pair)

    train_dataframe = pd.DataFrame(train_data, columns = ["input","target"])
    return train_dataframe

data = load_chat_data()
data.head()

Unnamed: 0,input,target
0,Hi,greetings
1,Hello,greetings
2,Good Morning!,greetings
3,Good Evening!,greetings
4,bye,goodbye


In [11]:
"""
Replacing the target with numeric value
"""
data["target"] = data["target"].map(classes_vec_label)

In [12]:
# data.head()

In [13]:
def tokenize(data):
    # Word tokenize
    tokens = word_tokenize(data)
    return tokens

def remove_junks(tokens):
    # Removing punctuations
    wo_punctuations  = set(tokens).difference(set(punctuation))
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    wo_stopwords_puncts = wo_punctuations.difference(stop_words)
    return list(wo_stopwords_puncts)

def stemming(clean_words):
    stem_obj = PorterStemmer()
    lower_case_words = list(map(lambda x:x.lower(),clean_words))
    stem_words = list(map(lambda x:stem_obj.stem(x),lower_case_words))
    return stem_words

def create_vocabulary(data, column):
    # creating a vocabulary out of text column
    vocab = []
    for record in data[column].values:
      vocab.extend(record)
    return set(vocab)

def clean_data(df, column):
    df[column]= df[column].apply(tokenize)
    df[column]= df[column].apply(remove_junks)
    df[column]= df[column].apply(stemming)
    return df

In [14]:
data = clean_data(data,"input")


In [15]:
vocabulary = create_vocabulary(data, "input")
vocab_dict = {"vocabulary": list(vocabulary)}

In [16]:
# Writing Vocabulary to json file
with open("vocabulary.json","w") as voc_save_file:
  json.dump(vocab_dict,voc_save_file)

In [17]:
# Let's represent each text with bag-of-words method
def bag_of_words_representation(df, column):
    vocab = vocab_dict["vocabulary"]
    def vector_represent(record):
      bag_of_words = np.zeros(len(vocabulary))
      for word in record:
        if word in vocab:
          bag_of_words[vocab.index(word)] += 1
      return bag_of_words
      
    vector_matrix = list(map(lambda x: vector_represent(x),df[column].values))
    bow_df = pd.DataFrame(vector_matrix,columns=vocabulary)
    df.drop(columns=[column], inplace=True)
    final_df = pd.concat([df,bow_df],axis=1)
    return final_df
  


In [18]:
data = bag_of_words_representation(data,"input")

In [19]:
# data.head()

In [29]:
y_data =pd.DataFrame(data["target"])
x_data = data.drop(columns=['target'])
print(x_data.head())
print("****************************")
print(y_data)


   campu  metro  placement  locat  structur  adress  fee_structur  subject  \
0    0.0    0.0        0.0    0.0       0.0     0.0           0.0      0.0   
1    0.0    0.0        0.0    0.0       0.0     0.0           0.0      0.0   
2    0.0    0.0        0.0    0.0       0.0     0.0           0.0      0.0   
3    0.0    0.0        0.0    0.0       0.0     0.0           0.0      0.0   
4    0.0    0.0        0.0    0.0       0.0     0.0           0.0      0.0   

   nearbi  bye  ...  avail  provid  travel  sport  night  cours  choos  taxi  \
0     0.0  0.0  ...    0.0     0.0     0.0    0.0    0.0    0.0    0.0   0.0   
1     0.0  0.0  ...    0.0     0.0     0.0    0.0    0.0    0.0    0.0   0.0   
2     0.0  0.0  ...    0.0     0.0     0.0    0.0    0.0    0.0    0.0   0.0   
3     0.0  0.0  ...    0.0     0.0     0.0    0.0    0.0    0.0    0.0   0.0   
4     0.0  1.0  ...    0.0     0.0     0.0    0.0    0.0    0.0    0.0   0.0   

   opertun  thank  
0      0.0    0.0  
1      0.0

Converting Numpy inputs to tensors


In [32]:
x_tensors = tf.convert_to_tensor(x_data)
# y_tensors = tf.convert_to_tensor(y_data['target'].values)


**Model**



In [None]:
input_layer = tf.keras.Input(shape=(x_data.shape[1],))
middle_layer = tf.keras.layers.Dense(32, activation='relu') 
output_layer =  tf.keras.layers.Dense(9) 

model = tf.keras.Sequential()
model.add(input_layer)
model.add(middle_layer)
model.add(output_layer)

model.summary()

In [None]:
model.compile(optimizer='sgd', loss='mse')


In [None]:
model.fit(x_tensors, y_tensors, batch_size=32, epochs=10)