In [110]:
import pandas as pd
import numpy as np
import tensorflow as tf

In [125]:
import json
with open('intents2.json', 'r') as file:
    data = json.load(file)

In [126]:
pattern_data = []
intent_data = []
for intent in data["intents"]:
    patterns = intent["patterns"]
    intent_tag = intent["tag"]
    pattern_data.extend(patterns)
    intent_data.extend([intent_tag] * len(patterns))

df = pd.DataFrame({"pattern": pattern_data, "intent": intent_data})
df

Unnamed: 0,pattern,intent
0,Hi,greeting
1,watsup bro,greeting
2,hiiiii,greeting
3,hi,greeting
4,hey how are you,greeting
...,...,...
1957,what is the hod's approach to student feedback...,hod
1958,what is the hod's role in faculty recruitment ...,hod
1959,how does the hod promote a conducive learning ...,hod
1960,what are the hod's initiatives for industry-ac...,hod


In [127]:
df = df.sample(frac=1).reset_index(drop=True)
df

Unnamed: 0,pattern,intent
0,See you on the other side,goodbye
1,What is the number to call?,number
2,Are there any ongoing projects or initiatives ...,principal
3,"Hey there, what brings you here?",greeting
4,What are the holiday dates for this academic y...,vacation
...,...,...
1957,I'm indebted to you for your help.,salutaion
1958,Do they offer any special discounts or promoti...,menu
1959,How many days will be allocated for the vacation?,vacation
1960,"Good evening, how's your evening going?",greeting


In [128]:
df['intent'].nunique()

31

In [129]:
df['intent'].value_counts()

intent
task           85
salutaion      84
scholarship    83
course         82
vacation       79
name           79
event          75
creator        72
number         68
principal      67
library        66
ragging        63
facilities     63
greeting       62
sem            62
size           61
goodbye        60
location       60
hours          60
syllabus       58
admission      56
fees           56
placement      55
uniform        55
seats          55
sports         54
hod            52
menu           48
canteen        48
hostel         47
random         47
Name: count, dtype: int64

In [130]:
list1 = df['intent'].unique().tolist()

In [132]:
import nltk
from nltk.corpus import wordnet
import pandas as pd
def perform_synonym_replacement(pattern, num_samples):
    augmented_samples = []
    tokens = nltk.word_tokenize(pattern)
    for token in tokens:
        synonyms = []
        for syn in wordnet.synsets(token):
            for lemma in syn.lemmas():
                synonyms.append(lemma.name())
        if synonyms:
            augmented_samples.extend([pattern.replace(token, synonym, 1) for synonym in synonyms])
    augmented_samples = list(set(augmented_samples))
    if len(augmented_samples) > num_samples:
        augmented_samples = augmented_samples[:num_samples]

    return augmented_samples
def augment_dataset_with_synonyms(dataset, intents_with_lower_samples, num_samples_per_intent):
    augmented_dataset = dataset.copy()

    for intent in intents_with_lower_samples:
        intent_data = dataset[dataset['intent'] == intent]
        patterns = intent_data['pattern'].tolist()
        augmented_samples = []
        for pattern in patterns:
            samples = perform_synonym_replacement(pattern, num_samples_per_intent)
            augmented_samples.extend(samples)
        augmented_samples_df = pd.DataFrame({
            'pattern': augmented_samples,
            'intent': [intent] * len(augmented_samples)
        })
        augmented_dataset = pd.concat([augmented_dataset, augmented_samples_df], ignore_index=True)
    return augmented_dataset
intents_with_lower_samples = list1
num_samples_per_intent = 2
df = augment_dataset_with_synonyms(df, intents_with_lower_samples, num_samples_per_intent)

In [133]:
df['intent'].value_counts()

intent
task           255
scholarship    249
salutaion      248
course         246
name           237
vacation       237
event          225
creator        216
number         204
principal      201
library        198
ragging        189
facilities     189
sem            185
size           183
hours          180
location       179
goodbye        178
syllabus       174
greeting       173
admission      168
fees           168
placement      165
uniform        165
seats          165
sports         162
hod            155
menu           144
canteen        144
hostel         141
random         141
Name: count, dtype: int64

In [134]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
#nltk.download('stopwords')
#nltk.download('wordnet')
#nltk.download('punkt')

In [135]:
import re
from nltk.corpus import stopwords

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9\s]", " ", text)
    text = re.sub(r"\d+", "NUM", text)
    
    stop_words = set(stopwords.words("english"))

    # Custom list of words to exclude from stopwords
    custom_stopwords = ["how", "are", "you", "other", "question", "phrases"]
    stop_words.difference_update(custom_stopwords)

    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    
    contractions = {
        "ain't": "am not",
        "aren't": "are not",
        "can't": "cannot",
        "can't've": "cannot have",
        "'cause": "because",
        "could've": "could have",
        "couldn't": "could not",
        "couldn't've": "could not have",
        "didn't": "did not",
        "doesn't": "does not",
        "don't": "do not",
        "hadn't": "had not",
        "hadn't've": "had not have",
        "hasn't": "has not",
        "haven't": "have not",
        "he'd": "he would",
        "he'd've": "he would have",
        "he'll": "he will",
        "he'll've": "he will have",
        "he's": "he is",
        "how'd": "how did",
        "how'd'y": "how do you",
        "how'll": "how will",
        "how's": "how is",
        "i'd": "i would",
        "i'd've": "i would have",
        "i'll": "i will",
        "i'll've": "i will have",
        "i'm": "i am",
        "i've": "i have",
        "isn't": "is not",
        "it'd": "it would",
        "it'd've": "it would have",
        "it'll": "it will",
        "it'll've": "it will have",
        "it's": "it is",
        "let's": "let us",
        "ma'am": "madam",
        "mayn't": "may not",
        "might've": "might have",
        "mightn't": "might not",
        "mightn't've": "might not have",
        "must've": "must have",
        "mustn't": "must not",
        "mustn't've": "must not have",
        "needn't": "need not",
        "needn't've": "need not have",
        "o'clock": "of the clock",
        "oughtn't": "ought not",
        "oughtn't've": "ought not have",
        "shan't": "shall not",
        "sha'n't": "shall not",
        "shan't've": "shall not have",
        "she'd": "she would",
        "she'd've": "she would have",
        "she'll": "she will",
        "she'll've": "she will have",
        "she's": "she is",
        "should've": "should have",
        "shouldn't": "should not",
        "shouldn't've": "should not have",
        "so've": "so have",
        "so's": "so is",
        "that'd": "that would",
        "that'd've": "that would have",
        "that's": "that is",
        "there'd": "there would",
        "there'd've": "there would have",
        "there's": "there is",
        "they'd": "they would",
        "they'd've": "they would have",
        "they'll": "they will",
        "they'll've": "they will have",
        "they're": "they are",
        "they've": "they have",
        "to've": "to have",
        "wasn't": "was not",
        "we'd": "we would",
        "we'd've": "we would have",
        "we'll": "we will",
        "we'll've": "we will have",
        "we're": "we are",
        "we've": "we have",
        "weren't": "were not",
        "what'll": "what will",
        "what'll've": "what will have",
        "what're": "what are",
        "what's": "what is",
        "what've": "what have",
        "when's": "when is",
        "when've": "when have",
        "where'd": "where did",
        "where's": "where is",
        "where've": "where have",
        "who'll": "who will",
        "who'll've": "who will have",
        "who's": "who is",
        "who've": "who have",
        "why's": "why is",
        "why've": "why have",
        "will've": "will have",
        "won't": "will not",
        "won't've": "will not have",
        "would've": "would have",
        "wouldn't": "would not",
        "wouldn't've": "would not have",
        "y'all": "you all",
        "y'all'd": "you all would",
        "y'all'd've": "you all would have",
        "y'all're": "you all are",
        "y'all've": "you all have",
        "you'd": "you would",
        "you'd've": "you would have",
        "you'll": "you will",
        "you'll've": "you will have",
        "you're": "you are",
        "you've": "you have"
    }
    tokens = text.split()
    processed_tokens = [contractions.get(word, word) for word in tokens]
    processed_text = " ".join(processed_tokens)

    return processed_text

In [136]:
df['pattern'] = df['pattern'].apply(preprocess_text)

In [138]:
df['pattern'].isnull().sum()

0

In [139]:
import re
def fun1(text):
    tokens = word_tokenize(text)
    lm = WordNetLemmatizer()
    lemmatized_tokens = [lm.lemmatize(token) for token in tokens]
    return lemmatized_tokens

df['pattern'] = df['pattern'].apply(fun1)


In [140]:
df = df[df['pattern'].apply(lambda x: len(x) > 0)]

In [141]:
df.head(30)

Unnamed: 0,pattern,intent
0,"[see, you, on, the, other, side]",goodbye
1,"[what, is, the, number, to, call]",number
2,"[are, there, any, ongoing, project, or, initia...",principal
3,"[hey, there, what, brings, you, here]",greeting
4,"[what, are, the, holiday, date, for, this, aca...",vacation
5,"[are, there, any, quota, or, reservation, for,...",seats
6,"[are, there, any, counseling, round, for, seat...",seats
7,"[when, can, i, access, the, facility, of, the,...",hours
8,"[what, study, track, or, pathway, are, availab...",course
9,"[who, are, your, creator]",creator


In [142]:
pattern_column = df['pattern'].tolist()
print(max(len(pattern) for pattern in pattern_column))

17


In [143]:
import torch
import numpy as np
from transformers import BertModel, BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def extract_bert_features(tokenized_input):
    features = []
    for tokens in tokenized_input:
        if len(tokens) == 0:
            continue
        encoded_input = tokenizer.encode_plus(
            tokens,
            add_special_tokens=True,
            truncation=True,
            padding='max_length',
            max_length=17,
            return_tensors='pt'
        )
        with torch.no_grad():
            outputs = model(**encoded_input)
        sentence_features = outputs.last_hidden_state.squeeze(0).numpy()
        features.append(sentence_features)
    return features


  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [144]:
df['features'] = extract_bert_features(df['pattern'])

In [145]:
df

Unnamed: 0,pattern,intent,features
0,"[see, you, on, the, other, side]",goodbye,"[[0.11272174, 0.07082397, 0.084916234, 0.03197..."
1,"[what, is, the, number, to, call]",number,"[[-0.17751953, 0.08031044, 0.18810888, -0.0735..."
2,"[are, there, any, ongoing, project, or, initia...",principal,"[[-0.20244382, -0.31306756, -0.4767528, 0.1021..."
3,"[hey, there, what, brings, you, here]",greeting,"[[0.1496106, 0.041372396, -0.044739466, -0.129..."
4,"[what, are, the, holiday, date, for, this, aca...",vacation,"[[-0.13180786, 0.21631685, 0.13757367, -0.0108..."
...,...,...,...
5859,"[what, be, your, favorite, season]",random,"[[-0.19847973, -0.11630121, -0.24292564, -0.11..."
5860,"[behave, you, believe, in, alien]",random,"[[-0.014306758, 0.27058882, -0.10485062, 0.017..."
5861,"[come, you, believe, in, alien]",random,"[[0.09136562, 0.32485095, -0.2598656, -0.07789..."
5862,"[do, you, stargaze]",random,"[[-0.046534434, 0.6012542, -0.55832464, -0.293..."


In [146]:
x = df['features']

In [147]:
x = np.array(x.to_list())


In [148]:
y = df['intent']

In [149]:
y

0         goodbye
1          number
2       principal
3        greeting
4        vacation
          ...    
5859       random
5860       random
5861       random
5862       random
5863       random
Name: intent, Length: 5864, dtype: object

In [150]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
y = encoder.fit_transform(y)

In [152]:
num_classes = len(encoder.classes_)
y = tf.one_hot(y, num_classes)

In [153]:
y = y.numpy()

In [155]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [156]:
y_train.shape

(4691, 31)

In [157]:
x_train.shape

(4691, 17, 768)

In [158]:
x_test.shape

(1173, 17, 768)

Bert Max(92.02 acc)

In [159]:
import tensorflow as tf
from transformers import TFBertModel

bert_model = TFBertModel.from_pretrained('bert-base-uncased')

# Wrap the BERT model in a Keras layer
bert_layer = tf.keras.layers.Wrapper(bert_model)

model = tf.keras.Sequential()
model.add(bert_layer)
model.add(tf.keras.layers.Dropout(0.25))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.GRU(256, return_sequences=True)))
model.add(tf.keras.layers.Dropout(0.25))
model.add(tf.keras.layers.GlobalMaxPooling1D())
model.add(tf.keras.layers.Dropout(0.25))
model.add(tf.keras.layers.Dense(256, activation='relu'))
model.add(tf.keras.layers.Dropout(0.25))
model.add(tf.keras.layers.Dense(128, activation='relu'))
model.add(tf.keras.layers.Dense(31, activation='softmax'))

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [160]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [161]:
hist = model.fit(x_train,y_train,epochs = 15 , batch_size= 24 , validation_split=0.2)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [162]:
acc = model.evaluate(x_test,y_test)



In [164]:
model.save('Final_Model_GRU')



INFO:tensorflow:Assets written to: Final_Model_GRU1\assets


INFO:tensorflow:Assets written to: Final_Model_GRU1\assets


Loading

In [3]:
import tensorflow as tf
model = tf.keras.models.load_model('C:/Users/adith/Downloads/chatbot/Final_Model_GRU')

TESTING

In [169]:
import torch
import numpy as np
from transformers import BertModel, BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model1 = BertModel.from_pretrained('bert-base-uncased')

def extract_bert_features(tokenized_input):
    features = []
    encoded_input = tokenizer.encode_plus(
        tokenized_input,
        add_special_tokens=True,
        truncation=True,
        padding='max_length',
        max_length=17,
        return_tensors='pt'
    )
    with torch.no_grad():
        outputs = model1(**encoded_input)
    sentence_features = outputs.last_hidden_state.squeeze(0).numpy()
    features.append(sentence_features)
    return features

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [221]:
input = "what all kinds of food items will i get at the canteen"
input = preprocess_text(input)
input = fun1(input)
features = extract_bert_features(input)

In [222]:
features = features[0]

In [223]:
features = np.reshape(features , (1,17,768))

In [224]:
test = model.predict(features)



In [225]:
predicted_label = np.argmax(test)
predicted_class_name = encoder.classes_[predicted_label]
print("Predicted class:", predicted_class_name)

Predicted class: canteen


In [180]:
encoder.classes_

array(['admission', 'canteen', 'course', 'creator', 'event', 'facilities',
       'fees', 'goodbye', 'greeting', 'hod', 'hostel', 'hours', 'library',
       'location', 'menu', 'name', 'number', 'placement', 'principal',
       'ragging', 'random', 'salutaion', 'scholarship', 'seats', 'sem',
       'size', 'sports', 'syllabus', 'task', 'uniform', 'vacation'],
      dtype=object)

Text To Speech

In [181]:
import random
for x in data['intents']:
    if x['tag'] == predicted_class_name:
        if len(x['responses'])>1:
            print(random.choice(x['responses']))
        else:
            print(x['responses'])

Hello!


In [226]:
import random
for x in data['intents']:
    if x['tag'] == predicted_class_name:
        if len(x['responses'])>1:
            text_to_speech(random.choice(x['responses']))
        else:
            text_to_speech(x['responses'])

In [183]:
import pyttsx3
def text_to_speech(text):
    # Initialize the pyttsx3 engine
    engine = pyttsx3.init()

    # Set the speech rate (speed) if desired
    # engine.setProperty('rate', 150)  # You can experiment with different values

    # Set the text to be spoken
    engine.say(text)

    # Play the speech
    engine.runAndWait()