In [20]:
import pandas as pd
import numpy as np
import tensorflow as tf
import json

In [21]:
with open('intents2.json', 'r') as file:
    data = json.load(file)

In [22]:
pattern_data = []
intent_data = []
for intent in data["intents"]:
    patterns = intent["patterns"]
    intent_tag = intent["tag"]
    pattern_data.extend(patterns)
    intent_data.extend([intent_tag] * len(patterns))

df = pd.DataFrame({"pattern": pattern_data, "intent": intent_data})
df

Unnamed: 0,pattern,intent
0,Hi,greeting
1,watsupp,greeting
2,wassup,greeting
3,watsup bro,greeting
4,hiiiii,greeting
...,...,...
1905,what is the hod's approach to student feedback...,hod
1906,what is the hod's role in faculty recruitment ...,hod
1907,how does the hod promote a conducive learning ...,hod
1908,what are the hod's initiatives for industry-ac...,hod


In [23]:
df = df.sample(frac=1).reset_index(drop=True)
df

Unnamed: 0,pattern,intent
0,What are the working days?,hours
1,Tell me about the entrepreneurship events or s...,event
2,Tell me about the placement opportunities at t...,placement
3,Can we wear accessories or jewelry with the un...,uniform
4,How are the seats allocated in different branches,seats
...,...,...
1905,What are the dates for the next career fair?,event
1906,What are the holidays in this year?,vacation
1907,When can I expect the holiday season?,vacation
1908,Is there a self-service or cafeteria-style set...,canteen


In [24]:
df['intent'].nunique()

29

In [25]:
df['intent'].value_counts()

intent
course         95
salutation     84
scholarship    83
canteen        81
name           79
vacation       79
event          75
placement      75
ragging        75
creator        72
sem            70
number         68
principal      67
library        66
greeting       64
hod            61
size           61
hours          60
goodbye        60
location       60
task           56
admission      56
fees           55
seats          55
uniform        55
sports         54
random         50
facilities     47
hostel         47
Name: count, dtype: int64

In [26]:
list1 = df['intent'].unique().tolist()

In [27]:
import nltk
from nltk.corpus import wordnet
import pandas as pd

# Function to perform synonym replacement
def perform_synonym_replacement(pattern, num_samples):
    augmented_samples = []
    tokens = nltk.word_tokenize(pattern)

    for token in tokens:
        # Get the synonyms for the current token
        synonyms = []
        for syn in wordnet.synsets(token):
            for lemma in syn.lemmas():
                synonyms.append(lemma.name())

        # Replace the token with a random synonym (if available)
        if synonyms:
            augmented_samples.extend([pattern.replace(token, synonym, 1) for synonym in synonyms])

    # Shuffle and select a subset of augmented samples
    augmented_samples = list(set(augmented_samples))
    if len(augmented_samples) > num_samples:
        augmented_samples = augmented_samples[:num_samples]

    return augmented_samples

# Function to augment the dataset with generated samples
def augment_dataset_with_synonyms(dataset, intents_with_lower_samples, num_samples_per_intent):
    augmented_dataset = dataset.copy()

    for intent in intents_with_lower_samples:
        # Filter the dataset for the current intent
        intent_data = dataset[dataset['intent'] == intent]

        # Get the patterns for the current intent
        patterns = intent_data['pattern'].tolist()

        # Generate augmented samples for the intent
        augmented_samples = []
        for pattern in patterns:
            samples = perform_synonym_replacement(pattern, num_samples_per_intent)
            augmented_samples.extend(samples)

        # Create a DataFrame for the augmented samples
        augmented_samples_df = pd.DataFrame({
            'pattern': augmented_samples,
            'intent': [intent] * len(augmented_samples)
        })

        # Append the augmented samples to the augmented dataset
        augmented_dataset = pd.concat([augmented_dataset, augmented_samples_df], ignore_index=True)

    return augmented_dataset


intents_with_lower_samples = list1
# Specify the number of samples to generate per intent
num_samples_per_intent = 2

# Augment the dataset with synonym replacements
df = augment_dataset_with_synonyms(df, intents_with_lower_samples, num_samples_per_intent)


In [28]:
df['intent'].value_counts()

intent
course         285
scholarship    249
salutation     248
canteen        243
vacation       237
name           237
placement      225
ragging        225
event          225
creator        216
sem            210
number         204
principal      201
library        198
size           183
hod            182
hours          180
location       179
goodbye        178
greeting       175
task           168
admission      168
fees           165
seats          165
uniform        165
sports         162
random         150
facilities     141
hostel         141
Name: count, dtype: int64

In [29]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
#nltk.download('stopwords')
#nltk.download('wordnet')
#nltk.download('punkt')

In [30]:
import re
from nltk.corpus import stopwords

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9\s]", " ", text)
    text = re.sub(r"\d+", "NUM", text)
    
    stop_words = set(stopwords.words("english"))

    # Custom list of words to exclude from stopwords
    custom_stopwords = ["how", "are", "you", "other", "question", "phrases"]
    stop_words.difference_update(custom_stopwords)

    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    
    contractions = {
        "ain't": "am not",
        "aren't": "are not",
        "can't": "cannot",
        "can't've": "cannot have",
        "'cause": "because",
        "could've": "could have",
        "couldn't": "could not",
        "couldn't've": "could not have",
        "didn't": "did not",
        "doesn't": "does not",
        "don't": "do not",
        "hadn't": "had not",
        "hadn't've": "had not have",
        "hasn't": "has not",
        "haven't": "have not",
        "he'd": "he would",
        "he'd've": "he would have",
        "he'll": "he will",
        "he'll've": "he will have",
        "he's": "he is",
        "how'd": "how did",
        "how'd'y": "how do you",
        "how'll": "how will",
        "how's": "how is",
        "i'd": "i would",
        "i'd've": "i would have",
        "i'll": "i will",
        "i'll've": "i will have",
        "i'm": "i am",
        "i've": "i have",
        "isn't": "is not",
        "it'd": "it would",
        "it'd've": "it would have",
        "it'll": "it will",
        "it'll've": "it will have",
        "it's": "it is",
        "let's": "let us",
        "ma'am": "madam",
        "mayn't": "may not",
        "might've": "might have",
        "mightn't": "might not",
        "mightn't've": "might not have",
        "must've": "must have",
        "mustn't": "must not",
        "mustn't've": "must not have",
        "needn't": "need not",
        "needn't've": "need not have",
        "o'clock": "of the clock",
        "oughtn't": "ought not",
        "oughtn't've": "ought not have",
        "shan't": "shall not",
        "sha'n't": "shall not",
        "shan't've": "shall not have",
        "she'd": "she would",
        "she'd've": "she would have",
        "she'll": "she will",
        "she'll've": "she will have",
        "she's": "she is",
        "should've": "should have",
        "shouldn't": "should not",
        "shouldn't've": "should not have",
        "so've": "so have",
        "so's": "so is",
        "that'd": "that would",
        "that'd've": "that would have",
        "that's": "that is",
        "there'd": "there would",
        "there'd've": "there would have",
        "there's": "there is",
        "they'd": "they would",
        "they'd've": "they would have",
        "they'll": "they will",
        "they'll've": "they will have",
        "they're": "they are",
        "they've": "they have",
        "to've": "to have",
        "wasn't": "was not",
        "we'd": "we would",
        "we'd've": "we would have",
        "we'll": "we will",
        "we'll've": "we will have",
        "we're": "we are",
        "we've": "we have",
        "weren't": "were not",
        "what'll": "what will",
        "what'll've": "what will have",
        "what're": "what are",
        "what's": "what is",
        "what've": "what have",
        "when's": "when is",
        "when've": "when have",
        "where'd": "where did",
        "where's": "where is",
        "where've": "where have",
        "who'll": "who will",
        "who'll've": "who will have",
        "who's": "who is",
        "who've": "who have",
        "why's": "why is",
        "why've": "why have",
        "will've": "will have",
        "won't": "will not",
        "won't've": "will not have",
        "would've": "would have",
        "wouldn't": "would not",
        "wouldn't've": "would not have",
        "y'all": "you all",
        "y'all'd": "you all would",
        "y'all'd've": "you all would have",
        "y'all're": "you all are",
        "y'all've": "you all have",
        "you'd": "you would",
        "you'd've": "you would have",
        "you'll": "you will",
        "you'll've": "you will have",
        "you're": "you are",
        "you've": "you have"
    }
    tokens = text.split()
    processed_tokens = [contractions.get(word, word) for word in tokens]
    processed_text = " ".join(processed_tokens)

    return processed_text

In [31]:
df['pattern'] = df['pattern'].apply(preprocess_text)

In [32]:
df

Unnamed: 0,pattern,intent
0,what are the working days,hours
1,tell me about the entrepreneurship events or s...,event
2,tell me about the placement opportunities at t...,placement
3,can we wear accessories or jewelry with the un...,uniform
4,how are the seats allocated in different branches,seats
...,...,...
5700,what follow the library rules and regulations,library
5701,ace need information about the college library,library
5702,i need information about the college library,library
5703,can you direct me to the library,library


In [33]:
df['pattern'].isnull().sum()

0

In [34]:
import re
def fun1(text):
    tokens = word_tokenize(text)
    lm = WordNetLemmatizer()
    lemmatized_tokens = [lm.lemmatize(token) for token in tokens]
    return lemmatized_tokens

df['pattern'] = df['pattern'].apply(fun1)


In [35]:
df = df[df['pattern'].apply(lambda x: len(x) > 0)]

In [36]:
df.head(30)

Unnamed: 0,pattern,intent
0,"[what, are, the, working, day]",hours
1,"[tell, me, about, the, entrepreneurship, event...",event
2,"[tell, me, about, the, placement, opportunity,...",placement
3,"[can, we, wear, accessory, or, jewelry, with, ...",uniform
4,"[how, are, the, seat, allocated, in, different...",seats
5,"[provide, detail, about, the, hod]",hod
6,"[distance, between, hostel, and, college]",hostel
7,"[is, there, any, placement, oppertunities, aft...",placement
8,"[what, s, your, name, if, any]",name
9,[admission],admission


In [37]:
pattern_column = df['pattern'].tolist()
print(max(len(pattern) for pattern in pattern_column))

17


In [38]:
import torch
import numpy as np
from transformers import BertModel, BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def extract_bert_features(tokenized_input):
    features = []
    for tokens in tokenized_input:
        if len(tokens) == 0:
            continue
        encoded_input = tokenizer.encode_plus(
            tokens,
            add_special_tokens=True,
            truncation=True,
            padding='max_length',
            max_length=14,
            return_tensors='pt'
        )
        with torch.no_grad():
            outputs = model(**encoded_input)
        sentence_features = outputs.last_hidden_state.squeeze(0).numpy()
        features.append(sentence_features)
    return features


In [39]:
df['features'] = extract_bert_features(df['pattern'])

In [40]:
df

Unnamed: 0,pattern,intent,features
0,"[what, are, the, working, day]",hours,"[[0.19312496, 0.25102663, -0.2094245, -0.20383..."
1,"[tell, me, about, the, entrepreneurship, event...",event,"[[0.030883595, -0.14463073, -0.33744907, 0.162..."
2,"[tell, me, about, the, placement, opportunity,...",placement,"[[0.23510706, -0.06939392, -0.25611755, 0.1512..."
3,"[can, we, wear, accessory, or, jewelry, with, ...",uniform,"[[0.060613558, -0.09857152, -0.15630046, 0.278..."
4,"[how, are, the, seat, allocated, in, different...",seats,"[[-0.15844814, -0.08488568, -0.092611015, 0.22..."
...,...,...,...
5700,"[what, follow, the, library, rule, and, regula...",library,"[[-0.07125845, 0.15415755, -0.2806683, -0.1253..."
5701,"[ace, need, information, about, the, college, ...",library,"[[0.08756759, 0.1496042, -0.061282653, 0.10965..."
5702,"[i, need, information, about, the, college, li...",library,"[[0.118558615, 0.15734714, -0.12687284, -0.031..."
5703,"[can, you, direct, me, to, the, library]",library,"[[0.11008467, 0.103716895, -0.07538706, 0.1841..."


In [41]:
x = df['features']

In [42]:
x = np.array(x.to_list())


In [43]:
y = df['intent']

In [44]:
y

0           hours
1           event
2       placement
3         uniform
4           seats
          ...    
5700      library
5701      library
5702      library
5703      library
5704      library
Name: intent, Length: 5705, dtype: object

In [46]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
y = encoder.fit_transform(y)

In [47]:
print(y)

[11  4 16 ... 12 12 12]


In [48]:
num_classes = len(encoder.classes_)
y = tf.one_hot(y, num_classes)

In [49]:
y = y.numpy()

In [50]:
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [51]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [52]:
y_train.shape

(4564, 29)

In [53]:
x_train.shape

(4564, 14, 768)

In [54]:
x_test.shape

(1141, 14, 768)

Bert Max(92.02 acc)

In [55]:
import tensorflow as tf
from transformers import TFBertModel

bert_model = TFBertModel.from_pretrained('bert-base-uncased')

# Wrap the BERT model in a Keras layer
bert_layer = tf.keras.layers.Wrapper(bert_model)

model = tf.keras.Sequential()
model.add(bert_layer)
model.add(tf.keras.layers.Dropout(0.25))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.GRU(256, return_sequences=True)))
model.add(tf.keras.layers.Dropout(0.25))
model.add(tf.keras.layers.GlobalMaxPooling1D())
model.add(tf.keras.layers.Dropout(0.25))
model.add(tf.keras.layers.Dense(256, activation='relu'))
model.add(tf.keras.layers.Dropout(0.25))
model.add(tf.keras.layers.Dense(128, activation='relu'))
model.add(tf.keras.layers.Dense(29, activation='softmax'))

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [56]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [57]:
hist = model.fit(x_train,y_train,epochs = 10 , batch_size= 24 , validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [58]:
acc = model.evaluate(x_test,y_test)



In [59]:
model.save('Final_Model_GRU')

INFO:tensorflow:Assets written to: Final_Model_GRU\assets


INFO:tensorflow:Assets written to: Final_Model_GRU\assets


Loading

In [3]:
import tensorflow as tf
model = tf.keras.models.load_model('C:/Users/adith/Downloads/chatbot/Final_Model_GRU')

TESTING

In [60]:
import torch
import numpy as np
from transformers import BertModel, BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model1 = BertModel.from_pretrained('bert-base-uncased')

def extract_bert_features(tokenized_input):
    features = []
    encoded_input = tokenizer.encode_plus(
        tokenized_input,
        add_special_tokens=True,
        truncation=True,
        padding='max_length',
        max_length=14,
        return_tensors='pt'
    )
    with torch.no_grad():
        outputs = model1(**encoded_input)
    sentence_features = outputs.last_hidden_state.squeeze(0).numpy()
    features.append(sentence_features)
    return features

In [61]:
input = "tell me about the hods"
input = preprocess_text(input)
input = fun1(input)
features = extract_bert_features(input)

In [62]:
features = features[0]

In [63]:
features = np.reshape(features , (1,14,768))

In [64]:
test = model.predict(features)



In [65]:
predicted_label = np.argmax(test)
predicted_class_name = encoder.classes_[predicted_label]
print("Predicted class:", predicted_class_name)

Predicted class: hod


In [66]:
encoder.classes_

array(['admission', 'canteen', 'course', 'creator', 'event', 'facilities',
       'fees', 'goodbye', 'greeting', 'hod', 'hostel', 'hours', 'library',
       'location', 'name', 'number', 'placement', 'principal', 'ragging',
       'random', 'salutation', 'scholarship', 'seats', 'sem', 'size',
       'sports', 'task', 'uniform', 'vacation'], dtype=object)

Text To Speech

In [181]:
import random
for x in data['intents']:
    if x['tag'] == predicted_class_name:
        if len(x['responses'])>1:
            print(random.choice(x['responses']))
        else:
            print(x['responses'])

Hello!


In [226]:
import random
for x in data['intents']:
    if x['tag'] == predicted_class_name:
        if len(x['responses'])>1:
            text_to_speech(random.choice(x['responses']))
        else:
            text_to_speech(x['responses'])

In [183]:
import pyttsx3
def text_to_speech(text):
    # Initialize the pyttsx3 engine
    engine = pyttsx3.init()

    # Set the speech rate (speed) if desired
    # engine.setProperty('rate', 150)  # You can experiment with different values

    # Set the text to be spoken
    engine.say(text)

    # Play the speech
    engine.runAndWait()