In [1]:
import json
import numpy as np
import nltk
# from nltk_utils import tokenize_fn, stem_fn, lemmatize_fn ,bag_of_words

import torch
from torch.utils.data import Dataset, DataLoader
from classes import createChatDataset

In [2]:
from nltk.stem import WordNetLemmatizer
import string
from nltk.corpus import stopwords
import re

# Download necessary NLTK data if not already downloaded
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# Define stop words using nltk's stopword list
stop_words = set(stopwords.words('english'))
punctuation = set(string.punctuation)

# The ignore list containing words to exclude
ignore_list = [""]

# Function to clean contractions
def clean_contractions(word):
    return re.sub(r"['’]s$", "", word).replace("n't", "")

# Function to tokenize and lemmatize while filtering

def preprocess_words_fn(pattern, stop_words, punctuation):
    """
    Preprocess a sentence or a list of words:
    - Tokenizes if it's a string.
    - Filters stopwords and punctuation.
    - Lemmatizes the words.
    """
    all_word=[]
    if isinstance(pattern, str):  # Check if input is a string
        words = nltk.word_tokenize(pattern.lower())  # Tokenize and convert to lowercase
    elif isinstance(pattern, list):  # If it's already a list
        words = [word.lower() for word in pattern]  # Normalize case
    else:
        raise ValueError("Input must be a string or a list of words.")

    words = [clean_contractions(w) for w in words]  # Clean contractions like 's and n't
    words = [lemmatizer.lemmatize(w) for w in words]  # Lemmatize words
    words = [w for w in words if w not in stop_words and w not in punctuation]  # Filter
    words = [ w for w in words if len(w)>1]
    # print("---<" , words)
    all_word.extend(words)
    # print("\n \n \n <<>>>>>" , len(all_words))
    # print("\n \n \n <<>>>>>" , len(sorted(set(all_words))))
    return sorted(set(all_word))  # Return unique words




[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\University\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\University\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\University\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
with open('intents2.json', "r") as f:
    all_intents = json.load(f)


In [4]:
# import string
all_words = []
tags = []
xy = []
# print(all_intents)
for intent in all_intents["intents"] :
    tag = intent['tag']
    tags.append(tag)
    for pattern in intent['patterns']:
        w = preprocess_words_fn(pattern=pattern , stop_words= stop_words ,
                             punctuation=punctuation )
        all_words.extend(w)
        xy.append((w,tag))

In [5]:
print(len(xy) , "patterns \n" )
print(len(tags), "tags:\n", tags)
print(len(all_words),"all words:\n"," all_words")

154 patterns 

7 tags:
 ['greeting', 'goodbye', 'practical_programs', 'study_AI', 'study_tips', 'best_study_time', 'student_support']
495 all words:
  all_words


In [6]:
import numpy as np
def bag_of_words(sentence, all_words, preprocess_words_fn, stop_words, punctuation):
    """
    Return a bag of words: 1 for each known word that exists in the sentence, 0 otherwise.
    sentence = "hello how are you"
    all_words = ["hi", "hello", "I", "bye", "thank", "cool"]
    bag = [0, 1, 0, 0, 0, 0]
    """
    # Preprocess the sentence (e.g., tokenization, lemmatization)
    sentence_words = preprocess_words_fn(sentence, stop_words, punctuation)

    # Initialize the bag with 0s for each word in all_words|
    bag = np.zeros(len(all_words), dtype=np.float32)
    # print(bag)
    # Loop through each word in all_words
    for idx, word in enumerate(all_words):
        if word in sentence_words:  # If the word exists in the sentence
            bag[idx] = 1  # Mark it as present

    return bag


In [7]:
## bag of the words 
# create training data
X_train = []
y_train = [] 

for (pattern_sentence , tag) in xy:
    # print(pattern_sentence)
    bag =  bag_of_words(pattern_sentence, all_words ,preprocess_words_fn, 
                        stop_words, punctuation)
    # print("bag" , bag)
    X_train.append(bag)

    label = tags.index(tag)
    y_train.append(label) # for calculate the loss_fn -- CreossEntropy
    classes = tags 

In [8]:
# # Example data
# pat = "How can I manage my time effectively?"
    

# # Preprocess words
# allword = preprocess_words(pat, stop_words, punctuation)

# # Output the processed words
# print(allword)

In [9]:
X_train = np.array(X_train)
y_train = np.array(y_train)

X_train,  y_train

(array([[1., 0., 0., ..., 0., 0., 0.],
        [0., 1., 0., ..., 0., 0., 0.],
        [0., 0., 1., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 1., 0.],
        [0., 0., 0., ..., 0., 1., 0.],
        [0., 0., 0., ..., 1., 1., 1.]], dtype=float32),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
        4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
        5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6]))

In [10]:
train_dataset = createChatDataset(X_train,y_train,classes, all_words)

print(len(train_dataset))

154


In [11]:
train_dataset.classes


['greeting',
 'goodbye',
 'practical_programs',
 'study_AI',
 'study_tips',
 'best_study_time',
 'student_support']

In [12]:
train_dataset.all_words[:4] , len(train_dataset.all_words)

(['hello', 'hi', 'hey', 'good'], 495)

In [13]:
torch.save(train_dataset, "train_dataset_v2.pt")

In [14]:
print("done")

done
