In [1]:
import torch
import torch.nn as nn
import re
from torch.utils.data import Dataset, DataLoader
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

In [15]:
# Load Data
def load_data(file_x, file_y):
    with open(file_x, 'r') as fx, open(file_y, 'r') as fy:
        sentences = fx.read().strip().split('\n')
        labels = fy.read().strip().split('\n')
    return [sentence.split(",") for sentence in sentences], [label.split(',') for label in labels]

print("Loading Data Started")
train_sentences, train_labels = load_data('database/labeler/x_train.txt', 'database/labeler/y_train.txt')
test_sentences, test_labels = load_data('database/labeler/x_dev.txt', 'database/labeler/y_dev.txt')
print("Loading Data Done")

Loading Data Started
Loading Data Done


In [16]:
# Build Vocabulary
FREQ_THRESH = 0

def build_vocab():
    vocab = set()
    with open("database/labeler/vocabulary.txt", "r") as fv:
        for line in fv: 
            voc, thresh = line.strip().split(",")
            if int(thresh) > FREQ_THRESH:
                vocab.add(voc)
    word2idx = {word: idx + 2 for idx, word in enumerate(sorted(vocab))}  # Reserve 0 for padding, 1 for unknown words
    word2idx['<PAD>'] = 0
    word2idx['<UNK>'] = 1
    return word2idx

def build_label_vocab(labels):
    vocab = {label for label_list in labels for label in label_list}
    label2idx = {label: idx for idx, label in enumerate(sorted(vocab))}
    return label2idx

print("Building Vocab Started")
word2idx = build_vocab()
label2idx = build_label_vocab(train_labels)
idx2label = {idx: label for label, idx in label2idx.items()}
print("Building Vocab Done")

Building Vocab Started
Building Vocab Done


In [17]:
for word, idx in word2idx.items():
    print(f"{word},{idx}")

200millilit,2
500millilit,3
500ml,4
DIGIT,5
ale,6
alfredo,7
all,8
also,9
american,10
an,11
anchovi,12
ani,13
appl,14
applewood,15
artichok,16
arugula,17
avoid,18
bacon,19
balsam,20
balzam,21
banana,22
barbecu,23
basil,24
bay,25
bbq,26
bean,27
beef,28
big,29
bit,30
black,31
bottl,32
broccoli,33
brocoli,34
buffalo,35
can,36
caramel,37
carrot,38
cauliflow,39
cheddar,40
chees,41
cheeseburg,42
cherri,43
chicago,44
chicken,45
chorizo,46
chorrizo,47
coffe,48
coke,49
combin,50
crust,51
cumin,52
deep,53
deepdish,54
dew,55
diet,56
dish,57
doctor,58
dough,59
dr,60
dri,61
eight,62
eleven,63
everi,64
everyth,65
extra,66
fanta,67
fat,68
feta,69
fifteen,70
five,71
fl,72
flake,73
fluid,74
four,75
fourteen,76
free,77
fri,78
garlic,79
ginger,80
glaze,81
gluten,82
glutenfre,83
green,84
grill,85
ground,86
ham,87
hate,88
have,89
hawaiian,90
high,91
hold,92
hot,93
ice,94
italian,95
jalapeno,96
just,97
kalamata,98
keto,99
larg,100
leav,101
lemon,102
lettuc,103
liter,104
littl,105
lot,106
lover,107
low,108
lu

In [18]:
print(label2idx)

{'CONTAINERTYPE': 0, 'DRINKTYPE': 1, 'NONE': 2, 'NOT_STYLE': 3, 'NOT_TOPPING': 4, 'NUMBER': 5, 'PIZZA': 6, 'QUANTITY': 7, 'SIZE': 8, 'STYLE': 9, 'TOPPING': 10, 'VOLUME': 11}


In [20]:
# Prepare Dataset
class SequenceDataset(Dataset):
    def __init__(self, sentences, labels, word2idx, label2idx, max_len=50):
        self.sentences = [[word2idx.get(word, word2idx['<UNK>']) for word in sentence] for sentence in sentences]
        self.labels = [[label2idx.get(label, label2idx["QUANTITY"]) for label in label_list] for label_list in labels]
        self.max_len = max_len

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        label = self.labels[idx]
        sentence = sentence[:self.max_len] + [0] * (self.max_len - len(sentence))
        label = label[:self.max_len] + [label2idx['NONE']] * (self.max_len - len(label))
        return torch.tensor(sentence), torch.tensor(label)

print("Preparing Dataset Started")
train_dataset = SequenceDataset(train_sentences, train_labels, word2idx, label2idx)
test_dataset = SequenceDataset(test_sentences, test_labels, word2idx, label2idx)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)
print("Preparing Dataset Done")

Preparing Dataset Started
Preparing Dataset Done


In [21]:
# Define Model
class RNNSequenceLabeling(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(RNNSequenceLabeling, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=word2idx['<PAD>'])
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, 
                            num_layers=2, 
                            bidirectional=True, 
                            batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        predicted = self.fc(lstm_out)
        return predicted

In [22]:

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

vocab_size = len(word2idx)
embedding_dim = 100
hidden_dim = 128
output_dim = len(label2idx)

model = RNNSequenceLabeling(vocab_size, embedding_dim, hidden_dim, output_dim).to(device)
model.load_state_dict(torch.load('models/active/Entity.pth', map_location=torch.device('cpu')))
model.eval()

  model.load_state_dict(torch.load('models/active/Entity.pth', map_location=torch.device('cpu')))


RNNSequenceLabeling(
  (embedding): Embedding(228, 100, padding_idx=0)
  (lstm): LSTM(100, 128, num_layers=2, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=256, out_features=12, bias=True)
)

In [9]:
# Training
print("Training Started")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

vocab_size = len(word2idx)
embedding_dim = 100
hidden_dim = 128
output_dim = len(label2idx)

model = RNNSequenceLabeling(vocab_size, embedding_dim, hidden_dim, output_dim).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

num_epochs = 20
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for sentences, labels in train_loader:
        sentences, labels = sentences.to(device), labels.to(device)
        
        optimizer.zero_grad()
        predictions = model(sentences)

        loss = criterion(predictions.view(-1, output_dim), labels.view(-1)) 
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()

    torch.save(model.state_dict(), f"BLSTM_IT_{epoch + 1}.pth")
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss / len(train_loader):.25f}")
print("Training Done")

Training Started
Epoch 1/20, Loss: 0.0008272594174388413082802
Epoch 2/20, Loss: 0.0000000000492898536707681
Epoch 3/20, Loss: 0.0000000000082441232608919
Epoch 4/20, Loss: 0.0000000000049044159261700
Epoch 5/20, Loss: 0.0000000000036028010328189
Epoch 6/20, Loss: 0.0000000000028777762404607
Epoch 7/20, Loss: 0.0000000000024361614213390
Epoch 8/20, Loss: 0.0000000000021498397217536
Epoch 9/20, Loss: 0.0000000000020081347981390
Epoch 10/20, Loss: 0.0000000000018848709002082
Epoch 11/20, Loss: 0.0000000000048567948986891
Epoch 12/20, Loss: 0.0000000000308412314694920
Epoch 13/20, Loss: 0.0000005678250542069031777
Epoch 14/20, Loss: 0.0000000191948637415595313
Epoch 15/20, Loss: 0.0000008389367586936262401
Epoch 16/20, Loss: 0.0000006606241880591917679
Epoch 17/20, Loss: 0.0000004368849924189691590
Epoch 18/20, Loss: 0.0000009468243555622430261
Epoch 19/20, Loss: 0.0000003498500169720110484
Epoch 20/20, Loss: 0.0000018562301402992116068
Training Done


In [23]:
# Evaluation
def evaluate(model, loader):
    global label2idx, idx2word
    model.eval()
    total, correct = 0, 0
    with torch.no_grad():
        for sentences, labels in loader:
            sentences, labels = sentences.to(device), labels.to(device)
            predictions = model(sentences).argmax(dim=-1)
            total += labels.numel()
            correct += (predictions == labels).sum().item()
            # for sentence, predict, label in zip(sentences, predictions, labels):
            #     if (predict != label).sum().item() > 0:
            #         for word in sentence:
            #             print(idx2word[word.item()], end="")
            #         print()
                    

    return correct / total
print("Testing Started")
accuracy = evaluate(model, test_loader)
print(f"Test Accuracy: {accuracy * 100:.2f}%")
print("Testing Done")

Testing Started
Test Accuracy: 95.22%
Testing Done


In [15]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()


PUNCTUATIONS=[
    ".",  # Period
    ",",  # Comma
    ";",  # Semicolon
    ":",  # Colon
    "!",  # Exclamation Mark
    "?",  # Question Mark
    "'",  # Apostrophe
    '"',  # Quotation Marks
    "_",  # Em Dash
    "-",  # Hyphen
    "[",  # Left Bracket
    "]",  # Right Bracket
    "{",  # Left Brace
    "}",  # Right Brace
    "/",  # Slash
    "\\", # Backslash
    "|",  # Vertical Bar
    "@",  # At Symbol
    "#",  # Hash
    "$",  # Dollar Sign
    "%",  # Percent
    "^",  # Caret
    "&",  # Ampersand
    "*",  # Asterisk
    "_",  # Underscore
    "+",  # Plus
    "=",  # Equals
    "<",  # Less Than
    ">",  # Greater Than
    "~",  # Tilde
    "`"   # Grave Accent
]

BLACKLIST = [
    # human references
    'i', 'me', 'my', 'mine', 'myself',
    'you', 'your', 'yourself',
    'he', 'him', 'hi', 'himself',
    'she', 'her', 'herself',
    'it', 'itself',
    'we', 'us', 'our', 'ourselv',
    'they', 'them', 'their', 'themselv',
    'person', 'peopl', 'human',
    'individu', 'man', 'men',
    'woman', 'women', 'child', 'children',
    'adult', 'someon', 'somebodi',
    'anyon', 'anybodi', 'everyon', 'everybodi',
    'no on', 'nobodi'

    # admiring something (who cares cry about it)
    "love", "like", "admire", "adore", "cherish", 
    "appreciate", "respect", "idolize", "enjoy", 
    "value", "revere", "treasure", "favor", 
    "prefer", "esteem", "venerate", "worship", 
    "fancy", "savor", "delight", "care"

    # extra
    'the', 'and', 'or', 'but', 'within', 'to', 'by', 

    'id', 'ive', 'iam', 'along', 'on', 'in', 'over'
]

PIZZA_WORDS = ["pizza", "pie", "slice"]



BLACKLIST    = list(set([ stemmer.stem(x) for x in BLACKLIST ]))
PIZZA_WORDS  = list(set([ stemmer.stem(x) for x in PIZZA_WORDS]))


In [16]:
import re
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

class Normalizer:
    ''' 
        Removes All Punctuation From The Given String 
        Punctuation will is defined in var.py
    '''
    def remove_punctuations(self, TOP):
        regex = "".join([ fr"\{punc}" for punc in PUNCTUATIONS ])
        TOP = re.sub(fr"[{regex}]", '', TOP)
        return TOP
    
    '''
        Removes Words From The Given String Which Are Defined In var.py
        Note: This Words Is Assumed To Be Stemmed Already
    '''
    def remove_words(self, TOP):
        regex = fr"(?<=(?:\b|^))(?:{'|'.join(BLACKLIST)})(?=(?:\b|&))"
        return re.sub(fr"({regex})", '', TOP)

    def replace_numbers(self, TOP):
        regex = r'\b(?:\d+|a)\b' # TODO: revise this
        TOP = re.sub(regex, 'DIGIT', TOP)
        return TOP

    '''
        Follow Rules Of Replacing Some Words With Other In Case Of Needing
    '''
    def reconstruct_words(self, TOP):
        return TOP
    
    '''
        - Replace Multiple Spaces With Only One Space
        - Removes Spaces At The Beginning And End Of The Given String
    '''
    def reorganize_spaces(self, TOP):
        TOP = re.sub(r'\s+', ' ', TOP)
        return re.sub(r'(?:\s+$)|(?:^\s+)', '', TOP)
    
    '''
        Stems the given word and convert it to lowercase
    '''
    def stem_word(self, token):
        return stemmer.stem(token)

    '''
        Stems the given sentence and convert all words to lowercase
    '''
    def stem_sentence(self, sentence):
        return " ".join([ self.stem_word(word) for word in sentence.split(' ') ])

    '''
        Normalizes the given sentence by performing the following steps:
        1. Reconstructs the words using the rules defined in var
        2. Removes punctuation defined in var
        3. Reorganizes spaces
        4. Stems the sentence
        5. Returns the normalized sentence
    '''
    def normalize(self, sentence):
        NEXT = self.remove_punctuations(sentence) 
        NEXT = self.remove_words(NEXT)
        NEXT = self.replace_numbers(NEXT)
        NEXT = self.reconstruct_words(NEXT)
        NEXT = self.reorganize_spaces(NEXT)
        NEXT = self.stem_sentence(NEXT)
        return NEXT
    


In [17]:
# Predict labels for a single query
def preprocess_sentence(sentence, word2idx, max_len=50):
    normalizer = Normalizer()
    preprocessed = normalizer.normalize(sentence)
    preprocessed = normalizer.reorganize_spaces(preprocessed)
    preprocessed = preprocessed.replace("digit", "DIGIT")
    
    print (f"After Preprocessing: {preprocessed}")
    words = preprocessed.split()  # Tokenize the sentence
    indices = [word2idx.get(word, word2idx['<UNK>']) for word in words]  # Map words to indices
    indices = indices[:max_len] + [0] * (max_len - len(indices))  # Pad or truncate
    return torch.tensor([indices])  # Add batch dimension 

In [18]:
# Predict labels for a single query
def predict_labels(model, sentence, word2idx, idx2label, device, max_len=50):
    input_tensor = preprocess_sentence(sentence, word2idx, max_len).to(device)
    with torch.no_grad():
        output = model(input_tensor)  # Get logits
        predictions = output.argmax(dim=-1).squeeze(0)  # Get label indices
    return [idx2label[idx.item()] for idx in predictions if idx.item() in idx2label]

In [19]:
# Example testing
new_sentence = "i'd like a medium pizza with marinara sauce, extra cheese and pepperoni"

print(f"Sentence: {new_sentence}")
predicted_labels = predict_labels(model, new_sentence, word2idx, idx2label, device)
print(f"Predicted Labels: {predicted_labels}")

Sentence: i'd like a medium pizza with marinara sauce, extra cheese and pepperoni
After Preprocessing: DIGIT medium pizza with marinara sauc extra chees pepperoni
Predicted Labels: ['NUMBER', 'SIZE', 'PIZZA', 'NONE', 'VOLUME', 'TOPPING', 'QUANTITY', 'TOPPING', 'TOPPING', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE']
