In [1]:
import utils
import feature_extractor
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
import pickle

# Constant Definition

In [18]:
NUM_CLASSES=23
BATCH_SIZE=64
EPOCHS=10
HIDDEN_SIZE=64
VECTOR_SIZE = 768  # Size of word vectors
WINDOW_SIZE = 5  # Context window size
THREADS = 4  # Number of threads to use for training
CUTOFF_FREQ = 1  # Minimum frequency for a word to be included in vocabulary
TRAINING_SIZE = 100  
TEST_SIZE = 10

# reading dataset

In [3]:
complete_data=utils.read_data("../data/fixed_PIZZA_train.json")

In [19]:
data = complete_data[:TRAINING_SIZE]
corpus, top, decoupled = utils.get_train_dataset(data)

In [20]:
entites_output_as_number_labels,intents_output_as_number_labels, input_as_tokenized_string=utils.label_complete_input_bert(corpus, decoupled, top)

# Embedding model

In [6]:
reviews_tokens = []
with open('../data/food_review.txt', 'r', encoding='utf-8') as f:
    for line in f:
        reviews_tokens.append(utils.tokenize_string(utils.clean_string(line.strip())))
print("Done1")
count=0
with open('../data/food_review2.txt', 'r', encoding='utf-8') as f:
    for line in f:
        count+=1
        reviews_tokens.append(utils.tokenize_string(utils.clean_string(line.strip())))
        if count==TRAINING_SIZE:
            break
print("Done2")
all_trainig = []
not_all_corpus, _, _ = utils.get_train_dataset(data)
for i in range(len(not_all_corpus)):
    all_trainig.append(utils.tokenize_string(utils.clean_string(not_all_corpus[i]))) 
print("Done3")
emb_model = feature_extractor.train_gensim_w2v_model(all_trainig+reviews_tokens, VECTOR_SIZE)

Done1
Done2
Done3


In [None]:
# from gensim.models import Word2Vec, FastText  # For Word2Vec model
# emb_model = Word2Vec.load('../embedding_models/word2vec_whole_stemmed.model')

In [21]:
emb_model, emb_tokenizer = feature_extractor.init_bert()

In [None]:
print(len(all_trainig+reviews_tokens), +10000+2456446)

In [None]:
# emb_model.wv.most_similar('pleas')

[('plz', 0.47857165336608887),
 ('pl', 0.46934911608695984),
 ('kindli', 0.38020795583724976),
 ('exponenti', 0.28583502769470215),
 ('jaan', 0.2755764126777649),
 ('lije', 0.2750872075557709),
 ('differenli', 0.27329814434051514),
 ('poor', 0.26954880356788635),
 ('hope', 0.2683272957801819),
 ('ur', 0.2626297175884247)]

# NER Model

In [22]:
class LargeDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data 
        self.labels = labels 
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

def collate_fn(batch):
    sequences, labels = zip(*batch)
    #I believe we can transform words into embeddings here
    embeddings=[]
    # print(sequences)
    for seq in sequences:
        x=[]
        # join seq_string
        seq_string = ' '.join(seq)
        for token in seq:
            emb =feature_extractor.get_word_bert_embedding(token, seq_string, emb_tokenizer, emb_model)
            x.append(emb)
        embeddings.append(x)
    sequences=embeddings
    labels = [torch.tensor(label, dtype=torch.long) for label in labels]
    padded_labels = pad_sequence(labels, batch_first=True, padding_value=-1)
    sequences = [torch.tensor(seq) for seq in sequences]
    padded_sequences = pad_sequence(sequences, batch_first=True)
    lengths = torch.tensor([len(seq) for seq in sequences], dtype=torch.long)
    return padded_sequences, padded_labels, lengths

class LargeWordRNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(LargeWordRNN, self).__init__()
        self.rnn = nn.LSTM(input_size, hidden_size, batch_first=True, bidirectional=True, num_layers=3)
        self.fc = nn.Linear(hidden_size * 2, num_classes)
    
    def forward(self, x, lengths):
        packed_x = pack_padded_sequence(x, lengths.cpu(), batch_first=True, enforce_sorted=False)
        packed_out, _ = self.rnn(packed_x)
        out, _ = pad_packed_sequence(packed_out, batch_first=True)
        out = self.fc(out)
        return out

# Checking if CUDA is working

In [23]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(torch.cuda.is_available()) 
print(torch.version.cuda)
print(f'Using device: {device}')

False
None
Using device: cpu


# Training Model

In [24]:


labels = entites_output_as_number_labels

dataset = LargeDataset(input_as_tokenized_string, labels)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn, shuffle=True, num_workers=0)

model = LargeWordRNN(input_size=VECTOR_SIZE, hidden_size=HIDDEN_SIZE, num_classes=NUM_CLASSES).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=-1)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
for epoch in range(EPOCHS): 
    for padded_sequences, padded_labels, lengths in dataloader:
        padded_sequences=padded_sequences.to(device).float()
        padded_labels=padded_labels.to(device)
        lengths=lengths.to(device).float()
        optimizer.zero_grad()
        outputs = model(padded_sequences, lengths).float()
        loss = criterion(outputs.view(-1, NUM_CLASSES), padded_labels.view(-1))
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch + 1}, Loss: {loss.item():.4f}")


Epoch 1, Loss: 3.0540
Epoch 2, Loss: 2.9065
Epoch 3, Loss: 2.7127
Epoch 4, Loss: 2.4520
Epoch 5, Loss: 2.2182
Epoch 6, Loss: 2.0913
Epoch 7, Loss: 2.0993
Epoch 8, Loss: 1.9842
Epoch 9, Loss: 1.8757
Epoch 10, Loss: 1.9668


In [None]:
# test_data=complete_data[200000:200010]
# print(test_data)
# print("---------------------------------------------")
# test_corpus,_,_= utils.get_train_dataset(test_data)
# print(test_corpus)
# test_as_tokenized_string=feature_extractor.list_of_lists(test_corpus)

In [None]:
# print(test_as_tokenized_string)

In [None]:
# class TestLargeDataset(Dataset):
#     def __init__(self, data):
#         self.data = data 
    
#     def __len__(self):
#         return len(self.data)
    
#     def __getitem__(self, idx):
#         return self.data[idx]

# def test_collate_fn(batch):
#     sequences = batch
#     #I believe we can transform words into embeddings here
#     embeddings=[]
#     for seq in sequences:
#         print(seq)
#         x=[]
#         for token in seq:
#             x.append(emb_model.wv[token])
#         embeddings.append(x)
#     sequences=embeddings
#     sequences = [torch.tensor(seq) for seq in sequences]
#     padded_sequences = pad_sequence(sequences, batch_first=True)
#     lengths = torch.tensor([len(seq) for seq in sequences], dtype=torch.long)
#     return padded_sequences, lengths


# test_as_tokenized_string
# dataset = TestLargeDataset(test_as_tokenized_string)
# dataloader = DataLoader(dataset, batch_size=1, collate_fn=test_collate_fn, shuffle=False, num_workers=0)

# for padded_sequences, lengths in dataloader:
#     print("-------------------------------")
#     padded_sequences=padded_sequences.to(device)
#     lengths=lengths.to(device)
#     outputs = model(padded_sequences, lengths)
#     entity_to_num = {"I_NUMBER": 0, "I_SIZE": 1, "I_TOPPING": 2, "I_STYLE": 3, "I_DRINKTYPE": 4, "I_CONTAINERTYPE": 5, "I_VOLUME": 6, "I_QUANTITY": 7, "B_NUMBER": 8, "B_SIZE": 9, "B_TOPPING": 10, "B_STYLE": 11, "B_DRINKTYPE": 12, "B_CONTAINERTYPE": 13, "B_VOLUME": 14, "B_QUANTITY": 15, "I_NOT_TOPPING": 16, "B_NOT_TOPPING": 17, "NONE": 18}
#     for i, out in enumerate(outputs[0]):
#         num = torch.argmax(out).int().item()
#         for key, value in entity_to_num.items():
#             if value == num:
#                 print(key)
#                 break


    
# #  entity_to_num = {"I_NUMBER": 0, "I_SIZE": 1, "I_TOPPING": 2, "I_STYLE": 3, "I_DRINKTYPE": 4, "I_CONTAINERTYPE": 5, "I_VOLUME": 6, "I_QUANTITY": 7, "B_NUMBER": 8, "B_SIZE": 9, "B_TOPPING": 10, "B_STYLE": 11, "B_DRINKTYPE": 12, "B_CONTAINERTYPE": 13, "B_VOLUME": 14, "B_QUANTITY": 15, "I_NOT_TOPPING": 16, "B_NOT_TOPPING": 17, "NONE": 18}


In [None]:
    # entity_to_num = {"I_NUMBER": 0, "I_SIZE": 1, "I_TOPPING": 2, "I_STYLE": 3, "I_DRINKTYPE": 4, "I_CONTAINERTYPE": 5, "I_VOLUME": 6, "I_QUANTITY": 7, "B_NUMBER": 8, "B_SIZE": 9, "B_TOPPING": 10, "B_STYLE": 11, "B_DRINKTYPE": 12, "B_CONTAINERTYPE": 13, "B_VOLUME": 14, "B_QUANTITY": 15, "I_NOT_TOPPING": 16, "B_NOT_TOPPING": 17, "NONE": 18}


In [26]:
dev_data=utils.read_data("../data/fixed_PIZZA_dev.json")
dev_corpus, dev_top = utils.get_dev_dataset(dev_data[:2])
gold_dev_labels,dev_as_tokenized_string=utils.label_complete_dev_bert(dev_corpus, dev_top)

class TestLargeDataset(Dataset):
    def __init__(self, data):
        self.data = data 
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx]

def test_collate_fn(batch):
    sequences = batch
    embeddings=[]
    for seq in sequences:
        x=[]
        seq_string = ' '.join(seq)
        for token in seq:
            emb =feature_extractor.get_word_bert_embedding(token, seq_string, emb_tokenizer, emb_model)
            x.append(emb)
        embeddings.append(x)
    sequences=embeddings
    sequences = [torch.tensor(seq) for seq in sequences]
    padded_sequences = pad_sequence(sequences, batch_first=True)
    lengths = torch.tensor([len(seq) for seq in sequences], dtype=torch.long)
    return padded_sequences, lengths

dev_dataset = TestLargeDataset(dev_as_tokenized_string)
dataloader = DataLoader(dev_dataset, batch_size=1, collate_fn=test_collate_fn, shuffle=False, num_workers=0)
model_output=[]
for padded_sequences, lengths in dataloader:
    print("-------------------------------")
    labels = []
    padded_sequences=padded_sequences.to(device).float()
    lengths=lengths.to(device).float()
    outputs = model(padded_sequences, lengths)
    entity_to_num = {"I_NUMBER": 0, "I_SIZE": 1, "I_TOPPING": 2, "I_STYLE": 3, "I_DRINKTYPE": 4, "I_CONTAINERTYPE": 5, "I_VOLUME": 6, "I_QUANTITY": 7, "B_NUMBER": 8, "B_SIZE": 9, "B_TOPPING": 10, "B_STYLE": 11, "B_DRINKTYPE": 12, "B_CONTAINERTYPE": 13, "B_VOLUME": 14, "B_QUANTITY": 15, "I_NOT_TOPPING": 16, "B_NOT_TOPPING": 17,"I_NOT_STYLE": 18, "B_NOT_STYLE": 19, "B_NOT_QUANTITY": 20, "I_NOT_QUANTITY": 21, "NONE": 22}
    for i, out in enumerate(outputs[0]):
        num = torch.argmax(out).int().item()
        labels.append(num)
    model_output.append(labels)

confusion_matrix, accuracy=utils.calc_accuracy(dev_corpus,model_output, gold_dev_labels)
print(confusion_matrix)
print(accuracy)

-------------------------------
-------------------------------
Wrong prediction in 0 th sentence at 4 th token
Wrong prediction in 0 th sentence at 5 th token
Wrong prediction in 0 th sentence at 8 th token
Wrong prediction in 0 th sentence at 10 th token
Wrong prediction in 0 th sentence at 11 th token
Wrong prediction in 0 th sentence at 13 th token
Wrong prediction in 0 th sentence at 14 th token
Wrong prediction in 0 th sentence at 17 th token
Wrong prediction in 0 th sentence at 19 th token
Wrong prediction in 0 th sentence at 20 th token
Wrong prediction in 0 th sentence at 22 th token
Wrong prediction in 0 th sentence at 23 th token
Wrong prediction in 0 th sentence at 26 th token
Wrong prediction in 0 th sentence at 28 th token
Sentence: i want to order two medium pizzas with sausage and black olives and two medium pizzas with pepperoni and extra cheese and three large pizzas with pepperoni and sausage
Pred: [22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 

In [None]:
# import pandas as pd
# import os
# import opendatasets as od
# # ! pip install opendatasets
# # Assign the Kaggle data set URL into variable
# dataset = 'https://www.kaggle.com/datasets/ghaithqoudeiamti/pizza-dataset?select=food_review2.txt'
# # Using opendatasets let's download the data sets
# od.download(dataset)

In [None]:
# model = pickle.load(open('../models/model_1m.pk1' , 'rb'))
# model = torch.load('../models/model_1m.pk1', map_location=torch.device('cpu'))
model = torch.load('../models/model_1m.pk1', map_location=torch.device('cpu'))



In [8]:
import re
pattern = re.compile(r'(?<=train.TOP).*\(TOPPING [a-z]* \) \(TOPPING [a-z]* \).*(?=train.TOP-DECOUPLED)')

with open('../data/fixed_PIZZA_train.json', 'r') as file:
    for i, line in enumerate(file):
        # Process each line
        matches = pattern.findall(line)
        if matches:
            print(i)
            print(line)
            break


1956451
{"train.SRC": "i'd like a pizza with cheeseburger sausages and garlic powder", "train.EXR": "(ORDER (PIZZAORDER (NUMBER 1 ) (TOPPING CHEESEBURGER ) (TOPPING SAUSAGE ) (TOPPING GARLIC_POWDER ) ) )", "train.TOP": "(ORDER i'd like (PIZZAORDER (NUMBER a ) pizza with (TOPPING cheeseburger ) (TOPPING sausages ) and (TOPPING garlic powder ) ) )", "train.TOP-DECOUPLED": "(ORDER (PIZZAORDER (NUMBER a ) (TOPPING cheeseburger ) (TOPPING sausages ) (TOPPING garlic powder ) ) )"},

