In [59]:
import format
import utils
import feature_extractor
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

# Constant Definition

In [60]:
NUM_CLASSES=19
BATCH_SIZE=64
EPOCHS=20
HIDDEN_SIZE=64
VECTOR_SIZE = 50  # Size of word vectors
WINDOW_SIZE = 5  # Context window size
THREADS = 4  # Number of threads to use for training
CUTOFF_FREQ = 1  # Minimum frequency for a word to be included in vocabulary

# reading dataset

In [61]:
data = utils.read_data("../data/fixed_PIZZA_train.json")[:100000]
corpus,top_tokenized,dec_tokenized= utils.get_train_dataset(data)

In [62]:
entites_output_as_number_labels,intents_output_as_number_labels=utils.label_complete_input(corpus,top_tokenized,dec_tokenized)
input_as_tokenized_string=feature_extractor.list_of_lists(corpus)

# Embedding model

In [63]:
emb_model = feature_extractor.train_gensim_w2v_model(corpus)

# NER Model

In [None]:
class LargeDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data 
        self.labels = labels 
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

def collate_fn(batch):
    sequences, labels = zip(*batch)
    #I believe we can transform words into embeddings here
    embeddings=[]
    print(sequences)
    for seq in sequences:
        x=[]
        for token in seq:
            x.append(emb_model.wv[token])
        embeddings.append(x)
    sequences=embeddings
    labels = [torch.tensor(label, dtype=torch.long) for label in labels]
    padded_labels = pad_sequence(labels, batch_first=True, padding_value=-1)
    sequences = [torch.tensor(seq) for seq in sequences]
    padded_sequences = pad_sequence(sequences, batch_first=True)
    lengths = torch.tensor([len(seq) for seq in sequences], dtype=torch.long)
    return padded_sequences, padded_labels, lengths

class LargeWordRNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(LargeWordRNN, self).__init__()
        self.rnn = nn.LSTM(input_size, hidden_size, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size * 2, num_classes)
    
    def forward(self, x, lengths):
        packed_x = pack_padded_sequence(x, lengths.cpu(), batch_first=True, enforce_sorted=False)
        packed_out, _ = self.rnn(packed_x)
        out, _ = pad_packed_sequence(packed_out, batch_first=True)
        out = self.fc(out)
        return out

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')


labels = entites_output_as_number_labels

dataset = LargeDataset(input_as_tokenized_string, labels)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn, shuffle=True, num_workers=0)

model = LargeWordRNN(input_size=VECTOR_SIZE, hidden_size=HIDDEN_SIZE, num_classes=NUM_CLASSES).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=-1)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
for epoch in range(EPOCHS): 
    for padded_sequences, padded_labels, lengths in dataloader:
        # print(padded_sequences,padded_labels,lengths)
        padded_sequences=padded_sequences.to(device)
        padded_labels=padded_labels.to(device)
        lengths=lengths.to(device)
        optimizer.zero_grad()
        outputs = model(padded_sequences, lengths)
        loss = criterion(outputs.view(-1, NUM_CLASSES), padded_labels.view(-1))
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch + 1}, Loss: {loss.item():.4f}")


Using device: cpu
Epoch 1, Loss: 0.0012
Epoch 2, Loss: 0.0002
Epoch 3, Loss: 0.0001
Epoch 4, Loss: 0.0001
Epoch 5, Loss: 0.0001
Epoch 6, Loss: 0.0000
Epoch 7, Loss: 0.0000
Epoch 8, Loss: 0.0000
Epoch 9, Loss: 0.0000
Epoch 10, Loss: 0.0000
Epoch 11, Loss: 0.0000
Epoch 12, Loss: 0.0000
Epoch 13, Loss: 0.0000
Epoch 14, Loss: 0.0000
Epoch 15, Loss: 0.0000
Epoch 16, Loss: 0.0000
Epoch 17, Loss: 0.0000
Epoch 18, Loss: 0.0000
Epoch 19, Loss: 0.0000
Epoch 20, Loss: 0.0000


In [None]:
dataset = LargeDataset(input_as_tokenized_string[:10], labels[:10])
dataloader = DataLoader(dataset, batch_size=1, collate_fn=collate_fn, shuffle=True, num_workers=0)
print(dataset.data)

for padded_sequences, padded_labels, lengths in dataloader:
    
    padded_sequences=padded_sequences.to(device)
    padded_labels=padded_labels.to(device)
    lengths=lengths.to(device)
    outputs = model(padded_sequences, lengths)
    for out in outputs[0]:
        print(torch.argmax(out))

[['can', 'i', 'have', 'a', 'large', 'bbq', 'pulled', 'pork'], ['large', 'pie', 'with', 'green', 'pepper', 'and', 'with', 'extra', 'peperonni'], ['i', "'d", 'like', 'a', 'large', 'vegetarian', 'pizza'], ['party', 'size', 'stuffed', 'crust', 'pie', 'with', 'american', 'cheese', 'and', 'with', 'mushroom'], ['can', 'i', 'have', 'one', 'personal', 'sized', 'artichoke'], ['pie', 'with', 'banana', 'pepper', 'and', 'peppperonis', 'and', 'extra', 'low', 'fat', 'cheese'], ['i', 'want', 'one', 'regular', 'pizza', 'without', 'any', 'fried', 'onions'], ['i', 'want', 'a', 'stuffed', 'crust', 'pizza', 'with', 'american', 'cheese', 'and', 'a', 'little', 'bit', 'of', 'peperonni'], ['can', 'i', 'have', 'one', 'party', 'sized', 'high', 'rise', 'dough', 'pizza', 'with', 'american', 'cheese', 'and', 'a', 'lot', 'of', 'peperonni'], ['pie', 'with', 'green', 'olive', 'and', 'pesto', 'sauce']]
(['can', 'i', 'have', 'one', 'personal', 'sized', 'artichoke'],)
tensor(18)
tensor(18)
tensor(18)
tensor(8)
tensor(9)


In [None]:
    entity_to_num = {"I_NUMBER": 0, "I_SIZE": 1, "I_TOPPING": 2, "I_STYLE": 3, "I_DRINKTYPE": 4, "I_CONTAINERTYPE": 5, "I_VOLUME": 6, "I_QUANTITY": 7, "B_NUMBER": 8, "B_SIZE": 9, "B_TOPPING": 10, "B_STYLE": 11, "B_DRINKTYPE": 12, "B_CONTAINERTYPE": 13, "B_VOLUME": 14, "B_QUANTITY": 15, "I_NOT_TOPPING": 16, "B_NOT_TOPPING": 17, "NONE": 18}
