In [41]:
import torch
import torch.nn as nn
import re
import pickle
import numpy as np
import random
from torch.utils.data import Dataset, DataLoader
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

In [32]:
# Load Data
def load_data(file_x, file_y):
    with open(file_x, 'r') as fx, open(file_y, 'r') as fy:
        sentences = fx.read().strip().split('\n')
        labels = fy.read().strip().split('\n')
    return [sentence.split(",") for sentence in sentences], [label.split(',') for label in labels]

Folder = "/kaggle/input/yarabn5ls/OrderLabeler"

print("Loading Data Started")
train_sentences, train_labels = load_data(f'{Folder}/x_train.txt', f'{Folder}/y_train.txt')
test_sentences, test_labels = load_data(f'{Folder}/x_dev.txt', f'{Folder}/y_dev.txt')


train_sentences = train_sentences
train_labels = train_labels
sentences = train_sentences + test_sentences
labels = train_labels + test_labels

entries = [ (x, y) for x, y in zip(sentences, labels) ]
random.shuffle(entries)

train = entries[ : int(0.25 * len(entries))]
test = entries[int(0.95 * len(entries)) : ]

train_sentences = []
train_labels = []
for x, y in train:
    train_sentences.append(x); train_labels.append(y)
test_sentences = []
test_labels = []
for x, y in test:
   test_sentences.append(x); test_labels.append(y)


print("Loading Data Done")

Loading Data Started
Loading Data Done


In [33]:
print(train_sentences[0])
print(train_labels[0])

print(test_sentences[0])
print(test_labels[0])

['sm_num', 'pizzas', 'with', 'pepperoni', 'and', 'sm_num', 'pies', 'with', 'gold', 'leaf', 'and', 'low', 'fat', 'cheese']
['B_PIZZAORDER', 'PIZZAORDER', 'PIZZAORDER', 'E_PIZZAORDER', 'NONE', 'B_PIZZAORDER', 'PIZZAORDER', 'PIZZAORDER', 'PIZZAORDER', 'PIZZAORDER', 'PIZZAORDER', 'PIZZAORDER', 'PIZZAORDER', 'E_PIZZAORDER']
["i'd", 'like', 'a', 'pizza', 'with', 'cheddar', 'camembert', 'and', 'caramelized', 'onions', 'hold', 'the', 'fruit']
['NONE', 'NONE', 'B_PIZZAORDER', 'PIZZAORDER', 'PIZZAORDER', 'PIZZAORDER', 'PIZZAORDER', 'PIZZAORDER', 'PIZZAORDER', 'PIZZAORDER', 'PIZZAORDER', 'PIZZAORDER', 'E_PIZZAORDER']


In [37]:
from gensim.models import Word2Vec
vocab_model = Word2Vec(
    sentences=train_sentences,      # Tokenized sentences
    vector_size=100,                # Size of word vectors
    window=5,                       # Context window size
    min_count=1,                    # Minimum word frequency
    sg=1,                           # Skip-gram (1) or CBOW (0)
    epochs=10                       # Number of training epochs
)

In [47]:
word2idx = {word: idx + 2 for idx, word in enumerate(vocab_model.wv.index_to_key)}  # Start indices from 1
word2idx['<PAD>'] = 0  # Padding token
word2idx['<UNK>'] = 0  # Padding token
idx_to_word = {idx: word for word, idx in vocab.items()}

In [48]:
embedding_matrix = np.zeros((len(word2idx), embedding_dim))
for word, idx in word2idx.items():
    if word in vocab_model.wv:
        embedding_matrix[idx] = vocab_model.wv[word]
embedding_matrix = torch.tensor(embedding_matrix, dtype=torch.float32)

In [49]:
# Build Vocabulary
FREQ_THRESH = 0

def build_vocab(sentences):
    vocab = set()
    with open(f"{Folder}/vocabulary.txt", "r") as fv:
        for line in fv: 
            voc = line.strip()
            vocab.add(voc)
    word2idx = {word: idx + 2 for idx, word in enumerate(sorted(vocab))}
    word2idx['<PAD>'] = 0
    word2idx['<UNK>'] = 1
    return word2idx

def build_label_vocab(labels):
    vocab = {label for label_list in labels for label in label_list}
    label2idx = {label: idx for idx, label in enumerate(sorted(vocab))}
    return label2idx

print("Building Vocab Started")
#word2idx = build_vocab(train_sentences + test_sentences)
label2idx = build_label_vocab(train_labels + test_labels)
idx2label = {idx: label for label, idx in label2idx.items()}
print("Building Vocab Done")

Building Vocab Started
Building Vocab Done


In [50]:
print(label2idx)
print(len(word2idx))

{'B_DRINKORDER': 0, 'B_PIZZAORDER': 1, 'DRINKORDER': 2, 'E_DRINKORDER': 3, 'E_PIZZAORDER': 4, 'NONE': 5, 'PIZZA': 6, 'PIZZAORDER': 7}
96056


In [51]:
# Prepare Dataset
class SequenceDataset(Dataset):
    def __init__(self, sentences, labels, word2idx, label2idx, max_len=50):
        self.sentences = [[word2idx.get(word, word2idx['<UNK>']) for word in sentence] for sentence in sentences]
        self.labels = [[label2idx[label] for label in label_list] for label_list in labels]
        self.max_len = max_len

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        label = self.labels[idx]
        sentence = sentence[:self.max_len] + [0] * (self.max_len - len(sentence))
        label = label[:self.max_len] + [label2idx['NONE']] * (self.max_len - len(label))
        return torch.tensor(sentence), torch.tensor(label)

print("Preparing Dataset Started")
train_dataset = SequenceDataset(train_sentences, train_labels, word2idx, label2idx)
test_dataset = SequenceDataset(test_sentences, test_labels, word2idx, label2idx)

train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=1024)
print("Preparing Dataset Done")

Preparing Dataset Started
Preparing Dataset Done


In [55]:
# Define Model
class RNNSequenceLabeling(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):

        global embedding_matrix
        super(RNNSequenceLabeling, self).__init__()
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, padding_idx=word2idx['<PAD>'])
        
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, 
                            num_layers=2, 
                            bidirectional=True, 
                            batch_first=True)
        self.lstm_dropout = nn.Dropout(p=0.25)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, x):
        #x = x.long()
        #one_hot_x = F.one_hot(x, num_classes=self.vocab_size)
        #one_hot_x = one_hot_x.float()

        embedded = self.embedding(x)
        
        lstm_out, _ = self.lstm(embedded)
        lstm_out = self.lstm_dropout(lstm_out)

        predicted = self.fc(lstm_out)
        return predicted

In [58]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class RNNSequenceLabeling(nn.Module):
    def __init__(self, input_dim, embedding_size, hidden_dim, output_dim):
        super(RNNSequenceLabeling, self).__init__()
        global embedding_matrix

        self.vocab_size = input_dim
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, padding_idx=word2idx['<PAD>'])
        self.rnn = nn.RNN(embedding_size, hidden_dim, 
                          num_layers=2, 
                          bidirectional=False, 
                          batch_first=True)
        self.rnn_dropout = nn.Dropout(p=0.25)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        #x = x.long()
        #one_hot_x = F.one_hot(x, num_classes=self.vocab_size)
        #one_hot_x = one_hot_x.float()

        embedded = self.embedding(x)
        rnn_out, _ = self.rnn(embedded)
        rnn_out = self.rnn_dropout(rnn_out)
        predicted = self.fc(rnn_out)
        return predicted

In [59]:
# Training
print("Training Started")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

vocab_size = len(word2idx)
embedding_dim = 100
hidden_dim = 128
output_dim = len(label2idx)

model = RNNSequenceLabeling(vocab_size, embedding_dim, hidden_dim, output_dim).to(device)
#model = RNNSequenceLabeling(vocab_size, hidden_dim, output_dim).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for sentences, labels in train_loader:
        sentences, labels = sentences.to(device), labels.to(device)
        optimizer.zero_grad()
        predictions = model(sentences)

        loss = criterion(predictions.view(-1, output_dim), labels.view(-1)) 
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss / len(train_loader):.25f}")
print("Training Done")

Training Started
Epoch 1/10, Loss: 0.0874930511477092914818243
Epoch 2/10, Loss: 0.0325627161391700281622974
Epoch 3/10, Loss: 0.0291640496486797913544553
Epoch 4/10, Loss: 0.0276005555829033251413751
Epoch 5/10, Loss: 0.0266389916899303601238724
Epoch 6/10, Loss: 0.0259778965202470610296182
Epoch 7/10, Loss: 0.0254920858703553687707455
Epoch 8/10, Loss: 0.0251200397623081994602501
Epoch 9/10, Loss: 0.0248817181618263306985828
Epoch 10/10, Loss: 0.0245961157170434784047242
Training Done


In [23]:
# Saving Results
model.to("cpu")

modelFile = open("model", "wb")
word2idxFile = open("word2idx", "wb")
idx2labelFile = open("label2idx.idx", "wb")

pickle.dump(model, modelFile)
pickle.dump(word2idx, word2idxFile)
pickle.dump(idx2label, idx2labelFile)

modelFile.close()
word2idxFile.close()
idx2labelFile.close()

In [60]:
# Evaluation
from sklearn.metrics import classification_report
all_predictions = []
all_labels = []
def evaluate(model, loader):
    global label2idx, idx2word
    model.eval()
    total, correct = 0, 0
    with torch.no_grad():
        for sentences, labels in loader:
            sentences, labels = sentences.to(device), labels.to(device)
            predictions = model(sentences).argmax(dim=-1)
            total += labels.numel()
            correct += (predictions == labels).sum().item()
            all_predictions.extend(predictions.view(-1).cpu().numpy())
            all_labels.extend(labels.view(-1).cpu().numpy())
                    
    return correct / total
accuracy = evaluate(model, test_loader)
report = classification_report(all_labels, all_predictions, target_names=label2idx.keys())

print("Testing Started")
print(f"Test Accuracy: {accuracy * 100:.2f}%")
print("\nClassification Report:")
print(report)
print("Testing Done")

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Testing Started
Test Accuracy: 98.77%

Classification Report:
              precision    recall  f1-score   support

B_DRINKORDER       0.90      0.74      0.81     92097
B_PIZZAORDER       0.83      0.94      0.88    124067
  DRINKORDER       0.98      0.96      0.97    163200
E_DRINKORDER       0.96      0.98      0.97     92097
E_PIZZAORDER       0.90      0.90      0.90    124917
        NONE       1.00      1.00      1.00   4850071
       PIZZA       0.00      0.00      0.00         1
  PIZZAORDER       0.98      0.97      0.97    695550

    accuracy                           0.99   6142000
   macro avg       0.82      0.81      0.81   6142000
weighted avg       0.99      0.99      0.99   6142000

Testing Done


  _warn_prf(average, modifier, msg_start, len(result))
