In [1]:
import sys
import utils
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
import pickle
from gensim.models import Word2Vec
import random

In [None]:
NER_NUM_CLASSES=23
IS_NUM_CLASSES=7
BATCH_SIZE=256
EPOCHS=12
HIDDEN_SIZE=768
VECTOR_SIZE = 200
TRAINING_SIZE = 300000
NUM_LAYERS=1
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Device is', device)
print("-------------------------------------------------")

In [None]:
model_path = '../models/word2vec_with_dev.model'
emb_model = Word2Vec.load(model_path)
print('Load Embedding Model Done')
print("-------------------------------------------------")

In [None]:
complete_data=utils.read_data('../data/fixed_PIZZA_train.json')
random.shuffle(complete_data)
print('Read Training Data Done')
print("-------------------------------------------------")

In [None]:
dev_data=utils.read_data("../data/fixed_PIZZA_dev.json")
dev_corpus, dev_top = utils.get_dev_dataset(dev_data)
ner_dev_labels,is_dev_labels,dev_as_tokenized_string=utils.label_complete_dev(dev_corpus, dev_top)
print('Read Dev Data Done')
print("-------------------------------------------------")

In [None]:
data = complete_data[:TRAINING_SIZE]
corpus, top, decoupled = utils.get_train_dataset(data)
ner_train_labels,is_train_labels, input_as_tokenized_string=utils.label_complete_dev(corpus, top)
print('Parse Data Done')

In [None]:
zipped_train = list(zip(ner_train_labels,is_train_labels, input_as_tokenized_string))
zipped_dev = list(zip(ner_dev_labels,is_dev_labels,dev_as_tokenized_string))
train_dev = zipped_train + zipped_dev
random.shuffle(train_dev)
ner_total_train_labels, is_total_train_labels, total_train_as_tokenized_strings = zip(*train_dev)
ner_total_train_labels = list(ner_total_train_labels)
is_total_train_labels = list(is_total_train_labels)
total_train_as_tokenized_strings = list(total_train_as_tokenized_strings)
print("Shuffle Train+Dev Done")

In [None]:
import numpy as np
def data_generator(data, labels, batch_size):
    batch = []
    for i in range(len(data)):
        batch.append((data[i], labels[i]))
        
        if len(batch) == batch_size:
            sequences, labels_batch = zip(*batch)
            
            embeddings = []
            for seq in sequences:
                x=[]
                for token in seq:
                    x.append(emb_model.wv[token])
                x= np.array(x)
                embeddings.append(x)
            sequences=embeddings
            labels_batch = [torch.tensor(label, dtype=torch.long) for label in labels_batch]
            padded_labels = pad_sequence(labels_batch, batch_first=True, padding_value=-1)
            sequences = [torch.tensor(seq) for seq in sequences]
            padded_sequences = pad_sequence(sequences, batch_first=True)

            yield padded_sequences, padded_labels

            batch = []

class LargeWordLSTM(nn.Module):
    def __init__(self, embedding_dim,hidden_size, num_classes):
        super(LargeWordLSTM, self).__init__()
        self.lstm = nn.LSTM(embedding_dim, hidden_size, batch_first=True, bidirectional=True, num_layers=NUM_LAYERS)
        self.fc = nn.Linear(hidden_size * 2, num_classes)

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        out = self.fc(lstm_out)
        return out

ner_model = LargeWordLSTM(embedding_dim=VECTOR_SIZE, hidden_size=HIDDEN_SIZE, num_classes=NER_NUM_CLASSES).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=-1)
optimizer = torch.optim.NAdam(ner_model.parameters(), lr=0.001)
ner_model.train()
for epoch in range(EPOCHS):
    for padded_sequences, padded_labels in data_generator(total_train_as_tokenized_strings, ner_total_train_labels, BATCH_SIZE):
        padded_sequences=padded_sequences.to(device)
        padded_labels=padded_labels.to(device)
        outputs = ner_model(padded_sequences)
        optimizer.zero_grad()
        loss = criterion(outputs.view(-1, NER_NUM_CLASSES), padded_labels.view(-1))
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch + 1}, Loss: {loss.item():.4f}")
# pickle.dump(ner_model , open('ner_model_train_dev.pk1' , 'wb'))
print('Finish Training, ner_model is saved')

In [None]:
is_model = LargeWordLSTM(embedding_dim=VECTOR_SIZE, hidden_size=HIDDEN_SIZE, num_classes=IS_NUM_CLASSES).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=-1)
optimizer = torch.optim.NAdam(is_model.parameters(), lr=0.001)
for epoch in range(EPOCHS):
    for padded_sequences, padded_labels in data_generator(total_train_as_tokenized_strings, is_total_train_labels, BATCH_SIZE):
        padded_sequences=padded_sequences.to(device)
        padded_labels=padded_labels.to(device)
        outputs = is_model(padded_sequences)
        optimizer.zero_grad()
        loss = criterion(outputs.view(-1, IS_NUM_CLASSES), padded_labels.view(-1))
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch + 1}, Loss: {loss.item():.4f}")
# pickle.dump(is_model , open('is_model_train_dev.pk1' , 'wb'))
print('Finish Training, is_model is saved')