In [None]:
import pickle
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
import pickle
from gensim.models import Word2Vec, FastText  # For Word2Vec model
import random
import numpy as np
import utils

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Device is', device)

NUM_CLASSES=23
BATCH_SIZE=256
EPOCHS=10
HIDDEN_SIZE = 768
VECTOR_SIZE = 768
TRAINING_SIZE =500000
NUM_LAYERS = 1
DROP_OUT=0

In [None]:
import csv

def get_test_dataset_from_csv(file_path):
    dataset = []
    with open(file_path, mode='r', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        for row in reader:
            dataset.append((row["id"], row["order"]))
    return dataset

testset = get_test_dataset_from_csv("../data/del.csv")
test_as_tokenized_string = []
for i in range(len(testset)):
    test_as_tokenized_string.append(utils.tokenize_string(testset[i][1]))

In [None]:
unstemmed_test_as_tokenized_string = []
for i in range(len(testset)):
    unstemmed_test_as_tokenized_string.append((testset[i][0],utils.tokenize_string_bert((testset[i][1]))))

In [12]:
class LargeWordLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim,hidden_size, num_classes):
        super(LargeWordLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, batch_first=True, bidirectional=True, num_layers=NUM_LAYERS, dropout=DROP_OUT)
        self.fc = nn.Linear(hidden_size * 2, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        lstm_out, _ = self.lstm(x)
        out = self.fc(lstm_out)
        return out

pickle_file_path1 = "../models/is_model.pk1"
with open(pickle_file_path1, "rb") as file1:
    is_model = pickle.load(file1)

pickle_file_path2 = "../models/ner_model.pk1"
with open(pickle_file_path2, "rb") as file2:
    ner_model = pickle.load(file2)

pickle_file_path2 = "../models/word_to_index.pk1"
with open(pickle_file_path2, "rb") as file2:
    word_to_index = pickle.load(file2)

In [13]:
class TestLargeDataset(Dataset):
    def __init__(self, data):
        self.data = data 
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx]

def test_collate_fn(batch):
    sequences = batch
    embeddings=[]
    for seq in sequences:
        x=[]
        for token in seq:
            if token not in word_to_index:
                x.append(word_to_index['i'])
            else:
                x.append(word_to_index[token])
        embeddings.append(x)
    sequences=embeddings
    sequences = [torch.tensor(seq) for seq in sequences]
    padded_sequences = pad_sequence(sequences, batch_first=True)
    return padded_sequences


In [None]:
test_dataset = TestLargeDataset(test_as_tokenized_string)
dataloader = DataLoader(test_dataset, batch_size=1, collate_fn=test_collate_fn, shuffle=False, num_workers=0)
ner_model_output=[]

ner_model.eval()
with torch.no_grad():
    for padded_sequences in dataloader:
        labels = []
        padded_sequences=padded_sequences.to(device)
        outputs = ner_model(padded_sequences)
        for i, out in enumerate(outputs[0]):
            num = torch.argmax(out).int().item()
            labels.append(num)
        ner_model_output.append(labels)

In [15]:
is_model_output=[]

is_model.eval()
with torch.no_grad():
    for padded_sequences in dataloader:
        labels = []
        padded_sequences=padded_sequences.to(device)
        outputs = is_model(padded_sequences)
        for i, out in enumerate(outputs[0]):
            num = torch.argmax(out).int().item()
            labels.append(num)
        is_model_output.append(labels)

is_model_output = utils.intent_post_processing(is_model_output)
is_model_output = utils.intent_post_processing_extra(is_model_output)
is_model_output = utils.intent_post_processing2(is_model_output, ner_model_output)

In [16]:
import csv
import reformat_results
def jsons_to_csv(input_tokens,ner_out, is_out):
    with open("../data/sample2.csv", mode="w", newline='', encoding="utf-8") as csvfile:
        csv_writer = csv.writer(csvfile)
        csv_writer.writerow(["id","output"])
        for i in range(len(ner_out)):
            file1 = "file1.json"
            test_tree = utils.convert_to_json (file1,input_tokens[i][1], ner_out[i], is_out[i])
            top = reformat_results.parse_tree(test_tree)
            csv_writer.writerow([input_tokens[i][0],top])

jsons_to_csv(unstemmed_test_as_tokenized_string,ner_model_output,is_model_output)