In [8]:
from charge_dataset import ChargeDataset
import random
from tokenizer import Tokenizer
import torch
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from utils import load_conf

In [11]:
from utils import load_conf
# load configurations
CONF = load_conf()
# add special characters to config
CONF["special_tokens"] = {
    CONF["unk_token"]: 0,
    CONF["pad_token"]: 1,
    CONF["sos_token"]: 2,
    CONF["eos_token"]: 3,
}

In [2]:

TEST_SET = False
SPLIT = 0.1
with open("Charge.txt", encoding='utf-8') as f:
    templates = f.readlines()
    # templates = [line.strip().split(" ") for line in templates]
    templates = [['<sos>'] + line.strip().split(" ") + ['<eos>'] for line in templates]


random.shuffle(templates)
num_val_samples = int(SPLIT * len(templates))
if TEST_SET:
    num_train_samples = len(templates) - 2 * num_val_samples
    train_templates = templates[:num_train_samples]
    val_templates = templates[num_train_samples : num_train_samples + num_val_samples]
    test_templates = templates[num_train_samples + num_val_samples :]
else:
    num_train_samples = len(templates) - num_val_samples
    train_templates = templates[:num_train_samples]
    val_templates = templates[num_train_samples : num_train_samples + num_val_samples]
    test_templates = []


print(f"{len(templates)} total templates")
print(f"{len(train_templates)} training templates")
print(f"{len(val_templates)} validation templates")
print(f"{len(test_templates)} test templates")

67 total templates
61 training templates
6 validation templates
0 test templates


In [3]:
BATCH_SIZE = 32
MAX_SEQ_LEN = 45
NUM_WORDS = 250
steps_per_epoch=50000
validation_steps=30000
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
DEVICE = torch.device('cpu')
x_tokenizer = Tokenizer(num_words=NUM_WORDS, max_seq_len=MAX_SEQ_LEN, for_sentence=True)
y_tokenizer = Tokenizer(num_words=NUM_WORDS, max_seq_len=MAX_SEQ_LEN, for_sentence=False, sos_eos=True)

td = ChargeDataset(train_templates, x_tokenizer, y_tokenizer, steps_per_epoch, DEVICE)
x, y = td.__getitem__(0)

In [7]:
y

tensor([12,  1,  6,  2,  1,  5,  7,  1,  1,  1,  3,  1,  1,  1,  4, 10,  1,  1,
         1, 13])

In [6]:
print(x_tokenizer.seq_to_words(x.cpu().numpy()))
# print(y_tokenizer.seq_to_words(y.argmax(axis=1).cpu().numpy()))
print(y_tokenizer.seq_to_words(y.cpu().numpy()))

['<sos>', 'شارژ', 'رایتل', 'معمولی', 'مبلغ', '5242069', 'میلیارد', 'بوسیله', 'حساب', 'شماره', '2038562210457219', 'سی', 'خط', 'شماره', 'زیحث', 'حوص', 'برای', 'خودم', 'میخری', '<eos>']
['<sos>', 'O', 'operator', 'charge_type', 'O', 'amount', 'unit', 'O', 'O', 'O', 'bnumber', 'O', 'O', 'O', 'pnumber', 'pnumber_post', 'O', 'O', 'O', '<eos>']


In [9]:
def collate_fn(batch):
    PAD_IDX = CONF["special_tokens"][CONF["pad_token"]]
    src_batch, tgt_batch = [], []
    for src_sample, tgt_sample in batch:
        src_batch.append(src_sample)
        tgt_batch.append(src_sample)
    src_batch = pad_sequence(src_batch, padding_value=PAD_IDX, batch_first=True)
    tgt_batch = pad_sequence(tgt_batch, padding_value=PAD_IDX, batch_first=True)
    return src_batch, tgt_batch

dataloader = DataLoader(td, batch_size=32, shuffle=True, collate_fn=collate_fn)

In [12]:
for i_batch, sample_batched in enumerate(dataloader):
    # print(sample_batched[1])
    print(sample_batched[0].shape)
    if i_batch > 15 :
        break

print(i_batch)
print(sample_batched[0].shape)

torch.Size([32, 21])
torch.Size([32, 23])
torch.Size([32, 20])
torch.Size([32, 19])
torch.Size([32, 21])
torch.Size([32, 25])
torch.Size([32, 24])
torch.Size([32, 23])
torch.Size([32, 23])
torch.Size([32, 26])
torch.Size([32, 22])
torch.Size([32, 23])
torch.Size([32, 23])
torch.Size([32, 22])
torch.Size([32, 23])
torch.Size([32, 21])
torch.Size([32, 22])
16
torch.Size([32, 22])


In [17]:
print(sample_batched[1].size())

torch.Size([16, 45, 14])


In [9]:
len(td.test)

5001

In [13]:
a = [1,2,3]
print(len(a))
random.choice(a)
len(a)

3


3

In [26]:
from charge_dataset import ChargeDataset
import random
from tokenizer import Tokenizer
import torch
from torch.utils.data import DataLoader
from utils import  *
from model import *
from train import train
CONF = load_conf()
CONF["special_tokens"] = {
    # CONF["unk_token"]: 1,
    CONF["pad_token"]: 0,
    # CONF["sos_token"]: 12,
    # CONF["eos_token"]: 13,
}

encoder = Encoder(input_dim = CONF["num_words"]*2,
                  emb_dim = CONF['embed_size'],
                  hid_dim = CONF['hidden_size'],
                  n_layers = CONF['encoder_layers'],
                  kernel_size = CONF['kernel_size'],
                  dropout = CONF['dropout'],
                  device = DEVICE,
                  max_length=CONF['max_length'],
                  )

decoder = Decoder(output_dim = CONF["num_words"]*2,
                  emb_dim = CONF['embed_size'],
                  hid_dim = CONF['hidden_size'],
                  n_layers = CONF['decoder_layers'],
                  kernel_size = CONF['kernel_size'],
                  dropout = CONF['dropout'],
                  tgt_pad_idx = CONF["special_tokens"][CONF['pad_token']],
                  device = DEVICE,
                  max_length=CONF['max_length'],
                  )

convs2s = Seq2Seq(encoder, decoder, device = DEVICE,)
chkpoint = torch.load(os.path.join("checkpoint", 'bestmodel.pt'))
convs2s.load_state_dict(chkpoint)

<All keys matched successfully>

In [48]:
from tokenizer import Tokenizer
import os
import numpy as np
model = torch.load(os.path.join("checkpoint", 'bestmodel.pt'))
# Create Tokenizers
x_tokenizer = Tokenizer(num_words=250, max_seq_len=45,
                        for_sentence=True)
y_tokenizer = Tokenizer(num_words=250, max_seq_len=45,
                        for_sentence=False, sos_eos=True)


sentence = "شارژ شگفت انگیز ایرنسل 50 هزار تومنی"
input_seq = x_tokenizer.words_to_seq(sentence.split(" "))
slots = y_tokenizer.words_to_seq(["O", "charge_type", "charge_type_post",
                                  "operator", "amount", "unit", "unit"])
print(len(slots))
input_seq = torch.tensor([input_seq])
slots = torch.tensor([slots])

output, _ = convs2s(input_seq, slots[:,:-1])
output_dim = output.shape[-1]
output = output.contiguous().view(-1, output_dim)
slots = slots[:,1:].contiguous().view(-1)

45


In [49]:
slots

tensor([2, 8, 6, 5, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [54]:
output.argmax(axis=1).size()

torch.Size([44])