In [1]:
import numpy as np
idx_intent = np.load('data/total_idx_intent.npy', allow_pickle=True).item()
idx_titles = np.load('data/total_idx_titles.npy', allow_pickle=True).item()

intents = []
titles = []
for k,v in idx_intent.items():
    intents.append(v)
    titles.append(idx_titles[k])
    
# split train, test set = 8:2
test_num = int(len(intents)*0.2)

train_intent = intents[:-test_num]
train_titles = titles[:-test_num]
test_intent = intents[-test_num:]
test_titles = titles[-test_num:]

train_intent.pop(1031)
train_titles.pop(1031)
test_intent.pop(816)
test_titles.pop(816)

'Nestle Hot Cocoa Mix Rich Chocolate - 70/0.75oz. Envelopes, Community Coffee Whole Bean Coffee, French Roast, 12-Ounce Bags (Pack of 3)'

In [2]:
data = {'titles':train_titles,
       'intents':train_intent}

In [3]:
vali_data = {'titles':test_titles,
       'intents':test_intent}

In [4]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from typing import Iterable, List

vocab_transform = {}

tokenizer = get_tokenizer('spacy', language='en_core_web_sm')

# helper function to yield list of tokens
def yield_tokens(data_iter: list, language: str) -> List[str]:
    # language_index = {SRC_LANGUAGE: 0, TGT_LANGUAGE: 1}

    for data_sample in data_iter:
        yield tokenizer(data_sample)

# Define special symbols and indices
UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3
# Make sure the tokens are in order of their indices to properly insert them in vocab
special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']

for ln in ['titles', 'intents']:
    # Training data Iterator
    # train_iter = Multi30k(split='train', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
    train_data = data[ln]
    # Create torchtext's Vocab object
    vocab_transform[ln] = build_vocab_from_iterator(yield_tokens(train_data, ln),
                                                    min_freq=1,
                                                    specials=special_symbols,
                                                    special_first=True)

# Set UNK_IDX as the default index. This index is returned when the token is not found.
# If not set, it throws RuntimeError when the queried token is not found in the Vocabulary.
for ln in ['titles', 'intents']:
    vocab_transform[ln].set_default_index(UNK_IDX)

In [5]:
len(vocab_transform['intents'])

3085

In [6]:
SRC_LANGUAGE = 'titles'
TGT_LANGUAGE = 'intents'

In [7]:
from torch import nn
# Defining the Encoder part of the model
class Encoder(nn.Module):
    
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, p):
        super(Encoder, self).__init__()
        self.dropout = nn.Dropout(p)
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p, bidirectional=True)
        
#         self.h_projection = nn.Linear(in_features=self.hidden_size * 2, out_features=self.hidden_size, bias=False)
#         self.c_projection = nn.Linear(in_features=self.hidden_size * 2, out_features=self.hidden_size, bias=False)

    def forward(self, x):
        # x shape: (seq_length, N) where N is batch size

        embedding = self.dropout(self.embedding(x))
        # embedding shape: (seq_length, N, embedding_size)

        outputs, (hidden, cell) = self.rnn(embedding)
        hidden_list, cell_list = [], []
        for i in range(self.num_layers):
            add_dire = hidden[i,:,:] + hidden[i+self.num_layers,:,:]
            hidden_list.append(add_dire)
            add_dire = cell[i,:,:] + cell[i+self.num_layers,:,:]
            cell_list.append(add_dire)
        final_hidden = torch.stack(hidden_list)
        final_cell = torch.stack(cell_list)
        # outputs shape: (seq_length, N, hidden_size)
#         init_decoder_hidden = self.h_projection(torch.cat([hidden[0], hidden[1]], dim=1))
#         init_decoder_cell = self.c_projection(torch.cat([cell[0], cell[1]], dim=1))
        
        return final_hidden, final_cell

In [8]:
# Defining the Decoder part

class Decoder(nn.Module):
    def __init__(
        self, input_size, embedding_size, hidden_size, output_size, num_layers, p):
        super(Decoder, self).__init__()
        self.dropout = nn.Dropout(p)
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, hidden, cell):
        # x shape: (N) where N is for batch size, we want it to be (1, N), seq_length
        # is 1 here because we are sending in a single word and not a sentence
        x = x.unsqueeze(0)

        embedding = self.dropout(self.embedding(x))
        # embedding shape: (1, N, embedding_size)

        outputs, (hidden, cell) = self.rnn(embedding, (hidden, cell))
        # outputs shape: (1, N, hidden_size)

        predictions = self.fc(outputs)

        # predictions shape: (1, N, length_target_vocabulary) to send it to
        # loss function we want it to be (N, length_target_vocabulary) so we're
        # just gonna remove the first dim
        predictions = predictions.squeeze(0)

        return predictions, hidden, cell

In [9]:
import random

# Defining the complete model
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, source, target, teacher_force_ratio=0.5):
        batch_size = source.shape[1]
        target_len = target.shape[0]
        # SRC_VOCAB_SIZE = len(vocab_transform[SRC_LANGUAGE])
        # TGT_VOCAB_SIZE = len(vocab_transform[TGT_LANGUAGE])
        # target_vocab_size = len(english.vocab)
        target_vocab_size = len(vocab_transform[TGT_LANGUAGE])

        outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)

        hidden, cell = self.encoder(source)
#         print(hidden.size())
        # Grab the first input to the Decoder which will be <SOS> token
        x = target[0]
#         print(x.size(), hidden.size(), cell.size())
        for t in range(1, target_len):
            # Use previous hidden, cell as context from encoder at start
            output, hidden, cell = self.decoder(x, hidden, cell)

            # Store next output prediction
            outputs[t] = output

            # Get the best word the Decoder predicted (index in the vocabulary)
            best_guess = output.argmax(1)

            # With probability of teacher_force_ratio we take the actual next word
            # otherwise we take the word that the Decoder predicted it to be.
            # Teacher Forcing is used so that the model gets used to seeing
            # similar inputs at training and testing time, if teacher forcing is 1
            # then inputs at test time might be completely different than what the
            # network is used to. This was a long comment.
            x = target[t] if random.random() < teacher_force_ratio else best_guess

        return outputs

In [10]:
import torch

# Hyperparameters
num_epochs = 30
learning_rate = 0.001
batch_size = 64

# Model hyperparameters
load_model = False
device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')
input_size_encoder = len(vocab_transform[SRC_LANGUAGE])
input_size_decoder = len(vocab_transform[TGT_LANGUAGE])
output_size = len(vocab_transform[TGT_LANGUAGE])
encoder_embedding_size = 300
decoder_embedding_size = 300

hidden_size = 512
num_layers = 2
enc_dropout = 0.5
dec_dropout = 0.5

In [11]:
encoder_net = Encoder(input_size_encoder, 
                      encoder_embedding_size,
                      hidden_size,num_layers, 
                      enc_dropout).to(device)


decoder_net = Decoder(input_size_decoder, 
                      decoder_embedding_size,
                      hidden_size,output_size,num_layers, 
                      dec_dropout).to(device)

model = Seq2Seq(encoder_net, decoder_net).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, betas=(0.9, 0.98), eps=1e-9)
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)

print(model)

Seq2Seq(
  (encoder): Encoder(
    (dropout): Dropout(p=0.5, inplace=False)
    (embedding): Embedding(18264, 300)
    (rnn): LSTM(300, 512, num_layers=2, dropout=0.5, bidirectional=True)
  )
  (decoder): Decoder(
    (dropout): Dropout(p=0.5, inplace=False)
    (embedding): Embedding(3085, 300)
    (rnn): LSTM(300, 512, num_layers=2, dropout=0.5)
    (fc): Linear(in_features=512, out_features=3085, bias=True)
  )
)


In [12]:
from torch.utils import data

class TransData(data.Dataset):
    def __init__(self, titles, intents, is_training=True):
        """
        Dataset formatter adapted pair-wise algorithms
        Parameters
        ----------
        neg_set : List,
        is_training : bool,
        """
        super(TransData, self).__init__()
        self.titles = titles
        self.intents = intents

        # self.features_fill = []

        # for u, i, r, js in neg_set:
        #     u, i, r = int(u), int(i), np.float32(1)
        #     if is_training:
        #         for j in js:
        #             self.features_fill.append([u, i, j, r])
        #     else:
        #         self.features_fill.append([u, i, i, r])

    def __len__(self):
        return len(self.titles)

    def __getitem__(self, idx):
        # features = self.features_fill
        # user = features[idx][0]
        # item_i = features[idx][1]
        # item_j = features[idx][2]
        # label = features[idx][3]

        return self.titles[idx], self.intents[idx]

In [13]:
data4Train = TransData(train_titles, train_intent)

In [14]:
from torch.nn.utils.rnn import pad_sequence

# helper function to club together sequential operations
def sequential_transforms(*transforms):
    def func(txt_input):
        for transform in transforms:
            txt_input = transform(txt_input)
        return txt_input
    return func

# function to add BOS/EOS and create tensor for input sequence indices
def tensor_transform(token_ids: List[int]):
    return torch.cat((torch.tensor([BOS_IDX]),
                      torch.tensor(token_ids),
                      torch.tensor([EOS_IDX])))

# src and tgt language text transforms to convert raw strings into tensors indices
text_transform = {}
for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    text_transform[ln] = sequential_transforms(tokenizer, #Tokenization token_transform[ln]
                                               vocab_transform[ln], #Numericalization
                                               tensor_transform) # Add BOS/EOS and create tensor


# function to collate data samples into batch tesors
def collate_fn(batch):
    src_batch, tgt_batch = [], []
    for src_sample, tgt_sample in batch:
        src_batch.append(text_transform[SRC_LANGUAGE](src_sample.rstrip("\n")))
        tgt_batch.append(text_transform[TGT_LANGUAGE](tgt_sample.rstrip("\n")))

    src_batch = pad_sequence(src_batch, padding_value=PAD_IDX)
    tgt_batch = pad_sequence(tgt_batch, padding_value=PAD_IDX)
    return src_batch, tgt_batch

In [15]:
from torch.utils.data import DataLoader

def train_epoch(model, optimizer):
    model.train()
    losses = 0
    # train_iter = Multi30k(split='train', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
    train_iter = data4Train
    train_dataloader = DataLoader(train_iter, batch_size=batch_size, collate_fn=collate_fn)

    for src, tgt in train_dataloader:
        src = src.to(device)
        tgt = tgt.to(device)

        tgt_input = tgt[:-1, :]

#         src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

#         logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)

        logits = model(src, tgt_input)
        optimizer.zero_grad()

        tgt_out = tgt[1:, :]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        loss.backward()

        optimizer.step()
        losses += loss.item()

    return losses / len(train_dataloader)


def evaluate(model):
    model.eval()
    losses = 0

    # val_iter = Multi30k(split='valid', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
    val_iter = TransData(test_titles, test_intent)
    val_dataloader = DataLoader(val_iter, batch_size=batch_size, collate_fn=collate_fn)

    for src, tgt in val_dataloader:
        src = src.to(device)
        tgt = tgt.to(device)

        tgt_input = tgt[:-1, :]

        logits = model(src, tgt_input)

#         logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)

        tgt_out = tgt[1:, :]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        losses += loss.item()

    return losses / len(val_dataloader)

In [16]:
from timeit import default_timer as timer
NUM_EPOCHS = num_epochs

for epoch in range(1, NUM_EPOCHS+1):
    start_time = timer()
    train_loss = train_epoch(model, optimizer)
    end_time = timer()
    val_loss = evaluate(model)
#     val_loss = 0.0
    print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}, "f"Epoch time = {(end_time - start_time):.3f}s"))




Epoch: 1, Train loss: 6.051, Val loss: 5.718, Epoch time = 13.060s
Epoch: 2, Train loss: 5.619, Val loss: 5.709, Epoch time = 12.925s
Epoch: 3, Train loss: 5.479, Val loss: 5.686, Epoch time = 13.015s
Epoch: 4, Train loss: 5.358, Val loss: 5.683, Epoch time = 13.053s
Epoch: 5, Train loss: 5.239, Val loss: 5.689, Epoch time = 13.041s
Epoch: 6, Train loss: 5.135, Val loss: 5.736, Epoch time = 12.933s
Epoch: 7, Train loss: 5.052, Val loss: 5.760, Epoch time = 12.928s
Epoch: 8, Train loss: 4.960, Val loss: 5.842, Epoch time = 12.921s
Epoch: 9, Train loss: 4.880, Val loss: 5.915, Epoch time = 12.912s
Epoch: 10, Train loss: 4.803, Val loss: 5.959, Epoch time = 12.980s
Epoch: 11, Train loss: 4.711, Val loss: 6.021, Epoch time = 12.982s
Epoch: 12, Train loss: 4.600, Val loss: 6.084, Epoch time = 12.871s
Epoch: 13, Train loss: 4.493, Val loss: 6.196, Epoch time = 12.890s
Epoch: 14, Train loss: 4.386, Val loss: 6.319, Epoch time = 13.018s
Epoch: 15, Train loss: 4.256, Val loss: 6.378, Epoch time

In [17]:
# function to generate output sequence using greedy algorithm
def greedy_decode(model, src, max_len, start_symbol):
    src = src.to(device)
#     src_mask = src_mask.to(device)

    hidden, cell = model.encoder(src)
    ys = torch.ones(1,1).fill_(start_symbol).type(torch.long).to(device)
#     print(ys.size(),hidden.size(), cell.size())
    for i in range(max_len-1):
#         print(ys.shape, hidden.shape, cell.shape)
        with torch.no_grad():
            hidden = hidden.to(device)
            cell = cell.to(device)
    #         tgt_mask = (generate_square_subsequent_mask(ys.size(0))
    #                     .type(torch.bool)).to(DEVICE)
            out, hidden, cell = model.decoder(ys[-1], hidden, cell)
            best_guess = out.argmax(1).item()
#             print('best_guess')
#         out = out.transpose(0, 1)
#         prob = model.generator(out[:, -1])
#         _, next_word = torch.max(prob, dim=1)
        next_word = best_guess

        ys = torch.cat([ys,
                        torch.ones(1,1).type_as(src.data).fill_(next_word)], dim=0)
        if next_word == EOS_IDX:
            break
    return ys


# actual function to translate input sentence into target language
def translate(model: torch.nn.Module, src_sentence: str):
    model.eval()
    src = text_transform[SRC_LANGUAGE](src_sentence).view(-1, 1)
    num_tokens = src.shape[0]
    tgt_tokens = greedy_decode(model, src, max_len=num_tokens + 5, start_symbol=BOS_IDX).flatten()
#     print(tgt_tokens)
    return " ".join(vocab_transform[TGT_LANGUAGE].lookup_tokens(list(tgt_tokens.cpu().numpy()))).replace("<bos>", "").replace("<eos>", "")

In [18]:
# for i in range(len(test_intent)):
#     print(test_intent[i])
#     print(translate(model, test_titles[i]))
#     print('----')
#     break

In [18]:
preds = []
for i in range(len(test_intent)):
    # print(test_intent[i])
    pred = translate(model, test_titles[i])
    preds.append(pred)

In [19]:
import datasets
rouge = datasets.load_metric('./rouge.py')


In [20]:
rouge_output = rouge.compute(predictions=preds, references=test_intent, rouge_types=["rouge2"])["rouge2"].mid

In [21]:
round(rouge_output.precision, 4),round(rouge_output.recall, 4),round(rouge_output.fmeasure, 4)

(0.0083, 0.005, 0.0059)

In [22]:
rouge_output = rouge.compute(predictions=preds, references=test_intent, rouge_types=["rouge1"])["rouge1"].mid

In [23]:
round(rouge_output.precision, 4),round(rouge_output.recall, 4),round(rouge_output.fmeasure, 4)

(0.0799, 0.044, 0.0521)

In [24]:
rouge_output = rouge.compute(predictions=preds, references=test_intent, rouge_types=["rougeL"])["rougeL"].mid

In [25]:
round(rouge_output.precision, 4),round(rouge_output.recall, 4),round(rouge_output.fmeasure, 4)

(0.0793, 0.0433, 0.0514)

In [18]:
all_bundle = np.load('data/cloth_evaluation.npy', allow_pickle=True).item()

intents = []
titles = []
for k,v in all_bundle.items():
    intents.append(v[2])
    titles.append(v[1])

In [20]:
finalres = []
for i in range(len(intents)):
    pred = translate(model, titles[i])
    finalres.append((pred, intents[i]))

In [21]:
finalres

[(' ', 'Leggings'),
 (' all for for ', 'These all essential for everyday person'),
 (" bracelets of the , of to : Transactional with Transactional is a a or service service they 're , different different of searchers – even with similar habits could could seeking something",
  'same bracelets of different colors'),
 (' and photography ', 'wallet and card holder similar purpose'),
 (' ', 'Top'),
 (' ', 'coats'),
 (' they ', 'women bottom'),
 (" 's in , , of : : : : : is a a or or service they 're , different different of searchers – even with",
  "Women's Trousers (different style)"),
 (' style of charms ', 'different style of earrings'),
 (" 's , , , with it it ", "Men's cufflinks"),
 (' and clothing ', 'Footwear and socks'),
 (' Product use use ', 'All Product use this same time '),
 (' are all hand hand jewelry ', 'They are all hand jewelry '),
 (' are all hand hand jewelry ',
  'Women Rings Items 1,3,4 and 5 are Alternative products'),
 (' ', 'sandal'),
 (" 's fashion fashion ", "Wo

In [None]:
np.save('transformer_cloth.npy', finalres)