In [1]:
import sys
import os

from nlp_datasets import YahooDataset
from nlp_datasets import BaseDataset
from nlp_datasets import WordSim353Dataset
from nlp_modeltrainers import BaseTrainerModule
from nlp_metrics import Metrics

import torch
import numpy as np
import pytorch_lightning as pl
import matplotlib.pyplot as plt

from tqdm import tqdm
from torch.nn import Module, Linear, Embedding
from torch.utils.data import DataLoader

from scipy.stats import spearmanr
import fastwer

from nltk import word_tokenize

import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from pytorch_lightning import LightningModule

dev = 'cpu'
if torch.cuda.is_available():
    dev = "cuda:0"

device = torch.device(dev)

In [2]:
from utils.SubwordHash import subwordhash
from utils.SubwordEmbedding import subwordembedding
from utils.config import *

In [3]:
class Word_Preprocessor:
    def __init__(self):
        pass
    
    def __call__(self, sample):
        tokenized = word_tokenize(sample["input"])
        return [w for w in tokenized]

In [4]:
class Hash_Preprocessor:
    def __init__(self):
        pass
    
    def __call__(self, sample):
        tokenized = word_tokenize(sample["input"])
        tokenized_hashes = self.hash_tokenize(tokenized)
        output_id = self.padding(tokenized_hashes, padding_idx=0)
        
        return {"input": output_id, "target": sample['target']-1}
    
    def hash_tokenize(self, data):
        tokenized_id = [subword_hashes(w) for w in data]
        return tokenized_id
    
    def padding(self, data, padding_idx=0):
        if len(data) >= max_sample_len:
            return torch.tensor(data[:max_sample_len], dtype = torch.long).to(device)
        data.extend(np.array([[padding_idx]*max_sw_hash_len]*(max_sample_len - len(data))))
        return torch.tensor(data, dtype = torch.long).to(device)

In [5]:
class FastTextDataset(Dataset):
    def __init__(self, 
                 word_dataset, 
                 context_size,
                 neg_num,
                 device,
                 uniform = False):
        self.word_dataset = word_dataset
        self.device = device
        w_dict, w_to_id, v_id_f  = self.make_dict()
        self.word_dict = w_dict
        self.vocab_idx_freq = v_id_f
        self.word_to_idx = w_to_id
        self.context_size = context_size
        self.neg_num = neg_num
        self.data = np.array(self.get_training_data(context_size, neg_num, uniform), dtype = object)
    
    def __getitem__(self, idx):
        target = torch.from_numpy(subword_hashes(self.word_dict[int(self.data[idx,0])])).to(self.device)
        context = torch.from_numpy(subword_hashes(self.word_dict[int(self.data[idx,1])])).to(self.device)
        negatives = torch.from_numpy(np.array([subword_hashes(self.word_dict[n_idx]) for n_idx in self.data[idx,2]])).to(self.device)
            
        output = {'input':{ 'target':target, 
                            'context':context, 
                            'negatives':negatives},
                  'target':[]}
        return output
        
    def __len__(self):
        return len(self.data)
    
    def make_dict(self):
        word_dict = {}
        word_to_idx = {}
        vocab_freq = {}
        count = 0
        for sample in self.word_dataset:
            for word in sample:
                word_dict[count] = word
                if word not in vocab_freq:
                    vocab_freq[word] = 0
                    word_to_idx[word] = len(vocab_freq)-1
                vocab_freq[word] += 1
                count += 1
        vocab_idx_freq = {word_to_idx[w]:vocab_freq[w] for w in vocab_freq}
        return word_dict, word_to_idx, vocab_idx_freq
    
    def get_training_data(self, context_size, neg_num, uniform):
        t_pos = 0
        training_data = []
        for sample in tqdm(self.word_dataset):
            for tp in range(len(sample)):
                context_pos = []
                for sign in [-1,1]:
                    for window in range(1, context_size+1):
                        c_pos = t_pos + sign*(window)
                        if c_pos not in range(len(sample)):
                            break
                        context_pos.append(c_pos)
                
                vocab_list, vocab_dist = self.negative_dist(t_pos, context_pos, uniform)
                vocab = list(vocab_list)
                for c_pos in context_pos:
                    negative_idxs = np.random.choice(vocab_list, neg_num, p = vocab_dist, replace = True)
                    training_data.append([t_pos, c_pos, negative_idxs])
                t_pos += 1
        return training_data
    
    def negative_dist(self, t_pos, c_pos, uniform):
        vocab_idx_bag = self.vocab_idx_freq.copy()
        exclude_words = []
        for pos in c_pos:
            exclude_words.append(self.word_to_idx[self.word_dict[pos]])
        exclude_words.append(self.word_to_idx[self.word_dict[t_pos]])
        exclude_words = set(exclude_words)
        for w_idx in exclude_words:
            vocab_idx_bag.pop(w_idx)
            
        if uniform:
            noise_dist = np.array([1/len(vocab_idx_bag)]*len(vocab_idx_bag))
        else:
            word_freq = np.array(list(vocab_idx_bag))
            unigram_dist = word_freq/np.sum(word_freq)
            noise_dist = unigram_dist**(0.75)/np.sum(unigram_dist**(0.75))
            
        return list(vocab_idx_bag), noise_dist

In [6]:
class FastTextTrainer(BaseTrainerModule):
    def __init__(self, word_embedding, device, debug = False):
        super().__init__()
        self.word_embedding = word_embedding
        self._device = device
        self.debug = debug
        
    def forward(self, target, context, negatives):
        # (batch_size, ngram_size, embedding_dim)
        target_vec = self.word_embedding(target)
        # (batch_size, ngram_size, embedding_dim)
        context_vec = self.word_embedding(context)
        # (batch_size, negatives_num, ngram_size, embedding_dim)
        negatives_vec = self.word_embedding(negatives)
        
        return target_vec, context_vec, negatives_vec
    
    def entropy_loss_func(self, vec1, vec2):
        """
        vec1: (batch_size, ngram_size, embedding_dim)
        vec2: (batch_size, ngram_size, embedding_dim)
        """
        if self.debug: 
            print('vec1.shape:\t', vec1.shape)
            print('vec2.shape:\t', vec2.shape)
            
        vec_product = torch.mul(vec1, vec2).to(self._device)
        if self.debug: print('vec_product.shape:\t', vec_product.shape)
        """ (batch_size, ngram_size, emb_dim) """
        
        vec_product_sum = vec_product.sum(1).sum(1).to(self._device)
        if self.debug: print('vec_product_sum.shape:\t', vec_product_sum.shape)
        """ (batch_size) """
        
        positive_loss = F.logsigmoid(vec_product_sum).to(self._device)
        if self.debug: print("positive_loss:\t", positive_loss)
        return positive_loss
    
    def negative_loss_func(self, t_vec, n_vec):
        if self.debug:
            print("n_vec.shape", n_vec.shape)
            print("t_vec.shape", t_vec.shape)
        BMM = torch.einsum('bnhd,bhdp->bnhp', n_vec, t_vec.unsqueeze(3)).to(self._device)
        """ (bs, neg_num, ngram_size, 1)"""
        if self.debug: print('BMM.shape:\t', BMM.shape)
        neg_loss = F.logsigmoid(BMM).squeeze(3).sum(2).sum(1).to(self._device)
        return neg_loss
        
    def loss_func(self, t_vec, c_vec, n_vec):
        positive_loss = self.entropy_loss_func(t_vec, c_vec)
        negative_loss = self.negative_loss_func(t_vec, n_vec)
        
#         print('positive_loss: {} negative_loss: {}'.format(positive_loss.neg(), negative_loss.neg()))
        if self.debug: print('positive_loss', positive_loss.mean())
        total_loss = -(positive_loss + negative_loss).mean()
        
        return total_loss
    
    def cal_loss(self, outputs, targets=None):
        t_vec, c_vec, n_vec = outputs

        t_vec = t_vec.float()
        c_vec = t_vec.float()
        n_vec = n_vec.float()
        return self.loss_func(t_vec, c_vec, n_vec)
    
    def cal_metrics(self, outputs, targets=None):
        return None

In [7]:
dataset = YahooDataset(max_samples=max_samples, local_dir="../datasets/small_yahoo_dataset")

In [8]:
subword_hashes = subwordhash(dataset.train)

word_num = subword_hashes.word_num
max_sw_hash_len = subword_hashes.max_hash
max_sample_len = subword_hashes.max_sample
print(max_sw_hash_len)
print(max_sample_len)

100%|█████████████████████████████████████████████████████████████████████████████████| 9000/9000 [00:08<00:00, 1066.91it/s]


377
995


In [9]:
word_embedding = subwordembedding(num_embeddings = num_emb, 
                                  embedding_dim = emb_dim, 
                                  device = device, 
                                  padding_idx = 0,
                                  sumfirst = False)

In [10]:
preprocessor = Hash_Preprocessor()
dataset.train.set_preprocessor(preprocessor)
dataset.val.set_preprocessor(preprocessor)
dataset.test.set_preprocessor(preprocessor)

dataloader_train = DataLoader(dataset.train, batch_size = batch_size, shuffle = True)
dataloader_val = DataLoader(dataset.val, batch_size = batch_size, shuffle = False)
dataloader_test = DataLoader(dataset.test, batch_size = batch_size, shuffle = False)

In [11]:
word_dataset = YahooDataset(max_samples=max_samples, local_dir="../datasets/small_yahoo_dataset_text")

preprocessor = Word_Preprocessor()
word_dataset.train.set_preprocessor(preprocessor)
word_dataset.val.set_preprocessor(preprocessor)
word_dataset.test.set_preprocessor(preprocessor)

In [12]:
fasttext_train_dataset = FastTextDataset(word_dataset.train, context_size, neg_num, device, uniform)
fasttext_val_dataset = FastTextDataset(word_dataset.val, context_size, neg_num, device, uniform)
fasttext_test_dataset = FastTextDataset(word_dataset.test, context_size, neg_num, device, uniform)

fasttext_loader_train = DataLoader(fasttext_train_dataset, batch_size = batch_size, shuffle = True)
fasttext_loader_val = DataLoader(fasttext_val_dataset, batch_size = batch_size, shuffle = False)
fasttext_loader_test = DataLoader(fasttext_test_dataset, batch_size = batch_size, shuffle = False)

100%|█████████████████████████████████████████████████████████████████████████████████████| 900/900 [01:40<00:00,  8.97it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:03<00:00, 32.05it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 1000/1000 [01:47<00:00,  9.27it/s]


In [13]:
logger = pl.loggers.CSVLogger("../Trained_Models/SubwordEmbedding/logs", name = f"Fasttext_{emb_dim}_{dist}")
checkpoint = pl.callbacks.ModelCheckpoint(
    dirpath = "../Trained_Models/SubwordEmbedding/checkpoints",
    filename = 'best_model_latesum',
    monitor = 'val_loss',
    mode = 'min'
)
class LitProgressBar(pl.callbacks.ProgressBar):
    def init_validation_tqdm(self):
        bar = tqdm(disable=True)
        return bar
bar = LitProgressBar()

In [14]:
fasttext_model = FastTextTrainer(word_embedding, 
                                 device,
                                 debug = False)
torch.cuda.empty_cache()
trainer = pl.Trainer(logger = logger, 
                     gpus = '0', 
                     callbacks = [checkpoint, bar], 
                     num_sanity_val_steps = 0, 
                     auto_lr_find = True,
                     max_epochs = max_epochs)
# trainer = pl.Trainer(logger=logger, callbacks=[checkpoint, bar], max_epochs=100)
trainer.fit(fasttext_model, 
            train_dataloader = fasttext_loader_train, 
            val_dataloaders = fasttext_loader_val)

  rank_zero_deprecation(
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name           | Type             | Params
----------------------------------------------------
0 | word_embedding | subwordembedding | 100 M 
----------------------------------------------------
100 M     Trainable params
0         Non-trainable params
100 M     Total params
400.000   Total estimated model params size (MB)
  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

In [15]:
trainer.test(test_dataloaders = fasttext_loader_test)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  rank_zero_warn(


Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_loss': 1332.9322509765625}
--------------------------------------------------------------------------------


[{'test_loss': 1332.9322509765625}]

In [16]:
torch.save(word_embedding.state_dict(), emb_path+'_latesum')

In [17]:
torch.cuda.get_device_name(0)

'NVIDIA GeForce RTX 2070'

In [18]:
'a'+'b'

'ab'

In [19]:
bs = 3
ngr_s = 7
ngnum = 2
emb_dim = 5

neg = torch.randn(bs, ngnum, ngr_s, emb_dim)
target = torch.randn(bs, ngr_s, emb_dim)
a = torch.einsum('bnhd,bhdp->bnhp', neg, target.unsqueeze(3))
# lsm = F.logsigmoid(a).squeeze(3).sum(1).sum(1)
# product = torch.mul(target,target)
print(a.shape)

# neg = torch.randn(bs, ngnum, emb_dim)
# target = torch.randn(bs, emb_dim)
# b = torch.bmm(neg,target.unsqueeze(2))
# b.shape

torch.Size([3, 2, 7, 1])


In [20]:
# class test(Module):
    
#     def __init__(self):
#         super().__init__()
#         self.embed = Embedding(20, 5)
#         self.embed.weight.data.uniform_(-0.05,0.05)
        
#     def forward(self):
#         idx = torch.tensor([0,1], dtype = torch.long)
#         multiplier = self.embed(idx)
#         target = torch.tensor([[i for i in range(5)]], dtype = torch.long)
#         print("idx.shape: ", idx.shape)
#         print("idx: ", idx)
#         print("\nmultiplier.shape: ", multiplier.shape)
#         print("multiplier: ", multiplier)
#         print("\ntarget.shape: ", target.shape)
#         print("target: ", target)
        
#         product = torch.mul(target, multiplier)
#         print("\nproduct.shape: ", product.shape)
#         print("product: ", product)
#         emb_sum = torch.sum(product, dim=1)
#         print("\nsum.shape: ", emb_sum.shape)
#         print("sum: ", emb_sum)
        
#         noise_dist = torch.ones(20)
#         ng = torch.multinomial(noise_dist,5, replacement = True)
        
#         return

In [21]:
print(int(True))

1


In [22]:
'''
100d
- uniform: 140 test
- noise: 
50d
- uniform 127 test
'''

'\n100d\n- uniform: 140 test\n- noise: \n50d\n- uniform 127 test\n'