In [1]:
import sys
import os

from nlp_datasets import YahooDataset
from nlp_datasets import BaseDataset
from nlp_datasets import WordSim353Dataset
from nlp_modeltrainers import BaseTrainerModule
from nlp_metrics import Metrics

import torch
import numpy as np
import pytorch_lightning as pl
import matplotlib.pyplot as plt

from tqdm import tqdm
from torch.nn import Module, Linear, Embedding
from torch.utils.data import DataLoader

from scipy.stats import spearmanr
import fastwer
import re
import math

import nltk
from nltk import word_tokenize
from nltk import WordNetLemmatizer
from nltk.corpus import stopwords 

lem = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from pytorch_lightning import LightningModule

dev = 'cpu'
if torch.cuda.is_available():
    dev = "cuda:0"

device = torch.device(dev)

In [2]:
from utils.SubwordHash import subwordhash, Hash_Preprocessor
from utils.SubwordHash import Word_Preprocessor
from utils.SubwordEmbedding import subwordembedding
from utils.config import *
from utils.replace_dict import rep
from utils.dict_freq import get_freq_dict

In [3]:
class FastTextDataset(Dataset):
    def __init__(self, 
                 word_dataset, 
                 context_size,
                 neg_num,
                 device,
                 uniform = False):
        self.word_dataset = word_dataset
        self.device = device
        w_dict, w_to_id, v_id_f  = self.make_dict()
        self.word_dict = w_dict
#         self.vocab_freq = v_f
        self.vocab_idx_freq = v_id_f
        self.word_to_idx = w_to_id
        self.context_size = context_size
        self.neg_num = neg_num
        print('Making instances (t,c,[ns])...')
        self.data = np.array(self.get_training_data(context_size, neg_num, uniform), dtype = object)
    
    def __getitem__(self, idx):
        target = torch.from_numpy(subword_hashes(self.word_dict[int(self.data[idx,0])])).to(self.device)
        context = torch.from_numpy(subword_hashes(self.word_dict[int(self.data[idx,1])])).to(self.device)
        negatives = torch.from_numpy(np.array([subword_hashes(self.word_dict[n_idx]) for n_idx in self.data[idx,2]])).to(self.device)
            
        output = {'input':{ 'target':target, 
                            'context':context, 
                            'negatives':negatives},
                  'target':[]}
        return output
        
    def __len__(self):
        return len(self.data)
    
    def make_dict(self):
        word_dict = {}
        word_to_idx = {}
        vocab_freq = {}
        count = 0
        for sample in self.word_dataset:
            for word in sample:
                word_dict[count] = word
                if word not in vocab_freq:
                    vocab_freq[word] = 0
                    word_to_idx[word] = len(vocab_freq)-1
                vocab_freq[word] += 1
                count += 1
        vocab_idx_freq = {word_to_idx[w]:vocab_freq[w] for w in vocab_freq}
        return word_dict, word_to_idx, vocab_idx_freq
    
    def get_training_data(self, context_size, neg_num, uniform):
        t_pos = 0
        training_data = []
        for sample in tqdm(self.word_dataset):
            for tp in range(len(sample)):
                context_pos = []
                for sign in [-1,1]:
                    for window in range(1, context_size+1):
                        c_pos = t_pos + sign*(window)
                        if c_pos not in range(len(sample)):
                            break
                        context_pos.append(c_pos)
                
                vocab_list, vocab_dist = self.negative_dist(t_pos, context_pos, uniform)
                vocab = list(vocab_list)
                for c_pos in context_pos:
                    negative_idxs = np.random.choice(vocab_list, neg_num, p = vocab_dist, replace = True)
                    training_data.append([t_pos, c_pos, negative_idxs])
                t_pos += 1
        return training_data
    
    def negative_dist(self, t_pos, c_pos, uniform):
        vocab_idx_bag = self.vocab_idx_freq.copy()
        exclude_words = []
        for pos in c_pos:
            exclude_words.append(self.word_to_idx[self.word_dict[pos]])
        exclude_words.append(self.word_to_idx[self.word_dict[t_pos]])
        exclude_words = set(exclude_words)
        for w_idx in exclude_words:
            vocab_idx_bag.pop(w_idx)
            
        if uniform:
            noise_dist = np.array([1/len(vocab_idx_bag)]*len(vocab_idx_bag))
        else:
            word_freq = np.array(list(vocab_idx_bag))
            unigram_dist = word_freq/np.sum(word_freq)
            noise_dist = unigram_dist**(0.75)/np.sum(unigram_dist**(0.75))
            
        return list(vocab_idx_bag), noise_dist

In [4]:
class FastTextTrainer(BaseTrainerModule):
    def __init__(self, word_embedding, device, debug = False):
        super().__init__()
        self.word_embedding = word_embedding
        self._device = device
        self.debug = debug
        
    def forward(self, target, context, negatives):
        # (batch_size, embedding_dim)
        target_vec = self.word_embedding(target)
        # (batch_size, embedding_dim)
        context_vec = self.word_embedding(context)
        # (batch_size, negatives_num, embedding_dim)
        negatives_vec = self.word_embedding(negatives)
        
        return target_vec, context_vec, negatives_vec
    
    def entropy_loss_func(self, vec1, vec2):
        """
        vec1: (batch_size, embedding_dim)
        vec2: (batch_size, embedding_dim)
        """
        if self.debug: 
            print('vec1.shape:\t', vec1.shape)
            print('vec2.shape:\t', vec2.shape)
            
        vec_product = torch.mul(vec1, vec2).to(self._device)
        if self.debug: print('vec_product.shape:\t', vec_product.shape)
        if self.debug: print('vec_product:\t', vec_product)
        """ (batch_size, emb_dim) """
        
        vec_product_sum = vec_product.sum(dim=1).to(self._device)
        if self.debug: print('vec_product_sum.shape:\t', vec_product_sum.shape)
        if self.debug: print('vec_product_sum:\t', vec_product_sum)
        """ (batch_size) """
        
        positive_loss = F.logsigmoid(vec_product_sum).to(self._device)
        if self.debug: print("positive_loss:\t", positive_loss)
        return positive_loss
    
    def negative_loss_func(self, t_vec, n_vec):
        BMM = torch.bmm(n_vec.neg(), t_vec.unsqueeze(2)).to(self._device)
        """ (bs, neg_num, 1)"""
#         print('BMM.shape:\t', BMM.shape)
        neg_loss = F.logsigmoid(BMM).squeeze(2).sum(1).to(self._device)
        return neg_loss
        
    def loss_func(self, t_vec, c_vec, n_vec):
        positive_loss = self.entropy_loss_func(t_vec, c_vec)
        negative_loss = self.negative_loss_func(t_vec, n_vec)
        
#         print('positive_loss: {} negative_loss: {}'.format(positive_loss.neg(), negative_loss.neg()))
        total_loss = -(positive_loss + negative_loss).mean()
        
        return total_loss
    
    def cal_loss(self, outputs, targets=None):
        t_vec, c_vec, n_vec = outputs

        t_vec = t_vec.float()
        c_vec = t_vec.float()
        n_vec = n_vec.float()
        return self.loss_func(t_vec, c_vec, n_vec)
    
    def cal_metrics(self, outputs, targets=None):
        return None

In [5]:
word_dataset = YahooDataset(max_samples=max_samples, local_dir="../datasets/small_yahoo_dataset_text")
freq_dict = get_freq_dict(word_dataset.train)

100%|████████████████████████████████████████████████████████████████████████████████████| 900/900 [00:03<00:00, 267.60it/s]


In [6]:
word_dataset.train[1]['input']

"Why has TAVFX fallen so much yesterday?\nIt appears that the question period has expired. Other answerers deserve to receive the 'best answer' vote for their attempt at answering your question.  If you have received an answer that meets your needs, _please_ choose one of those as a 'best answer' as soon as you can; otherwise, this question will go to an automatic vote. If you haven't received a good answer for your question, you may want to consider the following,\\n\\n1) Remove this version of your question and re-post your question. Newer questions get more activity on Yahoo! Answers than old ones.\\n2) If you do re-post your question, consider why it wasn't answered the first time. Could it be more specific? Could it be worded better? Were there grammatical or spelling errors? Was it in the best category? Can you provide more helpful details?\\n\\nIf it doesn't seem likely that re-posting your question will help you, then here's a listing of my favorite 'answer sites'. Maybe one of

In [7]:
freq_dict

{'if': 439,
 'you': 1220,
 'hold': 30,
 'it': 1032,
 'to': 2160,
 'tight': 2,
 'will': 272,
 'fall': 17,
 'out': 173,
 'of': 1587,
 'your': 418,
 'hand': 23,
 'loosely': 3,
 'break': 14,
 'what': 564,
 'be': 3131,
 'an': 278,
 'egg': 12,
 'why': 167,
 'have': 807,
 'tavfx': 1,
 'so': 217,
 'much': 90,
 'yesterday': 1,
 'appear': 12,
 'that': 948,
 'the': 3779,
 'question': 94,
 'period': 11,
 'expire': 4,
 'other': 179,
 'answerers': 1,
 'deserve': 6,
 'receive': 14,
 'best': 152,
 'answer': 110,
 'vote': 18,
 'for': 807,
 'their': 164,
 'attempt': 10,
 'at': 265,
 'meet': 27,
 'need': 100,
 'please': 23,
 'choose': 28,
 'one': 960,
 'those': 55,
 'as': 413,
 'a': 1996,
 'soon': 14,
 'can': 475,
 'otherwise': 6,
 'this': 361,
 'go': 186,
 'automatic': 2,
 'not': 657,
 'good': 149,
 'may': 71,
 'want': 148,
 'consider': 40,
 'follow': 31,
 'remove': 14,
 'version': 14,
 'and': 1672,
 're': 74,
 'post': 16,
 'newer': 1,
 'get': 295,
 'more': 238,
 'activity': 4,
 'on': 563,
 'yahoo': 70,

In [8]:
train_preprocessor = Word_Preprocessor(freq_dict = freq_dict, 
                                       train = True,
                                       subsampling = subsampling)
word_dataset.train.set_preprocessor(train_preprocessor)

preprocessor = Word_Preprocessor(freq_dict = freq_dict, 
                                 train = False,
                                 subsampling = subsampling)
word_dataset.val.set_preprocessor(preprocessor)
word_dataset.test.set_preprocessor(preprocessor)

In [9]:
subword_hashes = subwordhash(word_dataset.train)

word_num = subword_hashes.word_num
max_sw_hash_len = subword_hashes.max_hash
max_sample_len = subword_hashes.max_sample
print(max_sw_hash_len)
print(max_sample_len)

100%|████████████████████████████████████████████████████████████████████████████████████| 900/900 [00:01<00:00, 486.55it/s]


356
811


In [10]:
word_dataset.train[20]

['what',
 's',
 'the',
 'longest',
 'english',
 'word',
 'without',
 'a',
 'vowel',
 'in',
 'it',
 'and',
 'what',
 'do',
 'that',
 'word',
 'mean',
 'if',
 'it',
 's',
 'not',
 'a',
 'common',
 'word',
 'the',
 'longest',
 'word',
 'without',
 'a',
 'vowel',
 'be',
 'rhythm',
 'it',
 'be',
 'reference',
 'on',
 'below',
 'web',
 'site',
 'where',
 'you',
 'can',
 'find',
 'more',
 'fun',
 'facts']

In [11]:
word_embedding = subwordembedding(num_embeddings = num_emb, 
                                  embedding_dim = emb_dim, 
                                  device = device, 
                                  padding_idx = 0)

In [12]:
fasttext_train_dataset = FastTextDataset(word_dataset.train, context_size, neg_num, device, uniform)
fasttext_val_dataset = FastTextDataset(word_dataset.val, context_size, neg_num, device, uniform)
fasttext_test_dataset = FastTextDataset(word_dataset.test, context_size, neg_num, device, uniform)

fasttext_loader_train = DataLoader(fasttext_train_dataset, batch_size = batch_size, shuffle = True)
fasttext_loader_val = DataLoader(fasttext_val_dataset, batch_size = batch_size, shuffle = False)
fasttext_loader_test = DataLoader(fasttext_test_dataset, batch_size = batch_size, shuffle = False)

  0%|                                                                                       | 1/900 [00:00<01:53,  7.95it/s]

Making instances (t,c,[ns])...


100%|█████████████████████████████████████████████████████████████████████████████████████| 900/900 [01:38<00:00,  9.12it/s]
  0%|                                                                                               | 0/100 [00:00<?, ?it/s]

Making instances (t,c,[ns])...


100%|█████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:03<00:00, 26.87it/s]
  0%|                                                                                              | 0/1000 [00:00<?, ?it/s]

Making instances (t,c,[ns])...


100%|███████████████████████████████████████████████████████████████████████████████████| 1000/1000 [01:41<00:00,  9.87it/s]


In [13]:
logger = pl.loggers.CSVLogger("../Trained_Models/SubwordEmbedding/logs", name = f"Fasttext_{emb_dim}_{dist}")
checkpoint = pl.callbacks.ModelCheckpoint(
    dirpath = "../Trained_Models/SubwordEmbedding/checkpoints",
    filename = 'best_model_{epoch}_loss-{val_loss:.2f}',
    monitor = 'val_loss',
    mode = 'min'
)
class LitProgressBar(pl.callbacks.ProgressBar):
    def init_validation_tqdm(self):
        bar = tqdm(disable=True)
        return bar
bar = LitProgressBar()

In [14]:
fasttext_model = FastTextTrainer(word_embedding, device, debug = False)
torch.cuda.empty_cache()
trainer = pl.Trainer(logger = logger, 
                     gpus = '0', 
                     callbacks = [checkpoint, bar], 
                     num_sanity_val_steps = 0, 
                     auto_lr_find = True,
                     max_epochs = max_epochs)
# trainer = pl.Trainer(logger=logger, callbacks=[checkpoint, bar], max_epochs=100)
trainer.fit(fasttext_model, 
            train_dataloader = fasttext_loader_train, 
            val_dataloaders = fasttext_loader_val)

  rank_zero_deprecation(
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name           | Type             | Params
----------------------------------------------------
0 | word_embedding | subwordembedding | 200 M 
----------------------------------------------------
200 M     Trainable params
0         Non-trainable params
200 M     Total params
800.000   Total estimated model params size (MB)
  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

In [15]:
trainer.test(test_dataloaders = fasttext_loader_test)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  rank_zero_warn(


Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_loss': 186.8179168701172}
--------------------------------------------------------------------------------


[{'test_loss': 186.8179168701172}]

In [16]:
torch.save(word_embedding.state_dict(), emb_path)

In [17]:
# best_model_path = checkpoint.best_model_path
# checkpoint.best_model_path

In [18]:
# cp = torch.load(best_model_path)

In [19]:
# cp['state_dict']['word_embedding.subword_embedding.weight']

In [20]:
# word_embedding.state_dict()

In [21]:
torch.cuda.get_device_name(0)

'NVIDIA GeForce RTX 2070'

In [22]:
max([1,4,10,55,12,13,22])

55

In [23]:
g = {1:2, 2:3}
sum(list(g.values()))

5

In [24]:
# class test(Module):
    
#     def __init__(self):
#         super().__init__()
#         self.embed = Embedding(20, 5)
#         self.embed.weight.data.uniform_(-0.05,0.05)
        
#     def forward(self):
#         idx = torch.tensor([0,1], dtype = torch.long)
#         multiplier = self.embed(idx)
#         target = torch.tensor([[i for i in range(5)]], dtype = torch.long)
#         print("idx.shape: ", idx.shape)
#         print("idx: ", idx)
#         print("\nmultiplier.shape: ", multiplier.shape)
#         print("multiplier: ", multiplier)
#         print("\ntarget.shape: ", target.shape)
#         print("target: ", target)
        
#         product = torch.mul(target, multiplier)
#         print("\nproduct.shape: ", product.shape)
#         print("product: ", product)
#         emb_sum = torch.sum(product, dim=1)
#         print("\nsum.shape: ", emb_sum.shape)
#         print("sum: ", emb_sum)
        
#         noise_dist = torch.ones(20)
#         ng = torch.multinomial(noise_dist,5, replacement = True)
        
#         return

In [25]:
print(int(True))

1


In [26]:
'''
100d
- uniform: 140 test
- noise: 
50d
- uniform 127 test
'''

'\n100d\n- uniform: 140 test\n- noise: \n50d\n- uniform 127 test\n'