In [1]:
import sys
import os

# from Utilities.NLP_ModelTrainers.SentenceClassification.MulticlassSentenceClassificationModule import MulticlassSentenceClassificationTrainerModule

from nlp_datasets import YahooDataset
from nlp_datasets import BaseDataset
# from nlp_datasets import SpellingSimilarityDataset
from nlp_modeltrainers import BaseTrainerModule
from nlp_modeltrainers import MulticlassSentenceClassificationTrainerModule
# from nlp_modeltrainers import VectorCosineSimilarityTrainerModule


import torch
import fastwer
from string import ascii_letters as letters
L = list(letters)
import numpy as np
import pytorch_lightning as pl
import matplotlib.pyplot as plt

from tqdm import tqdm
from torch.nn import Module, Linear, Embedding, ModuleList
from torch.utils.data import DataLoader

from nltk import word_tokenize

import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from pytorch_lightning import LightningModule

if torch.cuda.is_available():
    dev = "cuda:0"
else:
    dev = 'cpu'

device = torch.device(dev)

In [2]:
class subwordhash:
    def __init__(self, dataset):
        word_num, hash_len, sample_len = self.average_subword_num(dataset)
        self.word_num = word_num
        self.max_hash = hash_len
        self.max_sample = sample_len
        
    def __call__(self, word):
        return self.subword_hashes(word, max_hash_num = self.max_hash)
    
    def fnv1a(self, txt, K = int(2e6 + 1)):
        # 64 bit fnv-1a
        txt = bytes(txt, 'utf-8')
        hval = 0xcbf29ce484222325
        fnv_prime = 0x100000001b3
        for c in txt:
            hval = hval ^ c
            hval = (hval * fnv_prime) % K
        return hval + 1        

    def subword_hashes(self, word, max_hash_num = None, get_len = False):
        sub_hash = []
        tword = '<' + word + '>'
        sub_hash.append(self.fnv1a(tword))
        for n in range(3,7):
            for i in range(len(tword)-n+1):
                sub_hash.append(self.fnv1a(tword[i:i+n]))
                if len(sub_hash) == max_hash_num:
                    return np.array(sub_hash[:max_hash_num])
        if max_hash_num is not None:
            sub_hash.extend([0]*(max_hash_num - len(sub_hash)))
        if get_len:
            return len(sub_hash)
        return np.array(sub_hash)

    def average_subword_num(self, dataset):
        max_sample_len = 0
        hash_len_dist = {}
        len_dist = {}
        for sample in tqdm(dataset):
            tokens = word_tokenize(sample["input"])
            if len(tokens) not in len_dist:
                len_dist[len(tokens)] = 0
            len_dist[len(tokens)] += 1
            max_sample_len = max(max_sample_len, len(tokens))
            
        for L in list(len_dist):
            hash_len_dist[self.subword_hashes('a'*L, get_len = True)] = len_dist[L]
        
        total = 0
        weighted_hash_len = []
        for L in list(hash_len_dist):
            total += hash_len_dist[L]
            weighted_hash_len.append(hash_len_dist[L]*L)
        avg = sum(weighted_hash_len)/total
        
        return int(total), int(avg), max_sample_len
        

In [3]:
class SubwordEmbedding(Module):
    def __init__(self, num_embeddings, embedding_dim, device, padding_idx = 0):
        super().__init__()
        self.num_embeddings = num_embeddings
        self.embedding_dim = embedding_dim
        self.device =  device
        self.padding_idx = padding_idx
        
        self.subword_embedding = Embedding(num_embeddings = num_embeddings, 
                                           embedding_dim = embedding_dim, 
                                           padding_idx = padding_idx)
    def forward(self, token_ids):
        # token_ids: (batch_size, word_num, hash_size)
        # return: (batch_size, word_num, embedding_dim)
        debug = False
        
        subword_embed = self.subword_embedding(token_ids).to(self.device)
        # (batch_size, word_num, hash_size, embedding_dim)
        if debug: print("subword_embed.shape: ", subword_embed.shape)
        
        word_embed = subword_embed.sum(dim = len(subword_embed.shape) -2).to(self.device)
        # (batch_size, word_num, embedding_dim)
        if debug: print("word_embed.shape: ", word_embed.shape)
        
        if debug: print("\n########################################\n")
        return word_embed
        

In [4]:
class Hash_Preprocessor:
    def __init__(self):
        pass
    
    def __call__(self, sample):
        tokenized = word_tokenize(sample["input"])
        tokenized_hashes = self.hash_tokenize(tokenized)
        output_id = self.padding(tokenized_hashes, padding_idx=0)
        
        return {"input": output_id, "target": sample['target']-1}
    
    def hash_tokenize(self, data):
        tokenized_id = [subword_hashes(w) for w in data]
        return tokenized_id
    
    def padding(self, data, padding_idx=0):
        if len(data) >= max_sample_len:
            return torch.tensor(data[:max_sample_len], dtype = torch.long).to(device)
        data.extend(np.array([[padding_idx]*max_sw_hash_len]*(max_sample_len - len(data))))
        return torch.tensor(data, dtype = torch.long).to(device)

In [5]:
class Misspell_Preprocessor:
    def __init__(self):
        pass
    
    def __call__(self, sample):
        random_freq = True
        tokenized = {i:w for i,w in enumerate(word_tokenize(sample["input"]))}
        tokenized = self.misspell(tokenized, random_freq)
        tokenized_hashes = self.hash_tokenize(tokenized)
        output_id = self.padding(tokenized_hashes, padding_idx=0)
        
        return {"input": output_id, "target": sample['target']-1}
    
    def misspell(self, data, random_freq=False):
        if random_freq: 
            msp_f = np.random.uniform(0.1,0.5)
        else: 
            msp_f = misspell_freq
        misspell_num = int(len(data)*msp_f)
        misspell_idx = np.random.choice(len(data), misspell_num, replace = False)
        m_type = {i:mt for i,mt in enumerate(np.random.randint(0, 4, misspell_num))}
        m_dict = {0:self.delete, 1:self.insert, 2:self.replace, 3:self.swap}
        for i in range(misspell_num):
            mp = data[misspell_idx[i]]
            if len(mp) > 1:
                mp = m_dict[m_type[i]](list(mp))
            else:
                mp = self.replace(list(mp))
            data[misspell_idx[i]] = mp
        return [data[w] for w in sorted(data)]
    
    def delete(self, word):
        idx = np.random.randint(len(word))
        word.pop(idx)
        return ''.join(map(str,word))

    def insert(self, word):
        idx = np.random.randint(len(word))
        letter = np.random.choice(L)
        word.insert(idx, letter)
        return ''.join(map(str,word))

    def replace(self, word):
        idx = np.random.randint(len(word))
        letter = np.random.choice(L)
        word.pop(idx)
        word.insert(idx,letter)
        return ''.join(map(str,word))

    def swap(self, word):
        idx1 = np.random.randint(len(word))
        if idx1 == 0:
            idx2 = idx1 + 1
        elif idx1 == len(word)-1:
            idx2 = idx1 -1
        else:
            idx2 = np.random.choice([idx1+1,idx1-1])
        first_idx = min(idx1,idx2)
        second_idx = max(idx1,idx2)
        temp = word.pop(first_idx)
        word.insert(second_idx, temp)
        return ''.join(map(str,word))
        
    def hash_tokenize(self, data):
        tokenized_id = [subword_hashes(w) for w in data]
        return tokenized_id
    
    def padding(self, data, padding_idx=0):
        if len(data) >= max_sample_len:
            return torch.tensor(data[:max_sample_len], dtype = torch.long).to(device)
        data.extend(np.array([[padding_idx]*max_sw_hash_len]*(max_sample_len - len(data))))
        return torch.tensor(data, dtype = torch.long).to(device)

In [6]:
class YahooClassifier(Module):
    def __init__(self, word_embedding, embedding_dim, class_zize,device):
        super().__init__()
        self.device = device
        self.word_embedding = word_embedding.to(device)
        self.linear_classifier = Linear(embedding_dim, class_size).to(device)
    
    def forward(self, token_ids):
        """
        toekn_ids: (batch_size, worrd_num, hash_size)
        """
#         print(token_ids.shape)
        # (batch_size, words_num, embedding_dim)
        outputs = self.word_embedding(token_ids).to(self.device)
#         print(outputs.shape)
        # (batch_size, embedding_dim)
        outputs = torch.max(outputs, dim=1)[0].to(self.device)
#         print(outputs.shape)
        # (batch_size, class_size)
        outputs = self.linear_classifier(outputs).to(self.device)
#         print(outputs.shape)
        return outputs

In [7]:
max_samples = 10000
batch_size = 8
emb_dim = 50
num_emb = int(2e6+1)
context_size = 3
neg_num = 5
uniform = True
max_epochs = 100
class_size = 10
misspell_freq = 0.5

if uniform: dist = 'uniform'
else: dist = 'noisedist'
if max_epochs is None:
    epoch = 'ULepochs'
else: epoch = f'{max_epochs}e'
emb_path = f"./SubwordEmbedding/trained_model/trained_model_{emb_dim}d_{dist}_{epoch}_{context_size}w"

In [8]:
dataset = YahooDataset(max_samples=max_samples, local_dir="small_yahoo_dataset")
subword_hashes = subwordhash(dataset.train)

word_num = subword_hashes.word_num
max_sw_hash_len = subword_hashes.max_hash
max_sample_len = subword_hashes.max_sample
print(max_sw_hash_len)
print(max_sample_len)

100%|█████████████████████████████████████████████████████████████████████████████████| 9000/9000 [00:07<00:00, 1193.52it/s]


377
995


In [9]:
word_embedding = SubwordEmbedding(num_embeddings = num_emb, embedding_dim = emb_dim, device = device, padding_idx = 0)
word_embedding = word_embedding.to(device)
word_embedding.load_state_dict(torch.load(emb_path))
print(f'Loaded model: trained_model_{emb_dim}d_{dist}_{epoch}_{context_size}w')

Loaded model: trained_model_50d_uniform_100e_3w


In [10]:
preprocessor = Hash_Preprocessor()

dataset.train.set_preprocessor(preprocessor)
dataset.val.set_preprocessor(preprocessor)
dataset.test.set_preprocessor(preprocessor)

In [11]:
dataloader_train = DataLoader(dataset.train, batch_size=batch_size, shuffle=True)
dataloader_val = DataLoader(dataset.val, batch_size=batch_size, shuffle=False)
dataloader_test = DataLoader(dataset.test, batch_size=batch_size, shuffle=False)

In [12]:
misspell_dataset = YahooDataset(max_samples=max_samples, local_dir="small_yahoo_dataset")

In [13]:
preprocessor = Misspell_Preprocessor()

misspell_dataset.train.set_preprocessor(preprocessor)
misspell_dataset.val.set_preprocessor(preprocessor)
misspell_dataset.test.set_preprocessor(preprocessor)

In [14]:
msploader_train = DataLoader(misspell_dataset.train, batch_size=batch_size, shuffle=True)
msploader_val = DataLoader(misspell_dataset.val, batch_size=batch_size, shuffle=False)
msploader_test = DataLoader(misspell_dataset.test, batch_size=batch_size, shuffle=False)

In [15]:
yahoo_classifier = YahooClassifier(word_embedding, emb_dim, class_size,device).to(device)

In [16]:
classifier_model = MulticlassSentenceClassificationTrainerModule(yahoo_classifier).to(device)

In [17]:
misspell = True
if misspell:
    logger = pl.loggers.CSVLogger("Classification/logs", name = f"MisspellText_{emb_dim}d")
else:
    logger = pl.loggers.CSVLogger("Classification/logs", name = f"CorrectedText_{emb_dim}d")
    
checkpoint = pl.callbacks.ModelCheckpoint(
    dirpath = "Classification/checkpoints",
    filename = 'best_model',
    monitor = 'val_loss',
    mode = 'min'
)
class LitProgressBar(pl.callbacks.ProgressBar):
    def init_validation_tqdm(self):
        bar = tqdm(disable=True)
        return bar
bar = LitProgressBar()

In [18]:
mode = 'cor'
loader = {
    'msp':{'train':msploader_train, 'val':msploader_val, 'test':msploader_test},
    'cor':{'train':dataloader_train, 'val':dataloader_val, 'test':dataloader_test}
}
torch.cuda.empty_cache()
trainer = pl.Trainer(logger = logger, 
                     gpus = '0', 
                     callbacks=[checkpoint, bar], 
                     num_sanity_val_steps=0, 
                     max_epochs=max_epochs)

  rank_zero_deprecation(
GPU available: True, used: True
TPU available: False, using: 0 TPU cores


In [18]:
trainer.fit(classifier_model, 
            train_dataloader = loader[mode]['train'],
            val_dataloaders = loader[mode]['val'])

  rank_zero_deprecation(
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type            | Params
------------------------------------------
0 | model | YahooClassifier | 100 M 
------------------------------------------
100 M     Trainable params
0         Non-trainable params
100 M     Total params
400.002   Total estimated model params size (MB)
  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

  rank_zero_warn('Detected KeyboardInterrupt, attempting graceful shutdown...')


In [22]:
torch.cuda.empty_cache()
checkpoint = torch.load("./Classification/checkpoints/best_model.ckpt")
# print(checkpoint)
w_e = SubwordEmbedding(num_embeddings = num_emb, embedding_dim = emb_dim, device = device, padding_idx = 0)
yahoo_classifier = YahooClassifier(word_embedding, emb_dim, class_size,device).to(device)
classifier_model = MulticlassSentenceClassificationTrainerModule(yahoo_classifier).to(device)
# w_e.load_state_dict(checkpoint['state_dict'])
classifier_model.load_state_dict(checkpoint['state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']
loss = checkpoint['loss']


NameError: name 'optimizer' is not defined

In [31]:
trainer.test(test_dataloaders = loader[mode]['test'])

RuntimeError: Error(s) in loading state_dict for MulticlassSentenceClassificationTrainerModule:
	While copying the parameter named "model.word_embedding.subword_embedding.weight", whose dimensions in the model are torch.Size([2000001, 50]) and whose dimensions in the checkpoint are torch.Size([2000001, 50]), an exception occurred : ('CUDA error: device-side assert triggered\nCUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1.',).
	While copying the parameter named "model.linear_classifier.weight", whose dimensions in the model are torch.Size([10, 50]) and whose dimensions in the checkpoint are torch.Size([10, 50]), an exception occurred : ('CUDA error: device-side assert triggered\nCUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1.',).
	While copying the parameter named "model.linear_classifier.bias", whose dimensions in the model are torch.Size([10]) and whose dimensions in the checkpoint are torch.Size([10]), an exception occurred : ('CUDA error: device-side assert triggered\nCUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1.',).

In [None]:
trainer.test(test_dataloaders = loader['msp']['test'])

In [32]:
save_path = f"./Classification/trained_model/trained_classification_model_{emb_dim}d_{epoch}"

In [33]:
torch.save(yahoo_classifier.state_dict(), save_path)

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

In [None]:
a = {True:1, False:2}
a

In [None]:
a = [1,2,3]
b = [4,5,6]
c = [7,8,9]
ld = {
    'msp':{'train':a},
    'cor':{'train':c}
}
print(ld['msp']['train'])