In [1]:
import sys
import os

from nlp_datasets import YahooDataset
from nlp_datasets import BaseDataset
from nlp_modeltrainers import BaseTrainerModule
from nlp_modeltrainers.sentence_classification import MulticlassSentenceClassificationTrainerModule
# from nlp_modeltrainers import VectorCosineSimilarityTrainerModule


import torch
import fastwer
from string import ascii_letters as letters
L = list(letters)
import numpy as np
import pytorch_lightning as pl
from pytorch_lightning.callbacks.early_stopping import EarlyStopping

from tqdm import tqdm
from torch.nn import Module, Linear, Embedding
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
from nltk import word_tokenize

import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from pytorch_lightning import LightningModule

dev = 'cpu'
if torch.cuda.is_available():
    dev = "cuda:0"

device = torch.device(dev)
CUDA_LAUNCH_BLOCKING=1

In [2]:
from utils.SubwordHash import subwordhash
from utils.SubwordEmbedding import subwordembedding
from utils.config import *

In [3]:
class Hash_Preprocessor:
    def __init__(self):
        pass
    
    def __call__(self, sample):
        tokenized = word_tokenize(sample["input"])
        tokenized_hashes = self.hash_tokenize(tokenized)
        output_id = self.padding(tokenized_hashes, padding_idx=0)
        
        return {"input": output_id, "target": sample['target']-1}
    
    def hash_tokenize(self, data):
        tokenized_id = [subword_hashes(w) for w in data]
        return tokenized_id
    
    def padding(self, data, padding_idx=0):
        if len(data) >= max_sample_len:
            return torch.tensor(data[:max_sample_len], dtype = torch.long).to(device)
        data.extend(np.array([[padding_idx]*max_sw_hash_len]*(max_sample_len - len(data))))
        return torch.tensor(data, dtype = torch.long).to(device)

In [4]:
class Misspell_Preprocessor:
    def __init__(self):
        pass
    
    def __call__(self, sample):
        random_freq = True
        tokenized = {i:w for i,w in enumerate(word_tokenize(sample["input"]))}
        tokenized = self.misspell(tokenized, random_freq)
        tokenized_hashes = self.hash_tokenize(tokenized)
        output_id = self.padding(tokenized_hashes, padding_idx=0)
        
        return {"input": output_id, "target": sample['target']-1}
    
    def misspell(self, data, random_freq=False):
        if random_freq: 
            msp_f = np.random.uniform(0.1,0.5)
        else: 
            msp_f = misspell_freq
        misspell_num = int(len(data)*msp_f)
        misspell_idx = np.random.choice(len(data), misspell_num, replace = False)
        m_type = {i:mt for i,mt in enumerate(np.random.randint(0, 4, misspell_num))}
        m_dict = {0:self.delete, 1:self.insert, 2:self.replace, 3:self.swap}
        for i in range(misspell_num):
            mp = data[misspell_idx[i]]
            if len(mp) > 1:
                mp = m_dict[m_type[i]](list(mp))
            else:
                mp = self.replace(list(mp))
            data[misspell_idx[i]] = mp
        return [data[w] for w in sorted(data)]
    
    def delete(self, word):
        idx = np.random.randint(len(word))
        word.pop(idx)
        return ''.join(map(str,word))

    def insert(self, word):
        idx = np.random.randint(len(word))
        letter = np.random.choice(L)
        word.insert(idx, letter)
        return ''.join(map(str,word))

    def replace(self, word):
        idx = np.random.randint(len(word))
        letter = np.random.choice(L)
        word.pop(idx)
        word.insert(idx,letter)
        return ''.join(map(str,word))

    def swap(self, word):
        idx1 = np.random.randint(len(word))
        if idx1 == 0:
            idx2 = idx1 + 1
        elif idx1 == len(word)-1:
            idx2 = idx1 -1
        else:
            idx2 = np.random.choice([idx1+1,idx1-1])
        first_idx = min(idx1,idx2)
        second_idx = max(idx1,idx2)
        temp = word.pop(first_idx)
        word.insert(second_idx, temp)
        return ''.join(map(str,word))
        
    def hash_tokenize(self, data):
        tokenized_id = [subword_hashes(w) for w in data]
        return tokenized_id
    
    def padding(self, data, padding_idx=0):
        if len(data) >= max_sample_len:
            return torch.tensor(data[:max_sample_len], dtype = torch.long).to(device)
        data.extend(np.array([[padding_idx]*max_sw_hash_len]*(max_sample_len - len(data))))
        return torch.tensor(data, dtype = torch.long).to(device)

In [5]:
dataset = YahooDataset(max_samples=max_samples, local_dir="../datasets/small_yahoo_dataset")
subword_hashes = subwordhash(dataset.train)

word_num = subword_hashes.word_num
max_sw_hash_len = subword_hashes.max_hash
max_sample_len = subword_hashes.max_sample
print(max_sw_hash_len)
print(max_sample_len)

100%|█████████████████████████████████████████████████████████████████████████████████| 9000/9000 [00:08<00:00, 1079.73it/s]


377
995


In [6]:
word_embedding = subwordembedding(num_embeddings = num_emb, embedding_dim = emb_dim, device = device, padding_idx = 0)
word_embedding = word_embedding.to(device)
word_embedding.load_state_dict(torch.load(emb_path))
print(f'Loaded model: {emb_path}')

Loaded model: ../Trained_Models/SubwordEmbedding/trained_model/trained_model_50d_uniform_100e_3w


In [7]:
preprocessor = Hash_Preprocessor()
dataset.train.set_preprocessor(preprocessor)
dataset.val.set_preprocessor(preprocessor)
dataset.test.set_preprocessor(preprocessor)


In [8]:
dataloader_train = DataLoader(dataset.train, batch_size=batch_size, shuffle=True)
dataloader_val = DataLoader(dataset.val, batch_size=batch_size, shuffle=False)
dataloader_test = DataLoader(dataset.test, batch_size=batch_size, shuffle=False)

In [9]:
misspell_dataset = YahooDataset(max_samples=max_samples, local_dir="../datasets/misspell_dataset")

In [10]:
preprocessor = Misspell_Preprocessor()

misspell_dataset.train.set_preprocessor(preprocessor)
misspell_dataset.val.set_preprocessor(preprocessor)
misspell_dataset.test.set_preprocessor(preprocessor)

In [11]:
msploader_train = DataLoader(misspell_dataset.train, batch_size=batch_size, shuffle=True)
msploader_val = DataLoader(misspell_dataset.val, batch_size=batch_size, shuffle=False)
msploader_test = DataLoader(misspell_dataset.test, batch_size=batch_size, shuffle=False)

In [12]:
class YahooClassifier(Module):
    def __init__(self, word_embedding, embedding_dim, class_zize,device):
        super().__init__()
        self.device = device
        self.word_embedding = word_embedding.to(device)
        self.linear_classifier = Linear(embedding_dim, class_size).to(device)
    
    def forward(self, token_ids):
        """
        toekn_ids: (batch_size, worrd_num, hash_size)
        """
#         print(token_ids.shape)
        # (batch_size, words_num, embedding_dim)
        outputs = self.word_embedding(token_ids).to(self.device)
#         print(outputs.shape)
        # (batch_size, embedding_dim)
#         print(outputs.shape)
#         print(len(outputs.shape))
        outputs = torch.max(outputs, dim=len(outputs.shape)-2)[0].to(self.device)
#         print(outputs.shape)
#         print(outputs)
        # (batch_size, class_size)
        outputs = self.linear_classifier(outputs).to(self.device)
#         print(outputs.shape)
        return outputs

In [13]:
# worddata = YahooDataset(max_samples=max_samples, local_dir="../datasets/small_yahoo_dataset")

# data = []
# for i in range(8960, 8976):
#     data.append(dataset.test[i])
# t_loader = DataLoader(data, batch_size = 1,shuffle = False)
# for batch in dataloader_test:
#     print(batch['input'].shape)
#     break
# # i = 8960
# # for batch in tqdm(t_loader):
# #     print(i, end = " ")
# #     print(batch['input'].shape)
# #     out = yahoo_classifier.forward(batch['input'])
# #     i += 1
# print(len(worddata.test[8975]['input']))
# print (dataset.test[8975])
# print(yahoo_classifier.forward(dataset.test[8975]['input'].unsqueeze(0)))


In [14]:
save_path = f"../Trained_Models/Classification/trained_model/trained_classification_model_{emb_dim}d_{classify_epochs}e_sumpool"
yahoo_classifier = YahooClassifier(word_embedding, emb_dim, class_size,device).to(device)
yahoo_classifier.load_state_dict(torch.load(save_path))

<All keys matched successfully>

In [15]:
save_path_latesum = f"../Trained_Models/Classification/trained_model/trained_classification_model_{emb_dim}d_{classify_epochs}e_latesum"
latesum_yahoo_classifier = YahooClassifier(word_embedding, emb_dim, class_size,device).to(device)
latesum_yahoo_classifier.load_state_dict(torch.load(save_path_latesum))

<All keys matched successfully>

In [16]:
def classification_eval(models, test_loader):    
    target_class = np.array([])
    prediction = [np.array([])]*len(models)
    first_model = True
    for batch in tqdm(test_loader):
        model = 0
        for i in range(len(models)):
            out = models[i].forward(batch['input'])
            model_class = torch.max(out, dim = 1)[1].cpu().detach().numpy()
            prediction[i] = np.append(prediction[i], model_class, 0)
        tgc = batch['target'].cpu().detach().numpy()
        target_class = np.append(target_class, tgc, 0)
#     plt.figure(figsize=[20,5])
#     for i in range(len(models)):
#         plt.plot(miss[i])
#     plt.grid()
#     plt.show()
    return prediction, target_class

In [17]:
prediction, target_class= classification_eval([yahoo_classifier], dataloader_test)

100%|███████████████████████████████████████████████████████████████████████████████████| 1250/1250 [07:07<00:00,  2.92it/s]


In [20]:
prediction, target_class= classification_eval([yahoo_classifier], dataloader_train)

100%|███████████████████████████████████████████████████████████████████████████████████| 1125/1125 [07:03<00:00,  2.66it/s]


In [21]:
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
k = confusion_matrix(target_class,prediction[0])
print(k)
print()
# l = confusion_matrix(target_class,prediction[1])
print(l)

[[   0    0    4    0    0    0   28    0  501    1]
 [   0    0   11    0    0    0   43    0  958    1]
 [   0    0   12    0    0    0   26    0  647    1]
 [   0    0   11    0    0    0   57    0  751    2]
 [   0    0   16    0    0    0   41    0 1197    0]
 [   0    0    3    0    0    0   44    0  586    0]
 [   0    0   34    0    0    0  159    0 2085    1]
 [   0    0    9    0    0    0   53    0  572    1]
 [   0    0   12    0    0    0   19    0  467    0]
 [   0    0    4    0    0    0   28    0  613    2]]



NameError: name 'l' is not defined

In [None]:
class_res = [{{'correct':0, 'incorrect':0}}]*2
print(len(target_class))
for i in tqdm(range(len(target_class))):
    for model in range(2):
        if target_class[i] == prediction[model][i]:
            class_res[model][int(target_class[i])]['correct'] += 1
        else:
            class_res[model][int(target_class[i])]['incorrect'] += 1
print(class_res)

In [None]:
for j in range(2):
    print(f'model #{j}')
    for i,res in enumerate(class_res[j]):
        print(res)
#         print(f'class {i}: {res}\t acc: {(res['correct'])*100/(res['correct']+res['incorrect'])}%')

In [None]:
import collections
for i,model in enumerate(miss):
    total = len(model)
    correct = collections.Counter(model)[0]
    print(f'Accuracy #{i}: {correct*100/total} ({correct}/{total})')
    

In [None]:
dist_train = np.array([])


for batch in tqdm(dataloader_train):
    dist_train = np.append(dist_train, batch['target'],0)


# classes_val = []
# classes_test = []

In [None]:
plt.hist(dist_train, bins=10)

In [None]:
dist_val = np.array([])
for batch in tqdm(dataloader_val):
    dist_val = np.append(dist_val, batch['target'],0)
plt.hist(dist_val, bins=10)

In [None]:
dist_test = np.array([])
for batch in tqdm(dataloader_test):
    dist_test = np.append(dist_test, batch['target'],0)

In [None]:
plt.hist(dist_test, bins=10)

In [None]:
a = np.array([1,2,3])
b = np.array([4,5,6])
c = np.append(a,b,0)
c