In [1]:
import sys
import os

from nlp_datasets import YahooDataset
from nlp_datasets import BaseDataset
from nlp_modeltrainers import BaseTrainerModule
from nlp_modeltrainers.sentence_classification import MulticlassSentenceClassificationTrainerModule
# from nlp_modeltrainers import VectorCosineSimilarityTrainerModule


import torch
import fastwer
from string import ascii_letters as letters
L = list(letters.lower())
import numpy as np
import pytorch_lightning as pl
from pytorch_lightning.callbacks.early_stopping import EarlyStopping

from tqdm import tqdm
from torch.nn import Module, Linear, Embedding
from torch.utils.data import DataLoader

import re

import nltk
from nltk import word_tokenize
from nltk import WordNetLemmatizer
from nltk.corpus import stopwords 

lem = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from pytorch_lightning import LightningModule

dev = 'cpu'
if torch.cuda.is_available():
    dev = "cuda:0"

device = torch.device(dev)

In [2]:
from utils.SubwordHash import subwordhash, Hash_Preprocessor, Word_Preprocessor
from utils.SubwordEmbedding import subwordembedding
from utils.config import *
from utils.replace_dict import rep
from utils.dict_freq import get_freq_dict

In [3]:
class YahooClassifier(Module):
    def __init__(self, word_embedding, embedding_dim, class_zize, device):
        super().__init__()
        self.device = device
        self.word_embedding = word_embedding.to(device)
        self.linear_classifier = Linear(embedding_dim, class_size).to(device)
    
    def forward(self, token_ids):
        """
        toekn_ids: (batch_size, worrd_num, hash_size)
        """
#         print(token_ids.shape)
        # (batch_size, words_num, embedding_dim)
        outputs = self.word_embedding(token_ids).to(self.device)
#         print(outputs.shape)
        # (batch_size, embedding_dim)
        outputs = torch.max(outputs, dim=1)[0].to(self.device)
#         print(outputs.shape)
#         print(outputs)
        # (batch_size, class_size)
        outputs = self.linear_classifier(outputs).to(self.device)
#         print(outputs.shape)
        return outputs

In [4]:
word_dataset = YahooDataset(max_samples=max_samples, local_dir="../datasets/small_yahoo_dataset_text")
freq_dict = get_freq_dict(word_dataset.train)

100%|████████████████████████████████████████████████████████████████████████████████████| 900/900 [00:02<00:00, 301.60it/s]


In [5]:
train_preprocessor = Word_Preprocessor(freq_dict = freq_dict, 
                                       train = True)
word_dataset.train.set_preprocessor(train_preprocessor)

subword_hashes = subwordhash(word_dataset.train)

word_num = subword_hashes.word_num
max_sw_hash_len = subword_hashes.max_hash
max_sample_len = subword_hashes.max_sample
print(max_sw_hash_len)
print(max_sample_len)

100%|████████████████████████████████████████████████████████████████████████████████████| 900/900 [00:01<00:00, 508.29it/s]


356
811


In [6]:
dataset = YahooDataset(max_samples=max_samples, local_dir="../datasets/small_yahoo_dataset")

In [7]:
word_embedding = subwordembedding(num_embeddings = num_emb, embedding_dim = emb_dim, device = device, padding_idx = 0)
word_embedding = word_embedding.to(device)
word_embedding.load_state_dict(torch.load(emb_path))
print(f'Loaded model: {emb_path}')

Loaded model: ../Trained_Models/SubwordEmbedding/trained_model/trained_model_100d_noisedist_100e_3w_10000sample_min1_culled_noURL_lemmatized


In [8]:
train_preprocessor = Hash_Preprocessor(max_sw_hash_len,
                                       max_sample_len,
                                       subword_hashes,
                                       device,
                                       freq_dict = freq_dict,
                                       train = True,
                                       subsampling = subsampling)
preprocessor = Hash_Preprocessor(max_sw_hash_len,
                                 max_sample_len,
                                 subword_hashes,
                                 device,
                                 freq_dict = freq_dict,
                                 train = False,
                                 subsampling = subsampling)
dataset.train.set_preprocessor(train_preprocessor)
dataset.val.set_preprocessor(preprocessor)
dataset.test.set_preprocessor(preprocessor)

In [9]:
dataloader_train = DataLoader(dataset.train, batch_size=batch_size, shuffle=True)
dataloader_val = DataLoader(dataset.val, batch_size=batch_size, shuffle=False)
dataloader_test = DataLoader(dataset.test, batch_size=batch_size, shuffle=False)

In [10]:
yahoo_classifier = YahooClassifier(word_embedding, emb_dim, class_size,device)

In [11]:
classifier_model = MulticlassSentenceClassificationTrainerModule(yahoo_classifier).to(device)

In [12]:
misspell = False
if misspell:
    logger = pl.loggers.CSVLogger("../Trained_Models/Classification/logs", name = f"MisspellText_{emb_dim}d")
else:
    logger = pl.loggers.CSVLogger("../Trained_Models/Classification/logs", name = f"CorrectedText_{emb_dim}d")
    
checkpoint = pl.callbacks.ModelCheckpoint(
    dirpath = "../Trained_Models/Classification/checkpoints",
    filename = 'best_model_{epoch}_{val_loss}',
    monitor = 'val_loss',
    mode = 'min'
)
class LitProgressBar(pl.callbacks.ProgressBar):
    def init_validation_tqdm(self):
        bar = tqdm(disable=True)
        return bar
bar = LitProgressBar()

In [13]:
mode = 'cor'
loader = {
#     'msp':{'train':msploader_train, 'val':msploader_val, 'test':msploader_test},
    'cor':{'train':dataloader_train, 'val':dataloader_val, 'test':dataloader_test}
}
torch.cuda.empty_cache()
trainer = pl.Trainer(logger = logger, 
                     gpus = '0', 
                     callbacks = [checkpoint, bar], 
                     num_sanity_val_steps = 0,
                     auto_lr_find = True,
                     max_epochs = classify_epochs)

trainer.fit(classifier_model, 
            train_dataloader = loader[mode]['train'],
            val_dataloaders = loader[mode]['val'])

  rank_zero_deprecation(
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type            | Params
------------------------------------------
0 | model | YahooClassifier | 200 M 
------------------------------------------
200 M     Trainable params
0         Non-trainable params
200 M     Total params
800.004   Total estimated model params size (MB)
  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

NameError: name 'max_sample_len' is not defined

In [None]:
save_path = f"../Trained_Models/Classification/trained_model/trained_classification_model_{emb_dim}d_{emb_add}"
torch.save(yahoo_classifier.state_dict(), save_path)

In [None]:
trainer.test(test_dataloaders = loader[mode]['test'])

In [None]:
trainer.test(test_dataloaders = loader['msp']['test'])

In [None]:
from nltk import word_tokenize
word_tokenize("""When's the next friday? 10/12 bbbbBBB Sydney""".lower())

In [None]:
a = {True:1, False:2}
a

In [None]:
a = [1,2,3]
b = [4,5,6]
c = [7,8,9]
ld = {
    'msp':{'train':a},
    'cor':{'train':c}
}
print(ld['msp']['train'])