# NPLM

In [1]:
%load_ext lab_black

## 0. import

In [2]:
import pickle
import torch
import torch.nn as nn
import torch.nn.functional as F
import pytorch_lightning as pl

from collections import Counter
from torch.utils.data import Dataset, DataLoader
from konlpy.tag import Mecab
from tqdm import tqdm

from typing import List

## 1. Preprocess

In [3]:
def preprocess(
    data_path: str,
    word_index: dict = None,
    num_words: int = 10000,
):
    tokenizer = Mecab()

    # 0. data load
    with open(data_path, "rb") as f:
        data = pickle.load(f)

    # 1. bag-of-words
    vocab, docs = [], []
    for doc in tqdm(data):
        if doc:
            # nsmc 데이터에 nan값을 제외해주기 위함
            try:
                nouns = tokenizer.nouns(doc)
                vocab.extend(nouns)
                docs.append(nouns)
            except:
                continue

    # 2. build vocab
    if not word_index:
        vocab = Counter(vocab)
        vocab = vocab.most_common(num_words)

        # 3. add unknwon token
        word_index = {"<UNK>": 0}
        for idx, (word, _) in enumerate(vocab, 1):
            word_index[word] = idx

    index_word = {idx: word for word, idx in word_index.items()}

    # 4. create corpus
    corpus = []
    for doc in docs:
        if doc:
            corpus.append([word_index.get(word, 0) for word in doc])

    return corpus, word_index, index_word

In [4]:
train_path = "../data/nsmc/train_data.pkl"
test_path = "../data/nsmc/test_data.pkl"

train_corpus, word_index, index_word = preprocess(train_path)
test_corpus, _, _ = preprocess(test_path, word_index)

100%|██████████| 150000/150000 [00:08<00:00, 17047.07it/s]
100%|██████████| 50000/50000 [00:02<00:00, 17624.96it/s]


### inputs and targets

In [5]:
def create_contexts_target(corpus, window=3):
    contexts, targets = [], []

    for tokens in tqdm(corpus):
        if len(tokens) > window:
            idx = 0
            while window + idx + 1 <= len(tokens):
                target = tokens[idx + window]
                if target != 0:
                    contexts.append(tokens[idx : idx + window])
                    targets.append(target)

                idx += 1

    return contexts, targets

In [6]:
train_contexts, train_targets = create_contexts_target(train_corpus)
test_contexts, test_targets = create_contexts_target(test_corpus)

100%|██████████| 141731/141731 [00:00<00:00, 425976.74it/s]
100%|██████████| 47238/47238 [00:00<00:00, 354582.07it/s]


In [7]:
len(test_contexts)

136414

### DataSet

In [8]:
class NSMCDataset(Dataset):
    def __init__(self, contexts: List[List[int]], targets: List[int]):
        self.contexts = contexts
        self.targets = targets

    def __len__(self):
        return len(self.contexts)

    def __getitem__(self, idx):
        return self.contexts[idx], self.targets[idx]

In [9]:
trainset = NSMCDataset(train_contexts, train_targets)
testset = NSMCDataset(test_contexts, test_targets)

In [10]:
trainset[:2]

([[204, 290, 1], [290, 1, 45]], [45, 783])

### collate function

In [11]:
def collate_fn(batch):
    tokens = [entry[0] for entry in batch]
    targets = [entry[1] for entry in batch]

    tokens = torch.LongTensor(tokens)
    targets = torch.LongTensor(targets)

    return tokens, targets

### dataloader

In [12]:
train_loader = DataLoader(
    dataset=trainset,
    batch_size=256,
    collate_fn=collate_fn,
    shuffle=True,
    num_workers=8,
)


test_loader = DataLoader(
    dataset=testset,
    batch_size=256,
    collate_fn=collate_fn,
    shuffle=False,
    num_workers=8,
)

In [13]:
for batch in train_loader:
    sample = batch
    break

## 2. Model

In [22]:
class NPLM(pl.LightningModule):
    def __init__(
        self,
        vocab_size: int,
        window: int,
        embed_dim: int = 100,
        hidden_dim: int = 50,
    ):
        super(NPLM, self).__init__()

        self.C = nn.Embedding(vocab_size, embed_dim)
        self.H = nn.Linear(window * embed_dim, hidden_dim)
        self.U = nn.Linear(hidden_dim, vocab_size, bias=False)
        self.W = nn.Linear(window * embed_dim, vocab_size)

    def forward(self, x):
        x = self.C(x)  # [batch_size, window, embed_dim]
        x = x.reshape(-1, x.shape[1] * x.shape[2])  # [batch_size, window * embed_dim]
        tanh = torch.tanh(self.H(x))  # [batch_size, hidden_dim]
        output = self.W(x) + self.U(tanh)  # [batch_size, vocab_size]
        return output

    def loss_fn(self, logits, targets):
        criterion = nn.CrossEntropyLoss()
        loss = criterion(logits, targets)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
        return optimizer

    def training_step(self, train_batch, batch_idx):
        x, y = train_batch
        logits = self.forward(x)
        loss = self.loss_fn(logits, y)
        self.log("train_loss", loss)
        return loss

    def validation_step(self, val_batch, batch_idx):
        x, y = val_batch
        logits = self.forward(x)
        loss = self.loss_fn(logits, y)
        self.log("val_loss", loss)

## 3. Train

In [23]:
# model init
vocab_size = len(word_index)
window = 3

model = NPLM(vocab_size, window)

In [24]:
trainer = pl.Trainer(gpus=2, max_epochs=10, val_check_interval=0.5, accelerator="dp")

GPU available: True, used: True
TPU available: None, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


In [25]:
trainer.fit(model, train_loader, test_loader)


  | Name | Type      | Params
-----------------------------------
0 | C    | Embedding | 1.0 M 
1 | H    | Linear    | 15.1 K
2 | U    | Linear    | 500 K 
3 | W    | Linear    | 3.0 M 
-----------------------------------
4.5 M     Trainable params
0         Non-trainable params
4.5 M     Total params


Epoch 0:   0%|          | 8/2662 [00:00<01:52, 23.60it/s, loss=9.36, v_num=1]



Epoch 0:  30%|██▉       | 798/2662 [00:15<00:36, 50.41it/s, loss=7.02, v_num=1]
Validating: 0it [00:00, ?it/s][A
Validating:   0%|          | 0/533 [00:00<?, ?it/s][A
Epoch 0:  30%|███       | 802/2662 [00:15<00:37, 50.22it/s, loss=7.02, v_num=1]
Epoch 0:  32%|███▏      | 841/2662 [00:16<00:34, 52.33it/s, loss=7.02, v_num=1]
Epoch 0:  33%|███▎      | 881/2662 [00:16<00:32, 54.48it/s, loss=7.02, v_num=1]
Epoch 0:  35%|███▍      | 921/2662 [00:16<00:30, 56.58it/s, loss=7.02, v_num=1]
Epoch 0:  36%|███▌      | 961/2662 [00:16<00:29, 57.48it/s, loss=7.02, v_num=1]
Validating:  33%|███▎      | 176/533 [00:01<00:02, 139.41it/s][A
Epoch 0:  38%|███▊      | 1001/2662 [00:17<00:28, 58.29it/s, loss=7.02, v_num=1]
Validating:  40%|███▉      | 213/533 [00:01<00:02, 113.19it/s][A
Validating:  43%|████▎     | 227/533 [00:01<00:02, 107.24it/s][A
Epoch 0:  39%|███▉      | 1041/2662 [00:17<00:27, 59.07it/s, loss=7.02, v_num=1]
Validating:  47%|████▋     | 252/533 [00:01<00:02, 98.75it/s] [A
Valid

1

## 4. Check using gensim

In [39]:
import numpy as np

from gensim.models import KeyedVectors

### create pre-trained vectors file

In [34]:
embedding = model.C.weight
embedding = embedding.detach().numpy()

In [38]:
embed_dim = 100
with open("./vectors.txt", "w", encoding="utf8") as f:
    f.write(f"{len(word_index)-1} {embed_dim}\n")
    for word, idx in word_index.items():
        if idx != 0:
            str_vec = " ".join(map(str, list(embedding[idx, :])))
            f.write(f"{word} {str_vec}\n")

In [40]:
nplm = KeyedVectors.load_word2vec_format("./vectors.txt", binary=False)

In [44]:
nplm.wv.most_similar("영화")

  """Entry point for launching an IPython kernel.


[('지루', 0.4514681398868561),
 ('순대', 0.32524368166923523),
 ('라오스', 0.32409751415252686),
 ('그만큼', 0.31804704666137695),
 ('순례', 0.3157411813735962),
 ('어웨이', 0.3132662773132324),
 ('도신', 0.30069756507873535),
 ('스킵', 0.2967820167541504),
 ('베리모어', 0.2919543385505676),
 ('원영의', 0.28816911578178406)]