In [None]:
!pip install lightning wandb konlpy

In [None]:
import torch
import random
import numpy as np

np.random.seed(0)
random.seed(0)
torch.manual_seed(0)

In [1]:
from requests import get
from os.path import exists

def download(url, filename):
    if exists(filename):
        print(f"{filename} already exists")
    else:
        with open(filename, "wb") as file:
            response = get(url)
            file.write(response.content)

download("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt", "ratings_train.txt")
download("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt", "ratings_test.txt")

with open("ratings_train.txt", "r") as file:
    for i in range(5):
        print(file.readline())

with open("ratings_train.txt", "r", encoding="utf-8") as file:
    contents = file.read()
    lines = contents.split("\n")[1:]
    train_data = [line.split("\t") for line in lines if len(line) > 0]

with open("ratings_test.txt", "r", encoding="utf-8") as file:
    contents = file.read()
    lines = contents.split("\n")[1:]
    test_data = [line.split("\t") for line in lines if len(line) > 0]

id	document	label

9976970	아 더빙.. 진짜 짜증나네요 목소리	0

3819312	흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나	1

10265843	너무재밓었다그래서보는것을추천한다	0

9045019	교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정	0



In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class SentimentClassifier(nn.Module):
    def __init__(self, vocab_size):
        super(SentimentClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, 32)
        self.fc1 = nn.Linear(32 * 100, 100)
        self.fc2 = nn.Linear(100, 2)

    def forward(self, x):
        x = self.embedding(x)
        x = x.view(-1, 32 * 100)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [None]:
import lightning as pl

class SentimentClassifierPL(pl.LightningModule):
    def __init__(self, sentiment_classifier):
        super(SentimentClassifierPL, self).__init__()
        self.model = sentiment_classifier
        self.loss = nn.CrossEntropyLoss()

        self.validation_step_outputs = []
        self.test_step_outputs = []
        self.save_hyperparameters()

    def training_step(self, batch, batch_idx):
        inputs, labels = batch
        outputs = self.model(inputs)
        loss = self.loss(outputs, labels)
        self.log("train_loss", loss)
        return loss

    def validation_step(self, batch, batch_idx):
        inputs, labels = batch
        outputs = self.model(inputs)
        loss = self.loss(outputs, labels)
        self.log("val_loss", loss)
        self.validation_step_outputs.append((loss, outputs, labels))
        return loss, outputs, labels

    def on_validation_epoch_end(self):
        outputs = self.validation_step_outputs
        avg_loss = torch.stack([x[0] for x in outputs]).mean()
        self.log("avg_val_loss", avg_loss)

        all_outputs = torch.cat([x[1] for x in outputs])
        all_labels = torch.cat([x[2] for x in outputs])
        all_preds = all_outputs.argmax(dim=1)
        accuracy = (all_preds == all_labels).float().mean()
        self.log("val_accuracy", accuracy)
        self.validation_step_outputs.clear()

    def test_step(self, batch, batch_idx):
        inputs, labels = batch
        outputs = self.model(inputs)
        loss = self.loss(outputs, labels)
        self.log("test_loss", loss)
        self.test_step_outputs.append((loss, outputs, labels))
        return loss, outputs, labels

    def on_test_epoch_end(self):
        outputs = self.test_step_outputs
        avg_loss = torch.stack([x[0] for x in outputs]).mean()
        self.log("avg_test_loss", avg_loss)

        all_outputs = torch.cat([x[1] for x in outputs])
        all_labels = torch.cat([x[2] for x in outputs])
        all_preds = all_outputs.argmax(dim=1)
        accuracy = (all_preds == all_labels).float().mean()
        self.log("test_accuracy", accuracy)
        self.test_step_outputs.clear()

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.model.parameters(), lr=1e-3)
        return optimizer

In [None]:
from torch.utils.data import Dataset, DataLoader

class SentimentDataset(Dataset):
    def __init__(self, data, vocab):
        self.data = data
        self.vocab = vocab

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        label = int(self.data[index][1])
        tokens = self.data[index][0]

        token_ids = [self.vocab[token] if token in self.vocab else 1 for token in tokens]
        
        if len(token_ids) > 100:
            token_ids = token_ids[:100]
        else:
            token_ids = token_ids[:100] + [0] * (100 - len(token_ids))

        return torch.tensor(token_ids), torch.tensor(label)

In [None]:
import wandb
from lightning.pytorch.loggers import WandbLogger

wandb.login()

def check_vocab_properties(vocab):
    print(f"Vocab size: {len(vocab)}")
    print(f"Vocab items: {list(vocab.items())[:5]}")


def check_performance(vocab,train_data, test_data, wandb_log_name):
    wandb_logger = WandbLogger(project="NLP", name=wandb_log_name, group="Lec02")

    model = SentimentClassifier(len(vocab))
    pl_model = SentimentClassifierPL(model)

    train_dataset = SentimentDataset(train_data, vocab)
    train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=4)
    val_dataset = SentimentDataset(test_data, vocab)
    val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False, num_workers=4)
    test_dataset = SentimentDataset(test_data, vocab)
    test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, num_workers=4)

    trainer = pl.Trainer(max_epochs=1,
                     accelerator="gpu",
                     logger=wandb_logger
                     )

    trainer.fit(model=pl_model,
                train_dataloaders=train_loader,
                val_dataloaders=val_loader)

    trainer.test(dataloaders=test_loader)

    wandb.finish()

### (기존) Char Vocab

In [None]:
char_vocab = {"[PAD]":0, "[UNK]":1}
char_vocab_idx = 2

tokenizer = lambda x: x

tokenized_train_dataset = []
tokenized_test_dataset = []

for data in train_data:
    tokens = tokenizer(data[1])
    labels = data[2]
    tokenized_train_dataset.append((tokens, labels))

for data in test_data:
    tokens = tokenizer(data[1])
    labels = data[2]
    tokenized_test_dataset.append((tokens, labels))

for tokens, _ in tokenized_train_dataset:
    for token in tokens:
        if token not in char_vocab:
            char_vocab[token] = char_vocab_idx
            char_vocab_idx += 1

In [None]:
check_vocab_properties(char_vocab)

In [None]:
check_performance(char_vocab, tokenized_train_dataset, tokenized_test_dataset, "char_vocab")

### Whitespace tokenizer를 이용한 단순 Vocab

In [None]:
from collections import Counter
from tqdm import tqdm

whitespace_vocab = {"[PAD]":0, "[UNK]":1}
whitespace_vocab_idx = 2

tokenizer = lambda x: x.split()

tokenized_train_dataset = []
tokenized_test_dataset = []

for data in train_data:
    tokens = tokenizer(data[1])
    labels = data[2]
    tokenized_train_dataset.append((tokens, labels))

for data in test_data:
    tokens = tokenizer(data[1])
    labels = data[2]
    tokenized_test_dataset.append((tokens, labels))

for tokens, _ in tokenized_train_dataset:
    for token in tokens:
        if token not in whitespace_vocab:
            whitespace_vocab[token] = whitespace_vocab_idx
            whitespace_vocab_idx += 1

In [None]:
check_vocab_properties(whitespace_vocab)

In [None]:
check_performance(whitespace_vocab, tokenized_train_dataset, tokenized_test_dataset, "whitespace")

### 등장빈도가 너무 적은 토큰 삭제

In [None]:
from collections import Counter

tokenizer = lambda x: x.split()

token_counter = Counter()

for tokens, _ in tokenized_train_dataset:
    token_counter.update(tokens)

In [None]:
len(token_counter)

In [None]:
token_counter.most_common(10)

In [None]:
token_counter.most_common()[-10:]

In [None]:
import matplotlib.pyplot as plt

def plot_frequency_distribution(counter):
    plt.plot(counter.values())
    plt.ylabel('Token Frequency')
    plt.show()

In [None]:
plot_frequency_distribution(token_counter)

In [None]:
min_count = 1
min1removed_vocab = {"[PAD]":0, "[UNK]":1}
min1removed_vocab_idx = 2

for token, count in token_counter.items():
    if count > min_count:
        min1removed_vocab[token] = min1removed_vocab_idx
        min1removed_vocab_idx += 1

In [None]:
check_vocab_properties(min1removed_vocab)

In [None]:
check_performance(min1removed_vocab, tokenized_train_dataset, tokenized_test_dataset, "whitespace_min_count_1")

In [None]:
token_counter.most_common()[-40:]

### 문장부호 및 자음, 모음(e.g. ㅋㅋ) 제거

In [None]:
import re

tokenizer = lambda x: x.split()

tokenized_train_dataset = []
tokenized_test_dataset = []

for data in train_data:
    text = re.sub(r'[,.!?;:()\"\'-]', ' ', data[1])
    text = re.sub(r'[ㄱ-ㅎㅏ-ㅣ]', ' ', text)
    tokens = tokenizer(text)
    labels = data[2]
    tokenized_train_dataset.append((tokens, labels))

for data in test_data:
    text = re.sub(r'[,.!?;:()\"\'-]', ' ', data[1])
    text = re.sub(r'[ㄱ-ㅎㅏ-ㅣ]', ' ', text)
    tokens = tokenizer(text)
    labels = data[2]
    tokenized_test_dataset.append((tokens, labels))

token_counter = Counter()

for tokens, _ in tokenized_train_dataset:
    token_counter.update(tokens)

In [None]:
len(token_counter)

In [None]:
token_counter.most_common(10)

In [None]:
token_counter.most_common()[-10:]

In [None]:
min_count = 1
cleaned_vocab = {"[PAD]":0, "[UNK]":1}
cleaned_vocab_idx = 2

for token, count in token_counter.items():
    if count > min_count:
        cleaned_vocab[token] = cleaned_vocab_idx
        cleaned_vocab_idx += 1

In [None]:
check_vocab_properties(cleaned_vocab)

In [None]:
check_performance(cleaned_vocab, tokenized_train_dataset, tokenized_test_dataset, "whitespace_mc1_cleaned")

### 정규화와 Stemming (with Konlpy)

In [None]:
from konlpy.tag import Okt          #https://konlpy.org/ko/latest/install/#ubuntu

tokenizer = Okt().morphs

okt_test_str = "'다 알바생인가 내용도 없고 무서운거도 없고 웃긴거도 하나도 없음 완전 별싱거운 영화.ㅇ.ㅇ내ㅇ시간 넘 아까움 .. . 완전 낚잌ㅋㅋ"

print(tokenizer(okt_test_str))
print(tokenizer(okt_test_str, norm=True))
print(tokenizer(okt_test_str, norm=True, stem=True))

In [None]:
from tqdm import tqdm
import pickle

okt_train_file = "okt_train_dataset.pkl"
okt_test_file = "okt_test_dataset.pkl"
t
if exists(okt_train_file):
    print(f"{okt_train_file} already exists")
    with open(okt_train_file, "rb") as file:
        tokenized_train_dataset = pickle.load(file)
    with open(okt_test_file, "rb") as file:
        tokenized_test_dataset = pickle.load(file)

else:
    tokenized_train_dataset = []
    tokenized_test_dataset = []

    for data in tqdm(train_data):
        text = re.sub(r'[,.!?;:()\"\'-]', ' ', data[1])
        text = re.sub(r'[ㄱ-ㅎㅏ-ㅣ]', ' ', text)
        tokens = tokenizer(text)
        labels = data[2]
        tokenized_train_dataset.append((tokens, labels))

    with open(okt_train_file, "wb") as file:
        pickle.dump(tokenized_train_dataset, file)

    for data in tqdm(test_data):
        text = re.sub(r'[,.!?;:()\"\'-]', ' ', data[1])
        text = re.sub(r'[ㄱ-ㅎㅏ-ㅣ]', ' ', text)
        tokens = tokenizer(text)
        labels = data[2]
        tokenized_test_dataset.append((tokens, labels))

    with open(okt_test_file, "wb") as file:
        pickle.dump(tokenized_test_dataset, file)

In [None]:
token_counter = Counter()

for tokens, _ in tokenized_train_dataset:
    token_counter.update(tokens)

In [None]:
token_counter.most_common(10)

In [None]:
token_counter.most_common()[-10:]

In [None]:
plot_frequency_distribution(token_counter)

In [None]:
min_count = 1
okt_vocab = {"[PAD]":0, "[UNK]":1}
okt_vocab_idx = 2

for token, count in token_counter.items():
    if count > min_count:
        okt_vocab[token] = okt_vocab_idx
        okt_vocab_idx += 1

In [None]:
check_vocab_properties(okt_vocab)

In [None]:
check_performance(okt_vocab, tokenized_train_dataset, tokenized_test_dataset, "okt")

In [None]:
okt_train_file = "okt_norm_train_dataset.pkl"
okt_test_file = "okt_norm_test_dataset.pkl"

if exists(okt_train_file):
    print(f"{okt_train_file} already exists")
    with open(okt_train_file, "rb") as file:
        tokenized_train_dataset = pickle.load(file)
    with open(okt_test_file, "rb") as file:
        tokenized_test_dataset = pickle.load(file)

else:
    tokenized_train_dataset = []
    tokenized_test_dataset = []

    for data in tqdm(train_data):
        text = re.sub(r'[,.!?;:()\"\'-]', ' ', data[1])
        text = re.sub(r'[ㄱ-ㅎㅏ-ㅣ]', ' ', text)
        tokens = tokenizer(text, norm=True)
        labels = data[2]
        tokenized_train_dataset.append((tokens, labels))

    with open(okt_train_file, "wb") as file:
        pickle.dump(tokenized_train_dataset, file)

    for data in tqdm(test_data):
        text = re.sub(r'[,.!?;:()\"\'-]', ' ', data[1])
        text = re.sub(r'[ㄱ-ㅎㅏ-ㅣ]', ' ', text)
        tokens = tokenizer(text, norm=True)
        labels = data[2]
        tokenized_test_dataset.append((tokens, labels))

    with open(okt_test_file, "wb") as file:
        pickle.dump(tokenized_test_dataset, file)

In [None]:
token_counter = Counter()

for tokens, _ in tokenized_train_dataset:
    token_counter.update(tokens)

In [None]:
token_counter.most_common(10)

In [None]:
token_counter.most_common()[-10:]

In [None]:
min_count = 1
okt_norm_vocab = {"[PAD]":0, "[UNK]":1}
okt_norm_vocab_idx = 2

for token, count in token_counter.items():
    if count > min_count:
        okt_norm_vocab[token] = okt_norm_vocab_idx
        okt_norm_vocab_idx += 1

In [None]:
check_vocab_properties(okt_norm_vocab)

In [None]:
check_performance(okt_norm_vocab, tokenized_train_dataset, tokenized_test_dataset, "okt_norm")

In [None]:
okt_train_file = "okt_stem_train_dataset.pkl"
okt_test_file = "okt_stem_test_dataset.pkl"

if exists(okt_train_file):
    print(f"{okt_train_file} already exists")
    with open(okt_train_file, "rb") as file:
        tokenized_train_dataset = pickle.load(file)
    with open(okt_test_file, "rb") as file:
        tokenized_test_dataset = pickle.load(file)

else:
    tokenized_train_dataset = []
    tokenized_test_dataset = []

    for data in tqdm(train_data):
        text = re.sub(r'[,.!?;:()\"\'-]', ' ', data[1])
        text = re.sub(r'[ㄱ-ㅎㅏ-ㅣ]', ' ', text)
        tokens = tokenizer(text, norm=True, stem=True)
        labels = data[2]
        tokenized_train_dataset.append((tokens, labels))

    with open(okt_train_file, "wb") as file:
        pickle.dump(tokenized_train_dataset, file)

    for data in tqdm(test_data):
        text = re.sub(r'[,.!?;:()\"\'-]', ' ', data[1])
        text = re.sub(r'[ㄱ-ㅎㅏ-ㅣ]', ' ', text)
        tokens = tokenizer(text, norm=True, stem=True)
        labels = data[2]
        tokenized_test_dataset.append((tokens, labels))

    with open(okt_test_file, "wb") as file:
        pickle.dump(tokenized_test_dataset, file)

In [None]:
token_counter = Counter()

for tokens, _ in tokenized_train_dataset:
    token_counter.update(tokens)

In [None]:
token_counter.most_common(10)

In [None]:
token_counter.most_common()[-10:]

In [None]:
min_count = 1
okt_stem_vocab = {"[PAD]":0, "[UNK]":1}
okt_stem_vocab_idx = 2

for token, count in token_counter.items():
    if count > min_count:
        okt_stem_vocab[token] = okt_stem_vocab_idx
        okt_stem_vocab_idx += 1

In [None]:
check_vocab_properties(okt_stem_vocab)

In [None]:
check_performance(okt_stem_vocab, tokenized_train_dataset, tokenized_test_dataset, "okt_stem")