In [None]:
!pip install sentencepiece
!pip install transformers
!pip install 'git+https://github.com/SKTBrain/KoBERT.git#egg=kobert_tokenizer&subdirectory=kobert_hf'

In [None]:
SHORT_MODE = True # True for first ppt, False for second ppt

In [None]:
# import packages

import os
import csv
import json

import numpy as np

from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from torch.utils.data import Dataset, DataLoader

In [None]:
# create tokenizer
# example code from https://github.com/SKT-AI/KoGPT2

from kobert_tokenizer import KoBERTTokenizer
from transformers import BertModel

tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')
bertmodel = BertModel.from_pretrained('skt/kobert-base-v1')

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [None]:
def move_to(obj, device): # move variable to gpu
    if torch.is_tensor(obj):
        return obj.to(device)
    elif isinstance(obj, dict):
        res = {}
        for k, v in obj.items():
            res[k] = move_to(v, device)
        return res
    elif isinstance(obj, list) or isinstance(obj, tuple):
        res = []
        for v in obj:
            res.append(move_to(v, device))
        return res
    else:
        print(obj)
        raise TypeError("Invalid type for move_to")

In [None]:
# load data + preprocessing

class TokenDataset(Dataset):
    def __init__(self, data_text, data_label, tokenizer):
        self.data_text = data_text
        self.data_label = data_label
        self.tokenizer = tokenizer
        
        self.data = [[] for _ in range(self.__len__())]
        
        for i in range(self.__len__()):
            sentence = self.data_text[i]
            label = self.data_label[i]
            
            tokens = self.tokenizer(
                sentence,
                return_tensors='pt',
                truncation=True,
                padding='max_length',
                add_special_tokens=True,
                max_length=512
            )
            
            input_ids = tokens['input_ids'].squeeze(0)
            attention_mask = tokens['attention_mask'].squeeze(0)
            token_type_ids = torch.zeros_like(attention_mask)
            
            self.data[i] = ({
                'input_ids': input_ids,
                'attention_mask': attention_mask,
                'token_type_ids': token_type_ids
            }, torch.tensor(label))
    
    def __len__(self):
        return len(self.data_text)
    
    def __getitem__(self, idx):
        return self.data[idx]

def load_data(data_dir, crawled_dir, source):
    path = os.path.join(data_dir, "{}.csv".format(source))
    crawled_path = os.path.join(crawled_dir, "{}.csv".format(source))
    dataset = list(csv.reader(open(path, encoding="utf8")))
    crawled_dataset = list(csv.reader(open(crawled_path, encoding="cp949")))
    n = len(dataset)
    
    length = min(50, n) if SHORT_MODE else n

    texts = []
    labels = [1.0, 0.0] * length

    for data in dataset[:length]:
        idx = int(round(float(data[0])))
        tokens = tokenizer.tokenize(data[5])
        texts.append(' '.join(tokens))
        tokens = tokenizer.tokenize(crawled_dataset[idx][4])
        texts.append(' '.join(tokens))
    return texts, labels

In [None]:
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=1,
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size, num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        last_hidden_state = output['last_hidden_state']
        if self.dropout:
            X = self.dropout(last_hidden_state[:, 0, :])
        else:
            X = last_hidden_state[:, 0, :]
        x = self.classifier(X)
        x = F.sigmoid(x)
        return x

In [None]:
# train and evaluation function

def train(model, data_loader, loss_fn, optimizer):
    model.train()
    
    running_loss = 0
    corr = 0
    counts = 0
    
    for inputs, labels in data_loader:
        optimizer.zero_grad()
        output = model(**inputs)
        loss = loss_fn(output.squeeze(dim=1), labels)
        loss.backward()
        optimizer.step()
        
        _, pred = output.max(dim=1)
        corr += pred.eq(labels).sum().item()
        counts += len(labels)
        running_loss += loss.item() * labels.size(0)
        print(f'{counts}')
    acc = corr / len(data_loader.dataset)
    
    return running_loss / len(data_loader.dataset), acc

def evaluate(model, data_loader, loss_fn):
    model.eval()
    
    with torch.no_grad():
        corr = 0
        running_loss = 0
        
        for inputs, labels in data_loader:
            output = model(**inputs)
            
            _, pred = output.max(dim=1)
            corr += torch.sum(pred.eq(labels)).item()
            
            running_loss += loss_fn(output, labels).item() * labels.size(0)
    acc = corr / len(data_loader.dataset)
    
    return running_loss / len(data_loader.dataset), acc

In [None]:
# main function

def main(
    data_dir="data/",
    crawl_dir="output/",
    log_dir="log/",
    topics=["culture", "economy", "it_science", "politics", "society", "world"],
    train_test_ratio=0.1,
    num_epoch=100
):
    torch.manual_seed(42)
    
    texts_list, labels_list = [], []
    for topic in topics:
        texts, labels = load_data(data_dir, crawl_dir, topic)
        texts_list.extend(texts)
        labels_list.extend(labels)

    texts_train, texts_test, labels_train, labels_test = train_test_split(
        texts_list, labels_list, test_size=train_test_ratio, random_state=42, shuffle=True,
    )
    
    train_dataset = TokenDataset(texts_train, labels_train, tokenizer)
    test_dataset = TokenDataset(texts_test, labels_test, tokenizer)
    
    # move_to(train_dataset.data, device)
    # move_to(test_dataset.data, device)
    
    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)#, num_workers=8)
    test_loader = DataLoader(test_dataset, batch_size=8, shuffle=True)#, num_workers=8)
    
    print('Loading done')
    
    model = BERTClassifier(bertmodel, dr_rate=0.5)#.to(device)
    optimizer = Adam(model.parameters(), lr=1e-5)
    loss_fn = nn.MSELoss()
    
    for epoch in range(1, num_epoch + 1):
        print(f'[Epoch {epoch}]', end=' ')
        
        train_loss, train_acc = train(model, train_loader, loss_fn, optimizer)
        val_loss, val_acc = evaluate(model, test_loader, loss_fn)
        
        print(f'train_loss / train_acc / test_loss / test_acc : {train_loss} / {train_acc} / {val_loss} / {val_acc}')

    result = model.predict(test_loader)
    result_proba = model.predict_proba(test_loader)
    result_log_proba = model.predict_log_proba(test_loader)
    kind = {"tp": 0, "fp": 0, "fn": 0, "tn": 0}
    for res, pred in zip(result, labels_test):
        if res == 1:
            kind["tp" if res == pred else "fp"] += 1
        else:
            kind["tn" if res == pred else "fn"] += 1
    precision = kind["tp"] / (kind["tp"] + kind["fp"])
    recall = kind["tp"] / (kind["tp"] + kind["fn"])

    ce_loss = 0
    for label, value in zip(labels_test, result_log_proba):
        ce_loss -= label * value[1] + (1 - label) * value[0]
    ce_loss /= len(labels_test)
    data = {
        "test_accuracy": val_acc,
        "test_precision": precision,
        "test_recall": recall,
        "F_score": 2 * precision * recall / (precision + recall),
        "mse_loss": np.sum(np.array(labels_test) - result_proba[:, 1]) ** 2 / len(labels_test),
        "ce_loss": ce_loss,
        "label_and_result": list(zip(labels_test, result.tolist(), result_proba[:, 1].tolist())),
    }
    print(data)
    json.dump(data, open(os.path.join(log_dir, "result.json"), "w"), indent=4)

In [None]:
# run main function

main()