In [1]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from kobert_transformers import get_kobert_model, get_tokenizer

In [2]:
MODEL_NAME = "monologg/kobert"
electra = "monologg/koelectra-small-v2-discriminator"

model = BertForSequenceClassification.from_pretrained(MODEL_NAME)
model.classifier.out_features=1
#model.cuda()
tokenizer = get_tokenizer()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
tokens = tokenizer.tokenize("한국어 모델을 공유합니다")
print("Tokens: {}".format(tokens))

# This is not sufficient for the model, as it requires integers as input, 
# not a problem, let's convert tokens to ids.
tokens_ids = tokenizer.convert_tokens_to_ids(tokens)
print("Tokens id: {}".format(tokens_ids))

# Add the required special tokens
tokens_ids = tokenizer.build_inputs_with_special_tokens(tokens_ids)

# We need to convert to a Deep Learning framework specific format, let's use PyTorch for now.
tokens_pt = torch.tensor([tokens_ids])
print("Tokens PyTorch: {}".format(tokens_pt))

# Now we're ready to go through BERT with out input
output = model(tokens_pt)
#outputs = output['last_hidden_state']
#pooled = output['pooler_output']
#print("Token wise output: {}, Pooled output: {}".format(outputs.shape, pooled.shape))

Tokens: ['▁한국', '어', '▁모델', '을', '▁공유', '합니다']
Tokens id: [4958, 6855, 2046, 7088, 1050, 7843]
Tokens PyTorch: tensor([[   2, 4958, 6855, 2046, 7088, 1050, 7843,    3]])


In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split

dataset = pd.read_csv('./data/news_train.csv')
train, val, train_label, val_label = train_test_split(dataset['content'], dataset['info'])

In [5]:
train_encodings = tokenizer(list(train.values), truncation=True, padding=True)
val_encodings = tokenizer(list(val.values), truncation=True, padding=True)

#print(tokenizer.convert_ids_to_tokens(train_encodings['input_ids'][0]))

In [6]:
import torch
from torch.utils.data import Dataset

class NewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

train_dataset = NewsDataset(train_encodings, train_label)
val_dataset = NewsDataset(val_encodings, val_label)

In [11]:
from torch.utils.data import DataLoader
from transformers import AdamW

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model.to(device)
model.train()

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

optim = AdamW(model.parameters(), lr=5e-5)

for epoch in range(3):
    for batch in train_loader:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optim.step()
        
model.eval()

KeyError: 83743