In [1]:
import torch
from transformers import AutoModel, AutoTokenizer, BertTokenizer
from kobert_transformers import get_kobert_model, get_tokenizer

In [2]:
MODEL_NAME = "monologg/kobert"
electra = "monologg/koelectra-small-v2-discriminator"

model = get_kobert_model()
tokenizer = get_tokenizer()

In [3]:
tokens = tokenizer.tokenize("한국어 모델을 공유합니다")
print("Tokens: {}".format(tokens))

# This is not sufficient for the model, as it requires integers as input, 
# not a problem, let's convert tokens to ids.
tokens_ids = tokenizer.convert_tokens_to_ids(tokens)
print("Tokens id: {}".format(tokens_ids))

# Add the required special tokens
tokens_ids = tokenizer.build_inputs_with_special_tokens(tokens_ids)

# We need to convert to a Deep Learning framework specific format, let's use PyTorch for now.
tokens_pt = torch.tensor([tokens_ids])
print("Tokens PyTorch: {}".format(tokens_pt))

# Now we're ready to go through BERT with out input
output = model(tokens_pt)
outputs = output['last_hidden_state']
pooled = output['pooler_output']
print("Token wise output: {}, Pooled output: {}".format(outputs.shape, pooled.shape))

Tokens: ['▁한국', '어', '▁모델', '을', '▁공유', '합니다']
Tokens id: [4958, 6855, 2046, 7088, 1050, 7843]
Tokens PyTorch: tensor([[   2, 4958, 6855, 2046, 7088, 1050, 7843,    3]])
Token wise output: torch.Size([1, 8, 768]), Pooled output: torch.Size([1, 768])


In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split

dataset = pd.read_csv('./data/news_train.csv')
train, val, train_label, val_label = train_test_split(dataset['content'], dataset['info'])

In [5]:
train_encodings = tokenizer(list(train.values), truncation=True, padding=True)
val_encodings = tokenizer(list(val.values), truncation=True, padding=True)

print(tokenizer.convert_ids_to_tokens(train_encodings['input_ids'][0]))

['[CLS]', '▁매출액', '은', '▁2', '7%', '▁줄어든', '▁27', '억', '4', '800', '만', '▁달러', '를', '▁기록했다', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]'

In [6]:
import torch
from torch.utils.data import Dataset

class NewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

train_dataset = NewsDataset(train_encodings, train_label)
val_dataset = NewsDataset(val_encodings, val_label)

In [9]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train()

ImportError: cannot import name 'Bert' from 'transformers' (/home/jiuk/miniconda3/envs/nh/lib/python3.8/site-packages/transformers/__init__.py)

In [23]:
train_dataset[0].keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])