In [None]:
import os
import sys

root_dir = os.path.abspath(os.path.join(os.getcwd(),os.pardir,os.pardir))
sys.path.append(os.path.join(root_dir,'src/laboro_distilbert'))

from tqdm.notebook import tqdm
import tokenization
from transformers import BatchEncoding
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments
import torch
import pickle

import numpy as np


In [None]:
livedoor_path = os.path.join(root_dir,'data/livedoor')
train_path = os.path.join(livedoor_path,'train_tokenized.txt')
val_path = os.path.join(livedoor_path,'dev_tokenized.txt')
test_path = os.path.join(livedoor_path,'test_tokenized.txt')


In [None]:
def read_livedoor(path):
    all_labels = ['dokujo-tsushin', 'it-life-hack', 'kaden-channel', 'livedoor-homme', 'movie-enter', 'peachy', 'smax', 'sports-watch', 'topic-news']
    data_ = open(path,encoding='utf8').readlines()
    texts = []
    labels = []
    for l in data_:
        l = l.strip().split('\t')
        texts.append(l[0])
        labels.append(int(l[1]))
    return texts,labels


In [None]:
train_texts, train_labels = read_livedoor(train_path)
val_texts, val_labels = read_livedoor(val_path)
test_texts, test_labels = read_livedoor(test_path)


In [None]:
def texts2encoding(texts,vocab_file,do_lower_case=True,max_seq_len=512):
    input_ids = []
    attention_mask = []
    for text in tqdm(texts):
        features = text2feature(text,vocab_file,do_lower_case=True,max_seq_len=512)
        input_ids.append(features[0])
        attention_mask.append(features[1])
          
    tmp_dic = {'input_ids':input_ids,'attention_mask':attention_mask}
    encoding = BatchEncoding(tmp_dic)
    
    return encoding

def text2feature(text,vocab_file,do_lower_case=True,max_seq_len=512):
    tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)

    tokens = tokenizer.tokenize(text)
    tokens = ['[CLS]']+tokens
    tokens.append('[SEP]')
    _input_ids = tokenizer.convert_tokens_to_ids(tokens)
    _attention_mask = [1]*len(_input_ids)

    assert len(_input_ids)<=max_seq_len
    while len(_input_ids)<max_seq_len:
        _input_ids.append(0)
        _attention_mask.append(0)

    return _input_ids,_attention_mask



In [None]:
vocab_file = os.path.join(root_dir,'model/laboro_distilbert/tokenizer/ccc_13g_unigram_vocab_lower.txt')

train_encodings = texts2encoding(train_texts,vocab_file)
val_encodings = texts2encoding(val_texts,vocab_file)
test_encodings = texts2encoding(test_texts,vocab_file)

train_encodings_path = os.path.join(livedoor_path,'train_encodings.pickle')
val_encodings_path = os.path.join(livedoor_path,'dev_encodings.pickle')
test_encodings_path = os.path.join(livedoor_path,'test_encodings.pickle')

pickle.dump(train_encodings,open(train_encodings_path,'wb'))
pickle.dump(val_encodings,open(val_encodings_path,'wb'))
pickle.dump(test_encodings,open(test_encodings_path,'wb'))

In [None]:
class LivedoorDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_encodings = pickle.load(open(train_encodings_path,'rb'))
val_encodings = pickle.load(open(val_encodings_path,'rb'))
test_encodings = pickle.load(open(test_encodings_path,'rb'))
    
train_dataset = LivedoorDataset(train_encodings, train_labels)
val_dataset = LivedoorDataset(val_encodings, val_labels)
test_dataset = LivedoorDataset(test_encodings, test_labels)

train_dataset_path = os.path.join(livedoor_path,'train_dataset.pickle')
val_dataset_path = os.path.join(livedoor_path,'dev_dataset.pickle')
test_dataset_path = os.path.join(livedoor_path,'test_dataset.pickle')

pickle.dump(train_dataset,open(train_dataset_path,'wb'))
pickle.dump(val_dataset,open(val_dataset_path,'wb'))
pickle.dump(test_dataset,open(test_dataset_path,'wb'))


In [None]:
train_dataset = pickle.load(open(train_dataset_path,'rb'))
val_dataset = pickle.load(open(val_dataset_path,'rb'))
test_dataset = pickle.load(open(test_dataset_path,'rb'))


In [None]:
# the num_labels in config.json should be 9 for livedoor news classification task
# rename config_livedoor.json as config.json for this task
# and use the following code to make sure model.num_labels is equal to 9

'''
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments

model_path = os.path.join(root_dir,'model/laboro_distilbert')
model = DistilBertForSequenceClassification.from_pretrained(model_path)
model.num_labels
'''


In [None]:
training_args = TrainingArguments(
    output_dir=os.path.join(root_dir,'model/laboro_distilbert/output_livedoor'),          # output directory
    num_train_epochs=10,              # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir=os.path.join(root_dir,'model/laboro_distilbert/output_livedoor'),            # directory for storing logs
    logging_steps=100,
)

model_path = os.path.join(root_dir,'model/laboro_distilbert')
model = DistilBertForSequenceClassification.from_pretrained(model_path)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

trainer.train()

In [None]:
training_args = TrainingArguments(
    output_dir=os.path.join(root_dir,'model/laboro_distilbert/output_livedoor'),          # output directory
    num_train_epochs=0,              # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=8,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir=os.path.join(root_dir,'model/laboro_distilbert/output_livedoor'),            # directory for storing logs
    logging_steps=10,
)

model_path = os.path.join(root_dir,'model/laboro_distilbert/output_livedoor/checkpoint-5500')
model = DistilBertForSequenceClassification.from_pretrained(model_path)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
)

predictions = trainer.predict(test_dataset=test_dataset).predictions


In [None]:
predictions = np.argmax(predictions, axis=1)
#print(predictions)
correct_labels = test_dataset.labels

print(len(predictions))
print(len(correct_labels))
cor = 0
for i in range(len(predictions)):
  if predictions[i]==correct_labels[i]:
    cor += 1
print(cor/len(predictions))