# import tool

In [1]:
import os
import numpy as np
import pandas as pd

import random
import json
from tqdm import tqdm
import datetime
from statistics import mean 
import pickle

from IPython.display import clear_output
# torch
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from torch.utils.data import TensorDataset, random_split

# bert
from transformers import BertTokenizer, BertModel, BertForSequenceClassification
clear_output()

# import self define module

In [2]:
from module.class_label_preprocessing import label_preprocess
with open('./module/label_encoding.pkl' , 'rb') as input:
    label_preprocessing = pickle.load(input)

# Parameter

In [3]:
PRETRAINED_MODEL_NAME = "bert-large-cased" # https://huggingface.co/transformers/pretrained_models.html

BATCH_SIZE = 64
EPOCHS = 50
DEVICE = "cuda: 1"
ifLIMIT = False
MAX_LENGTH = 100
ID = "bert_large_2"

# read data

In [4]:
with open('./data/X_train.json') as json_file:
    X_train = json.load(json_file)
with open('./data/y_train.json') as json_file:
    y_train = json.load(json_file)
with open('./data/X_test.json') as json_file:
    X_test = json.load(json_file)
with open('./data/y_test.json') as json_file:
    y_test = json.load(json_file)

In [5]:
if( ifLIMIT ):
    X_train = X_train[:100]
    y_train = y_train[:100]
    X_test = X_test[:100]
    y_test = y_test[:100]

In [6]:
len(y_train[0])

43

In [7]:
def create_data_loader(X, y, batch_size_):
    X_text = [i[0] for i in X]
    X_reply = [i[1] for i in X]
    tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)

    buf = [tokenizer.encode_plus(i[0], i[1], do_lower_case = False, add_special_tokens = True, max_length = MAX_LENGTH, pad_to_max_length = True) for i in tqdm(X)]   
    input_ids = torch.LongTensor( [i['input_ids'] for i in buf] )
    token_type_ids = torch.LongTensor( [i['token_type_ids'] for i in buf])
    attention_mask = torch.LongTensor( [i['attention_mask'] for i in buf])

    label = torch.FloatTensor(y)

    dataset = TensorDataset(input_ids, token_type_ids, attention_mask, label)
    loader = torch.utils.data.DataLoader(dataset = dataset, batch_size = batch_size_, shuffle = True)

    return(loader)

In [8]:
train_loader = create_data_loader(X_train, y_train, batch_size_ = BATCH_SIZE)
train_loader_1 = create_data_loader(X_train, y_train, batch_size_ = 1)
test_loader = create_data_loader(X_test, y_test, batch_size_ = 1)

100%|██████████| 28800/28800 [00:12<00:00, 2335.07it/s]
100%|██████████| 28800/28800 [00:12<00:00, 2307.37it/s]
100%|██████████| 3200/3200 [00:01<00:00, 2360.36it/s]


In [9]:
# next(iter(train_loader))

# model

In [10]:
model = BertForSequenceClassification.from_pretrained(PRETRAINED_MODEL_NAME, 
                                                      num_labels = len(y_train[0]))
model.to(DEVICE)
clear_output()

# optimizer

In [11]:
LEARNING_RATE = 1e-5
optimizer = torch.optim.Adam(model.parameters(), lr = LEARNING_RATE)

# Loss

In [12]:
loss_fn = nn.MSELoss(reduction='sum')

# Loss_for_test

In [13]:
# data_loader = test_loader

In [14]:
def acc_for_update(data_loader):
    buf = []
    with torch.no_grad():  
        for data in data_loader:
            input_ids, token_type_ids, attention_mask, labels = [t.to(DEVICE) for t in data]

            outputs = model(input_ids = input_ids, 
                                token_type_ids = token_type_ids, 
                                attention_mask = attention_mask) 

            predict_prop = list(outputs[0].cpu().detach().numpy()[0])
            X = list(np.arange(len(predict_prop)))
            X.sort(key=dict(zip(list(X), list(predict_prop))).get, reverse=True)

            predict_label = X[:6]

            true_label = list(np.where(data[3][0].numpy() == 1)[0])
            buf = buf + [ [i in predict_label for i in true_label] ]
            
    return( mean([mean(i) for i in buf]) )

# Start Training

In [None]:
state_of_the_art = 0
for epoch in range(EPOCHS):
    running_loss = 0.0
    
    for data in train_loader:    
        input_ids, token_type_ids, attention_mask, labels = [t.to(DEVICE) for t in data]
        
        optimizer.zero_grad()
        
         # forward pass
        outputs = model(input_ids = input_ids, 
                        token_type_ids = token_type_ids, 
                        attention_mask = attention_mask)          
        
        # loss
        loss = loss_fn(outputs[0], labels)
        # loss = loss_fn(torch.sigmoid(buf1), buf2)
        
        loss.backward()
        optimizer.step()
        running_loss = running_loss + loss.item()
        
    print("\n===EPOCH %d/%d==="% (epoch+1, EPOCHS))
    print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
    print("running_loss: %.4f" %  running_loss)
    print("evaluation score for training set: %.4f" % acc_for_update(train_loader_1))
    
    buf = acc_for_update(test_loader)
    print("evaluation score for testing set: %.4f" % buf)
    
    if( buf > state_of_the_art):
        save_directory = "%s_model_%d" % (ID, buf*10000)
        os.chdir("./model_save") 
        os.mkdir(save_directory)
        model.save_pretrained(save_directory)
        os.chdir("..")
        
        state_of_the_art = buf
        print("save model : %s" % save_directory)


===EPOCH 1/50===
2020-06-07 15:20:04
running_loss: 50909.3655
evaluation score for training set: 0.4137
evaluation score for testing set: 0.4055
save model : bert_large_2_model_4054

===EPOCH 2/50===
2020-06-07 15:38:16
running_loss: 37882.6317
evaluation score for training set: 0.4833
evaluation score for testing set: 0.4618
save model : bert_large_2_model_4618

===EPOCH 3/50===
2020-06-07 15:55:58
running_loss: 36800.7956
