In [1]:
import pickle as pickle
import os
import pandas as pd
import torch
from sklearn.metrics import accuracy_score
from transformers import AutoTokenizer, XLMRobertaConfig, XLMRobertaTokenizer
from transformers import XLMRobertaModel
import numpy as np
import matplotlib.pyplot as plt
import random
from itertools import chain
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
import copy
import csv
import json
import logging
import os
import torch.nn as nn
from tqdm.auto import tqdm
from transformers import AdamW, get_linear_schedule_with_warmup
import logging
import torch.nn.functional as F

In [2]:
random_seed=42
torch.manual_seed(random_seed)
torch.cuda.manual_seed(random_seed)
torch.cuda.manual_seed_all(random_seed) # if use multi-GPU
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(random_seed)
random.seed(random_seed)


In [3]:
# Dataset 구성.
class RE_Dataset(torch.utils.data.Dataset):
    def __init__(self, tokenized_dataset):
        self.tokenized_dataset = tokenized_dataset
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.tokenized_dataset.items()}
        return item

    def __len__(self):
        return len(self.tokenized_dataset['label'])

# 처음 불러온 tsv 파일을 원하는 형태의 DataFrame으로 변경 시켜줍니다.
# 변경한 DataFrame 형태는 baseline code description 이미지를 참고해주세요.
def preprocessing_dataset(dataset):
    out_dataset = pd.DataFrame({'sentence':list(dataset['sentence']),'entity_01':list(dataset['entity_01']), 'entity_02':list(dataset['entity_02']),'label':list(dataset['label'])})
    return out_dataset

# tsv 파일을 불러옵니다.
def load_data(dataset_dir):
  # load label_type, classes
#     with open('/opt/ml/input/data/label_type.pkl', 'rb') as f:
#         label_type = pickle.load(f)
  # load dataset
    dataset = pd.read_csv(dataset_dir, delimiter='\t')
  # preprecessing dataset
    dataset = preprocessing_dataset(dataset)
  
    return dataset

# XLMRoberta input을 위한 tokenizing.
# tip! 다양한 종류의 tokenizer와 special token들을 활용하는 것으로도 새로운 시도를 해볼 수 있습니다.
# baseline code에서는 2가지 부분을 활용했습니다.
# def append_token(dataset, tokenizer):
#     for (ex_index, example) in enumerate(dataset):
        
    

# def tokenized_dataset(dataset, tokenizer):
#     concat_entity = []
#     for e01, e02 in zip(dataset['entity_01'], dataset['entity_02']):
#         temp = ''
#         temp = e01 + '[SEP]' + e02
#         concat_entity.append(temp)
#         tokenized_sentences = tokenizer(
#       #concat_entity,
#           list(dataset['sentence']),
#           return_tensors="pt",
#           padding=True,
#           truncation=True,
#           max_length=150,
#           add_special_tokens=True,
#           )
#     return tokenized_sentences

def tokenized_dataset_len(dataset, tokenizer):
    li = []
    for sentence in dataset['sentence']:
        li.append(tokenizer.tokenize(sentence))
    return li


In [4]:
def compute_metrics(preds, labels):
    assert len(preds) == len(labels)
    return acc_and_f1(preds, labels)


def simple_accuracy(preds, labels):
    return (preds == labels).mean()


def official_f1():

    with open(os.path.join('/opt/ml/eval/result.txt'), "r", encoding="utf-8") as f:
        macro_result = list(f)[-1]
        macro_result = macro_result.split(":")[1].replace(">>>", "").strip()
        macro_result = macro_result.split("=")[1].strip().replace("%", "")
        macro_result = float(macro_result) / 100

    return macro_result

def acc_and_f1(preds, labels, average="macro"):
    acc = simple_accuracy(preds, labels)
    return {
        "acc": acc,
        #"f1": official_f1(),
    }


In [12]:
def convert_sentence_to_features(train_dataset, tokenizer, max_len):
    
    max_seq_len=max_len
    cls_token=tokenizer.cls_token
    #cls_token_segment_id=tokenizer.cls_token_id
    cls_token_segment_id=0
    sep_token=tokenizer.sep_token
    pad_token=1
    pad_token_segment_id=0
    sequence_a_segment_id=0
    add_sep_token=False
    mask_padding_with_zero=True
    
    all_input_ids = []
    all_attention_mask = []
    all_token_type_ids = []
    all_e1_mask=[]
    all_e2_mask=[]
    all_label=[]
    for idx in tqdm(range(len(train_dataset)), desc='tokenizing'):
#         if train_dataset['e1s'][idx] > train_dataset['e2s'][idx]:
#             train_dataset['sentence'][idx] = train_dataset['sentence'][idx][:train_dataset['e2s'][idx]] + ' <e2> ' + train_dataset['sentence'][idx][train_dataset['e2s'][idx]:train_dataset['e2e'][idx]+1] + ' </e2> ' + train_dataset['sentence'][idx][train_dataset['e2e'][idx]+1:train_dataset['e1s'][idx]] + ' <e1> ' + train_dataset['sentence'][idx][train_dataset['e1s'][idx]:train_dataset['e1e'][idx]+1] + ' </e1> ' + train_dataset['sentence'][idx][train_dataset['e1e'][idx]+1:]
#         else:
#             train_dataset['sentence'][idx] = train_dataset['sentence'][idx][:train_dataset['e1s'][idx]] + ' <e1> ' + train_dataset['sentence'][idx][train_dataset['e1s'][idx]:train_dataset['e1e'][idx]+1] + ' </e1> ' + train_dataset['sentence'][idx][train_dataset['e1e'][idx]+1:train_dataset['e2s'][idx]] + ' <e2> ' + train_dataset['sentence'][idx][train_dataset['e2s'][idx]:train_dataset['e2e'][idx]+1] + ' </e2> ' + train_dataset['sentence'][idx][train_dataset['e2e'][idx]+1:]    
        token = tokenizer.tokenize(train_dataset['sentence'][idx])
        
#         e11_p = token.index("<e1>")  # the start position of entity1
#         e12_p = token.index("</e1>")  # the end position of entity1
#         e21_p = token.index("<e2>")  # the start position of entity2
#         e22_p = token.index("</e2>")  # the end position of entity2

        e1_p = np.where(np.isin(token, '#'))
        e2_p = np.where(np.isin(token, '@'))

        e11_p = e1_p[0][0]  # the start position of entity1
        e12_p = e1_p[0][1]  # the end position of entity1
        e21_p = e2_p[0][0] # the start position of entity2
        e22_p = e2_p[0][1]  # the end position of entity2

        
#         token[e11_p] = "$"
#         token[e12_p] = "$"
#         token[e21_p] = "#"
#         token[e22_p] = "#"

        #print(token)

        e11_p += 1
        e12_p += 1
        e21_p += 1
        e22_p += 1


        if len(token) < max_seq_len + 1 :
#            token = token[: (max_seq_len - special_tokens_count)]

#         if add_sep_token:
#             token += [sep_token]

            token_type_ids = [sequence_a_segment_id] * len(token)

            token = [cls_token] + token 
            token_type_ids = [cls_token_segment_id] + token_type_ids

            input_ids = tokenizer.convert_tokens_to_ids(token)

            attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

            padding_length = max_seq_len - len(input_ids)
            input_ids = input_ids + ([pad_token] * padding_length)
            attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
            token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)

            e1_mask = [0] * len(attention_mask)
            e2_mask = [0] * len(attention_mask)

            for i in range(e11_p, e12_p + 1):
                e1_mask[i] = 1
            for i in range(e21_p, e22_p + 1):
                e2_mask[i] = 1

            assert len(input_ids) == max_seq_len, "Error with input length {} vs {}".format(len(input_ids), max_seq_len)
            assert len(attention_mask) == max_seq_len, "Error with attention mask length {} vs {}".format(
                len(attention_mask), max_seq_len
            )
            assert len(token_type_ids) == max_seq_len, "Error with token type length {} vs {}".format(
                len(token_type_ids), max_seq_len
            )

            all_input_ids.append(input_ids)
            all_attention_mask.append(attention_mask)
            all_token_type_ids.append(token_type_ids)
            all_e1_mask.append(e1_mask)
            all_e2_mask.append(e2_mask)
            all_label.append(train_dataset['label'][idx])
    
    all_features = {
        'input_ids' : torch.tensor(all_input_ids),
        'attention_mask' : torch.tensor(all_attention_mask),
        'token_type_ids' : torch.tensor(all_token_type_ids),
        'e1_mask' : torch.tensor(all_e1_mask),
        'e2_mask' : torch.tensor(all_e2_mask),
        'label' : torch.tensor(all_label)
    }  
    return RE_Dataset(all_features)



In [6]:
def reduce_loss(loss, reduction='mean'):
    return loss.mean() if reduction=='mean' else loss.sum() if reduction=='sum' else loss

# Implementation from fastai https://github.com/fastai/fastai2/blob/master/fastai2/layers.py#L338
class LabelSmoothingCrossEntropy(nn.Module):
    def __init__(self, e:float=0.05, reduction='mean'):
        super().__init__()
        self.e,self.reduction = e,reduction
    
    def forward(self, output, target):
        # number of classes
        c = output.size()[-1]
        log_preds = F.log_softmax(output, dim=-1)
        loss = reduce_loss(-log_preds.sum(dim=-1), self.reduction)
        nll = F.nll_loss(log_preds, target, reduction=self.reduction)
        # (1-ε)* H(q,p) + ε*H(u,p)
        return (1-self.e)*nll + self.e*(loss/c) 

In [7]:
class FCLayer(nn.Module):
    def __init__(self, input_dim, output_dim, dropout_rate=0.0, use_activation=True):
        super(FCLayer, self).__init__()
        self.use_activation = use_activation
        self.dropout = nn.Dropout(dropout_rate)
        self.linear = nn.Linear(input_dim, output_dim)
        self.tanh = nn.Tanh()

    def forward(self, x):
        x = self.dropout(x)
        if self.use_activation:
            x = self.tanh(x)
        return self.linear(x)


class RXLMRoberta(XLMRobertaModel):
    def __init__(self,  model_name, config, dropout_rate):
        super(RXLMRoberta, self).__init__(config)
        self.XLMRoberta = XLMRobertaModel.from_pretrained(model_name, config=config)  # Load pretrained XLMRoberta

        self.num_labels = config.num_labels

        self.cls_fc_layer = FCLayer(config.hidden_size, config.hidden_size, dropout_rate)
        self.entity_fc_layer1 = FCLayer(config.hidden_size, config.hidden_size, dropout_rate)
        self.entity_fc_layer2 = FCLayer(config.hidden_size, config.hidden_size, dropout_rate)

        self.label_classifier = FCLayer(
            config.hidden_size * 3,
            config.num_labels,
            dropout_rate,
            use_activation=False,
        )

    @staticmethod
    def entity_average(hidden_output, e_mask):
        """
        Average the entity hidden state vectors (H_i ~ H_j)
        :param hidden_output: [batch_size, j-i+1, dim]
        :param e_mask: [batch_size, max_seq_len]
                e.g. e_mask[0] == [0, 0, 0, 1, 1, 1, 0, 0, ... 0]
        :return: [batch_size, dim]
        """
        e_mask_unsqueeze = e_mask.unsqueeze(1)  # [b, 1, j-i+1]
        length_tensor = (e_mask != 0).sum(dim=1).unsqueeze(1)  # [batch_size, 1]

        # [b, 1, j-i+1] * [b, j-i+1, dim] = [b, 1, dim] -> [b, dim]
        sum_vector = torch.bmm(e_mask_unsqueeze.float(), hidden_output).squeeze(1)
        avg_vector = sum_vector.float() / length_tensor.float()  # broadcasting
        return avg_vector

    def forward(self, input_ids, attention_mask, token_type_ids, labels, e1_mask, e2_mask):
        outputs = self.XLMRoberta(
            input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids
        )  # sequence_output, pooled_output, (hidden_states), (attentions)
        sequence_output = outputs[0]
        pooled_output = outputs[1]  # [CLS]
    
        e1_h = self.entity_average(sequence_output, e1_mask)
        e2_h = self.entity_average(sequence_output, e2_mask)
        # Dropout -> tanh -> fc_layer (Share FC layer for e1 and e2)
        pooled_output = self.cls_fc_layer(pooled_output)
        e1_h = self.entity_fc_layer1(e1_h)
        e2_h = self.entity_fc_layer2(e2_h)
        # Concat -> fc_layer
        #concat_h = torch.cat([pooled_output, e1_h, e2_h, torch.abs(torch.sub(e1_h,e2_h))], dim=-1)
        concat_h = torch.cat([pooled_output, e1_h, e2_h], dim=-1)
        logits = self.label_classifier(concat_h)
        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
        # Softmax
        if labels is not None:
            if self.num_labels == 1:
                loss_fct = nn.MSELoss()
                loss = loss_fct(logits.view(-1), labels.view(-1))
            else:
                loss_fct = nn.CrossEntropyLoss()
                #loss_fct = LabelSmoothingCrossEntropy()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

            outputs = (loss,) + outputs

        return outputs  # (loss), logits, (hidden_states), (attentions)

In [8]:
torch.cuda.is_available()

True

In [9]:
logger = logging.getLogger(__name__)
class Trainer(object):
    def __init__(self,num_labels, label_dict,logging_steps, save_steps,max_steps,
                 num_train_epochs,warmup_steps,adam_epsilon,learning_rate,gradient_accumulation_steps,
                 max_grad_norm, eval_batch_size, train_batch_size, model_dir, dropout_rate,
                 weight_decay, Model_name ,train_dataset=None, dev_dataset=None, test_dataset=None):
        #self.args = args
        self.train_dataset = train_dataset
        self.eval_batch_size = eval_batch_size
        self.train_batch_size = train_batch_size
        self.dev_dataset = dev_dataset
        self.test_dataset = test_dataset
        self.Model_name = Model_name
        self.label_lst = label_dict
        self.num_labels = num_labels
        self.max_steps = max_steps
        self.weight_decay = weight_decay
        self.learning_rate = learning_rate
        self.adam_epsilon=adam_epsilon
        self.warmup_steps = warmup_steps
        self.num_train_epochs = num_train_epochs
        self.logging_steps = logging_steps
        self.save_steps = save_steps
        self.max_grad_norm = max_grad_norm
        self.model_dir = model_dir
        self.dropout_rate = dropout_rate
        self.gradient_accumulation_steps = gradient_accumulation_steps
        self.config = XLMRobertaConfig.from_pretrained(
            self.Model_name,
            num_labels=self.num_labels,
            #id2label={str(i): label for i, label in enumerate(self.label_lst)},
            id2label=self.label_lst,
            #label2id={label: i for key, label in self.label_lst},
            label2id={value : key for key, value in self.label_lst.items()}
        )
        self.model = RXLMRoberta(
            self.Model_name, config=self.config, dropout_rate = self.dropout_rate,
        )

        # GPU or CPU
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model.to(self.device)

    def train(self):
        train_sampler = RandomSampler(self.train_dataset)
        train_dataloader = DataLoader(
            self.train_dataset,
            sampler=train_sampler,
            batch_size=self.train_batch_size,
        )

        if self.max_steps > 0:
            t_total = self.max_steps
            self.num_train_epochs = (
                self.max_steps // (len(train_dataloader) // self.gradient_accumulation_steps) + 1
            )
        else:
            t_total = len(train_dataloader) // self.gradient_accumulation_steps * self.num_train_epochs

        # Prepare optimizer and schedule (linear warmup and decay)
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": self.weight_decay,
            },
            {
                "params": [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
        ]
        optimizer = AdamW(
            optimizer_grouped_parameters,
            lr=self.learning_rate,
            eps=self.adam_epsilon,
        )
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=self.warmup_steps,
            num_training_steps=t_total,
        )
        
        #scaler = torch.cuda.amp.GradScaler()
        # Train!
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(self.train_dataset))
        logger.info("  Num Epochs = %d", self.num_train_epochs)
        logger.info("  Total train batch size = %d", self.train_batch_size)
        logger.info("  Gradient Accumulation steps = %d", self.gradient_accumulation_steps)
        logger.info("  Total optimization steps = %d", t_total)
        logger.info("  Logging steps = %d", self.logging_steps)
        logger.info("  Save steps = %d", self.save_steps)

        global_step = 0
        tr_loss = 0.0
        self.model.zero_grad()

        train_iterator = tqdm(range(int(self.num_train_epochs)), desc="Epoch")

        for _ in train_iterator:
            epoch_iterator = tqdm(train_dataloader, desc="Iteration")
            for step, batch in enumerate(epoch_iterator):
                self.model.train()
                batch = tuple(batch[t].to(self.device) for t in batch)  # GPU or CPU
                inputs = {
                    "input_ids": batch[0],
                    "attention_mask": batch[1],
                    "token_type_ids" : batch[2],
                    "labels": batch[5],
                    "e1_mask": batch[3],
                    "e2_mask": batch[4]
                }
                #with torch.cuda.amp.autocast():
                outputs = self.model(**inputs)
                loss = outputs[0]

                if self.gradient_accumulation_steps > 1:
                    loss = loss / self.gradient_accumulation_steps

                #scaler.scale(loss).backward()
                loss.backward()

                tr_loss += loss.item()
                if (step + 1) % self.gradient_accumulation_steps == 0:
                    torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.max_grad_norm)

                    optimizer.step()
                    #scaler.step(optimizer)
                    #scaler.update()
                    scheduler.step()  # Update learning rate schedule
                    self.model.zero_grad()
                    global_step += 1

                    if self.logging_steps > 0 and global_step % self.logging_steps == 0:
                        logger.info("  global steps = %d", global_step)
                        self.evaluate("train")  # There is no dev set for semeval task

                    if self.save_steps > 0 and global_step % self.save_steps == 0:
                        self.save_model()

                if 0 < self.max_steps < global_step:
                    epoch_iterator.close()
                    break

            if 0 < self.max_steps < global_step:
                train_iterator.close()
                break

        return global_step, tr_loss / global_step

    def evaluate(self, mode):
        # We use test dataset because semeval doesn't have dev dataset
        if mode == "test":
            dataset = self.test_dataset
        elif mode == "dev":
            dataset = self.dev_dataset
        elif mode == "train":
            dataset = self.train_dataset
        else:
            raise Exception("Only dev and test dataset available")

        eval_sampler = SequentialSampler(dataset)
        eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=self.eval_batch_size)

        # Eval!
        logger.info("***** Running evaluation on %s dataset *****", mode)
        logger.info("  Num examples = %d", len(dataset))
        logger.info("  Batch size = %d", self.eval_batch_size)
        eval_loss = 0.0
        nb_eval_steps = 0
        preds = None
        out_label_ids = None

        self.model.eval()

        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            batch = tuple(batch[t].to(self.device) for t in batch)
            with torch.no_grad():
                inputs = {
                    "input_ids": batch[0],
                    "attention_mask": batch[1],
                    "token_type_ids": batch[2],
                    "labels": batch[5],
                    "e1_mask": batch[3],
                    "e2_mask": batch[4],
                }
                #with torch.cuda.amp.autocast():
                outputs = self.model(**inputs)
                tmp_eval_loss, logits = outputs[:2]
                eval_loss += tmp_eval_loss.mean().item()
            nb_eval_steps += 1

            if preds is None:
                preds = logits.detach().cpu().numpy()
                out_label_ids = inputs["labels"].detach().cpu().numpy()
            else:
                preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
                out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0)

        eval_loss = eval_loss / nb_eval_steps
        results = {"loss": eval_loss}
        preds = np.argmax(preds, axis=1)

        result = compute_metrics(preds, out_label_ids)
        results.update(result)

        logger.info("***** Eval results *****")
        for key in sorted(results.keys()):
            logger.info("  {} = {:.4f}".format(key, results[key]))

        return results
    
    def test_pred(self):
        test_dataset = self.test_dataset
        test_sampler = SequentialSampler(test_dataset)
        test_dataloader = DataLoader(test_dataset, sampler=test_sampler,batch_size=self.eval_batch_size)

        # Eval!
        logger.info("***** Running evaluation on %s dataset *****", "test")
        #logger.info("  Num examples = %d", len(dataset))
        logger.info("  Batch size = %d", self.eval_batch_size)

        nb_eval_steps = 0
        preds = None
        out_label_ids = None

        self.model.eval()

        for batch in tqdm(test_dataloader, desc="Predicting"):
            batch = tuple(batch[t].to(self.device) for t in batch)
            with torch.no_grad():
                inputs = {
                    "input_ids": batch[0],
                    "attention_mask": batch[1],
                    "token_type_ids": batch[2],
                    "labels": None,
                    "e1_mask": batch[3],
                    "e2_mask": batch[4],
                }
                outputs = self.model(**inputs)
                #print(outputs)
                pred = outputs[0]

            nb_eval_steps += 1

            if preds is None:
                preds = pred.detach().cpu().numpy()
                #out_label_ids = inputs["labels"].detach().cpu().numpy()
            else:
                preds = np.append(preds, pred.detach().cpu().numpy(), axis=0)
                #out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0)

        preds = np.argmax(preds, axis=1)
        df = pd.DataFrame(preds, columns=['pred'])
        df.to_csv('RXLMRoberta_layersplit_nerdata_epoch7.csv', index=False)
#         with open("proposed_answers.txt", "w", encoding="utf-8") as f:
#             for idx, pred in enumerate(preds):
#                 f.write("{}\n".format(pred))
        #write_prediction(self.args, os.path.join(self.args.eval_dir, "proposed_answers.txt"), preds)
    

    def save_model(self):
        # Save model checkpoint (Overwrite)
        if not os.path.exists(self.model_dir):
            os.makedirs(self.model_dir)
        model_to_save = self.model.module if hasattr(self.model, "module") else self.model
        model_to_save.save_pretrained(self.model_dir)

        # Save training arguments together with the trained model
        #torch.save(self.args, os.path.join(self.args.model_dir, "training_args.bin"))
        logger.info("Saving model checkpoint to %s", self.model_dir)

    def load_model(self):
        # Check whether model exists
        if not os.path.exists(self.model_dir):
            raise Exception("Model doesn't exists! Train first!")

        #self.args = torch.load(os.path.join(self.args.model_dir, "training_args.bin"))
        self.model = RXLMRoberta.from_pretrained(self.model_dir)
        self.model.to(self.device)
        logger.info("***** Model Loaded *****")

In [10]:
def init_logger():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )


In [None]:
# train_dataset = load_data("/opt/ml/input/data/train/train.tsv")
# test_dataset = load_data("/opt/ml/input/data/test/test.tsv")
# #dev_dataset = load_data("./dataset/train/dev.tsv")
# #train_label = train_dataset['label'].values
# #train_dataset.columns= ['link','sentence' 'entity_01','e1s','e1e','entity_02','e2s','e2e','label']
# ADDITIONAL_SPECIAL_TOKENS = ["<e1>", "</e1>", "<e2>", "</e2>"]
# MODEL_NAME = "xlm-roberta-large"
# #MODEL_NAME = "xlm-roXLMRobertaa-large"
# tokenizer = XLMRobertaTokenizer.from_pretrained(MODEL_NAME)
# tokenizer.add_special_tokens({"additional_special_tokens": ADDITIONAL_SPECIAL_TOKENS})
# train_Dataset = convert_sentence_to_features(train_dataset, tokenizer, max_len = 339+2)
# print(train_Dataset[0])

In [None]:

MODEL_NAME = "xlm-roberta-large"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
ADDITIONAL_SPECIAL_TOKENS = ["#", "@"]   
tokenizer.add_special_tokens({"additional_special_tokens": ADDITIONAL_SPECIAL_TOKENS})
s = '용병 @ α CIVILIZATION α 공격수@ 챠디의 부진과 시즌 초 활약한 # β PERSON β 강수일#의 침체, 시즌 중반에 영입한 세르비아 출신 용병 미드필더 오그넨 코로만의 부상 등이 부진의 원인으로 지적되던 가운데 인천은 시즌 마지막 4경기에서 3승 1패를 거두며 막판 승점 쌓기에 성공, 정규리그 순위 5위로 플레이오프에 진출하는 데에 성공했다.'
s = s.replace('α', '')
s = s.replace('β', '')
li = tokenizer.tokenize(s)
print(li)

In [None]:

MODEL_NAME = "xlm-roberta-large"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
print(tokenizer(['안녕 만나서 반가워', '왜 이게 이렇게 나오지'],
      return_tensors="pt",
      padding=True,
      truncation=True,
      max_length=100,
      add_special_tokens=True,
      return_token_type_ids=True,
      ))
ADDITIONAL_SPECIAL_TOKENS = ["#", "@", "α", "β"]
      
tokenizer.add_special_tokens({"additional_special_tokens": ADDITIONAL_SPECIAL_TOKENS})
tokenizer.tokenize('용병 @ α CIVILIZATION α 공격수@ 챠디의 부진과 시즌 초 활약한 # β PERSON β 강수일#의 침체, 시즌 중반에 영입한 세르비아 출신 용병 미드필더 오그넨 코로만의 부상 등이 부진의 원인으로 지적되던 가운데 인천은 시즌 마지막 4경기에서 3승 1패를 거두며 막판 승점 쌓기에 성공, 정규리그 순위 5위로 플레이오프에 진출하는 데에 성공했다.')
train_Dataset = convert_sentence_to_features(['안녕 만나서 반가워', '왜 이게 이렇게 나오지'], tokenizer, max_len = 339+2)

print(tokenizer.pad_token, tokenizer.pad_token_id)
print(tokenizer.cls_token, tokenizer.cls_token_id)
print(tokenizer.sep_token, tokenizer.sep_token_id)
# load dataset
train_dataset = load_data("/opt/ml/input/data/train/train.tsv")
#dev_dataset = load_data("./dataset/train/dev.tsv")
train_label = train_dataset['label'].values
#dev_label = dev_dataset['label'].values


tokenized_len_dataset = tokenized_dataset_len(train_dataset, tokenizer)
print('최대 길이 : ', max(len(i) for i in tokenized_len_dataset))
print('평균 길이 : ', sum(map(len, tokenized_len_dataset))/len(tokenized_len_dataset))
plt.hist([len(s) for s in tokenized_len_dataset], bins=50)
plt.xlabel('length of samples')
plt.ylabel('number of samples')
plt.show()


In [None]:
MODEL_NAME = "xlm-roberta-large"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
ADDITIONAL_SPECIAL_TOKENS = ["<e1>", "</e1>", "<e2>", "</e2>"] 
tokenizer.add_special_tokens({"additional_special_tokens": ADDITIONAL_SPECIAL_TOKENS})

train_dataset = load_data("/opt/ml/input/data/train/train.tsv")
train_label = train_dataset['label'].values

for idx in tqdm(range(len(train_dataset))):
    if train_dataset['e1s'][idx] > train_dataset['e2s'][idx]:
        train_dataset['sentence'][idx] = train_dataset['sentence'][idx][:train_dataset['e2s'][idx]] + ' <e2> ' + train_dataset['sentence'][idx][train_dataset['e2s'][idx]:train_dataset['e2e'][idx]+1] + ' </e2> ' + train_dataset['sentence'][idx][train_dataset['e2e'][idx]+1:train_dataset['e1s'][idx]] + ' <e1> ' + train_dataset['sentence'][idx][train_dataset['e1s'][idx]:train_dataset['e1e'][idx]+1] + ' </e1> ' + train_dataset['sentence'][idx][train_dataset['e1e'][idx]+1:]          
    else:
        train_dataset['sentence'][idx] = train_dataset['sentence'][idx][:train_dataset['e1s'][idx]] + ' <e1> ' + train_dataset['sentence'][idx][train_dataset['e1s'][idx]:train_dataset['e1e'][idx]+1] + ' </e1> ' + train_dataset['sentence'][idx][train_dataset['e1e'][idx]+1:train_dataset['e2s'][idx]] + ' <e2> ' + train_dataset['sentence'][idx][train_dataset['e2s'][idx]:train_dataset['e2e'][idx]+1] + ' </e2> ' + train_dataset['sentence'][idx][train_dataset['e2e'][idx]+1:]              


tokenized_len_dataset = tokenized_dataset_len(train_dataset, tokenizer)

print('최대 길이 : ', max(len(i) for i in tokenized_len_dataset))
print('평균 길이 : ', sum(map(len, tokenized_len_dataset))/len(tokenized_len_dataset))
plt.hist([len(s) for s in tokenized_len_dataset], bins=50)
plt.xlabel('length of samples')
plt.ylabel('number of samples')
plt.show()

In [None]:
MODEL_NAME = "xlm-roberta-large"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
ADDITIONAL_SPECIAL_TOKENS = ["#", "@", "α", "β"]   
tokenizer.add_special_tokens({"additional_special_tokens": ADDITIONAL_SPECIAL_TOKENS})

train_dataset = load_data("/opt/ml/input/data/train/train.tsv")
li = [len(i) for i in tokenized_len_dataset]
train_dataset['len'] = li

train_dataset = train_dataset[train_dataset['len'] <= 250]
train_dataset = train_dataset.reset_index(drop=False, inplace=False)
del train_dataset['index']
#train_dataset.head(57)
#train_dataset.head()
for idx in tqdm(range(len(train_dataset))):
    if train_dataset['e1s'][idx] > train_dataset['e2s'][idx]:
        train_dataset['sentence'][idx] = train_dataset['sentence'][idx][:train_dataset['e2s'][idx]] + ' <e2> ' + train_dataset['sentence'][idx][train_dataset['e2s'][idx]:train_dataset['e2e'][idx]+1] + ' </e2> ' + train_dataset['sentence'][idx][train_dataset['e2e'][idx]+1:train_dataset['e1s'][idx]] + ' <e1> ' + train_dataset['sentence'][idx][train_dataset['e1s'][idx]:train_dataset['e1e'][idx]+1] + ' </e1> ' + train_dataset['sentence'][idx][train_dataset['e1e'][idx]+1:]          
    else:
        train_dataset['sentence'][idx] = train_dataset['sentence'][idx][:train_dataset['e1s'][idx]] + ' <e1> ' + train_dataset['sentence'][idx][train_dataset['e1s'][idx]:train_dataset['e1e'][idx]+1] + ' </e1> ' + train_dataset['sentence'][idx][train_dataset['e1e'][idx]+1:train_dataset['e2s'][idx]] + ' <e2> ' + train_dataset['sentence'][idx][train_dataset['e2s'][idx]:train_dataset['e2e'][idx]+1] + ' </e2> ' + train_dataset['sentence'][idx][train_dataset['e2e'][idx]+1:]              


tokenized_len_dataset = tokenized_dataset_len(train_dataset, tokenizer)

print('최대 길이 : ', max(len(i) for i in tokenized_len_dataset))
print('평균 길이 : ', sum(map(len, tokenized_len_dataset))/len(tokenized_len_dataset))
plt.hist([len(s) for s in tokenized_len_dataset], bins=50)
plt.xlabel('length of samples')
plt.ylabel('number of samples')
plt.show()

In [None]:
MODEL_NAME = "xlm-roberta-large"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
ADDITIONAL_SPECIAL_TOKENS = ["#", "@", "α", "β"]   
tokenizer.add_special_tokens({"additional_special_tokens": ADDITIONAL_SPECIAL_TOKENS})

train_dataset = load_data("/opt/ml/input/data/train/ner_train_ver2.tsv")
# train_label = train_dataset['label'].values

tokenized_len_dataset = tokenized_dataset_len(train_dataset, tokenizer)
print('최대 길이 : ', max(len(i) for i in tokenized_len_dataset))
print('평균 길이 : ', sum(map(len, tokenized_len_dataset))/len(tokenized_len_dataset))
plt.hist([len(s) for s in tokenized_len_dataset], bins=50)
plt.xlabel('length of samples')
plt.ylabel('number of samples')
plt.show()


In [None]:
MODEL_NAME = "xlm-roberta-large"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
ADDITIONAL_SPECIAL_TOKENS = ["<e1>", "</e1>", "<e2>", "</e2>"] 
tokenizer.add_special_tokens({"additional_special_tokens": ADDITIONAL_SPECIAL_TOKENS})

train_dataset = load_data("/opt/ml/input/data/train/ner_train_ver2.tsv")
train_label = train_dataset['label'].values

for idx in tqdm(range(len(train_dataset))):
    if train_dataset['e1s'][idx] > train_dataset['e2s'][idx]:
        train_dataset['sentence'][idx] = train_dataset['sentence'][idx][:train_dataset['e2s'][idx]] + ' <e2> ' + train_dataset['sentence'][idx][train_dataset['e2s'][idx]:train_dataset['e2e'][idx]+1] + ' </e2> ' + train_dataset['sentence'][idx][train_dataset['e2e'][idx]+1:train_dataset['e1s'][idx]] + ' <e1> ' + train_dataset['sentence'][idx][train_dataset['e1s'][idx]:train_dataset['e1e'][idx]+1] + ' </e1> ' + train_dataset['sentence'][idx][train_dataset['e1e'][idx]+1:]          
    else:
        train_dataset['sentence'][idx] = train_dataset['sentence'][idx][:train_dataset['e1s'][idx]] + ' <e1> ' + train_dataset['sentence'][idx][train_dataset['e1s'][idx]:train_dataset['e1e'][idx]+1] + ' </e1> ' + train_dataset['sentence'][idx][train_dataset['e1e'][idx]+1:train_dataset['e2s'][idx]] + ' <e2> ' + train_dataset['sentence'][idx][train_dataset['e2s'][idx]:train_dataset['e2e'][idx]+1] + ' </e2> ' + train_dataset['sentence'][idx][train_dataset['e2e'][idx]+1:]              


tokenized_len_dataset = tokenized_dataset_len(train_dataset, tokenizer)

print('최대 길이 : ', max(len(i) for i in tokenized_len_dataset))
print('평균 길이 : ', sum(map(len, tokenized_len_dataset))/len(tokenized_len_dataset))
plt.hist([len(s) for s in tokenized_len_dataset], bins=50)
plt.xlabel('length of samples')
plt.ylabel('number of samples')

In [13]:
def main():
    init_logger()
    #train_dataset = load_data("/opt/ml/input/data/train/train+all.tsv")
    train_dataset = load_data("/opt/ml/input/data/train/ner_train_ver2.tsv")
    
    test_dataset = load_data("/opt/ml/input/data/test/ner_test_ver2.tsv")
    #dev_dataset = load_data("./dataset/train/dev.tsv")
    #train_label = train_dataset['label'].values
    #train_dataset.columns= ['link','sentence' 'entity_01','e1s','e1e','entity_02','e2s','e2e','label']
    ADDITIONAL_SPECIAL_TOKENS = ["#", "@", "α", "β"]
    MODEL_NAME = "xlm-roberta-large"
    #MODEL_NAME = "xlm-roXLMRobertaa-large"
    tokenizer = XLMRobertaTokenizer.from_pretrained(MODEL_NAME)
    tokenizer.add_special_tokens({"additional_special_tokens": ADDITIONAL_SPECIAL_TOKENS})
    
#     li = [len(i) for i in tokenized_len_dataset]
#     train_dataset['len'] = li

#     train_dataset = train_dataset[train_dataset['len'] <= 250]
#     train_dataset = train_dataset.reset_index(drop=False, inplace=False)
#     del train_dataset['index']
    
    
    train_Dataset = convert_sentence_to_features(train_dataset, tokenizer, max_len = 351+2)
    test_Dataset = convert_sentence_to_features(test_dataset, tokenizer, max_len=351+2)
    with open('/opt/ml/input/data/label_type.pkl', 'rb') as f:
        label_type = pickle.load(f)
    
    trainer = Trainer(eval_batch_size=16,train_batch_size=16, num_labels = 42,
                      max_steps=-1, weight_decay=0.0, learning_rate= 2e-5, 
                      adam_epsilon=1e-8, warmup_steps=0, num_train_epochs=7,
                      logging_steps=400, save_steps=400, max_grad_norm=1.0, 
                      model_dir='./model', gradient_accumulation_steps=1,dropout_rate = 0.1,
                      label_dict=label_type,Model_name=MODEL_NAME,train_dataset=train_Dataset,
                      test_dataset=test_Dataset)
    
    do_train = True
    do_test = True
    if do_train:
        trainer.train()

    if do_test:
        trainer.test_pred()


if __name__ == "__main__":
    main()

HBox(children=(FloatProgress(value=0.0, description='tokenizing', max=9000.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='tokenizing', max=1000.0, style=ProgressStyle(description_…




04/21/2021 00:22:32 - INFO - __main__ -   ***** Running training *****
04/21/2021 00:22:32 - INFO - __main__ -     Num examples = 9000
04/21/2021 00:22:32 - INFO - __main__ -     Num Epochs = 6
04/21/2021 00:22:32 - INFO - __main__ -     Total train batch size = 16
04/21/2021 00:22:32 - INFO - __main__ -     Gradient Accumulation steps = 1
04/21/2021 00:22:32 - INFO - __main__ -     Total optimization steps = 3378
04/21/2021 00:22:32 - INFO - __main__ -     Logging steps = 400
04/21/2021 00:22:32 - INFO - __main__ -     Save steps = 400


HBox(children=(FloatProgress(value=0.0, description='Epoch', max=6.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=563.0, style=ProgressStyle(description_wi…

  import sys
04/21/2021 00:30:21 - INFO - __main__ -     global steps = 400
04/21/2021 00:30:21 - INFO - __main__ -   ***** Running evaluation on train dataset *****
04/21/2021 00:30:21 - INFO - __main__ -     Num examples = 9000
04/21/2021 00:30:21 - INFO - __main__ -     Batch size = 16


HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=563.0, style=ProgressStyle(description_w…

04/21/2021 00:33:47 - INFO - __main__ -   ***** Eval results *****
04/21/2021 00:33:47 - INFO - __main__ -     acc = 0.7182
04/21/2021 00:33:47 - INFO - __main__ -     loss = 0.9767





04/21/2021 00:34:25 - INFO - __main__ -   Saving model checkpoint to ./model





HBox(children=(FloatProgress(value=0.0, description='Iteration', max=563.0, style=ProgressStyle(description_wi…

04/21/2021 00:42:14 - INFO - __main__ -     global steps = 800
04/21/2021 00:42:14 - INFO - __main__ -   ***** Running evaluation on train dataset *****
04/21/2021 00:42:14 - INFO - __main__ -     Num examples = 9000
04/21/2021 00:42:14 - INFO - __main__ -     Batch size = 16


HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=563.0, style=ProgressStyle(description_w…

04/21/2021 00:45:40 - INFO - __main__ -   ***** Eval results *****
04/21/2021 00:45:40 - INFO - __main__ -     acc = 0.7900
04/21/2021 00:45:40 - INFO - __main__ -     loss = 0.6796





04/21/2021 00:46:17 - INFO - __main__ -   Saving model checkpoint to ./model





HBox(children=(FloatProgress(value=0.0, description='Iteration', max=563.0, style=ProgressStyle(description_wi…

04/21/2021 00:54:05 - INFO - __main__ -     global steps = 1200
04/21/2021 00:54:05 - INFO - __main__ -   ***** Running evaluation on train dataset *****
04/21/2021 00:54:05 - INFO - __main__ -     Num examples = 9000
04/21/2021 00:54:05 - INFO - __main__ -     Batch size = 16


HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=563.0, style=ProgressStyle(description_w…

04/21/2021 00:57:31 - INFO - __main__ -   ***** Eval results *****
04/21/2021 00:57:31 - INFO - __main__ -     acc = 0.8381
04/21/2021 00:57:31 - INFO - __main__ -     loss = 0.5338





04/21/2021 00:58:19 - INFO - __main__ -   Saving model checkpoint to ./model
04/21/2021 01:06:07 - INFO - __main__ -     global steps = 1600
04/21/2021 01:06:07 - INFO - __main__ -   ***** Running evaluation on train dataset *****
04/21/2021 01:06:07 - INFO - __main__ -     Num examples = 9000
04/21/2021 01:06:07 - INFO - __main__ -     Batch size = 16


HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=563.0, style=ProgressStyle(description_w…

04/21/2021 01:09:33 - INFO - __main__ -   ***** Eval results *****
04/21/2021 01:09:33 - INFO - __main__ -     acc = 0.8909
04/21/2021 01:09:33 - INFO - __main__ -     loss = 0.3705





04/21/2021 01:10:12 - INFO - __main__ -   Saving model checkpoint to ./model





HBox(children=(FloatProgress(value=0.0, description='Iteration', max=563.0, style=ProgressStyle(description_wi…

04/21/2021 01:18:00 - INFO - __main__ -     global steps = 2000
04/21/2021 01:18:00 - INFO - __main__ -   ***** Running evaluation on train dataset *****
04/21/2021 01:18:00 - INFO - __main__ -     Num examples = 9000
04/21/2021 01:18:00 - INFO - __main__ -     Batch size = 16


HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=563.0, style=ProgressStyle(description_w…

04/21/2021 01:21:25 - INFO - __main__ -   ***** Eval results *****
04/21/2021 01:21:25 - INFO - __main__ -     acc = 0.9159
04/21/2021 01:21:25 - INFO - __main__ -     loss = 0.2790





04/21/2021 01:22:03 - INFO - __main__ -   Saving model checkpoint to ./model





HBox(children=(FloatProgress(value=0.0, description='Iteration', max=563.0, style=ProgressStyle(description_wi…

04/21/2021 01:29:50 - INFO - __main__ -     global steps = 2400
04/21/2021 01:29:50 - INFO - __main__ -   ***** Running evaluation on train dataset *****
04/21/2021 01:29:50 - INFO - __main__ -     Num examples = 9000
04/21/2021 01:29:50 - INFO - __main__ -     Batch size = 16


HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=563.0, style=ProgressStyle(description_w…

04/21/2021 01:33:16 - INFO - __main__ -   ***** Eval results *****
04/21/2021 01:33:16 - INFO - __main__ -     acc = 0.9458
04/21/2021 01:33:16 - INFO - __main__ -     loss = 0.1951





04/21/2021 01:33:53 - INFO - __main__ -   Saving model checkpoint to ./model
04/21/2021 01:41:41 - INFO - __main__ -     global steps = 2800
04/21/2021 01:41:41 - INFO - __main__ -   ***** Running evaluation on train dataset *****
04/21/2021 01:41:41 - INFO - __main__ -     Num examples = 9000
04/21/2021 01:41:41 - INFO - __main__ -     Batch size = 16


HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=563.0, style=ProgressStyle(description_w…

04/21/2021 01:45:07 - INFO - __main__ -   ***** Eval results *****
04/21/2021 01:45:07 - INFO - __main__ -     acc = 0.9601
04/21/2021 01:45:07 - INFO - __main__ -     loss = 0.1363





04/21/2021 01:45:44 - INFO - __main__ -   Saving model checkpoint to ./model





HBox(children=(FloatProgress(value=0.0, description='Iteration', max=563.0, style=ProgressStyle(description_wi…

04/21/2021 01:53:32 - INFO - __main__ -     global steps = 3200
04/21/2021 01:53:32 - INFO - __main__ -   ***** Running evaluation on train dataset *****
04/21/2021 01:53:32 - INFO - __main__ -     Num examples = 9000
04/21/2021 01:53:32 - INFO - __main__ -     Batch size = 16


HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=563.0, style=ProgressStyle(description_w…

04/21/2021 01:56:57 - INFO - __main__ -   ***** Eval results *****
04/21/2021 01:56:57 - INFO - __main__ -     acc = 0.9700
04/21/2021 01:56:57 - INFO - __main__ -     loss = 0.1070





04/21/2021 01:57:35 - INFO - __main__ -   Saving model checkpoint to ./model
04/21/2021 02:01:03 - INFO - __main__ -   ***** Running evaluation on test dataset *****
04/21/2021 02:01:03 - INFO - __main__ -     Batch size = 16






HBox(children=(FloatProgress(value=0.0, description='Predicting', max=63.0, style=ProgressStyle(description_wi…


