In [1]:
import csv
from pathlib import Path
import re
from datetime import date
import pandas as pd
import collections
import os
import sys
import random
import numpy as np
from tqdm.notebook import tqdm

import torch
from torch.optim.lr_scheduler import _LRScheduler, Optimizer
from torch import Tensor
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler
from tensorboardX import SummaryWriter

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
sys.path.append(module_path+'/examples/')
fund_dir = Path('/home/advice/notebook/jms/우리은행')
from run_classifier_spm import *

from apex import amp

from pytorch_pretrained_bert.optimization import BertAdam
from  pytorch_pretrained_bert import modeling
from pytorch_pretrained_bert.modeling import BertForPreTraining, BertPreTrainedModel, BertModel, BertConfig

In [29]:
a = Path('a')
b = Path('b')

a/b

PosixPath('a/b')

## set output_dir

In [2]:
today = date.today()
today = str(today).replace('-', '')
output_dir = fund_dir/'output_dir'/today
output_dir.mkdir(exist_ok = 'True')

## set hyper params

In [33]:
random.seed(42)

In [3]:
model_dir = fund_dir/"extract_kobert"

vocab_file = '/home/advice/notebook/jms/kobert/kobert_news_wiki_ko_cased-1087f8699e.spiece'

bert_config_file = model_dir / "kobert_config.json"
init_checkpoint = model_dir / "kobert_model.bin"
bert_model = 'kobert'
data_path = fund_dir/"data/"

train_file = "news_tr.txt"
eval_file = "news_te.txt"


In [4]:
args = {
    "train_file": train_file,
    'eval_file':eval_file,
    "data_dir": data_path,
    "task_name": "news",##'nsmc'
    "no_cuda": False,
    "bert_model": model_dir,
    "output_dir": output_dir,
    "tokenizer": vocab_file,
    "max_seq_length": 512,
    "doc_stride": 128,
    "do_train": True,
    "do_eval": True,
    "do_lower_case": True,
    "train_batch_size": 32,
    "eval_batch_size": 32,
    "learning_rate": 3e-5,
    "num_train_epochs": 1.0,
    "warmup_proportion": 0.1,
    "no_cuda": False,
    "local_rank": -1,
    "seed": 42,
    "gradient_accumulation_steps": 1,
    "optimize_on_cpu": False,
    "fp16": True,
    'fp16_opt_level':'O1',
    "loss_scale": 128,
    "logging_steps":100
}

In [5]:
class BertForMultiLabelSequenceClassification(BertPreTrainedModel):
    def __init__(self, config, num_labels):
        super(BertForMultiLabelSequenceClassification, self).__init__(config)
        self.num_labels = num_labels
        self.bert = BertModel(config)
        self.dropout = torch.nn.Dropout(config.hidden_dropout_prob)
        self.classifier = torch.nn.Linear(config.hidden_size, num_labels)
        self.apply(self.init_bert_weights)

    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
        _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        if labels is not None:
            #loss_fct = BCEWithLogitsLoss()
            #loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1, self.num_labels))
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            return loss
        else:
            return logits
        
    def freeze_bert_encoder(self):
        for param in self.bert.parameters():
            param.requires_grad = False
    
    def unfreeze_bert_encoder(self):
        for param in self.bert.parameters():
            param.requires_grad = True

In [6]:
class InputExample(object):

    def __init__(self, guid, text_a, text_b=None, labels=None, doc_span_index = None):
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.labels = labels
        self.doc_span_index = doc_span_index


class InputFeatures(object):
    def __init__(self,guid, input_ids, input_mask, segment_ids, label_ids, doc_span_index):
        self.guid = guid
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_ids = label_ids
        self.doc_span_index = doc_span_index

In [7]:
class DataProcessor(object):

    def get_train_examples(self):
        """Gets a collection of `InputExample`s for the train set."""
        raise NotImplementedError()

    def get_dev_examples(self):
        """Gets a collection of `InputExample`s for the dev set."""
        raise NotImplementedError()
    
    def get_test_examples(self):
        """Gets a collection of `InputExample`s for the dev set."""
        raise NotImplementedError() 

    def get_labels(self):
        """Gets the list of labels for this data set."""
        raise NotImplementedError()

In [8]:
def _read_tsv(input_file, cls = "\t", quotechar=None):
    reader = csv.reader(input_file.open('r'), delimiter=cls, quotechar=None)
    lines = [line for line in reader]
    return lines

class MultiClassTextProcessor(DataProcessor):

    def __init__(self, data_path, train_file, test_file,labels = None, dev_file = None):
        self.train_path = data_path/train_file
        self.test_path = data_path/test_file
        self.train_file = _read_tsv(data_path/train_file)
        self.test_file = _read_tsv(data_path/test_file)
        if dev_file!= None:
            self.dev_path = data_path/dev_file
            self.dev_file = _read_tsv(data_path/dev_file)
        self.labels = labels
    
    def get_train_examples(self):        
        logger.info("LOOKING AT {}".format(self.train_path))
        return self._create_examples(self.train_file, "train")
        
    def get_dev_examples(self):
        logger.info("LOOKING AT {}".format(self.dev_path))
        if self.dev_file!= None:
            return self._create_examples(self.dev_file, "dev")
        else:
            raise ValueError('There is no dev file')
    
    def get_test_examples(self):
        logger.info("LOOKING AT {}".format(self.test_path))
        return self._create_examples(self.test_file, "test")

    def get_labels(self):
        """See base class."""
        if self.labels == None:
            self.labels = ['0', '1']
        return self.labels

    def _create_examples(self, lines, set_type):
        """Creates examples for the training and dev sets."""
        examples = []
        for (i, line) in enumerate(lines):
            if i == 0:
                continue
            guid = "%s-%s" % (set_type, i)
            text_a = line[0]
            text_b = None
            label = line[1]
            examples.append(
                InputExample(guid=guid, text_a=text_a, text_b=text_b, labels=label))
        return examples

In [9]:
def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer, doc_stride):
    """Loads a data file into a list of `InputBatch`s."""
    label_map = {label : i for i, label in enumerate(label_list)}
    features_all = []
    for (ex_index, example) in enumerate(tqdm(examples)):
        
        tokens_a = tokenizer.tokenize(example.text_a)
        tokens_b = None

        max_tokens_for_doc = max_seq_length  - 2
        _DocSpan = collections.namedtuple(  
            "DocSpan", ["start", "length"])
        doc_spans = []
        start_offset = 0

        while start_offset < len(tokens_a):
            length = len(tokens_a) - start_offset
            if length > max_tokens_for_doc:
                length = max_tokens_for_doc
            doc_spans.append(_DocSpan(start=start_offset, length=length))
            if start_offset + length == len(tokens_a):
                break
            start_offset += min(length, doc_stride)

        for (doc_span_index, doc_span) in enumerate(doc_spans):
            features = []
            tokens = []
            segment_ids = [0]*max_seq_length
            tokens.append("[CLS]")
            
            for i in range(doc_span.length):
                split_token_index = doc_span.start + i
                tokens.append(tokens_a[split_token_index])

            tokens.append("[SEP]")

            input_ids = tokenizer.convert_tokens_to_ids(tokens)
            input_mask = [1] * len(input_ids)
            
            padding = [0] * (max_seq_length - len(input_ids))
            input_ids += padding
            input_mask += padding

            
            assert len(input_ids) == max_seq_length
            assert len(input_mask) == max_seq_length
            assert len(segment_ids) == max_seq_length
            labels_ids = []
            for label in example.labels:
                labels_ids.append(float(label))

#         label_id = label_map[example.label]
#chris changed

            if ex_index < 10:
                logger.info("*** Example ***")
                logger.info("guid: %s" % (example.guid))
                logger.info("doc_span_index: %s" % (doc_span_index))
                logger.info("tokens: %s" % " ".join(
                        [str(x) for x in tokens]))
                logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
                logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
                logger.info(
                        "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
                logger.info("label: %s (id = %s)" % (example.labels, labels_ids))

            features.append(
                    InputFeatures(guid = example.guid,
                                  input_ids=input_ids,
                                  input_mask=input_mask,
                                  segment_ids=segment_ids,
                                  label_ids=labels_ids, 
                                  doc_span_index = doc_span_index))
            features_all.extend(features)## extend가 맞남... 모르겟다링~
    return features_all


In [10]:
def _truncate_seq_pair(tokens_a, tokens_b, max_length):
    """Truncates a sequence pair in place to the maximum length."""

    # This is a simple heuristic which will always truncate the longer sequence
    # one token at a time. This makes more sense than truncating an equal percent
    # of tokens from each, since if one sequence is very short then each token
    # that's truncated likely contains more information than a longer sequence.
    while True:
        total_length = len(tokens_a) + len(tokens_b)
        if total_length <= max_length:
            break
        if len(tokens_a) > len(tokens_b):
            tokens_a.pop()
        else:
            tokens_b.pop()

In [11]:
def accuracy(out, labels):
    out_cpu = out.cpu().numpy()
    labels_cpu = labels.cpu().numpy()
    outputs = np.argmax(out_cpu, axis=1)
    return np.sum(outputs == labels_cpu)

def accuracy_thresh(y_pred:Tensor, y_true:Tensor, thresh:float=0.5, sigmoid:bool=True):
    "Compute accuracy when `y_pred` and `y_true` are the same size."
    if sigmoid: y_pred = y_pred.sigmoid()
#     return ((y_pred>thresh)==y_true.byte()).float().mean().item()
    return np.mean(((y_pred>thresh)==y_true.byte()).float().cpu().numpy(), axis=1).sum()


def fbeta(y_pred:Tensor, y_true:Tensor, thresh:float=0.2, beta:float=2, eps:float=1e-9, sigmoid:bool=True):
    "Computes the f_beta between `preds` and `targets`"
    beta2 = beta ** 2
    if sigmoid: y_pred = y_pred.sigmoid()
    y_pred = (y_pred>thresh).float()
    y_true = y_true.float()
    TP = (y_pred*y_true).sum(dim=1)
    prec = TP/(y_pred.sum(dim=1)+eps)
    rec = TP/(y_true.sum(dim=1)+eps)
    res = (prec*rec)/(prec*beta2+rec+eps)*(1+beta2)
    return res.mean().item()

In [12]:
def warmup_linear(x, warmup=0.002):
    if x < warmup:
        return x/warmup
    return 1.0 - x

In [13]:
processors = {args["task_name"]: MultiClassTextProcessor}

# Setup GPU parameters

if args["local_rank"] == -1 or args["no_cuda"]:
    device = torch.device("cuda" if torch.cuda.is_available() and not args["no_cuda"] else "cpu")
    n_gpu = torch.cuda.device_count()
#     n_gpu = 1
else:
    torch.cuda.set_device(args['local_rank'])
    device = torch.device("cuda", args['local_rank'])
    n_gpu = 1
    # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
    torch.distributed.init_process_group(backend='nccl')
logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
        device, n_gpu, bool(args['local_rank'] != -1), args['fp16']))

12/26/2019 07:35:14 - INFO - run_classifier_spm -   device: cuda n_gpu: 4, distributed training: False, 16-bits training: True


In [14]:
args['train_batch_size'] = int(args['train_batch_size'] / args['gradient_accumulation_steps'])

In [15]:
random.seed(args['seed'])
np.random.seed(args['seed'])
torch.manual_seed(args['seed'])
if n_gpu > 0:
    torch.cuda.manual_seed_all(args['seed'])

In [16]:
task_name = args['task_name'].lower()
if task_name not in processors:
    raise ValueError("Task not found: %s" % (task_name))
    
if args['task_name'] =='news':
    lab = ['0','1','2','3','4','5']
elif args['task_name'] =='nsmc':
    lab = ['0','1']

processor = processors[task_name](data_path = args['data_dir'], 
                                  train_file = args['train_file'], 
                                  test_file = args['eval_file'], 
                                  labels = lab,
                                 )
label_list = processor.get_labels()
num_labels = len(label_list)

In [17]:
tokenizer = BERTSPMTokenizer.from_pretrained(args['tokenizer'])

12/26/2019 07:35:17 - INFO - tokenization_spm -   loading vocabulary file /home/advice/notebook/jms/kobert/kobert_news_wiki_ko_cased-1087f8699e.spiece


In [18]:
# 이거 캐쉬로 저장하는거 추가하면 좋을듯.. 그냥 그렇다고. qa에 있던데..ㅎㅎ
train_examples = None
num_train_steps = None
if args['do_train']:
    train_examples = processor.get_train_examples()
#     train_examples = processor.get_train_examples(args['data_dir'], size=args['train_size'])
    train_features = convert_examples_to_features(
        train_examples, label_list, args['max_seq_length'], tokenizer, doc_stride=args['doc_stride'])
    num_train_steps = int(
        len(train_features) / args['train_batch_size'] / args['gradient_accumulation_steps'] * args['num_train_epochs'])


12/26/2019 07:35:17 - INFO - run_classifier_spm -   LOOKING AT /home/advice/notebook/jms/우리은행/data/news_tr.txt
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """


HBox(children=(FloatProgress(value=0.0, max=41850.0), HTML(value='')))

12/26/2019 07:35:17 - INFO - run_classifier_spm -   *** Example ***
12/26/2019 07:35:17 - INFO - run_classifier_spm -   guid: train-1
12/26/2019 07:35:17 - INFO - run_classifier_spm -   doc_span_index: 0
12/26/2019 07:35:17 - INFO - run_classifier_spm -   tokens: [CLS] ▁김 예 솔 기자 ▁김 세 정 이 ▁김 시 후 의 ▁죽음 에 ▁관심을 ▁갖 기 ▁시작했다 ▁일 에 ▁방송된 ▁너 의 ▁노래 를 ▁들려 줘 에서는 ▁홍 이 영 ▁김 세 정 ▁이 ▁기억 을 잃 기 ▁전 ▁자신 과 ▁김 이 안 ▁김 시 후 ▁이 ▁연 관 이 ▁있다는 ▁사실을 ▁알게 됐다 ▁앞서 ▁홍 이 영 은 ▁유 제 니 ▁조 유 정 에게 ▁김 이 안 에 ▁대해 ▁물 었다 ▁유 제 니 는 개월 ▁전 ▁네 가 ▁어 시 스트 했던 ▁사람이 다 라고 ▁말했다 ▁하지만 ▁홍 이 영 은 ▁기억 하지 ▁못했다 ▁유 제 니 는 ▁어떻게 보 면 잊 어 버린 ▁게 ▁좋 을 ▁수도 ▁있다 ▁알 던 ▁사람이 ▁죽 으면 ▁기분 ▁나 쁘 지 ▁않 냐 ▁고 ▁말했다 ▁이날 ▁홍 이 영 은 ▁장 윤 ▁연 우 진 을 ▁만나 자 ▁반 갑 게 ▁인사 했다 ▁두 ▁사람은 ▁키스 ▁후 ▁처음 ▁마 주 친 ▁상황 ▁하지만 ▁장 윤 은 ▁홍 이 영 의 ▁눈 을 ▁피 하며 ▁인사 하지 ▁않았다 ▁홍 이 영 은 ▁영 찜 찜 했다 ▁홍 이 영 은 ▁장 윤 에게 ▁왜 ▁날 ▁피 하 냐 며 ▁그냥 ▁돌려 ▁말 하지 ▁않겠다 ▁장 윤 씨는 ▁원래 ▁여자 랑 ▁키스 하고 쌩 까 시 냐 ▁고 ▁물 었다 ▁홍 이 영 은 ▁장 윤 씨가 ▁나 에 ▁대해 ▁알고 ▁싶다 고 ▁하지 ▁않았 냐 ▁나는 ▁되 게 ▁단순 한 ▁사람이 라 ▁그냥 ▁그 렇 구나 라고 ▁생각한다 ▁그래서 ▁그 날 도 ▁마음 ▁가는 대로 솔 직 하게 ▁직 진 했던 거 다 라고 ▁말했

12/26/2019 07:35:18 - INFO - run_classifier_spm -   input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 

12/26/2019 07:35:18 - INFO - run_classifier_spm -   label: 3 (id = [3.0])
12/26/2019 07:35:18 - INFO - run_classifier_spm -   *** Example ***
12/26/2019 07:35:18 - INFO - run_classifier_spm -   guid: train-2
12/26/2019 07:35:18 - INFO - run_classifier_spm -   doc_span_index: 2
12/26/2019 07:35:18 - INFO - run_classifier_spm -   tokens: [CLS] 을 ▁위해 ▁남은 ▁인생 을 ▁저 당 ▁잡 히 는 ▁집 ▁노 예 가 ▁되는 ▁대신 ▁지금 ▁사는 ▁공간 을 ▁제대로 ▁꾸 며 서 ▁살 기로 ▁한다 ▁대학 ▁시절 ▁옥 탑 방 에서 ▁시작 해 ▁마 포 ▁반 지 하 방 ▁문 래 동 ▁오피스텔 을 ▁거쳐 ▁세 입 자 ▁생활 을 ▁마감 하고 ▁신 림 동 에 ▁방 ▁개 짜리 ▁다 세대 주택 을 ▁장 만 한 ▁것 ▁이 낡 은 ▁집 을 ▁일 에 ▁걸쳐 ▁옷 방 과 ▁침 실 ▁서 재 ▁겸 ▁홈 시 어 터 룸 로 ▁구성된 ▁공간 으로 ▁꾸 미 는 ▁과정을 ▁담 았다 ▁욕 실 ▁개 조 ▁등 ▁실 전 ▁인 테 리 어 ▁노하우 가 ▁펼쳐 진다 ▁이해 리 ▁지 음 ▁마 티 ▁만 ▁원 ▁의미 의 ▁자리 한국 ▁시 단 에서 ▁활발 히 ▁활동 ▁중인 ▁저 자의 ▁네 ▁번째 ▁비 평 집 ▁의미 란 ▁무 엇 인 가 를 ▁주제로 ▁김 혜 순 ▁이제 니 ▁장 석 주 ▁등의 ▁작품 을 ▁독 해 한 ▁편 의 ▁글을 수록 했다 ▁이 론 적 ▁시 집 ▁해 설 뿐 ▁아니라 ▁독립 ▁잡 지 ▁문 예 지 ▁현황 ▁시 와 ▁자본 ▁시 인 과 ▁검 열 ▁같은 ▁문 단 ▁현실 에 ▁대한 ▁고 찰 도 ▁담겨 있다 ▁특히 ▁최근 의 ▁화 두 인 ▁번 역 을 ▁두고 ▁시 의 ▁번 역 에서 ▁발생 하는 ▁근 사 치 로서 의 ▁의미 에 ▁대해서도 ▁살 핀 다 ▁조 재 

12/26/2019 07:35:18 - INFO - run_classifier_spm -   input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

12/26/2019 07:35:18 - INFO - run_classifier_spm -   input_ids: 2 7079 2358 6293 1149 7053 5580 5936 2258 6812 5532 6314 1162 1149 7053 6938 5754 6812 5532 6314 1162 1149 7053 7096 3803 4016 4257 6037 1165 7003 5561 7095 4958 6305 5452 6730 6356 5499 4635 5947 7310 6983 3376 5452 4630 6333 6896 1685 3697 2574 7086 3622 5899 4257 6037 7095 4987 7971 7078 2358 3862 6197 3658 3238 1150 5330 4501 7483 7828 4213 7828 4257 6037 2095 5474 5933 3785 7872 2604 7102 5019 3135 3559 3656 3431 1150 1079 6080 5060 2358 6116 2232 5176 5580 5932 1933 3649 1966 3135 3559 4257 6037 1165 7003 5562 3524 6120 5859 2462 5872 7376 6903 4988 7848 4635 5947 7310 6896 4630 6333 7788 4166 5424 6368 5592 3934 7866 1849 6530 990 7206 5023 7078 2392 993 5019 3135 3559 3656 3431 4984 7398 7079 4259 1881 1165 7003 5561 7095 4635 5947 7310 4630 6333 5468 1725 5859 3376 7063 5525 4236 7794 3810 3886 7044 5808 1165 7003 5561 1312 6295 7344 2574 1820 1073 7848 1633 6198 2358 6116 2230 4946 3656 4489 5760 4016 1881 4402 65

12/26/2019 07:35:18 - INFO - run_classifier_spm -   input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 

12/26/2019 07:35:18 - INFO - run_classifier_spm -   input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

12/26/2019 07:35:18 - INFO - run_classifier_spm -   label: 1 (id = [1.0])
12/26/2019 07:35:18 - INFO - run_classifier_spm -   *** Example ***
12/26/2019 07:35:18 - INFO - run_classifier_spm -   guid: train-8
12/26/2019 07:35:18 - INFO - run_classifier_spm -   doc_span_index: 1
12/26/2019 07:35:18 - INFO - run_classifier_spm -   tokens: [CLS] ▁변 기 의 ▁물 을 ▁내리 고 ▁변 기를 ▁휴 지 로 ▁한 ▁번 닦 았다 ▁그때 ▁머리 ▁위 에서 ▁인기 척 을 ▁느꼈 다 ▁순간 적으로 위를 쳐 다 봤 지만 ▁아무 것 도 ▁없었다 ▁씨는 ▁변 기에 앉 으면서 도 ▁경계 를 늦 추 지 ▁않고 ▁천 장을 ▁계속 ▁응시 했다고 ▁한다 ▁일 러 스트 ▁정 다운 몇 ▁초 ▁뒤 ▁셀카 ▁모 드 로 ▁설정 된 ▁휴대전화 ▁카메라 가 ▁머리 ▁위로 ▁슬 며 시 ▁올라 왔다 ▁당황 한 ▁씨가 ▁지금 ▁뭐 하는 ▁거 냐 ▁고 ▁소리 치 자 ▁피의자 는 ▁화장실 ▁밖으로 ▁달아 났다 ▁씨는 ▁경찰에 ▁신고 한 ▁뒤 ▁건물 ▁폐쇄 회 로 를 ▁확인 하고 자 ▁했지만 ▁경찰은 ▁화장실 ▁쪽 을 ▁비 추 는 가 ▁없어 ▁드 나 든 ▁사람들 ▁모습을 ▁확인할 ▁수 ▁없다 ▁고 ▁했다 ▁이 ▁안 심 ▁화장실 은 ▁서울시 에서 ▁매달 ▁회 ▁이상 ▁불법 ▁촬영 ▁장비 ▁설치 ▁여부를 ▁점검 한다 ▁범죄 가 ▁일어나 기 ▁사흘 ▁전 인 ▁지난 ▁일 에도 ▁보안 관 이 ▁나와 ▁점검 을 ▁했다 는 ▁표시 가 ▁있었다 ▁서울시 는 ▁지난 년 월 ▁여성 ▁안 심 ▁보안 관 들을 ▁임명 해 ▁공공 ▁민간 개 방 ▁화장실 ▁등 ▁다 중 이 용 시설 에 ▁몰 래 카 메 라 ▁설치 ▁여부 ▁등을 ▁집중 ▁점검 해 ▁왔다 ▁일각에서는 ▁안 심 ▁화

12/26/2019 07:35:18 - INFO - run_classifier_spm -   input_ids: 2 5019 3647 3135 6745 5118 7086 2728 6903 1989 5152 3704 2496 4518 3961 2779 3311 4075 7831 2320 5330 3813 5561 2637 4012 7119 4304 3803 6901 2375 5474 7096 1394 4075 7088 5019 5760 4882 5330 3873 2728 5760 4304 5712 7028 3312 3135 6745 2375 5474 5938 3830 7848 1032 2170 5357 6305 5118 1815 1562 7295 7096 7003 6712 6896 2081 6023 7495 6190 6003 2779 3310 1824 4389 4075 7848 3464 3804 3135 6745 5118 3787 7096 4609 6983 1601 2496 4518 2320 6896 1698 1956 1489 5902 2872 3864 4336 7096 1392 3417 7068 6573 2912 7422 7886 5808 975 7885 7822 1108 2081 7495 6333 7096 3135 6745 5118 3787 7741 6896 3996 7997 4075 1407 7361 6116 2358 4525 1407 7361 5330 3996 7899 3879 2375 5474 7096 1622 7086 3417 7318 3163 909 2358 2496 4518 7088 4930 968 5859 3862 6197 2614 7119 3312 7096 3105 1088 7167 3844 6903 1939 4063 7206 7589 6527 7885 6896 5000 7831 993 1966 3135 6745 5118 7095 5037 6645 7828 2981 5468 1088 5859 2125 5782 2320 5330 2244 5118

12/26/2019 07:35:18 - INFO - run_classifier_spm -   input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 

12/26/2019 07:35:18 - INFO - run_classifier_spm -   input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 




In [19]:
bert_config = modeling.BertConfig.from_json_file(bert_config_file)
model = BertForMultiLabelSequenceClassification(bert_config, num_labels = num_labels)
model.bert.load_state_dict(torch.load(init_checkpoint))

model.to(device)
logger.info("***Now, Model is on the device!!!***")

12/26/2019 07:37:04 - INFO - run_classifier_spm -   ***Now, Model is on the device!!!***


In [20]:
from torch.optim.lr_scheduler import _LRScheduler, Optimizer

class CyclicLR(object):
    """Sets the learning rate of each parameter group according to
    cyclical learning rate policy (CLR). The policy cycles the learning
    rate between two boundaries with a constant frequency, as detailed in
    the paper `Cyclical Learning Rates for Training Neural Networks`_.
    The distance between the two boundaries can be scaled on a per-iteration
    or per-cycle basis.
    Cyclical learning rate policy changes the learning rate after every batch.
    `batch_step` should be called after a batch has been used for training.
    To resume training, save `last_batch_iteration` and use it to instantiate `CycleLR`.
    This class has three built-in policies, as put forth in the paper:
    "triangular":
        A basic triangular cycle w/ no amplitude scaling.
    "triangular2":
        A basic triangular cycle that scales initial amplitude by half each cycle.
    "exp_range":
        A cycle that scales initial amplitude by gamma**(cycle iterations) at each
        cycle iteration.
    This implementation was adapted from the github repo: `bckenstler/CLR`_
    Args:
        optimizer (Optimizer): Wrapped optimizer.
        base_lr (float or list): Initial learning rate which is the
            lower boundary in the cycle for eachparam groups.
            Default: 0.001
        max_lr (float or list): Upper boundaries in the cycle for
            each parameter group. Functionally,
            it defines the cycle amplitude (max_lr - base_lr).
            The lr at any cycle is the sum of base_lr
            and some scaling of the amplitude; therefore
            max_lr may not actually be reached depending on
            scaling function. Default: 0.006
        step_size (int): Number of training iterations per
            half cycle. Authors suggest setting step_size
            2-8 x training iterations in epoch. Default: 2000
        mode (str): One of {triangular, triangular2, exp_range}.
            Values correspond to policies detailed above.
            If scale_fn is not None, this argument is ignored.
            Default: 'triangular'
        gamma (float): Constant in 'exp_range' scaling function:
            gamma**(cycle iterations)
            Default: 1.0
        scale_fn (function): Custom scaling policy defined by a single
            argument lambda function, where
            0 <= scale_fn(x) <= 1 for all x >= 0.
            mode paramater is ignored
            Default: None
        scale_mode (str): {'cycle', 'iterations'}.
            Defines whether scale_fn is evaluated on
            cycle number or cycle iterations (training
            iterations since start of cycle).
            Default: 'cycle'
        last_batch_iteration (int): The index of the last batch. Default: -1
    Example:
        >>> optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
        >>> scheduler = torch.optim.CyclicLR(optimizer)
        >>> data_loader = torch.utils.data.DataLoader(...)
        >>> for epoch in range(10):
        >>>     for batch in data_loader:
        >>>         scheduler.batch_step()
        >>>         train_batch(...)
    .. _Cyclical Learning Rates for Training Neural Networks: https://arxiv.org/abs/1506.01186
    .. _bckenstler/CLR: https://github.com/bckenstler/CLR
    """

    def __init__(self, optimizer, base_lr=1e-3, max_lr=6e-3,
                 step_size=2000, mode='triangular', gamma=1.,
                 scale_fn=None, scale_mode='cycle', last_batch_iteration=-1):

#         if not isinstance(optimizer, Optimizer):
#             raise TypeError('{} is not an Optimizer'.format(
#                 type(optimizer).__name__))
        self.optimizer = optimizer

        if isinstance(base_lr, list) or isinstance(base_lr, tuple):
            if len(base_lr) != len(optimizer.param_groups):
                raise ValueError("expected {} base_lr, got {}".format(
                    len(optimizer.param_groups), len(base_lr)))
            self.base_lrs = list(base_lr)
        else:
            self.base_lrs = [base_lr] * len(optimizer.param_groups)

        if isinstance(max_lr, list) or isinstance(max_lr, tuple):
            if len(max_lr) != len(optimizer.param_groups):
                raise ValueError("expected {} max_lr, got {}".format(
                    len(optimizer.param_groups), len(max_lr)))
            self.max_lrs = list(max_lr)
        else:
            self.max_lrs = [max_lr] * len(optimizer.param_groups)

        self.step_size = step_size

        if mode not in ['triangular', 'triangular2', 'exp_range'] \
                and scale_fn is None:
            raise ValueError('mode is invalid and scale_fn is None')

        self.mode = mode
        self.gamma = gamma

        if scale_fn is None:
            if self.mode == 'triangular':
                self.scale_fn = self._triangular_scale_fn
                self.scale_mode = 'cycle'
            elif self.mode == 'triangular2':
                self.scale_fn = self._triangular2_scale_fn
                self.scale_mode = 'cycle'
            elif self.mode == 'exp_range':
                self.scale_fn = self._exp_range_scale_fn
                self.scale_mode = 'iterations'
        else:
            self.scale_fn = scale_fn
            self.scale_mode = scale_mode

        self.batch_step(last_batch_iteration + 1)
        self.last_batch_iteration = last_batch_iteration

    def batch_step(self, batch_iteration=None):
        if batch_iteration is None:
            batch_iteration = self.last_batch_iteration + 1
        self.last_batch_iteration = batch_iteration
        for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()):
            param_group['lr'] = lr

    def _triangular_scale_fn(self, x):
        return 1.

    def _triangular2_scale_fn(self, x):
        return 1 / (2. ** (x - 1))

    def _exp_range_scale_fn(self, x):
        return self.gamma**(x)

    def get_lr(self):
        step_size = float(self.step_size)
        cycle = np.floor(1 + self.last_batch_iteration / (2 * step_size))
        x = np.abs(self.last_batch_iteration / step_size - 2 * cycle + 1)

        lrs = []
        param_lrs = zip(self.optimizer.param_groups, self.base_lrs, self.max_lrs)
        for param_group, base_lr, max_lr in param_lrs:
            base_height = (max_lr - base_lr) * np.maximum(0, (1 - x))
            if self.scale_mode == 'cycle':
                lr = base_lr + base_height * self.scale_fn(cycle)
            else:
                lr = base_lr + base_height * self.scale_fn(self.last_batch_iteration)
            lrs.append(lr)
        return lrs

In [21]:
# Prepare optimizer
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
t_total = num_train_steps
if args['local_rank'] != -1:
    t_total = t_total // torch.distributed.get_world_size()
if args['fp16']:
    try:
        from apex.contrib.optimizers import FP16_Optimizer
        from apex.optimizers import FusedAdam
    except ImportError:
        raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")

    optimizer = FusedAdam(optimizer_grouped_parameters,
                          lr=args['learning_rate'],
                          bias_correction=False)
#     if args['loss_scale'] == 0:
#         optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
#     else:
#         optimizer = FP16_Optimizer(optimizer, static_loss_scale=args['loss_scale'])

else:
    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=args['learning_rate'],
                         warmup=args['warmup_proportion'],
                         t_total=t_total)

scheduler = CyclicLR(optimizer, base_lr=2e-5, max_lr=5e-5, step_size=2500, last_batch_iteration=0)

In [22]:
if args['fp16']:
    model, optimizer = amp.initialize(model, optimizer, 
                                      opt_level=args['fp16_opt_level'])

Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic


In [None]:
if args['local_rank'] != -1:
    try:
        from apex.parallel import DistributedDataParallel as DDP
    except ImportError:
        raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")

    model = DDP(model)
elif n_gpu > 1:
    model = torch.nn.DataParallel(model)

In [None]:
model = torch.nn.DataParallel(model)

In [26]:
model

BertForMultiLabelSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(8002, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): FusedLayerNorm(torch.Size([768]), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): FusedLayerN

In [25]:
import apex
apex.parallel.DistributedDataParallel()

<module 'apex.contrib' from '/usr/local/lib/python3.6/dist-packages/apex/contrib/__init__.py'>

In [27]:
DDP(model, delay_allreduce=True)

AssertionError: Default process group is not initialized

In [None]:
torch.distributed.l

In [23]:
## 이부분은 main으로 짤때에 그때에 torch.distributed.launch.py를 source 해와서 짜자리
# from apex.parallel import DistributedDataParallel as DDP

# model = DDP(model, delay_allreduce=True)

AssertionError: Default process group is not initialized

In [None]:
# # Eval Fn
# eval_examples = processor.get_dev_examples(args['data_dir'], size=args['val_size'])

def eval():

    eval_features = convert_examples_to_features(
        eval_examples, label_list, args['max_seq_length'], tokenizer, doc_stride=args['doc_stride'])
    logger.info("***** Running evaluation *****")
    logger.info("  Num examples = %d", len(eval_examples))
    logger.info("  Batch size = %d", args['eval_batch_size'])
    all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
    all_label_ids = torch.tensor([f.label_ids for f in eval_features], dtype=torch.long)##민성 change
    eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
    # Run prediction for full data
    eval_sampler = SequentialSampler(eval_data)
    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args['eval_batch_size'])
    
    all_logits = None
    all_labels = None
    
    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
        input_ids = input_ids.to(device)
        input_mask = input_mask.to(device)
        segment_ids = segment_ids.to(device)
        label_ids = label_ids.to(device)

        with torch.no_grad():
            tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids)
            logits = model(input_ids, segment_ids, input_mask)

#         logits = logits.detach().cpu().numpy()
#         label_ids = label_ids.to('cpu').numpy()
        tmp_eval_accuracy = accuracy(logits, label_ids)
#         tmp_eval_accuracy = accuracy_thresh(logits, label_ids)
        if all_logits is None:
            all_logits = logits.detach().cpu().numpy()
        else:
            all_logits = np.concatenate((all_logits, logits.detach().cpu().numpy()), axis=0)
            
        if all_labels is None:
            all_labels = label_ids.detach().cpu().numpy()
        else:    
            all_labels = np.concatenate((all_labels, label_ids.detach().cpu().numpy()), axis=0)
        

        eval_loss += tmp_eval_loss.mean().item()
        eval_accuracy += tmp_eval_accuracy

        nb_eval_examples += input_ids.size(0)
        nb_eval_steps += 1

    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_examples
    

    result = {'eval_loss': eval_loss,
              'eval_accuracy': eval_accuracy}#,
#               'loss': tr_loss/nb_tr_steps,
#               'roc_auc': roc_auc  }

    output_eval_file = os.path.join(args['output_dir'], "eval_results.txt")
    with open(output_eval_file, "w") as writer:
        logger.info("***** Eval results *****")
        for key in sorted(result.keys()):
            logger.info("  %s = %s", key, str(result[key]))
            writer.write("%s = %s\n" % (key, str(result[key])))
    return result

In [None]:
logger.info("***** Running training *****")
logger.info("  Num examples = %d", len(train_examples))
logger.info("  Num features = %d", len(train_features))
logger.info("  Batch size = %d", args['train_batch_size'])
logger.info("  Num steps = %d", num_train_steps)
all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
all_label_ids = torch.tensor([f.label_ids for f in train_features], dtype=torch.long)
train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
if args['local_rank'] == -1:
    train_sampler = RandomSampler(train_data)
else:
    train_sampler = DistributedSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args['train_batch_size'])

In [None]:
model.module.unfreeze_bert_encoder()

In [None]:
tensorboard_dir = output_dir / "tensorboard"
tensorboard_dir.mkdir(exist_ok=True)
tb_writer = SummaryWriter(tensorboard_dir)

global_step = 0
tr_loss, logging_loss, epoch_loss = 0.0, 0.0, 0.0
model.train()
for i_ in tqdm(range(int(args['num_train_epochs'])), desc="Epoch"):

    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):

        batch = tuple(t.to(device) for t in batch)
        input_ids, input_mask, segment_ids, label_ids = batch
        loss = model(input_ids, segment_ids, input_mask, label_ids)
        if n_gpu > 1:
            loss = loss.mean() # mean() to average on multi-gpu.
        if args['gradient_accumulation_steps'] > 1:
            loss = loss / args['gradient_accumulation_steps']

        if args['fp16']:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()

        tr_loss += loss.item()
        tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
        nb_tr_examples += input_ids.size(0)
        nb_tr_steps += 1
        if (step + 1) % args['gradient_accumulation_steps'] == 0:
#             scheduler.batch_step()
            # modify learning rate with special warm up BERT uses
            lr_this_step = args['learning_rate'] * warmup_linear(global_step/t_total, args['warmup_proportion'])
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr_this_step
            optimizer.step()
            optimizer.zero_grad()
            global_step += 1
            
        if args['logging_steps']>0 and global_step % args['logging_steps']==0:
            tb_writer.add_scalar("loss",(tr_loss - logging_loss) / args['logging_steps'],global_step,)
            tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
        logging_loss = tr_loss
    logger.info('Loss after epoc {}'.format(tr_loss / nb_tr_steps))
    logger.info('Eval after epoc {}'.format(i_+1))
        
tb_writer.close()

In [None]:
input_ids.shape

In [None]:
input_ids.size(0)

In [None]:
# Save a trained model
model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
output_model_file = os.path.join(output_dir, "finetuned_news_doc_stride_pytorch_model_d128_m512_FP16.bin")
torch.save(model_to_save.state_dict(), output_model_file)

In [None]:
# # Load a trained model that you have fine-tuned
# model_state_dict = torch.load(output_model_file)
# model = BertForMultiLabelSequenceClassification.from_pretrained(args['bert_model'], num_labels = num_labels, state_dict=model_state_dict)
# model.to(device)

In [None]:

test_examples = processor.get_test_examples()

In [None]:
test_features = convert_examples_to_features(
    test_examples, label_list, args['max_seq_length'], tokenizer, doc_stride=args['doc_stride'])


In [None]:
len(test_features)

In [None]:
new_input_data = [{'id':feature.guid, 'doc_index':feature.doc_span_index} for feature in test_features]

In [None]:
all_input_ids = torch.tensor([f.input_ids for f in test_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in test_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in test_features], dtype=torch.long)


In [None]:
test_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids)

In [None]:
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args['eval_batch_size'])


In [None]:
all_logits = None

model.eval()
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
for step, batch in enumerate(tqdm(test_dataloader, desc="Prediction Iteration")):
    input_ids, input_mask, segment_ids = batch
    input_ids = input_ids.to(device)
    input_mask = input_mask.to(device)
    segment_ids = segment_ids.to(device)

    with torch.no_grad():
        logits = model(input_ids, segment_ids, input_mask)
        logits = logits.sigmoid()## softmax

    if all_logits is None:
        all_logits = logits.detach().cpu().numpy()
    else:
        all_logits = np.concatenate((all_logits, logits.detach().cpu().numpy()), axis=0)

    nb_eval_examples += input_ids.size(0)
    nb_eval_steps += 1

In [None]:
a = pd.merge(pd.DataFrame(new_input_data), pd.DataFrame(all_logits, columns=label_list), left_index=True, right_index=True)

In [None]:
a.loc[:, 'pred'] = a.iloc[:,2:].apply(lambda x: x.idxmax(), axis = 1)

In [None]:
with open('/home/advice/notebook/jms/우리은행/data/news_te.txt', "r") as f:
    reader = csv.reader(f, delimiter="\t", quotechar=None)
    lines = []
    for line in reader:
        lines.append(line)
t = [i[1] for i in lines[1:]]
real_val = [{'id':'test-'+str(idx+1), 'real':real}for idx, real in enumerate(t)]

In [None]:
final = pd.merge(pd.DataFrame(a.groupby('id')['pred'].max()).reset_index(),
                 pd.DataFrame(real_val), 
                 on = ['id'])

In [None]:
final[final.pred == final.real].shape[0]/final.shape[0]