**bert**

In [1]:
#pip install numpy==1.23.5

In [2]:
pip install boto3==1.15.18

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
pip install gluonnlp==0.8.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
pip install mxnet==1.5.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [5]:
pip install torch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [6]:
pip install onnxruntime

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [7]:
pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [8]:
pip install sentencepiece==0.1.96

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentencepiece==0.1.96
  Downloading sentencepiece-0.1.96-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
  Attempting uninstall: sentencepiece
    Found existing installation: sentencepiece 0.1.99
    Uninstalling sentencepiece-0.1.99:
      Successfully uninstalled sentencepiece-0.1.99
Successfully installed sentencepiece-0.1.96


In [9]:
import torch
from torch import nn


class BERT(nn.Module):
    def __init__(self, bert):
        super(BERT, self).__init__()
        self.bert = bert

    def forward(self, inputs, mode):

        if mode == 'train':
            anchor_attention_mask = self.gen_attention_mask(inputs['anchor']['source'],
                                                            inputs['anchor']['valid_length'])

            positive_attention_mask = self.gen_attention_mask(inputs['positive']['source'],
                                                              inputs['positive']['valid_length'])

            negative_attention_mask = self.gen_attention_mask(inputs['negative']['source'],
                                                              inputs['negative']['valid_length'])

            _, anchor_pooler = self.bert(input_ids=inputs['anchor']['source'],
                                         token_type_ids=inputs['anchor']['segment_ids'],
                                         attention_mask=anchor_attention_mask)

            _, positive_pooler = self.bert(input_ids=inputs['positive']['source'],
                                           token_type_ids=inputs['positive']['segment_ids'],
                                           attention_mask=positive_attention_mask)

            _, negative_pooler = self.bert(input_ids=inputs['negative']['source'],
                                           token_type_ids=inputs['negative']['segment_ids'],
                                           attention_mask=negative_attention_mask)

            return anchor_pooler, positive_pooler, negative_pooler

        else:
            sentence_1_attention_mask = self.gen_attention_mask(inputs['sentence_1']['source'],
                                                                inputs['sentence_1']['valid_length'])

            sentence_2_attention_mask = self.gen_attention_mask(inputs['sentence_2']['source'],
                                                                inputs['sentence_2']['valid_length'])

            _, sentence_1_pooler = self.bert(input_ids=inputs['sentence_1']['source'],
                                             token_type_ids=inputs['sentence_1']['segment_ids'],
                                             attention_mask=sentence_1_attention_mask)

            _, sentence_2_pooler = self.bert(input_ids=inputs['sentence_2']['source'],
                                             token_type_ids=inputs['sentence_2']['segment_ids'],
                                             attention_mask=sentence_2_attention_mask)

            return sentence_1_pooler, sentence_2_pooler

    def encode(self, inputs, device):

        attention_mask = self.gen_attention_mask(inputs['source'], inputs['valid_length'])

        _, embeddings = self.bert(input_ids=inputs['source'].to(device),
                                  token_type_ids=inputs['segment_ids'].to(device),
                                  attention_mask=attention_mask.to(device))

        return embeddings

    def gen_attention_mask(self, token_ids, valid_length):

        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1

        return attention_mask.float()



---








**utils**

In [10]:
pip install tensorboardx

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [11]:
import os
import torch
import logging
from tensorboardX import SummaryWriter

logger = logging.getLogger(__name__)
writer = SummaryWriter()


class Metric():

    def __init__(self, args):
        self.args = args

    def get_lr(self, optimizer):
        return optimizer.state_dict()['param_groups'][0]['lr']

    def count_parameters(self, model):
        print(sum(p.numel() for p in model.parameters() if p.requires_grad))

    def cal_acc(self, yhat, y):
        with torch.no_grad():
            yhat = yhat.max(dim=-1)[1]  # [0]: max value, [1]: index of max value
            acc = (yhat == y).float().mean()

        return acc

    def cal_time(self, start_time, end_time):
        elapsed_time = end_time - start_time
        elapsed_mins = int(elapsed_time / 60)
        elapsed_secs = int(elapsed_time - (elapsed_mins * 60))

        return elapsed_mins, elapsed_secs

    def cal_dev_score(self, score, indicator):
        validation_score = score['score'] / score['iter']
        for key, value in indicator.items():
            indicator[key] /= score['iter']

        print("\n\nCosine-Similarity :\tPearson: {:.4f}\tSpearman: {:.4f}".format(
            indicator['eval_pearson_cosine'], indicator['eval_spearman_cosine']))
        print("Manhattan-Distance:\tPearson: {:.4f}\tSpearman: {:.4f}".format(
            indicator['eval_pearson_manhattan'], indicator['eval_spearman_manhattan']))
        print("Euclidean-Distance:\tPearson: {:.4f}\tSpearman: {:.4f}".format(
            indicator['eval_pearson_euclidean'], indicator['eval_spearman_euclidean']))
        print("Dot-Product-Similarity:\tPearson: {:.4f}\tSpearman: {:.4f}\n".format(
            indicator['eval_pearson_dot'], indicator['eval_spearman_dot']))

        return validation_score

    def update_indicator(self, indicator, score):
        for key, value in indicator.items():
            if key == 'eval_spearman_cosine':
                indicator[key] += score['eval_spearman_cosine']
            elif key == 'eval_pearson_cosine':
                indicator[key] += score['eval_pearson_cosine']
            elif key == 'eval_spearman_manhattan':
                indicator[key] += score['eval_spearman_manhattan']
            elif key == 'eval_pearson_manhattan':
                indicator[key] += score['eval_pearson_manhattan']
            elif key == 'eval_spearman_euclidean':
                indicator[key] += score['eval_spearman_euclidean']
            elif key == 'eval_pearson_euclidean':
                indicator[key] += score['eval_pearson_euclidean']
            elif key == 'eval_spearman_dot':
                indicator[key] += score['eval_spearman_dot']
            elif key == 'eval_pearson_dot':
                indicator[key] += score['eval_pearson_dot']

    def draw_graph(self, cp):
        writer.add_scalars('loss_graph', {'train': cp['tl'], 'valid': cp['vl']}, cp['ep'])
        writer.add_scalars('acc_graph', {'train': cp['tma'], 'valid': cp['vma']}, cp['ep'])

    def performance_check(self, cp, config):
        print(f'\t==Epoch: {cp["ep"] + 1:02} | Epoch Time: {cp["epm"]}m {cp["eps"]}s==')
        print(f'\t==Train Loss: {cp["tl"]:.4f} | Train acc: {cp["tma"]:.4f}==')
        print(f'\t==Valid Loss: {cp["vl"]:.4f} | Valid acc: {cp["vma"]:.4f}==')
        print(f'\t==Epoch latest LR: {self.get_lr(config["optimizer"]):.9f}==\n')

    def print_size_of_model(self, model):
        torch.save(model.state_dict(), "temp.p")
        print('Size (MB):', os.path.getsize("temp.p") / 1e6)
        os.remove('temp.p')

    def move2device(self, sample, device):
        if len(sample) == 0:
            return {}

        def _move_to_device(maybe_tensor, device):
            if torch.is_tensor(maybe_tensor):
                return maybe_tensor.to(device)
            elif isinstance(maybe_tensor, dict):
                return {
                    key: _move_to_device(value, device)
                    for key, value in maybe_tensor.items()
                    }
            elif isinstance(maybe_tensor, list):
                return [_move_to_device(x, device) for x in maybe_tensor]
            elif isinstance(maybe_tensor, tuple):
                return [_move_to_device(x, device) for x in maybe_tensor]
            else:
                return maybe_tensor

        return _move_to_device(sample, device)

    def save_model(self, config, cp, pco):
        if not os.path.exists(config['args'].path_to_save):
            os.makedirs(config['args'].path_to_save)

        sorted_path = config['args'].path_to_save + config['args'].ckpt
        if cp['vs'] > pco['best_valid_score']:
            # pco['early_stop_patient'] = 0
            pco['best_valid_score'] = cp['vs']

            state = {'model': config['model'].state_dict(),
                     'optimizer': config['optimizer'].state_dict()}

            torch.save(state, sorted_path)
            print(f'\t## SAVE {sorted_path} |'
                  f' valid_score: {cp["vs"]:.4f} |'
                  f' epochs: {cp["ep"]} |'
                  f' steps: {cp["step"]} ##\n')

        # self.draw_graph(cp)
        # self.performance_check(cp, config)


def pytorch_cos_sim(a, b):
    """
    Computes the cosine similarity cos_sim(a[i], b[j]) for all i and j.
    This function can be used as a faster replacement for 1-scipy.spatial.distance.cdist(a,b)
    :return: Matrix with res[i][j]  = cos_sim(a[i], b[j])
    """
    if not isinstance(a, torch.Tensor):
        a = torch.tensor(a)

    if not isinstance(b, torch.Tensor):
        b = torch.tensor(b)

    if len(a.shape) == 1:
        a = a.unsqueeze(0)

    if len(b.shape) == 1:
        b = b.unsqueeze(0)

    a_norm = a / a.norm(dim=1)[:, None]
    b_norm = b / b.norm(dim=1)[:, None]
    return torch.mm(a_norm, b_norm.transpose(0, 1))



---








**loss**

In [12]:
import torch
import logging
import numpy as np
import torch.nn as nn
from scipy.stats import pearsonr, spearmanr
from sklearn.metrics.pairwise import paired_cosine_distances, paired_euclidean_distances, paired_manhattan_distances

logger = logging.getLogger(__name__)


class Loss():

    def __init__(self, args):
        self.args = args
        self.cos = nn.CosineSimilarity(dim=-1)
        self.metric = Metric(args)

    def train_loss_fct(self, config, a, p, n):

        positive_similarity = self.cos(a.unsqueeze(1), p.unsqueeze(0)) / self.args.temperature
        negative_similarity = self.cos(a.unsqueeze(1), n.unsqueeze(0)) / self.args.temperature
        cosine_similarity = torch.cat([positive_similarity, negative_similarity], dim=1).to(self.args.device)

        labels = torch.arange(cosine_similarity.size(0)).long().to(self.args.device)

        loss = config['criterion'](cosine_similarity, labels)

        return loss

    def evaluation_during_training(self, embeddings1, embeddings2, labels, indicator):

        embeddings1 = embeddings1.cpu().numpy()
        embeddings2 = embeddings2.cpu().numpy()
        labels = labels.cpu().numpy().flatten()

        cosine_scores = 1 - (paired_cosine_distances(embeddings1, embeddings2))
        manhattan_distances = -paired_manhattan_distances(embeddings1, embeddings2)
        euclidean_distances = -paired_euclidean_distances(embeddings1, embeddings2)
        dot_products = [np.dot(emb1, emb2) for emb1, emb2 in zip(embeddings1, embeddings2)]

        eval_pearson_cosine, _ = pearsonr(labels, cosine_scores)
        eval_spearman_cosine, _ = spearmanr(labels, cosine_scores)

        eval_pearson_manhattan, _ = pearsonr(labels, manhattan_distances)
        eval_spearman_manhattan, _ = spearmanr(labels, manhattan_distances)

        eval_pearson_euclidean, _ = pearsonr(labels, euclidean_distances)
        eval_spearman_euclidean, _ = spearmanr(labels, euclidean_distances)

        eval_pearson_dot, _ = pearsonr(labels, dot_products)
        eval_spearman_dot, _ = spearmanr(labels, dot_products)

        score = {'eval_pearson_cosine': eval_pearson_cosine,
                 'eval_spearman_cosine': eval_spearman_cosine,
                 'eval_pearson_manhattan': eval_pearson_manhattan,
                 'eval_spearman_manhattan': eval_spearman_manhattan,
                 'eval_pearson_euclidean': eval_pearson_euclidean,
                 'eval_spearman_euclidean': eval_spearman_euclidean,
                 'eval_pearson_dot': eval_pearson_dot,
                 'eval_spearman_dot': eval_spearman_dot}

        self.metric.update_indicator(indicator, score)

        return max(eval_spearman_cosine, eval_spearman_manhattan, eval_spearman_euclidean, eval_spearman_dot)



---








**aws_s3**

In [13]:
import boto3
import os
import sys
from botocore import UNSIGNED
from botocore.client import Config


class AwsS3Downloader(object):
    def __init__(
        self,
        aws_access_key_id=None,
        aws_secret_access_key=None,
    ):
        self.resource = boto3.Session(
            aws_access_key_id=aws_access_key_id,
            aws_secret_access_key=aws_secret_access_key,
        ).resource("s3")
        self.client = boto3.client(
            "s3",
            aws_access_key_id=aws_access_key_id,
            aws_secret_access_key=aws_secret_access_key,
            config=Config(signature_version=UNSIGNED),
        )

    def __split_url(self, url: str):
        if url.startswith("s3://"):
            url = url.replace("s3://", "")
        bucket, key = url.split("/", maxsplit=1)
        return bucket, key

    def download(self, url: str, local_dir: str):
        bucket, key = self.__split_url(url)
        filename = os.path.basename(key)
        file_path = os.path.join(local_dir, filename)

        os.makedirs(os.path.dirname(file_path), exist_ok=True)
        meta_data = self.client.head_object(Bucket=bucket, Key=key)
        total_length = int(meta_data.get("ContentLength", 0))

        downloaded = 0

        def progress(chunk):
            nonlocal downloaded
            downloaded += chunk
            done = int(50 * downloaded / total_length)
            sys.stdout.write(
                "\r{}[{}{}]".format(file_path, "█" * done, "." * (50 - done))
            )
            sys.stdout.flush()

        try:
            with open(file_path, "wb") as f:
                self.client.download_fileobj(bucket, key, f, Callback=progress)
            sys.stdout.write("\n")
            sys.stdout.flush()
        except:
            raise Exception(f"downloading file is failed. {url}")
        return file_path


if __name__ == "__main__":
    s3 = AwsS3Downloader()

    s3.download(
        url="s3://skt-lsl-nlp-model/KoBERT/tokenizers/kobert_news_wiki_ko_cased-1087f8699e.spiece",
        local_dir=".cache",
    )

.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece[██████████████████████████████████████████████████]




---







**kobert_utils**

In [14]:
# coding=utf-8
# Copyright 2019 SK T-Brain Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import hashlib
import os


def download(url, chksum=None, cachedir=".cache"):
    cachedir_full = os.path.join(os.getcwd(), cachedir)
    os.makedirs(cachedir_full, exist_ok=True)
    filename = os.path.basename(url)
    file_path = os.path.join(cachedir_full, filename)
    if os.path.isfile(file_path):
        if hashlib.md5(open(file_path, "rb").read()).hexdigest()[:10] == chksum[:10]:
            print(f"using cached model. {file_path}")
            return file_path, True

    s3 = AwsS3Downloader()
    file_path = s3.download(url, cachedir_full)
    if chksum:
        assert (
            chksum[:10] == hashlib.md5(open(file_path, "rb").read()).hexdigest()[:10]
        ), "corrupted file!"
    return file_path, False


def get_tokenizer(cachedir=".cache"):
    """Get KoBERT Tokenizer file path after downloading"""
    tokenizer = {
        "url": "s3://skt-lsl-nlp-model/KoBERT/tokenizers/kobert_news_wiki_ko_cased-1087f8699e.spiece",
        "chksum": "ae5711deb3",
    }

    model_info = tokenizer
    model_path, is_cached = download(model_info["url"], model_info["chksum"], cachedir=cachedir)
    return model_path



---






**kobert_pytorch**

In [17]:
pip uninstall numpy

Found existing installation: numpy 1.24.3
Uninstalling numpy-1.24.3:
  Would remove:
    /usr/local/bin/f2py
    /usr/local/bin/f2py3
    /usr/local/bin/f2py3.10
    /usr/local/lib/python3.10/dist-packages/numpy-1.24.3.dist-info/*
    /usr/local/lib/python3.10/dist-packages/numpy.libs/libgfortran-040039e1.so.5.0.0
    /usr/local/lib/python3.10/dist-packages/numpy.libs/libopenblas64_p-r0-15028c96.3.21.so
    /usr/local/lib/python3.10/dist-packages/numpy.libs/libquadmath-96973f99.so.0.0.0
    /usr/local/lib/python3.10/dist-packages/numpy/*
Proceed (Y/n)? y
  Successfully uninstalled numpy-1.24.3


In [15]:
pip install numpy==1.23.5

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [16]:
# coding=utf-8
# Copyright 2019 SK T-Brain Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
from zipfile import ZipFile
import torch
from transformers import BertModel
import gluonnlp

#from kobert import download, get_tokenizer


def get_pytorch_kobert_model(ctx="cpu", cachedir=".cache"):
    def get_kobert_model(model_path, vocab_file, ctx="cpu"):
        bertmodel = BertModel.from_pretrained(model_path, return_dict=False)
        device = torch.device(ctx)
        bertmodel.to(device)
        bertmodel.eval()
        vocab_b_obj = gluonnlp.vocab.BERTVocab.from_sentencepiece(
            vocab_file, padding_token="[PAD]"
        )
        return bertmodel, vocab_b_obj

    pytorch_kobert = {
        "url": "s3://skt-lsl-nlp-model/KoBERT/models/kobert_v1.zip",
        "chksum": "411b242919",  # 411b2429199bc04558576acdcac6d498
    }

    # download model
    model_info = pytorch_kobert
    model_path, is_cached = download(
        model_info["url"], model_info["chksum"], cachedir=cachedir
    )
    cachedir_full = os.path.expanduser(cachedir)
    zipf = ZipFile(os.path.expanduser(model_path))
    zipf.extractall(path=cachedir_full)
    model_path = os.path.join(os.path.expanduser(cachedir), "kobert_from_pretrained")
    # download vocab
    vocab_path = get_tokenizer()
    return get_kobert_model(model_path, vocab_path, ctx)


if __name__ == "__main__":
    import torch
    #from kobert import get_pytorch_kobert_model

    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
    model, vocab = get_pytorch_kobert_model()
    sequence_output, pooled_output = model(input_ids, input_mask, token_type_ids)
    print(pooled_output.shape)
    print(vocab)
    print(sequence_output[0])

using cached model. /content/.cache/kobert_v1.zip
using cached model. /content/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece
torch.Size([2, 768])
Vocab(size=8002, unk="[UNK]", reserved="['[CLS]', '[SEP]', '[MASK]', '[PAD]']")
tensor([[-0.2461,  0.2428,  0.2590,  ..., -0.4861, -0.0731,  0.0756],
        [-0.2478,  0.2420,  0.2552,  ..., -0.4877, -0.0727,  0.0754],
        [-0.2472,  0.2420,  0.2561,  ..., -0.4874, -0.0733,  0.0765]],
       grad_fn=<SelectBackward0>)




---






**setting**

In [33]:
import torch
import random
import logging
import numpy as np
from argparse import ArgumentParser


class Arguments():

    def __init__(self):
        self.parser = ArgumentParser()

    def add_type_of_processing(self):
        self.add_argument('--opt_level', type=str, default='O1')
        self.add_argument('--fp16', type=str, default='True')
        self.add_argument('--train', type=str, default='True')
        self.add_argument('--test', type=str, default='False')
        #self.add_argument('--test', type=str, default='True')
        self.add_argument('--device', type=str, default=torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu'))

    def add_hyper_parameters(self):
        self.add_argument('--patient', type=int, default=10)
        self.add_argument('--dropout', type=int, default=0.1)
        self.add_argument('--max_len', type=int, default=50)
        self.add_argument('--batch_size', type=int, default=256)
        self.add_argument('--epochs', type=int, default=3)
        self.add_argument('--eval_steps', type=int, default=250)
        self.add_argument('--seed', type=int, default=1234)
        self.add_argument('--lr', type=float, default=0.00005)
        self.add_argument('--weight_decay', type=float, default=0.0)
        self.add_argument('--warmup_ratio', type=float, default=0.05)
        self.add_argument('--temperature', type=float, default=0.05)

    def add_data_parameters(self):
        self.add_argument('--train_data', type=str, default='snli_train_ko.tsv')
        self.add_argument('--valid_data', type=str, default='sts-test.tsv')
        self.add_argument('--test_data', type=str, default='xnli_test_ko.tsv')
        self.add_argument('--task', type=str, default='content/drive/MyDrive/Colab Notebooks')
        self.add_argument('--path_to_data', type=str, default='./data/')
        #self.add_argument('--path_to_save', type=str, default='./output/')
        self.add_argument('--path_to_save', type=str, default='/')
        #self.add_argument('--path_to_saved_model', type=str, default='./output/')
        self.add_argument('--path_to_saved_model', type=str, default='/')
        self.add_argument('--ckpt', type=str, default='best_checkpoint.pt')

    def print_args(self, args):
        for idx, (key, value) in enumerate(args.__dict__.items()):
            if idx == 0:print("argparse{\n", "\t", key, ":", value)
            elif idx == len(args.__dict__) - 1:print("\t", key, ":", value, "\n}")
            else:print("\t", key, ":", value)

    def add_argument(self, *args, **kw_args):
        return self.parser.add_argument(*args, **kw_args)

    def parse(self):
        print("여기까지는 실행됨")
        args = self.parser.parse_args("")
        print("여기까지는 실행됨22222")
        self.print_args(args)
        print("여기까지는 실행됨33333")

        return args


class Setting():

    def set_logger(self):

        _logger = logging.getLogger()
        formatter = logging.Formatter(
            '[%(levelname)s] %(asctime)s [ %(message)s ] | file::%(filename)s | line::%(lineno)s')

        stream_handler = logging.StreamHandler()
        stream_handler.setFormatter(formatter)

        _logger.addHandler(stream_handler)
        _logger.setLevel(logging.DEBUG)

        return _logger

    def set_seed(self, args):

        seed = args.seed

        random.seed(seed)
        np.random.seed(seed)

        torch.manual_seed(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)

    def run(self):

        parser = Arguments()
        parser.add_type_of_processing()
        parser.add_hyper_parameters()
        parser.add_data_parameters()

        args = parser.parse()
        logger = self.set_logger()
        self.set_seed(args)

        return args, logger








---










**dataloder**

In [32]:

import numpy
import torch
import logging
import gluonnlp as nlp
from torch.utils.data import DataLoader, Dataset
#from KoBERT.kobert.utils import get_tokenizer
#from KoBERT.kobert.pytorch_kobert import get_pytorch_kobert_model

logger = logging.getLogger(__name__)


class ModelDataLoader(Dataset):
    def __init__(self, file_path, args, metric, tokenizer, vocab, type):
        self.type = type
        self.args = args
        self.vocab = vocab
        self.metric = metric

        """NLI"""
        self.anchor = []
        self.positive = []
        self.negative = []

        """STS"""
        self.label = []
        self.sentence_1 = []
        self.sentence_2 = []

        #  -------------------------------------
        self.bert_tokenizer = tokenizer

        self.transform = nlp.data.BERTSentenceTransform(
            self.bert_tokenizer, max_seq_length=self.args.max_len, pad=True, pair=False)

        self.file_path = file_path

        """
        [CLS]: 2
        [PAD]: 1
        [UNK]: 0
        """
        self.init_token = self.vocab.cls_token
        self.pad_token = self.vocab.padding_token
        self.unk_token = self.vocab.unknown_token

        self.init_token_idx = self.vocab.token_to_idx[self.init_token]
        self.pad_token_idx = self.vocab.token_to_idx[self.pad_token]
        self.unk_token_idx = self.vocab.token_to_idx[self.unk_token]

    def load_data(self, type):

        with open(self.file_path) as file:
            lines = file.readlines()

            for line in lines:
                self.data2tensor(line, type)

        if type == 'train':
            assert len(self.anchor) == len(self.positive) == len(self.negative)
        else:
            assert len(self.sentence_1) == len(self.sentence_2) == len(self.label)

    def data2tensor(self, line, type):
        split_data = line.split('\t')

        if type == 'train':
            anchor, positive, negative = split_data
            anchor = self.transform([anchor])
            positive = self.transform([positive])
            negative = self.transform([negative])

            self.anchor.append(anchor)
            self.positive.append(positive)
            self.negative.append(negative)

        else:
            #print(split_data)
            #sentence_1, sentence_2, label = split_data
            genre, filename, year, id, label, sentence_1, sentence_2 = split_data
            sentence_1 = self.transform([sentence_1])
            sentence_2 = self.transform([sentence_2])

            self.sentence_1.append(sentence_1)
            self.sentence_2.append(sentence_2)
            self.label.append(float(label.strip())/5.0)

    def __getitem__(self, index):

        if self.type == 'train':
            inputs = {'anchor': {
                'source': torch.LongTensor(self.anchor[index][0]),
                'valid_length': torch.tensor(self.anchor[index][1]),
                'segment_ids': torch.LongTensor(self.anchor[index][2])
                },
                      'positive': {
                'source': torch.LongTensor(self.positive[index][0]),
                'valid_length': torch.tensor(self.positive[index][1]),
                'segment_ids': torch.LongTensor(self.positive[index][2])
                },
                      'negative': {
                'source': torch.LongTensor(self.negative[index][0]),
                'valid_length': torch.tensor(self.negative[index][1]),
                'segment_ids': torch.LongTensor(self.negative[index][2])
                }}
        else:

            inputs = {'sentence_1': {
                'source': torch.LongTensor(self.sentence_1[index][0]),
                'valid_length': torch.tensor(self.sentence_1[index][1]),
                'segment_ids': torch.LongTensor(self.sentence_1[index][2])
                },
                      'sentence_2': {
                'source': torch.LongTensor(self.sentence_2[index][0]),
                'valid_length': torch.tensor(self.sentence_2[index][1]),
                'segment_ids': torch.LongTensor(self.sentence_2[index][2])
                },
                      'label': torch.FloatTensor([self.label[index]])}

        inputs = self.metric.move2device(inputs, self.args.device)

        return inputs

    def __len__(self):
        if self.type == 'train':
            return len(self.anchor)
        else:
            return len(self.label)


# Get train, valid, test data loader and BERT tokenizer
def get_loader(args, metric):
    bert_model, vocab = get_pytorch_kobert_model()
    tokenizer = get_tokenizer()
    tokenizer = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

    path_to_train_data = '/' + args.task + '/' + args.train_data
    # args.path_to_data + '/' + args.task + '/' + args.train_data
    path_to_valid_data = '/' + args.task + '/' + args.valid_data
    # args.path_to_data + '/' + args.task + '/' + args.valid_data
    path_to_test_data = '/' + args.task + '/' + args.test_data
    # args.path_to_data + '/' + args.task + '/' + args.test_data

    if args.train == 'True' and args.test == 'False':
        train_iter = ModelDataLoader(path_to_train_data, args, metric, tokenizer, vocab, type='train')
        valid_iter = ModelDataLoader(path_to_valid_data, args, metric, tokenizer, vocab, type='valid')

        train_iter.load_data('train')
        valid_iter.load_data('valid')

        loader = {'train': DataLoader(dataset=train_iter,
                                      batch_size=args.batch_size,
                                      shuffle=True),
                  'valid': DataLoader(dataset=valid_iter,
                                      batch_size=args.batch_size,
                                      shuffle=True)}

    elif args.train == 'False' and args.test == 'True':
        test_iter = ModelDataLoader(path_to_test_data, args, metric, tokenizer, vocab, type='test')
        test_iter.load_data('test')

        loader = {'test': DataLoader(dataset=test_iter,
                                     batch_size=args.batch_size,
                                     shuffle=True)}

    else:
        loader = None

    return bert_model, loader, tokenizer


def convert_to_tensor(corpus, transform):
    tensor_corpus = []
    tensor_valid_length = []
    tensor_segment_ids = []
    for step, sentence in enumerate(corpus):
        cur_sentence, valid_length, segment_ids = transform([sentence])

        tensor_corpus.append(cur_sentence)
        tensor_valid_length.append(numpy.array([valid_length]))
        tensor_segment_ids.append(segment_ids)

    inputs = {'source': torch.LongTensor(tensor_corpus),
              'segment_ids': torch.LongTensor(tensor_segment_ids),
              'valid_length': torch.tensor(tensor_valid_length)}

    return inputs


def example_model_setting(model_ckpt):

    from model.simcse.bert import BERT

    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

    bert_model, vocab = get_pytorch_kobert_model()
    tokenizer = nlp.data.BERTSPTokenizer(get_tokenizer(), vocab, lower=False)
    transform = nlp.data.BERTSentenceTransform(tokenizer, max_seq_length=50, pad=True, pair=False)

    model = BERT(bert_model)

    model.load_state_dict(torch.load(model_ckpt)['model'])
    model.to(device)
    model.eval()

    return model, transform, device

#gs = Setting()
#args, logger = gs.run()


#if __name__ == '__main__':
#    get_loader('test')



---









**Processor**

In [19]:
pip install cxxfilt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [20]:
pip install tqdm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [21]:
pip install PyYAML

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [22]:
pip install pytest

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [23]:
pip install packaging

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [24]:
%cd /content/drive/MyDrive/Colab Notebooks/apex
!python3 setup.py install

/content/drive/MyDrive/Colab Notebooks/apex
No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'

 If your intention is to cross-compile, this is not an error.
By default, Apex will cross-compile for Pascal (compute capabilities 6.0, 6.1, 6.2),
Volta (compute capability 7.0), Turing (compute capability 7.5),
and, if the CUDA version is >= 11.0, Ampere (compute capability 8.0).
If you wish to cross-compile for a single specific architecture,
export TORCH_CUDA_ARCH_LIST="compute capability" before running setup.py.



torch.__version__  = 2.0.1+cu118


running install
!!

        ********************************************************************************
        Please avoid running ``setup.py`` directly.
        Instead, use pypa/build, pypa/installer, pypa/build or
        other standards-based tools.

        See https://blog.ganssle.io/articles/2021/10/setup-py-deprecated.html for details.
        ***************************************************************************

In [25]:
import os
import logging
from apex import amp
import torch.nn as nn
from tqdm import tqdm
import torch.quantization
import torch.optim as optim
#from data.dataloader import get_loader

from transformers import get_linear_schedule_with_warmup

logger = logging.getLogger(__name__)


class Processor():

    def __init__(self, args):
        self.args = args
        self.config = None
        self.metric = Metric(args)
        self.loss = Loss(args)
        self.total_steps = 0
        self.model_checker = {'early_stop': False,
                              'early_stop_patient': 0,
                              'best_valid_score': 0}
        self.dev_progress = {'score': 0, 'iter': 0}
        self.model_progress = {'loss': 0, 'iter': 0}

    def run(self, inputs, indicator=None, type=None):

        if type == 'train':
            anchor_embeddings, positive_embeddings, negative_embeddings = self.config['model'](inputs, type)
            loss = self.loss.train_loss_fct(self.config, anchor_embeddings, positive_embeddings, negative_embeddings)
            return loss
        else:
            sentence_1_embeddings, sentence_2_embeddings = self.config['model'](inputs, type)
            score = self.loss.evaluation_during_training(sentence_1_embeddings, sentence_2_embeddings, inputs['label'], indicator)
            return score

    def progress(self, loss):
        self.model_progress['loss'] += loss
        self.model_progress['iter'] += 1

    def progress_validation(self, score):
        self.dev_progress['score'] += score
        self.dev_progress['iter'] += 1

    def return_value(self):
        loss = self.model_progress['loss'].data.cpu().numpy() / self.model_progress['iter']
        acc = self.model_progress['acc'].data.cpu().numpy() / self.model_progress['iter']

        return loss, acc

    def get_object(self, tokenizer, model):
        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
             'weight_decay': self.args.weight_decay},
            {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
             'weight_decay': 0.0}
        ]

        criterion = nn.CrossEntropyLoss()
        optimizer = optim.AdamW(optimizer_grouped_parameters, lr=self.args.lr)
        print("좌표 : get_object 실행 완료")
        return criterion, optimizer

    def get_scheduler(self, optim, train_loader):
        print("좌표 : get_scheduler 진입")
        print(optim)
        print(train_loader)
        train_total = len(train_loader) * self.args.epochs
        scheduler = get_linear_schedule_with_warmup(optim,
                                                    num_warmup_steps=self.args.warmup_ratio * train_total,
                                                    num_training_steps=train_total)

        return scheduler, train_total

    def model_setting(self):
        model, loader, tokenizer = get_loader(self.args, self.metric)
        model = BERT(model)
        model.to(self.args.device)

        criterion, optimizer = self.get_object(tokenizer, model)

        print("좌표 : model_setting에서 if문 들어가기 직전")
        if self.args.train == 'True':
            print("좌표 : model_setting에서 self.args.train이 참인경우")
            print("좌표 : get_scheduler함수 안에 들어가기 전 optimizer", optimizer)
            print("좌표 : get_scheduler함수 안에 들어가기 전 loader['train']", loader['train'])
            scheduler, total_steps = self.get_scheduler(optimizer, loader['train'])
            self.total_steps = total_steps
        else:
            scheduler = None

        config = {'loader': loader,
                  'optimizer': optimizer,
                  'criterion': criterion,
                  'scheduler': scheduler,
                  'tokenizer': tokenizer,
                  'args': self.args,
                  'model': model}

        if config['args'].fp16 == 'True':
            config['model'], config['optimizer'] = amp.initialize(
                config['model'], config['optimizer'], opt_level=config['args'].opt_level)

        self.config = config

        return self.config

    def train(self, epoch):
        self.config['model'].train()

        for step, batch in enumerate(tqdm(self.config['loader']['train'])):
            self.config['optimizer'].zero_grad()

            inputs = batch

            train_loss = self.run(inputs, type='train')

            if self.args.fp16 == 'True':
                with amp.scale_loss(train_loss, self.config['optimizer']) as scaled_loss:
                    scaled_loss.backward()
            else:
                train_loss.backward()

            self.config['optimizer'].step()
            self.config['scheduler'].step()

            self.progress(train_loss.data)

            if self.model_progress['iter'] % self.args.eval_steps == 0 or self.model_progress['iter'] == self.total_steps:
                valid_score = self.valid()
                performance = {'tl': train_loss, 'vs': valid_score, 'ep': epoch, 'step': self.model_progress['iter']}
                
                self.metric.save_model(self.config, performance, self.model_checker)
                self.config['model'].train()
                
    def valid(self):
        self.config['model'].eval()
        self.dev_progress = self.dev_progress.fromkeys(self.dev_progress, 0)

        score_indicator = {'eval_pearson_cosine': 0,
                           'eval_spearman_cosine': 0,
                           'eval_pearson_manhattan': 0,
                           'eval_spearman_manhattan': 0,
                           'eval_pearson_euclidean': 0,
                           'eval_spearman_euclidean': 0,
                           'eval_pearson_dot': 0,
                           'eval_spearman_dot': 0}

        with torch.no_grad():
            for step, batch in enumerate(self.config['loader']['valid']):
                inputs = batch
                score = self.run(inputs, indicator=score_indicator, type='valid')

                self.progress_validation(score)

        score = self.metric.cal_dev_score(self.dev_progress, score_indicator)

        return score

    def test(self):
        self.config['model'].load_state_dict(torch.load(self.args.path_to_saved_model)['model'], strict=False)
        self.config['model'].eval()
        self.dev_progress = self.dev_progress.fromkeys(self.dev_progress, 0)

        score_indicator = {'eval_pearson_cosine': 0,
                           'eval_spearman_cosine': 0,
                           'eval_pearson_manhattan': 0,
                           'eval_spearman_manhattan': 0,
                           'eval_pearson_euclidean': 0,
                           'eval_spearman_euclidean': 0,
                           'eval_pearson_dot': 0,
                           'eval_spearman_dot': 0}

        with torch.no_grad():
            for step, batch in enumerate(self.config['loader']['test']):
                inputs = batch
                score = self.run(inputs, indicator=score_indicator, type='test')

                self.progress_validation(score)

        logger.info('### TEST SCORE ###')
        score = self.metric.cal_dev_score(self.dev_progress, score_indicator)



---









**main**


In [35]:
#from model.setting import Setting, Arguments
#from model.simcse.processor import Processor


def main(args, logger) -> None:
    processor = Processor(args)
    config = processor.model_setting()
    logger.info('Model Setting Complete')

    if args.train == 'True':
        logger.info('Start Training')
        
        for epoch in range(args.epochs):
            processor.train(epoch+1)

    if args.test == 'True':
        logger.info("Start Test")
        
        processor.test()
        processor.metric.print_size_of_model(config['model'])
        processor.metric.count_parameters(config['model'])


if __name__ == '__main__':
    args, logger = Setting().run()
    main(args, logger)

여기까지는 실행됨
여기까지는 실행됨22222
argparse{
 	 opt_level : O1
	 fp16 : True
	 train : True
	 test : False
	 device : cpu
	 patient : 10
	 dropout : 0.1
	 max_len : 50
	 batch_size : 256
	 epochs : 3
	 eval_steps : 250
	 seed : 1234
	 lr : 5e-05
	 weight_decay : 0.0
	 warmup_ratio : 0.05
	 temperature : 0.05
	 train_data : snli_train_ko.tsv
	 valid_data : sts-test.tsv
	 test_data : xnli_test_ko.tsv
	 task : content
	 path_to_data : ./data/
	 path_to_save : /
	 path_to_saved_model : /
	 ckpt : best_checkpoint.pt 
}
여기까지는 실행됨33333
using cached model. /content/drive/MyDrive/Colab Notebooks/apex/.cache/kobert_v1.zip
using cached model. /content/drive/MyDrive/Colab Notebooks/apex/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece
using cached model. /content/drive/MyDrive/Colab Notebooks/apex/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece
좌표 : get_object 실행 완료
좌표 : model_setting에서 if문 들어가기 직전
좌표 : model_setting에서 self.args.train이 참인경우
좌표 : get_scheduler함수 안에 들어가기 전 optimizer AdamW (
Parameter Gro



Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic


RuntimeError: ignored