## Setup

In [None]:
GLOBAL_SEED = 42

import os
os.environ['PYTHONHASHSEED'] = str(GLOBAL_SEED)

import numpy as np # linear algebra
from numpy import random as np_rnd
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import pickle
import glob
import json
from io import open
import unicodedata
import string
import re
import random as rnd
from nltk.corpus import stopwords
import time 
import sentencepiece as spm
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import TensorDataset, Dataset, DataLoader

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.optim import AdamW
from transformers import get_polynomial_decay_schedule_with_warmup

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")

In [None]:
def seed_everything(seed=42):
    os.environ['PYTHONHASHSEED'] = str(seed)
    # python random
    rnd.seed(seed)
    # numpy random
    np_rnd.seed(seed)
    # RAPIDS random
    try:
        cupy.random.seed(seed)
    except:
        pass
    # tf random
    try:
        tf_rnd.set_seed(seed)
    except:
        pass
    # pytorch random
    try:
        torch.manual_seed(seed)
        torch.cuda.manual_seed(seed)
        torch.backends.cudnn.deterministic = True
    except:
        pass

def pickleIO(obj, src, op="r"):
    if op=="w":
        with open(src, op + "b") as f:
            pickle.dump(obj, f)
    elif op=="r":
        with open(src, op + "b") as f:
            tmp = pickle.load(f)
        return tmp
    else:
        print("unknown operation")
        return obj
    
def createFolder(directory):
    try:
        if not os.path.exists(directory):
            os.makedirs(directory)
    except OSError:
        print('Error: Creating directory. ' + directory)

def findIdx(data_x, col_names):
    return [int(i) for i, j in enumerate(data_x) if j in col_names]

def diff(first, second):
    second = set(second)
    return [item for item in first if item not in second]

In [None]:
class CFG:
    debug = False
    epochs = 2 if debug else 30
    early_stopping_rounds = 10
    batch_size = 64
    eta = 5e-4
    weight_decay = 1e-4

## Loading data

In [None]:
df_merge = []
for catPath in glob.glob(r".\022.요약문 및 레포트 생성 데이터\01.데이터\1.Training\라벨링데이터\TL1\*"):
    df = []
    for fpath in glob.glob(catPath + "./2~3sent/*"):
        with open(fpath, encoding="utf8") as f:
            data = json.load(f)
            df.append({
                "doc": data['Meta(Refine)']["passage"],
                "label": data['Annotation']["summary2"] if data['Annotation']["summary2"] is not None else data['Annotation']["summary1"],
                "cat": catPath.split(".")[-1],
            })
    df_merge.append(pd.DataFrame(df))
df_train = pd.concat(df_merge).reset_index(drop=True)

In [None]:
df_merge = []
for catPath in glob.glob(r".\022.요약문 및 레포트 생성 데이터\01.데이터\2.Validation\라벨링데이터\VL1\*"):
    df = []
    for fpath in glob.glob(catPath + "./2~3sent/*"):
        with open(fpath, encoding="utf8") as f:
            data = json.load(f)
            df.append({
                "doc": data['Meta(Refine)']["passage"],
                "label": data['Annotation']["summary2"] if data['Annotation']["summary2"] is not None else data['Annotation']["summary1"],
                "cat": catPath.split(".")[-1],
            })
    df_merge.append(pd.DataFrame(df))
df_valid = pd.concat(df_merge).reset_index(drop=True)

In [None]:
del df_merge
df_train.shape, df_valid.shape

In [None]:
df_train.head()

In [None]:
df_valid.head()

## Preprocessing

In [None]:
def preprocess(text):
    text = re.sub(r"[^가-힣 ]", "", text)
    text = " ".join([i for i in text.split() if len(i) > 1])
    return text

sample = "안녕하세요.\n 오랜만입니다."
print(preprocess(sample))

In [None]:
cat_lbe = pd.Series(range(df_train["cat"].nunique()), index=df_train["cat"].unique())

df_train["doc"] = df_train["doc"].apply(preprocess)
df_train["label"] = df_train["label"].apply(preprocess)
df_train["cat"] = df_train["cat"].map(cat_lbe)

df_valid["doc"] = df_valid["doc"].apply(preprocess)
df_valid["label"] = df_valid["label"].apply(preprocess)
df_valid["cat"] = df_valid["cat"].map(cat_lbe)

In [None]:
df_train.head()

In [None]:
df_valid.head()

In [None]:
cat_lbe

In [None]:
pickleIO(cat_lbe, "./cat_lbe.pkl", "w")

## Tokenizing

### training sentence piece tokenizer

In [None]:
with open("./dataset/spt_train_data.txt", "w", encoding="utf8") as f:
    f.writelines((df_train["doc"].astype("str") + "\n").to_list() + (df_train["label"].astype("str") + "\n").to_list() + (df_valid["doc"].astype("str") + "\n").to_list() + (df_valid["label"].astype("str") + "\n").to_list())

vocab_size = 16384 + 4
spm.SentencePieceTrainer.train(f'--input=./dataset/spt_train_data.txt --model_prefix=tokenizer --model_type=bpe --max_sentence_length=8192 --vocab_size={vocab_size} --pad_id=0 --unk_id=1 --bos_id=2 --eos_id=3')
sp = spm.SentencePieceProcessor()
sp.load('tokenizer.model')
vocab = {sp.id_to_piece(i): i for i in range(sp.get_piece_size())}
print("number of vocab", len(vocab))

In [None]:
display(np.percentile([len(sp.EncodeAsIds(i)) for i in df_train["doc"]], 0.95))
display(np.percentile([len(sp.EncodeAsIds(i)) for i in df_train["label"]], 0.95))

In [None]:
feature_max_len = 122
label_max_len = 21

In [None]:
df_train = df_train.sample(frac=1, random_state=GLOBAL_SEED).reset_index(drop=True)

In [None]:
train_container = {}
valid_container = {}

def tokenizing(x, max_len):
    token = torch.tensor([vocab["<s>"]] + sp.EncodeAsIds(x)[:(max_len-2)] + [vocab["</s>"]], dtype=torch.int64)
    if len(token) < max_len:
        token = torch.cat([token, torch.zeros(max_len - len(token), dtype=torch.int64)])
    assert len(token) == max_len
    return token

train_container["doc"] = torch.stack([tokenizing(i, feature_max_len) for i in df_train["doc"]])
train_container["label"] = torch.stack([tokenizing(i, label_max_len) for i in df_train["label"]])
train_container["cat"] = torch.tensor(df_train["cat"], dtype=torch.int64)

valid_container["doc"] = torch.stack([tokenizing(i, feature_max_len) for i in df_valid["doc"]])
valid_container["label"] = torch.stack([tokenizing(i, label_max_len) for i in df_valid["label"]])
valid_container["cat"] = torch.tensor(df_valid["cat"], dtype=torch.int64)

## Define Model & Training

## 

In [None]:
def get_optimizer_params(model, eta, weight_decay):
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        # apply weight decay
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
         'lr': eta, 'weight_decay': weight_decay},
        # don't apply weight decay for LayerNormalization layer
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
         'lr': eta, 'weight_decay': 0.0},
    ]
    return optimizer_parameters


def get_scheduler(optimizer, num_warmup_steps, num_training_steps):
    scheduler = get_polynomial_decay_schedule_with_warmup(
        optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps, power=0.5, lr_end=1e-7
    )
    return scheduler


class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self, name, fmt=':f'):
        self.name = name
        self.fmt = fmt
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

    def __str__(self):
        fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
        return fmtstr.format(**self.__dict__)

In [None]:
def train_fn(model, dl, criterion, optimizer, scheduler, grad_scaler):
    model.train()
    metrics = {
        "loss": AverageMeter("loss", fmt=":.5f"),
        "accuracy": AverageMeter("accuracy", fmt=":.5f"),
    }
    
    for idx, batch in enumerate(train_dl):
        with torch.cuda.amp.autocast():
            batch[0] = batch[0].to(device)
            batch[1] = batch[1].to(device)
            batch[2] = batch[2].to(device)

            encoder_pos = torch.tile(torch.arange(batch[0].shape[1], dtype=torch.int64).view(1, -1), (batch[0].shape[0], 1)).to(device)
            last_hc = torch.zeros(generating_params["num_layers"], len(batch[0]), generating_params["embed_dim"], device=device) + 1e-7, \
                    torch.zeros(generating_params["num_layers"], len(batch[0]), generating_params["embed_dim"], device=device) + 1e-7
            encoder_output, last_hc = model.get_encoder_output(
                input=batch[0], input_pos=encoder_pos, input_cat=torch.tile(batch[2].view(-1, 1), (1, batch[0].shape[-1])), last_hc=last_hc,
            )
            decoder_output = torch.zeros(len(batch[0]), 1, generating_params["embed_dim"], dtype=torch.int64, device=device) + 1e-7
            decoder_input = batch[1][:, [0]]

            loss = 0
            acc = 0
            for i in range(label_max_len-1):
                decoder_pos = (torch.zeros(len(batch[0]), dtype=torch.int64, device=device).view(-1, 1) + i)
                decoder_output, decoder_output_prob, last_hc = model.get_decoder_output(
                    input=decoder_input, input_pos=decoder_pos, input_cat=batch[2].view(-1, 1),
                    encoder_output=encoder_output, last_hc=last_hc, last_hidden_state=decoder_output
                )
                decoder_output_prob = decoder_output_prob.squeeze(dim=1)
                decoder_output_cls = F.softmax(decoder_output_prob, dim=-1).argmax(dim=-1)
                loss += criterion(decoder_output_prob, batch[1][:, (i+1)]) / (label_max_len-1)
                acc += (((decoder_output_cls == batch[1][:, (i+1)]).float())[batch[1][:, (i+1)] != 0]).mean() / (label_max_len-1)
                if rnd.random() < generating_params["teacher_forcing_ratio"]:
                    decoder_input = batch[1][:, [(i+1)]]
                else:
                    decoder_input = decoder_output_cls.view(-1, 1)

        # initialization gradients to zero
        optimizer.zero_grad()
        # get scaled gradients by float16 (default)
        grad_scaler.scale(loss).backward()
        # apply original gradients (unscaling) to parameters
        # if these gradients do not contain infs or NaNs, optimizer.step() is then called.
        # otherwise, optimizer.step() is skipped.
        grad_scaler.step(optimizer)
        grad_scaler.update()
        # calcuate metrics
        metrics["loss"].update(loss.item())
        metrics["accuracy"].update(acc.item())
    
    # update scheduler
    scheduler.step()
    
    return metrics

def valid_fn(model, dl, criterion):
    model.eval()
    metrics = {
        "loss": AverageMeter("loss", fmt=":.5f"),
        "accuracy": AverageMeter("accuracy", fmt=":.5f"),
    }
    
    for idx, batch in enumerate(dl):
        with torch.no_grad():
            batch[0] = batch[0].to(device)
            batch[1] = batch[1].to(device)
            batch[2] = batch[2].to(device)

            encoder_pos = torch.tile(torch.arange(batch[0].shape[1], dtype=torch.int64).view(1, -1), (batch[0].shape[0], 1)).to(device)
            last_hc = torch.zeros(generating_params["num_layers"], len(batch[0]), generating_params["embed_dim"], device=device) + 1e-7, \
                    torch.zeros(generating_params["num_layers"], len(batch[0]), generating_params["embed_dim"], device=device) + 1e-7            
            encoder_output, last_hc = model.get_encoder_output(
                input=batch[0], input_pos=encoder_pos, input_cat=torch.tile(batch[2].view(-1, 1), (1, batch[0].shape[-1])), last_hc=last_hc,
            )
            decoder_output = torch.zeros(len(batch[0]), 1, generating_params["embed_dim"], dtype=torch.int64, device=device) + 1e-7
            decoder_input = batch[1][:, [0]]

            loss = 0
            acc = 0
            for i in range(label_max_len-1):
                decoder_pos = (torch.zeros(len(batch[0]), dtype=torch.int64, device=device).view(-1, 1) + i)
                decoder_output, decoder_output_prob, last_hc = model.get_decoder_output(
                    input=decoder_input, input_pos=decoder_pos, input_cat=batch[2].view(-1, 1),
                    encoder_output=encoder_output, last_hc=last_hc, last_hidden_state=decoder_output
                )
                decoder_output_prob = decoder_output_prob.squeeze(dim=1)
                decoder_output_cls = F.softmax(decoder_output_prob, dim=-1).argmax(dim=-1)
                loss += criterion(decoder_output_prob, batch[1][:, (i+1)]) / (label_max_len-1)
                acc += (((decoder_output_cls == batch[1][:, (i+1)]).float())[batch[1][:, (i+1)] != 0]).mean() / (label_max_len-1)
                if rnd.random() < generating_params["teacher_forcing_ratio"]:
                    decoder_input = batch[1][:, [(i+1)]]
                else:
                    decoder_input = decoder_output_cls.view(-1, 1)

        # calcuate metrics
        metrics["loss"].update(loss.item())
        metrics["accuracy"].update(acc.item())
                
    return metrics

In [None]:
def do_training(fold, model):
    # set loss & optimizer
    optimizer_parameters = get_optimizer_params(
        model,
        eta=CFG.eta,
        weight_decay=CFG.weight_decay
    )
    optimizer = AdamW(optimizer_parameters, lr=CFG.eta, weight_decay=CFG.weight_decay)
    scheduler = get_scheduler(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=CFG.epochs
    )
    grad_scaler = torch.cuda.amp.GradScaler()
    criterion = nn.CrossEntropyLoss(ignore_index=0)
    
    best_score = np.inf
    early_stopping_cnt = 0
    for epoch in range(CFG.epochs):
        epoch_start_time = time.time()
        
        # training
        train_metrics = train_fn(model, train_dl, criterion, optimizer, scheduler, grad_scaler)
        # evaluation
        valid_metrics = valid_fn(model, valid_dl, criterion)
        score = valid_metrics["loss"].avg

        print("Epoch[{0}/{1}]\n train loss : {2}\n train accuracy : {3}\n valid loss : {4}\n valid accuracy : {5}\n eta : {6}\n Elapsed : {7}\n"
              .format(
                  epoch+1, CFG.epochs,
                  round(train_metrics["loss"].avg, 5), round(train_metrics["accuracy"].avg, 5),
                  round(valid_metrics["loss"].avg, 5), round(valid_metrics["accuracy"].avg, 5),
                  round(optimizer.param_groups[-1]['lr'], 5), round(time.time() - epoch_start_time, 3)
              )
        )
    
        torch.save(
            {'model': model.state_dict()},
            f"./model_fold{fold}_best.pth",
        )
        if score < best_score:
            best_score = score
            return_score_dic = {
                "fold": fold,
                "train_loss": train_metrics["loss"].avg,
                "valid_loss": valid_metrics["loss"].avg,
                "train_accuracy": train_metrics["accuracy"].avg,
                "valid_accuracy": valid_metrics["accuracy"].avg,                
            }
            print("INFO: Found best weight\n\n")
            early_stopping_cnt = 0
        else:
            early_stopping_cnt += 1
        
        if early_stopping_cnt == CFG.early_stopping_rounds:
            break
    
    model.load_state_dict(torch.load(f"./model_fold{return_score_dic['fold']}_best.pth")["model"])
    
    return return_score_dic

In [None]:
# Encoder Block
class EncoderRNN(nn.Module):
    def __init__(self, input_size, embed_dim, max_len, num_layers=2, dropout_p=0.5):
        super(EncoderRNN, self).__init__()
        self.input_size = input_size
        self.embed_dim = embed_dim
        self.max_len = max_len
        self.num_layers = num_layers
        self.dropout_p = dropout_p

        self.lin_embed = nn.Linear(embed_dim + (embed_dim // 4) + (embed_dim // 4 // 4), embed_dim)
        # learing layer for latent vector
        self.gru = nn.LSTM(self.embed_dim, self.embed_dim, num_layers=self.num_layers, dropout=self.dropout_p/4, bidirectional=False, batch_first=True)
        # learning block
        self.lin = nn.Sequential(
            nn.Dropout(self.dropout_p),
            nn.Linear(self.embed_dim, self.embed_dim * 2),
            nn.ReLU(),
            nn.Dropout(self.dropout_p),
            nn.Linear(self.embed_dim * 2, self.embed_dim),
            nn.ReLU(),
        )
    
    def forward(self, input, input_pos, input_cat, last_hc, global_embedding_layer, pos_embedding, cat_embedding):
        # input (B, SEQ)
        # embedding (B, SEQ, EMBED)
        embed = torch.cat([
            global_embedding_layer(input),
            pos_embedding(input_pos),
            cat_embedding(input_cat),
        ], dim=-1)
        embed = self.lin_embed(embed)
        # GRU (B, SEQ, hidden_layer_size)
        output, (hn, cn) = self.gru(embed, last_hc)
        # linear transformation on output
        output = self.lin(output) + output
        return output, (hn, cn)

class DecoderRNN(nn.Module):
    def __init__(self, input_size, output_size, embed_dim, max_len, num_layers=2, dropout_p=0.5):
        super(DecoderRNN, self).__init__()
        self.input_size = input_size
        self.output_size = output_size
        self.embed_dim = embed_dim
        self.num_layers = num_layers
        self.dropout_p = dropout_p
        self.max_len = max_len
        
        self.lin_embed = nn.Linear(embed_dim + (embed_dim // 4) + (embed_dim // 4 // 4), embed_dim)
        # learing layer for new latent vector with recent hidden cell state
        self.gru = nn.LSTM(self.embed_dim, self.embed_dim, num_layers=self.num_layers, dropout=self.dropout_p/4, bidirectional=False, batch_first=True)
        # learning block
        self.lin1 = nn.Sequential(
            nn.Dropout(dropout_p),
            nn.Linear(self.embed_dim, self.embed_dim * 2),
            nn.ReLU(),
            nn.Dropout(dropout_p),
            nn.Linear(self.embed_dim * 2, self.embed_dim),
            nn.ReLU(),
        )
        # learning block
        self.lin2 = nn.Sequential(
            nn.Dropout(dropout_p),
            nn.Linear(self.embed_dim * 3, self.embed_dim * 4),
            nn.ReLU(),
            nn.Dropout(dropout_p),
            nn.Linear(self.embed_dim * 4, self.embed_dim),
            nn.ReLU(),
        )
        self.classifier = nn.Linear(self.embed_dim, self.output_size)

    def forward(self, input, input_pos, input_cat, encoder_output, last_hc, last_hidden_state, global_embedding_layer, pos_embedding, cat_embedding):
        # input (B, 1) - only one token
        embed = torch.cat([
            global_embedding_layer(input),
            pos_embedding(input_pos),
            cat_embedding(input_cat),
        ], dim=-1)
        embed = self.lin_embed(embed)
        # GRU (B, SEQ, hidden_layer_size)
        output, (hn, cn) = self.gru(embed, last_hc)
        # linear transformation on output
        output = self.lin1(output) + output
        # (B, 1, EMBED) * (B, EMBED, SEQ) -> (B, 1, SEQ)
        attn_weights = F.softmax(torch.bmm(output, encoder_output.transpose(1, 2)), dim=-1)
        # (B, 1, SEQ) * (B, EMBED, SEQ) -> (B, 1, EMBED)
        attn_applied = torch.bmm(attn_weights, encoder_output)
        # linear transformation (attention ouptut + output + last output)
        output = self.lin2(torch.cat([attn_applied, output, last_hidden_state], dim=-1))
        # get probablity
        output_prob = self.classifier(output)

        return output, output_prob, (hn, cn)

class Seq2SeqModel(nn.Module):
    def __init__(self, encoder_block, decoder_block, input_size, embed_dim, feature_max_len,  label_max_len, cat_size):
        super(Seq2SeqModel, self).__init__()
        self.token_embedding = nn.Embedding(input_size, embed_dim)
        self.encoder_pos_embedding = nn.Embedding(feature_max_len, embed_dim // 4)
        self.decoder_pos_embedding = nn.Embedding(label_max_len, embed_dim // 4)
        self.cat_embedding = nn.Embedding(cat_size, embed_dim // 4 // 4)
        self.encoder_block = encoder_block
        self.decoder_block = decoder_block
    def get_encoder_output(self, input, input_pos, input_cat, last_hc):
        return self.encoder_block(input, input_pos, input_cat, last_hc, self.token_embedding, self.encoder_pos_embedding, self.cat_embedding)
    def get_decoder_output(self, input, input_pos, input_cat, encoder_output, last_hc, last_hidden_state):
        return self.decoder_block(input, input_pos, input_cat, encoder_output, last_hc, last_hidden_state, self.token_embedding, self.decoder_pos_embedding, self.cat_embedding)

In [None]:
%%time

seed_everything(GLOBAL_SEED)

generating_params = {
    "embed_dim": 512,
    "num_layers": 2,
    "teacher_forcing_ratio": 0.5,
}

encoder = EncoderRNN(
    input_size=sp.get_piece_size(), embed_dim=generating_params["embed_dim"], max_len=feature_max_len, num_layers=generating_params["num_layers"],
)
decoder = DecoderRNN(
    input_size=sp.get_piece_size(), output_size=sp.get_piece_size(), embed_dim=generating_params["embed_dim"], max_len=label_max_len, num_layers=generating_params["num_layers"],
)
model = Seq2SeqModel(encoder, decoder, input_size=sp.get_piece_size(), embed_dim=generating_params["embed_dim"], feature_max_len=feature_max_len, label_max_len=label_max_len, cat_size=len(cat_lbe))
model.to(device)

train_dl = DataLoader(TensorDataset(train_container["doc"], train_container["label"], train_container["cat"]), batch_size=CFG.batch_size, shuffle=True, drop_last=True)
print("number of iteration :", len(train_dl))
valid_dl = DataLoader(TensorDataset(valid_container["doc"], valid_container["label"], valid_container["cat"]), batch_size=CFG.batch_size, shuffle=False)

# training 
best_score = do_training(0, model)

## Inference

In [None]:
class TeamGPT_TextSummarizer():
    def __init__(self, model, tokenizer, generating_params, max_len, token_length_limit=128):
        self.model = model
        self.tokenizer = tokenizer
        self.generating_params = generating_params
        self.max_len = max_len
        self.token_length_limit =token_length_limit
        self.model.eval()
    def preprocessing(self, text):
        text = re.sub(r"[^가-힣 ]", "", text)
        text = " ".join([i for i in text.split() if len(i) > 1])
        return text
    def tokenizing(self, text, max_len):
        token = torch.tensor([vocab["<s>"]] + self.tokenizer.EncodeAsIds(text)[:(max_len-2)] + [vocab["</s>"]], dtype=torch.int64)
        if len(token) < max_len:
            token = torch.cat([token, torch.zeros(max_len - len(token), dtype=torch.int64)])
        assert len(token) == max_len
        return token
    def summarize(self, text, cat, device):
        text = self.preprocessing(text)
        text = self.tokenizing(text, self.max_len)
        text = text.view(1, -1).to(device)
        cat = torch.tensor([cat], dtype=torch.int64, device=device)
        self.model.to(device)
        output_cls = []
        with torch.no_grad():
            encoder_pos = torch.tile(torch.arange(text.shape[1], dtype=torch.int64).view(1, -1), (text.shape[0], 1)).to(device)
            last_hc = torch.zeros(generating_params["num_layers"], len(text), generating_params["embed_dim"], device=device) + 1e-7, \
                    torch.zeros(generating_params["num_layers"], len(text), generating_params["embed_dim"], device=device) + 1e-7            
            encoder_output, last_hc = model.get_encoder_output(
                input=text, input_pos=encoder_pos, input_cat=torch.tile(cat.view(-1, 1), (1, text.shape[-1])), last_hc=last_hc,
            )
            decoder_output = torch.zeros(len(text), 1, generating_params["embed_dim"], dtype=torch.int64, device=device) + 1e-7
            decoder_input = torch.zeros(len(text), 1, dtype=torch.int64, device=device) + vocab["</s>"]
            for i in range(self.token_length_limit):
                decoder_pos = (torch.zeros(len(text), dtype=torch.int64, device=device).view(-1, 1) + i)
                decoder_output, decoder_output_prob, last_hc = self.model.get_decoder_output(
                    input=decoder_input, input_pos=decoder_pos, input_cat=cat.view(-1, 1),
                    encoder_output=encoder_output, last_hc=last_hc, last_hidden_state=decoder_output
                )
                decoder_output_prob = decoder_output_prob.squeeze(dim=1)
                decoder_output_cls = F.softmax(decoder_output_prob, dim=-1).argmax(dim=-1)
                pred_token = decoder_output_cls.squeeze().item()
                if pred_token == vocab["</s>"]:
                    break
                else:
                    decoder_input = decoder_output_cls.view(-1, 1)
                    output_cls.append(pred_token)
        return sp.DecodeIds(output_cls)

In [None]:
df_merge = []
for catPath in glob.glob(r".\022.요약문 및 레포트 생성 데이터\01.데이터\1.Training\라벨링데이터\TL1\*"):
    df = []
    for fpath in glob.glob(catPath + "./2~3sent/*"):
        with open(fpath, encoding="utf8") as f:
            data = json.load(f)
            df.append({
                "doc": data['Meta(Refine)']["passage"],
                "label": data['Annotation']["summary2"] if data['Annotation']["summary2"] is not None else data['Annotation']["summary1"],
                "cat": catPath.split(".")[-1],
            })
    df_merge.append(pd.DataFrame(df))
df_train = pd.concat(df_merge).reset_index(drop=True)

In [None]:
summarizer = TeamGPT_TextSummarizer(model, sp, generating_params=generating_params, max_len=feature_max_len, token_length_limit=128)
sample_features = df_train.sample(10, random_state=GLOBAL_SEED)

In [None]:
for idx, value in sample_features.iterrows():
    print("original ->", value["label"])
    print("summarized ->", summarizer.summarize(value["doc"], 0, device=torch.device("cpu")))
    print("\n")

In [None]:
pickleIO(generating_params, "./model_params.pkl", "w")