# import

In [None]:
# seeds
import random

# os
import glob, os, sys

import warnings
warnings.filterwarnings('ignore')

#################################################################
# data manipulation
import pandas as pd
pd.set_option('display.max_columns', 300)
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_colwidth', 1000)
pd.set_option('display.max_info_columns', 300)
pd.options.display.float_format = '{:,.2f}'.format

import numpy as np
import json

# print
from pprint import pprint

# visualization
import seaborn as sns
sns.set_style("white")

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
mpl.rcParams['axes.unicode_minus'] = False
plt.rcParams['font.family'] = 'NanumGothic'
mpl.rcParams['axes.titlesize'] = 20
mpl.rcParams['axes.labelsize'] = 15
mpl.rcParams['xtick.labelsize'] = 15
mpl.rcParams['ytick.labelsize'] = 15

#################################################################
# sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, precision_recall_fscore_support, accuracy_score

# torch
import torch
import torch.nn as nn
from torch.nn import CrossEntropyLoss, MSELoss

# transformers
from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification
from transformers import BertForSequenceClassification, BertPreTrainedModel, BertModel, Trainer, TrainingArguments

# others
from itertools import chain
import time
from datetime import timedelta, datetime
import copy
from tqdm import tqdm, trange

In [None]:
train_df = pd.read_csv('./data/news_train.csv')
test_df = pd.read_csv('./data/news_test.csv')
train_df = train_df.rename(columns = {'info' : 'info_'})

In [None]:
print(train_df.shape)
train_df.head(3)

In [None]:
print(test_df.shape)
test_df.head(3)

# EDA

In [None]:
MODEL = "monologg/koelectra-base-v3-discriminator"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL)
print(f'[CLS] : {tokenizer.get_vocab()["[CLS]"]}')
print(f'[SEP] : {tokenizer.get_vocab()["[SEP]"]}')
print(f'[PAD] : {tokenizer.get_vocab()["[PAD]"]}')
print(f'[UNK] : {tokenizer.get_vocab()["[UNK]"]}')

print(len(tokenizer))

In [None]:
tokenizer.add_special_tokens({'additional_special_tokens' : ['[EOP]']})

print(len(tokenizer))

# preprocess

## 뉴스단위 집계, 라벨

In [None]:
def sent_to_news(df) :
    news = []
    title = df.title.iloc[0]
    news.append(title)
    body_list_of_sent = list(df.content)
    news.extend(body_list_of_sent)
    
    res = pd.Series({'news' : news})
    return res

test_news = test_df.groupby('n_id').apply(sent_to_news)

## <font color=red>(토큰 제한하고) ids로

In [None]:
def news_to_ids(l) :
    res = [2]
    for sent in l :
        res += tokenizer.encode(sent)[:20]
        res += [35000] # 뉴스 한 문장이 끝날 때마다 [EOP] 토큰을 추가
    res.pop() # 마지막 [EOP] 토큰 대신 [SEP]를 넣어서 뉴스 종료를 알림
    res += [3]
    
    return res

test_news['tokens_ids'] = test_news.news.apply(news_to_ids)

In [None]:
# 토큰 길이가 제일 긴 뉴스는 6741개...
test_news.tokens_ids.apply(lambda x : len(x)).max()

## segment_ids

In [None]:
EOP_ID = tokenizer.get_vocab()["[EOP]"]
SEP_ID = tokenizer.get_vocab()["[SEP]"]

def ids_to_segments_ids(ids) :
    _segs = [-1] + [i for i, id_ in enumerate(ids) if id_ == EOP_ID or id_ == SEP_ID]
    segs = [_segs[i] - _segs[i-1] for i in range(1, len(_segs))]
    segments_ids = []
    
    for i, s in enumerate(segs) :
        if (i % 2 == 0) :
            segments_ids += s * [0]
        else :
            segments_ids += s * [1]
    return segments_ids

test_news['segments_ids'] = test_news.tokens_ids.apply(ids_to_segments_ids)

## masks

In [None]:
test_news['masks'] = test_news.tokens_ids.apply(lambda x : [1] * len(x))

## eop 인덱스

In [None]:
CLS_ID = tokenizer.get_vocab()["[EOP]"]

def ids_to_cls_idxs(l):
    cls_idxs = [i for i, t in enumerate(l) if t == CLS_ID]
    return cls_idxs

test_news['cls_idxs'] = test_news.tokens_ids.apply(ids_to_cls_idxs)

In [None]:
test_news.cls_idxs.apply(lambda x : len(x)).describe() # eop 1개인 경우도 있음

In [None]:
(test_news.tokens_ids.apply(lambda x : len(x)) - test_news.masks.apply(lambda x : len(x))).describe()

# divide news

## for electra+rnn

In [None]:
test_news.shape

In [None]:
sub_test_news = test_news[test_news.tokens_ids.apply(lambda x : len(x)) <= 512].copy()

In [None]:
print(sub_test_news.shape)

In [None]:
sub_test_news.tokens_ids.apply(lambda x : len(x)).describe()

## for electra+nsp

In [None]:
test_df.head()

In [None]:
n_ids = test_news[test_news.tokens_ids.apply(lambda x : len(x)) > 512].index

In [None]:
sub_test_news_2 = test_df[test_df.n_id.isin(n_ids)].reset_index(drop=True)

In [None]:
sub_test_news_2.shape

# infer electra_rnn

## helper function

In [None]:
# 반복문 진행중 진행상황 프린트 함수
def good_update_interval(total_iters, num_desired_updates):
    exact_interval = total_iters / num_desired_updates

    order_of_mag = len(str(total_iters)) - 1
    round_mag = order_of_mag - 1

    update_interval = int(round(exact_interval, -round_mag))

    if update_interval == 0:
        update_interval = 1

    return update_interval

# 초->시 변환
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

## model

In [None]:
from transformers import AutoConfig

class CustomBERTModel(nn.Module) :
    def __init__(self):
        super(CustomBERTModel, self).__init__()
        self.config = AutoConfig.from_pretrained(MODEL)
        self.bert = AutoModel.from_pretrained(MODEL, config = self.config) # 수정 필요
        self.bert.resize_token_embeddings(35001)
        
    def forward(self, ids, segs, mask) :
        output = self.bert(input_ids = ids, token_type_ids = segs, attention_mask = mask)
        
        # sequence_output has the following shape: (batch_size, sequence_length, 768)
        sequence_output = output[0]
#         pooled_output = output[1]

        return sequence_output

In [None]:
import torch.nn.functional as F

class LayerNormLSTMCell(nn.LSTMCell):

    def __init__(self, input_size, hidden_size, bias=True):
        super().__init__(input_size, hidden_size, bias)

        self.ln_ih = nn.LayerNorm(4 * hidden_size)
        self.ln_hh = nn.LayerNorm(4 * hidden_size)
        self.ln_ho = nn.LayerNorm(hidden_size)

    def forward(self, input, hidden=None):
        self.check_forward_input(input)
        if hidden is None:
            hx = input.new_zeros(input.size(0), self.hidden_size, requires_grad=False)
            cx = input.new_zeros(input.size(0), self.hidden_size, requires_grad=False)
        else:
            hx, cx = hidden
        self.check_forward_hidden(input, hx, '[0]')
        self.check_forward_hidden(input, cx, '[1]')

        gates = self.ln_ih(F.linear(input, self.weight_ih, self.bias_ih)) \
                + self.ln_hh(F.linear(hx, self.weight_hh, self.bias_hh))
        i, f, o = gates[:, :(3 * self.hidden_size)].sigmoid().chunk(3, 1)
        g = gates[:, (3 * self.hidden_size):].tanh()

        cy = (f * cx) + (i * g)
        hy = o * self.ln_ho(cy).tanh()
        return hy, cy

In [None]:
class LayerNormLSTM(nn.Module):

    def __init__(self, input_size, hidden_size, num_layers=1, bias=True, bidirectional=False):
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.bidirectional = bidirectional

        num_directions = 2 if bidirectional else 1
        self.hidden0 = nn.ModuleList([
            LayerNormLSTMCell(input_size=(input_size if layer == 0 else hidden_size * num_directions),
                              hidden_size=hidden_size, bias=bias)
            for layer in range(num_layers)
        ])

        if self.bidirectional:
            self.hidden1 = nn.ModuleList([
                LayerNormLSTMCell(input_size=(input_size if layer == 0 else hidden_size * num_directions),
                                  hidden_size=hidden_size, bias=bias)
                for layer in range(num_layers)
            ])

    def forward(self, input, hidden=None):
        seq_len, batch_size, hidden_size = input.size()  # supports TxNxH only
        num_directions = 2 if self.bidirectional else 1
        if hidden is None:
            hx = input.new_zeros(self.num_layers * num_directions, batch_size, self.hidden_size, requires_grad=False)
            cx = input.new_zeros(self.num_layers * num_directions, batch_size, self.hidden_size, requires_grad=False)
        else:
            hx, cx = hidden

        ht = [[None, ] * (self.num_layers * num_directions)] * seq_len
        ct = [[None, ] * (self.num_layers * num_directions)] * seq_len

        if self.bidirectional:
            xs = input
            for l, (layer0, layer1) in enumerate(zip(self.hidden0, self.hidden1)):
                l0, l1 = 2 * l, 2 * l + 1
                h0, c0, h1, c1 = hx[l0], cx[l0], hx[l1], cx[l1]
                for t, (x0, x1) in enumerate(zip(xs, reversed(xs))):
                    ht[t][l0], ct[t][l0] = layer0(x0, (h0, c0))
                    h0, c0 = ht[t][l0], ct[t][l0]
                    t = seq_len - 1 - t
                    ht[t][l1], ct[t][l1] = layer1(x1, (h1, c1))
                    h1, c1 = ht[t][l1], ct[t][l1]
                xs = [torch.cat((h[l0], h[l1]), dim=1) for h in ht]
            y = torch.stack(xs)
            hy = torch.stack(ht[-1])
            cy = torch.stack(ct[-1])
        else:
            h, c = hx, cx
            for t, x in enumerate(input):
                for l, layer in enumerate(self.hidden0):
                    ht[t][l], ct[t][l] = layer(x, (h[l], c[l]))
                    x = ht[t][l]
                h, c = ht[t], ct[t]
            y = torch.stack([h[-1] for h in ht])
            hy = torch.stack(ht[-1])
            cy = torch.stack(ct[-1])

        return y, (hy, cy)

In [None]:
class RNNEncoder(nn.Module):

    def __init__(self, bidirectional, num_layers, input_size,
                 hidden_size, dropout=0.0):
        super(RNNEncoder, self).__init__()
        num_directions = 2 if bidirectional else 1
        assert hidden_size % num_directions == 0
        hidden_size = hidden_size // num_directions

        self.rnn = LayerNormLSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            bidirectional=bidirectional)

        self.wo = nn.Linear(num_directions * hidden_size, 1, bias=True)
        self.dropout = nn.Dropout(dropout)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        """See :func:`EncoderBase.forward()`"""
        x = torch.transpose(x, 1, 0)
        memory_bank, _ = self.rnn(x)
        memory_bank = self.dropout(memory_bank) + x
        memory_bank = torch.transpose(memory_bank, 1, 0)

        sent_scores = self.sigmoid(self.wo(memory_bank))
        sent_scores = sent_scores.squeeze(-1)
        return sent_scores

In [None]:
from torch.nn.init import xavier_uniform_

class Summarizer(nn.Module):
    def __init__(self):
        super(Summarizer, self).__init__()
        self.bert = CustomBERTModel()
#         self.encoder = Classifier()
        self.encoder = RNNEncoder(bidirectional=True, num_layers=1,
                                      input_size=self.bert.config.hidden_size, hidden_size=768, # args.rnn_size
                                      dropout=0.1) # args.dropout
        for p in self.encoder.parameters():
            if p.dim() > 1:
                xavier_uniform_(p)
                
    def forward(self, x, segs, mask, clss):
        top_vec = self.bert(x, segs, mask) # (b, len, hidden)
#         sents_vec = top_vec[torch.arange(top_vec.size(0)).unsqueeze(1), clss] # [b, ]
        sents_vec = top_vec[:, clss.squeeze(0), :]  # [b, eop len, hidden]
        sent_scores = self.encoder(sents_vec) # [b, eop len]
        return sent_scores

In [None]:
model = Summarizer()

In [None]:
# load_model:
model_name = '1231_06:17_20_0.9987.pth'

state = torch.load(os.path.join('./models/morenews_electra_rnn_without_cleaning/'+model_name))
model.load_state_dict(state['model'])

print('model loaded')

In [None]:
import torch

print('\nLoading model to GPU...')
device = torch.device('cuda:3')
print('  GPU:', torch.cuda.get_device_name())
desc = model.to(device)

print('    DONE.')

## infer

In [None]:
sub_test_news.head()

In [None]:
pt_inputs_t = list(map(lambda x : torch.tensor(x).unsqueeze(0), list(sub_test_news.tokens_ids)))
pt_token_type_ids_t = list(map(lambda x : torch.tensor(x).unsqueeze(0), list(sub_test_news.segments_ids)))
pt_clss_t = list(map(lambda x : torch.tensor(x).unsqueeze(0), list(sub_test_news.cls_idxs)))
pt_masks_t = list(map(lambda x : torch.tensor(x).unsqueeze(0), list(sub_test_news.masks)))

In [None]:
# Put model in evaluation mode
model.eval()

# Tracking variables 
predictions = []

# 스마트 배치

# Choose an interval on which to print progress updates.
update_interval_eval = good_update_interval(total_iters=len(pt_inputs_t), 
                                       num_desired_updates=10)

# Measure elapsed time.
t0 = time.time()

# For each batch of training data...
for step in range(0, len(pt_inputs_t)):

    # Progress update every 100 batches.
    if step % update_interval_eval == 0 and not step == 0:
        # Calculate elapsed time in minutes.
        elapsed = format_time(time.time() - t0)

        # Calculate the time remaining based on our progress.
        steps_per_sec = (time.time() - t0) / step
        remaining_sec = steps_per_sec * (len(pt_inputs_t) - step)
        remaining = format_time(remaining_sec)

        # Report progress.
        print('  Batch {:>7,}  of  {:>7,}.    Elapsed: {:}.  Remaining: {:}'.format(step, len(pt_inputs_t), elapsed, remaining))

    # Copy the batch to the GPU.
    b_inputs = pt_inputs_t[step].to(device)
    b_token_type_ids = pt_token_type_ids_t[step].to(device)
    b_masks = pt_masks_t[step].to(device)
    b_clss = pt_clss_t[step].to(device)

#     b_labels = pt_labels_t[step].to(device)

    # Telling the model not to compute or store gradients, saving memory and speeding up prediction
    with torch.no_grad():
        # Forward pass, calculate logit predictions
        outputs = model(b_inputs, b_token_type_ids, b_masks, b_clss)

#     loss = criterion(outputs, b_labels)

#     total_val_loss += loss.sum().item()

    # Move logits and labels to CPU
    logits = outputs.squeeze(0).detach().cpu().numpy()
#     label_ids = b_labels.squeeze(0).to('cpu').numpy()

    # Store predictions and true labels
    predictions.append(logits)
#     true_labels.append(label_ids)

# Calculate the average val loss over all of the batches.
# avg_val_loss = total_val_loss / len(pt_inputs_t)
# val_loss.append(avg_val_loss)

# Combine the results across the batches.
predictions_flatten = np.concatenate(predictions, axis=0)
# true_labels = np.concatenate(true_labels, axis=0)

In [None]:
id_ = []
info_ = []
for n_id, n_predictions in zip(sub_test_news.index, predictions) :
    n_preds = np.round(n_predictions)
    for i, pred in enumerate(n_preds) :
        id_.append(n_id + '_' + str(i+1))
        info_.append(pred)

In [None]:
electra_infer = pd.DataFrame({'id' : id_, 'info' : info_})

# infer electra+nsp

## model

In [None]:
# Load Pre-Trained Model

from transformers import AutoConfig

# Load the Config object, with an output configured for classification.
config = AutoConfig.from_pretrained("monologg/koelectra-base-v3-discriminator", num_labels=2)
# config = AutoConfig.from_pretrained("beomi/kcbert-base", num_labels=2)

print('Config type:', str(type(config)), '\n')

# pre-trained kcbert 로드
from transformers import AutoModelForSequenceClassification

# Load the pre-trained model for classification, passing in the `config` from above.
model = AutoModelForSequenceClassification.from_pretrained(
    pretrained_model_name_or_path="monologg/koelectra-base-v3-discriminator",
    config=config)

# model = AutoModelForSequenceClassification.from_pretrained(
#     pretrained_model_name_or_path="beomi/kcbert-base",
#     config=config)

print('\nModel type:', str(type(model)))

In [None]:
# 최대 토큰 길이 설정
max_len = 300

# 배치 크기 지정
batch_size = 16

In [None]:
# load_model:
model_name = '1211_18_26_4_0.9977.pth'

state = torch.load(os.path.join('./models/koelectra/'+model_name))
model.load_state_dict(state['model'])

print('model loaded')

In [None]:
import torch

print('\nLoading model to GPU...')
device = torch.device('cuda:3')
print('  GPU:', torch.cuda.get_device_name())
desc = model.to(device)

print('    DONE.')

## infer

In [None]:
# 배치 생성 함수
def make_smart_batches_pred(title_samples, content_samples, batch_size):
    print('Creating Smart Batches from {:,} examples with batch size {:,}...\n'.format(len(title_samples), batch_size))

    # ==============================
    #    토큰화 & Truncate(패딩없이)
    # ==============================

    full_input_ids = []

    # Tokenize all training examples
    print('Tokenizing {:,} samples...'.format(len(title_samples)))

    update_interval = good_update_interval(total_iters=len(title_samples), num_desired_updates=10)
    
    text_samples = list(zip(title_samples, content_samples))

    # 모든 문장에 대해
    for title, content in text_samples:
        
        if ((len(full_input_ids) % update_interval) == 0):
            print('  Tokenized {:,} samples.'.format(len(full_input_ids)))

        # padding 없이 토큰화
        input_ids = tokenizer.encode(title, content,              # Text to encode.
                                    add_special_tokens=True, # Do add specials.
                                    max_length=max_len,      # Do Truncate!
                                    truncation=True,         # Do Truncate!
                                    padding=False)           # DO NOT pad.
        
                                    
        # full_input_ids
        full_input_ids.append(input_ids)
        
    print('DONE.')
    print('{:>10,} samples\n'.format(len(full_input_ids)))

    # =========================
    #      Select Batches
    # =========================
    
    # test cases의 입력 순서를 유지하기 위해, sort하지 않는다
    samples = list(full_input_ids)

    print('{:>10,} samples without sorting for prediction\n'.format(len(samples)))

    # 각 배치 담을 리스트
    batch_ordered_sentences = []
    
    print('Creating batches of size {:}...'.format(batch_size))

    update_interval = good_update_interval(total_iters=len(samples), num_desired_updates=10)

    # 모든 샘플을 배치화할 때까지..
    while len(samples) > 0:

        if ((len(batch_ordered_sentences) % update_interval) == 0 \
            and not len(batch_ordered_sentences) == 0):
            print('  Selected {:,} batches.'.format(len(batch_ordered_sentences)))

        to_take = min(batch_size, len(samples))

        # 인덱스는 순서대로 선택
        select = 0

        # 배치
        batch = samples[select:(select + to_take)]

        # 배치 토큰
        batch_ordered_sentences.append([s for s in batch])

        # 배치를 샘플에서 제거
        del samples[select:select + to_take]

    print('\n  DONE - Selected {:,} batches.\n'.format(len(batch_ordered_sentences)))

    # =========================
    #        Add Padding
    # =========================    

    print('Padding out sequences within each batch...')

    py_inputs = []
    py_attn_masks = []

    # (비슷한 토큰 길이를 가지는) 각 배치마다 패딩 추가된 인풋 생성
    for batch_inputs in batch_ordered_sentences:

        batch_padded_inputs = []
        batch_attn_masks = []
        
        # 배치 내에서 가장 긴 문장
        max_size = max([len(sen) for sen in batch_inputs])

        # 각 문장에 대해
        for sen in batch_inputs:
            
            # 추가할 패딩 개수
            num_pads = max_size - len(sen)

            # 패딩 추가
            padded_input = sen + [tokenizer.pad_token_id]*num_pads

            # 어텐션 마스크
            attn_mask = [1] * len(sen) + [0] * num_pads

            # 개별 배치의 결과
            batch_padded_inputs.append(padded_input)
            batch_attn_masks.append(attn_mask)

        # 각 배치의 인풋 생성 결과를 저장
        py_inputs.append(torch.tensor(batch_padded_inputs))
        py_attn_masks.append(torch.tensor(batch_attn_masks))
    
    print('  DONE.')
    
    # 모델의 최종 인풋
    return (py_inputs, py_attn_masks)

In [None]:
sub_test_news_2.head(1)

In [None]:
test_title = sub_test_news_2['title'].values
test_content = sub_test_news_2['content'].values

In [None]:
# 테스트셋 배치 생성
(py_inputs, py_attn_masks) = make_smart_batches_pred(test_title, test_content, batch_size)

print('Predicting labels for {:,} test sentences...'.format(len(test_title)))

# Tracking variables 
predictions = []

# Choose an interval on which to print progress updates.
update_interval = good_update_interval(total_iters=len(py_inputs), num_desired_updates=10)

# Measure elapsed time.
t0 = time.time()

# Put model in prediction mode
model.eval()

# For each batch of training data...
for step in range(0, len(py_inputs)):

    # Progress update every 100 batches.
    if step % update_interval == 0 and not step == 0:
        # Calculate elapsed time in minutes.
        elapsed = format_time(time.time() - t0)
        
        # Calculate the time remaining based on our progress.
        steps_per_sec = (time.time() - t0) / step
        remaining_sec = steps_per_sec * (len(py_inputs) - step)
        remaining = format_time(remaining_sec)

        # Report progress.
        print('  Batch {:>7,}  of  {:>7,}.    Elapsed: {:}.  Remaining: {:}'.format(step, len(py_inputs), elapsed, remaining))

    # Copy the batch to the GPU.
    b_input_ids = py_inputs[step].to(device)
    b_input_mask = py_attn_masks[step].to(device)
  
    # Telling the model not to compute or store gradients, saving memory and speeding up prediction
    with torch.no_grad():
        # Forward pass, calculate logit predictions
        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask)

    logits = outputs[0]

    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
  
    # Store predictions and true labels
    predictions.append(logits)

print('    DONE.')

In [None]:
# Combine the results across the batches.
predictions = np.concatenate(predictions, axis=0)

# Choose the label with the highest score as our prediction.
preds = np.argmax(predictions, axis=1).flatten()

In [None]:
pd.Series(preds).value_counts()

In [None]:
sub_test_news_2['info'] = preds
electra_infer_2 = sub_test_news_2[['id', 'info']]

In [None]:
electra_infer_2

In [None]:
infer = pd.concat([electra_infer, electra_infer_2])
infer.shape

In [None]:
infer.set_index('id').loc[test_df['id']].reset_index().to_csv('./submission/cut_token_no_cleaned_data.csv', index = False)