In [None]:
import glob
import os, sys
import random

import warnings
warnings.filterwarnings('ignore')

# data manipulation
import pandas as pd
# pd.set_option('display.max_columns', 200)
# pd.set_option('display.max_rows', 100)
# pd.set_option('display.max_info_columns', 250)

import numpy as np

# visualization
import seaborn as sns

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
mpl.rcParams['axes.unicode_minus'] = False
plt.rcParams['font.family'] = 'NanumGothic'
mpl.rcParams['axes.titlesize'] = 15
mpl.rcParams['axes.labelsize'] = 15
mpl.rcParams['xtick.labelsize'] = 15
mpl.rcParams['ytick.labelsize'] = 15

# others
from itertools import chain
import time
import datetime
from datetime import timedelta, datetime
import copy
from tqdm import tqdm, trange

# others2
import torch
import torch.nn as nn

from torch.nn import CrossEntropyLoss, MSELoss
from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification
from transformers import BertForSequenceClassification, BertPreTrainedModel, BertModel, Trainer, TrainingArguments

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, precision_recall_fscore_support, accuracy_score

In [None]:
train = pd.read_csv('./data/news_train.csv')
test = pd.read_csv('./data/news_test.csv')
train.head(3)

In [None]:
test.head(3)

In [None]:
print(train.shape)
print(test.shape)

## Inspect Training samples

In [None]:
import textwrap
import random

# Wrap text to 80 characters.
wrapper = textwrap.TextWrapper(width=80) 

# Randomly choose some examples.
for i in range(4):
    
    # Choose a random sample by index.
    j = random.choice(range(len(train)))
    
    # Print out the label and the text. 
    print('==== Label: {:} ===='.format(train['info'][j]))
    print('==== Ord: {:} ===='.format(train['ord'][j]))
    print(wrapper.fill(' | '.join([train['title'][j], train['content'][j]])))
    print('')

## Preprocessing and Cleaning

In [None]:
import re
# import emoji
from soynlp.normalizer import repeat_normalize

# emojis = ''.join(emoji.UNICODE_EMOJI.keys())
# pattern = re.compile(f'[^ .,?!/@$%~％·∼()\x00-\x7Fㄱ-힣{emojis}]+')
url_pattern = re.compile(
    r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)')

def clean(x):
#     x = pattern.sub(' ', x)
    x = url_pattern.sub('', x)
    
    x.replace('&nbsp;', ' ')
    x.replace('&quot;', '\"')
    x.replace('&ldquo;', '\"')
    x.replace('&rdquo;', '\"')
    x.replace('&#039;', '\'')
    x.replace('&#39;', '\'')
    x.replace('&#035;', '#')
    x.replace('&#35;', '#')
    x.replace('&apos;', '\'')
    x.replace('&amp;', '&')
    x.replace('&lt;', '<')
    x.replace('&gt;', '>')
    x.replace('000님', '익명')

    x = x.strip()
    x = repeat_normalize(x, num_repeats=2)
    return x

In [None]:
train['title'] = train['title'].apply(clean)
test['title'] = test['title'].apply(clean)

train['content'] = train['content'].apply(clean)
test['content'] = test['content'].apply(clean)

In [None]:
# Set the seed value all over the place to make this whole process & results reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

## Train/Dev Split for Validation

In [None]:
train_title = train['title'].values
train_content = train['content'].values

train_labels = train['info'].values

test_title = test['title'].values
test_content = test['content'].values

In [None]:
X_train_title, X_test_title, X_train_content, X_test_content, y_train, y_test = train_test_split(train_title, 
                                                                                                 train_content, train_labels, 
                                                    test_size=.1, 
                                                    random_state=seed_val, 
                                                    stratify=train_labels)

# 길이 확인
print(len(X_train_title), len(X_test_title))
print(len(y_train), len(y_test))

### Helper function

In [None]:
# 반복문 진행중 진행상황 프린트 함수
def good_update_interval(total_iters, num_desired_updates):
    exact_interval = total_iters / num_desired_updates

    order_of_mag = len(str(total_iters)) - 1
    round_mag = order_of_mag - 1

    update_interval = int(round(exact_interval, -round_mag))

    if update_interval == 0:
        update_interval = 1

    return update_interval

# 초->시 변환
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

## Load Pre-trained Model and Tokenizer

In [None]:
# Load Pre-Trained Model

from transformers import AutoConfig

# Load the Config object, with an output configured for classification.
config = AutoConfig.from_pretrained("monologg/koelectra-base-v3-discriminator", num_labels=2)
print('Config type:', str(type(config)), '\n')

# pre-trained model 로드
from transformers import AutoModelForSequenceClassification

# Load the pre-trained model for classification, passing in the `config` from above.
model = AutoModelForSequenceClassification.from_pretrained(
    pretrained_model_name_or_path="monologg/koelectra-base-v3-discriminator",
    config=config)

print('\nModel type:', str(type(model)))

In [None]:
# 최대 토큰 길이 설정
max_len = 300

# 배치 크기 지정
batch_size = 16

# epoch 지정
epochs = 10

In [None]:
!nvidia-smi

In [None]:
import torch

print('\nLoading model to GPU...')

device = torch.device('cuda')

print('  GPU:', torch.cuda.get_device_name())

# os.environ["CUDA_VISIBLE_DEVICES"] = '0, 1, 2, 3'

# model = nn.DataParallel(model, output_device=1)

desc = model.to(device)

print('    DONE.')

In [None]:
from transformers import AdamW

optimizer = AdamW(model.parameters(),
                  lr = 1e-5, # 1e-5, 2e-5, 3e-5, 5e-5
                  eps = 1e-8)

from transformers import get_linear_schedule_with_warmup

# Total number of training steps is [number of batches] x [number of epochs]. 
batches = len(X_train_title)//batch_size + 1
total_steps = batches * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = batches, # default 0
                                            num_training_steps = total_steps)

# Smart Batch for Training Set

In [None]:
# 배치 생성 함수
def make_smart_batches(title_samples, content_samples, labels, batch_size):
    
    '''
    문장을 padding없이 토큰화하고 - 토큰화 된 길이가 비슷한 문장끼리 배치 생성 - padding 추가된 인풋 생성
    '''

    print('Creating Smart Batches from {:,} examples with batch size {:,}...\n'.format(len(title_samples), batch_size))

    # ==============================
    #   토큰화 & Truncate(패딩없이)
    # ==============================

    full_input_ids = []

    # Tokenize all training examples
    print('Tokenizing {:,} samples...'.format(len(labels)))

    update_interval = good_update_interval(total_iters=len(labels), num_desired_updates=10)
    
    text_samples = list(zip(title_samples, content_samples))

    # 모든 문장에 대해
    for title, content in text_samples:
        
        if ((len(full_input_ids) % update_interval) == 0):
            print('  Tokenized {:,} samples.'.format(len(full_input_ids)))

        # padding 없이 토큰화
        input_ids = tokenizer.encode(title, content,               # Text to encode.
                                    add_special_tokens=True, # Do add specials.
                                    max_length=max_len,      # Do Truncate!
                                    truncation=True,         # Do Truncate!
                                    padding=False)           # DO NOT pad.
                                    
        # full_input_ids
        full_input_ids.append(input_ids)
        
    print('DONE.')
    print('{:>10,} samples\n'.format(len(full_input_ids)))

    # =========================
    #      Select Batches
    # =========================
    
    # 토큰 길이에 따라 정렬
    samples = sorted(zip(full_input_ids, labels), key=lambda x: len(x[0]))

    print('{:>10,} samples after sorting\n'.format(len(samples)))

    # 각 배치 담을 리스트
    batch_ordered_sentences = []
    batch_ordered_labels = []

    print('Creating batches of size {:}...'.format(batch_size))

    update_interval = good_update_interval(total_iters=len(samples), num_desired_updates=10)

    # 모든 샘플을 배치화할 때까지..
    while len(samples) > 0:

        if ((len(batch_ordered_sentences) % update_interval) == 0 \
            and not len(batch_ordered_sentences) == 0):
            print('  Selected {:,} batches.'.format(len(batch_ordered_sentences)))

        to_take = min(batch_size, len(samples))

        # 랜덤 인덱스 선택
        select = random.randint(0, len(samples) - to_take)

        # 배치
        batch = samples[select:(select + to_take)]

        # 배치 토큰
        batch_ordered_sentences.append([s[0] for s in batch])
        # 배치 라벨
        batch_ordered_labels.append([s[1] for s in batch])

        # 배치를 샘플에서 제거
        del samples[select:select + to_take]

    print('\n  DONE - Selected {:,} batches.\n'.format(len(batch_ordered_sentences)))

    # =========================
    #        Add Padding
    # =========================    

    print('Padding out sequences within each batch...')

    py_inputs = []
    py_attn_masks = []
    py_labels = []

    # (비슷한 토큰 길이를 가지는) 각 배치마다 패딩 추가된 인풋 생성
    for (batch_inputs, batch_labels) in zip(batch_ordered_sentences, batch_ordered_labels):

        batch_padded_inputs = []
        batch_attn_masks = []
        
        # 배치 내에서 가장 긴 문장
        max_size = max([len(sen) for sen in batch_inputs])

        # 각 문장에 대해
        for sen in batch_inputs:
            
            # 추가할 패딩
            num_pads = max_size - len(sen)

            # 패딩 추가
            padded_input = sen + [tokenizer.pad_token_id]*num_pads

            # 어텐션 마스크
            attn_mask = [1] * len(sen) + [0] * num_pads

            # 개별 배치의 결과
            batch_padded_inputs.append(padded_input)
            batch_attn_masks.append(attn_mask)

        # 각 배치의 인풋 생성 결과를 저장
        py_inputs.append(torch.tensor(batch_padded_inputs))
        py_attn_masks.append(torch.tensor(batch_attn_masks))
        py_labels.append(torch.tensor(batch_labels))
    
    print('  DONE.')
    
    # 모델의 최종 인풋
    return (py_inputs, py_attn_masks, py_labels)

# Train

In [None]:
# We'll store a number of quantities such as training and val_loss, val_acc, and timings.
training_stats = []

# Update every `update_interval` batches.
update_interval = good_update_interval(total_iters=batches, num_desired_updates=5)

# Measure the total training time for the whole run.
total_t0 = time.time()

# assign 'score' to save best model only
best_score = 0

# criterion = nn.NLLLoss()

# to visualize loss per each epoch
train_loss = []
val_loss = []

# For each epoch...
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    
    # At the start of each epoch (except for the first) we need to re-randomize our training data.
    # Use our `make_smart_batches` function to re-shuffle the dataset into new batches.
    (py_inputs, py_attn_masks, py_labels) = make_smart_batches(X_train_title, X_train_content, y_train, batch_size)
    
    print('Training on {:,} batches...'.format(len(py_inputs)))

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_train_loss = 0

    model.train()

    # For each batch of training data...
    for step in range(0, len(py_inputs)):

        # Progress update every, e.g., 100 batches.
        if step % update_interval == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Calculate the time remaining based on our progress.
            steps_per_sec = (time.time() - t0) / step
            remaining_sec = steps_per_sec * (len(py_inputs) - step)
            remaining = format_time(remaining_sec)

            # Report progress.
            print('  Batch {:>7,}  of  {:>7,}.    Elapsed: {:}.  Remaining: {:}'.format(step, len(py_inputs), elapsed, remaining))

        # Copy the current training batch to the GPU using the `to` method.
        b_input_ids = py_inputs[step].to(device)
        b_input_mask = py_attn_masks[step].to(device)
        b_labels = py_labels[step].to(device)

        # Always clear any previously calculated gradients before performing a backward pass.
        model.zero_grad()        

        # Perform a forward pass (evaluate the model on this training batch).
        # The call returns the loss (because we provided labels) and the "logits"--the model outputs prior to activation.
        loss, logits = model(b_input_ids, 
                       token_type_ids=None, 
                       attention_mask=b_input_mask,
                             labels=b_labels)

        # Accumulate the training loss over all of the batches so that we can calculate the average loss at the end. 
        # `loss` is a Tensor containing a single value; 
        # the `.item()` function just returns the Python value from the tensor.
        total_train_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0, to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"
        # how the parameters are modified based on their gradients, the learning rate, etc.
        optimizer.step()

        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(py_inputs)
    train_loss.append(avg_train_loss)
    
    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(training_time))
        
    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Training Time': training_time,
        }
    )
    
    # ========================================
    #               Evaluation
    # ========================================

    if epoch_i % 2 == 1: # control denominator(1~5 recommended)
        print('Predicting labels for {:,} test sentences...'.format(len(y_test)))

        # Put model in evaluation mode
        model.eval()

        # Tracking variables 
        predictions, true_labels = [], []

        # 스마트 배치
        (py_inputs, py_attn_masks, py_labels) = make_smart_batches(X_test_title, X_test_content, y_test, batch_size)

        # Choose an interval on which to print progress updates.
        update_interval_eval = good_update_interval(total_iters=len(py_inputs), 
                                               num_desired_updates=10)

        # Measure elapsed time.
        t0 = time.time()
        
        # Reset the total loss for this epoch.
        total_val_loss = 0

        # For each batch of training data...
        for step in range(0, len(py_inputs)):

            # Progress update every 100 batches.
            if step % update_interval_eval == 0 and not step == 0:
                # Calculate elapsed time in minutes.
                elapsed = format_time(time.time() - t0)

                # Calculate the time remaining based on our progress.
                steps_per_sec = (time.time() - t0) / step
                remaining_sec = steps_per_sec * (len(py_inputs) - step)
                remaining = format_time(remaining_sec)

                # Report progress.
                print('  Batch {:>7,}  of  {:>7,}.    Elapsed: {:}.  Remaining: {:}'.format(step, len(py_inputs), elapsed, remaining))

            # Copy the batch to the GPU.
            b_input_ids = py_inputs[step].to(device)
            b_input_mask = py_attn_masks[step].to(device)
            b_labels = py_labels[step].to(device)

            # Telling the model not to compute or store gradients, saving memory and speeding up prediction
            with torch.no_grad():
                # Forward pass, calculate logit predictions
                loss, logits = model(b_input_ids, 
                                token_type_ids=None, 
                                attention_mask=b_input_mask,
                               labels = b_labels)
            
            total_val_loss += loss.item()

            # Move logits and labels to CPU
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()

            # Store predictions and true labels
            predictions.append(logits)
            true_labels.append(label_ids)
            
        # Calculate the average val loss over all of the batches.
        avg_val_loss = total_val_loss / len(py_inputs)
        val_loss.append(avg_val_loss)

        # Combine the results across the batches.
        predictions = np.concatenate(predictions, axis=0)
        true_labels = np.concatenate(true_labels, axis=0)

        # Choose the label with the highest score as our prediction.
        preds = np.argmax(predictions, axis=1).flatten()

        print(classification_report(true_labels, preds))

        acc = accuracy_score(true_labels, preds)
        precision, recall, f1, _ = precision_recall_fscore_support(true_labels, preds, average='weighted')
        
        print('accuracy', acc)
        print('f1(weighted)',  f1)
        print('precision', precision)
        print('recall', recall)
        print("")
        print("Average validation loss: {0:.2f}".format(avg_val_loss))
    
        if acc > best_score:
            best_score = acc

            model_dir = './models'
            state = {
            'model': model.state_dict(),
            'optimizer': optimizer.state_dict(),
            'scheduler': scheduler.state_dict()
            }

            now = time.strftime('%m%d_%H:%M')
            torch.save(state, os.path.join(model_dir, '_'.join([now, 
                                                                str(epoch_i+1), 
                                                                str(round(acc,4))]) + '.pth'))
            print('model saved')

print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))

In [None]:
plt.plot(train_loss,'g*', val_loss, 'ro')
plt.show()

# Smart Batch for Prediction

In [None]:
# 배치 생성 함수
def make_smart_batches_pred(title_samples, content_samples, batch_size):
    print('Creating Smart Batches from {:,} examples with batch size {:,}...\n'.format(len(title_samples), batch_size))

    # ==============================
    #    토큰화 & Truncate(패딩없이)
    # ==============================

    full_input_ids = []

    # Tokenize all training examples
    print('Tokenizing {:,} samples...'.format(len(title_samples)))

    update_interval = good_update_interval(total_iters=len(title_samples), num_desired_updates=10)
    
    text_samples = list(zip(title_samples, content_samples))

    # 모든 문장에 대해
    for title, content in text_samples:
        
        if ((len(full_input_ids) % update_interval) == 0):
            print('  Tokenized {:,} samples.'.format(len(full_input_ids)))

        # padding 없이 토큰화
        input_ids = tokenizer.encode(title, content,              # Text to encode.
                                    add_special_tokens=True, # Do add specials.
                                    max_length=max_len,      # Do Truncate!
                                    truncation=True,         # Do Truncate!
                                    padding=False)           # DO NOT pad.
        
                                    
        # full_input_ids
        full_input_ids.append(input_ids)
        
    print('DONE.')
    print('{:>10,} samples\n'.format(len(full_input_ids)))

    # =========================
    #      Select Batches
    # =========================
    
    # test cases의 입력 순서를 유지하기 위해, sort하지 않는다
    samples = list(full_input_ids)

    print('{:>10,} samples without sorting for prediction\n'.format(len(samples)))

    # 각 배치 담을 리스트
    batch_ordered_sentences = []
    
    print('Creating batches of size {:}...'.format(batch_size))

    update_interval = good_update_interval(total_iters=len(samples), num_desired_updates=10)

    # 모든 샘플을 배치화할 때까지..
    while len(samples) > 0:

        if ((len(batch_ordered_sentences) % update_interval) == 0 \
            and not len(batch_ordered_sentences) == 0):
            print('  Selected {:,} batches.'.format(len(batch_ordered_sentences)))

        to_take = min(batch_size, len(samples))

        # 인덱스는 순서대로 선택
        select = 0

        # 배치
        batch = samples[select:(select + to_take)]

        # 배치 토큰
        batch_ordered_sentences.append([s for s in batch])

        # 배치를 샘플에서 제거
        del samples[select:select + to_take]

    print('\n  DONE - Selected {:,} batches.\n'.format(len(batch_ordered_sentences)))

    # =========================
    #        Add Padding
    # =========================    

    print('Padding out sequences within each batch...')

    py_inputs = []
    py_attn_masks = []

    # (비슷한 토큰 길이를 가지는) 각 배치마다 패딩 추가된 인풋 생성
    for batch_inputs in batch_ordered_sentences:

        batch_padded_inputs = []
        batch_attn_masks = []
        
        # 배치 내에서 가장 긴 문장
        max_size = max([len(sen) for sen in batch_inputs])

        # 각 문장에 대해
        for sen in batch_inputs:
            
            # 추가할 패딩 개수
            num_pads = max_size - len(sen)

            # 패딩 추가
            padded_input = sen + [tokenizer.pad_token_id]*num_pads

            # 어텐션 마스크
            attn_mask = [1] * len(sen) + [0] * num_pads

            # 개별 배치의 결과
            batch_padded_inputs.append(padded_input)
            batch_attn_masks.append(attn_mask)

        # 각 배치의 인풋 생성 결과를 저장
        py_inputs.append(torch.tensor(batch_padded_inputs))
        py_attn_masks.append(torch.tensor(batch_attn_masks))
    
    print('  DONE.')
    
    # 모델의 최종 인풋
    return (py_inputs, py_attn_masks)

## Load best model

In [None]:
# load_model:
model_name = '1211_18:26_4_0.9977.pth'

state = torch.load(os.path.join('./models/'+model_name))
model.load_state_dict(state['model'])
if optimizer is not None:
    optimizer.load_state_dict(state['optimizer'])
if scheduler is not None:
    scheduler.load_state_dict(state['scheduler'])
print('model loaded')

# Prediction for unlabeled test set

In [None]:
# 테스트셋 배치 생성
(py_inputs, py_attn_masks) = make_smart_batches_pred(test_title, test_content, batch_size)

print('Predicting labels for {:,} test sentences...'.format(len(test_title)))

# Tracking variables 
predictions = []

# Choose an interval on which to print progress updates.
update_interval = good_update_interval(total_iters=len(py_inputs), num_desired_updates=10)

# Measure elapsed time.
t0 = time.time()

# Put model in prediction mode
model.eval()

# For each batch of training data...
for step in range(0, len(py_inputs)):

    # Progress update every 100 batches.
    if step % update_interval == 0 and not step == 0:
        # Calculate elapsed time in minutes.
        elapsed = format_time(time.time() - t0)
        
        # Calculate the time remaining based on our progress.
        steps_per_sec = (time.time() - t0) / step
        remaining_sec = steps_per_sec * (len(py_inputs) - step)
        remaining = format_time(remaining_sec)

        # Report progress.
        print('  Batch {:>7,}  of  {:>7,}.    Elapsed: {:}.  Remaining: {:}'.format(step, len(py_inputs), elapsed, remaining))

    # Copy the batch to the GPU.
    b_input_ids = py_inputs[step].to(device)
    b_input_mask = py_attn_masks[step].to(device)
  
    # Telling the model not to compute or store gradients, saving memory and speeding up prediction
    with torch.no_grad():
        # Forward pass, calculate logit predictions
        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask)

    logits = outputs[0]

    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
  
    # Store predictions and true labels
    predictions.append(logits)

print('    DONE.')

In [None]:
# Combine the results across the batches.
predictions = np.concatenate(predictions, axis=0)

# Choose the label with the highest score as our prediction.
preds = np.argmax(predictions, axis=1).flatten()

In [None]:
pd.Series(preds).value_counts()

In [None]:
input_strings = [tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(j)) for i in py_inputs for j in np.array(i)]

In [None]:
pd.set_option('display.max_colwidth', None)
check_inference = pd.DataFrame({'content' : input_strings, 'info' : preds})

check_inference.head(3)

In [None]:
# submission format으로 맞춰주기
test.head(3)

In [None]:
test['info'] = preds
submission = test[['id', 'info']]

In [None]:
submission.head(3)

In [None]:
# 파일 생성할 때마다 파일명에 실험 내용과, 해당 날짜를 기재 -> 혼동 방지용
prediction_path = './prediction/KoElecV3_withtitleSEP_len300_lr1e5_201211.csv'

submission.to_csv(prediction_path, index=False)