In [1]:
# 셀 1: 데이터 다운로드 및 압축 해제
import os

os.chdir('/content')

!wget https://aistages-api-public-prod.s3.amazonaws.com/app/Competitions/000373/data/data.tar.gz
!tar -xzf data.tar.gz
!rm data.tar.gz
!ls -la

--2025-12-02 00:38:21--  https://aistages-api-public-prod.s3.amazonaws.com/app/Competitions/000373/data/data.tar.gz
Resolving aistages-api-public-prod.s3.amazonaws.com (aistages-api-public-prod.s3.amazonaws.com)... 3.5.185.3, 3.5.185.19, 3.5.185.124, ...
Connecting to aistages-api-public-prod.s3.amazonaws.com (aistages-api-public-prod.s3.amazonaws.com)|3.5.185.3|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4265259 (4.1M) [binary/octet-stream]
Saving to: ‘data.tar.gz’


2025-12-02 00:38:24 (2.29 MB/s) - ‘data.tar.gz’ saved [4265259/4265259]

total 20
drwxr-xr-x 1 root root  4096 Dec  2 00:38 .
drwxr-xr-x 1 root root  4096 Dec  2 00:24 ..
drwxr-xr-x 4 root root  4096 Nov 20 14:30 .config
drwxr-xr-x 2  501 staff 4096 Apr 22  2025 data
drwxr-xr-x 1 root root  4096 Nov 20 14:30 sample_data


In [2]:
# 셀 2: 라이브러리 설치
!pip install transformers datasets torch kobart rouge_score konlpy

# Mecab 설치
!git clone https://github.com/SOMJANG/Mecab-ko-for-Google-Colab.git
%cd Mecab-ko-for-Google-Colab
!bash install_mecab-ko_on_colab_light_220429.sh
%cd /content

[31mERROR: Could not find a version that satisfies the requirement kobart (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for kobart[0m[31m
[0mCloning into 'Mecab-ko-for-Google-Colab'...
remote: Enumerating objects: 138, done.[K
remote: Counting objects: 100% (47/47), done.[K
remote: Compressing objects: 100% (38/38), done.[K
remote: Total 138 (delta 26), reused 22 (delta 8), pack-reused 91 (from 1)[K
Receiving objects: 100% (138/138), 1.72 MiB | 3.96 MiB/s, done.
Resolving deltas: 100% (65/65), done.
/content/Mecab-ko-for-Google-Colab
Installing konlpy.....
Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl.metadata (1.9 kB)
Collecting JPype1>=0.7.0 (from konlpy)
  Downloading jpype1-1.6.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (5.0 kB)
Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.4/19.4 MB[0m [31m118.9 MB/s[0m eta [36m0:00:0

In [3]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=67139ce1bb47fe823280a6b4b06602859b814eef801548c033755be80e9f5f48
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [4]:
# R-Drop
import torch
import torch.nn.functional as F
from transformers import Seq2SeqTrainer

class RDropTrainer(Seq2SeqTrainer):
    def __init__(self, alpha=0.7, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.alpha = alpha

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        # 같은 입력을 두 번 통과 (Dropout이 다르게 적용됨)
        outputs1 = model(**inputs)
        outputs2 = model(**inputs)

        # CE Loss 평균
        ce_loss = (outputs1.loss + outputs2.loss) / 2

        # KL Divergence Loss
        logits1 = outputs1.logits.view(-1, outputs1.logits.size(-1))
        logits2 = outputs2.logits.view(-1, outputs2.logits.size(-1))

        kl_loss = F.kl_div(
            F.log_softmax(logits1, dim=-1),
            F.softmax(logits2, dim=-1),
            reduction='batchmean'
        ) + F.kl_div(
            F.log_softmax(logits2, dim=-1),
            F.softmax(logits1, dim=-1),
            reduction='batchmean'
        )

        loss = ce_loss + self.alpha * kl_loss / 2

        return (loss, outputs1) if return_outputs else loss

In [5]:
# 셀 3: 모델 학습 및 ROUGE 평가
import pandas as pd
import numpy as np
from transformers import PreTrainedTokenizerFast, BartForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import Dataset
from rouge_score import rouge_scorer
from konlpy.tag import Mecab
from tqdm import tqdm

# 경로 설정
DATA_PATH = '/content/data'

# 데이터 로드
train_df = pd.read_csv(f'{DATA_PATH}/train.csv')
dev_df = pd.read_csv(f'{DATA_PATH}/dev.csv')

# 모델 & 토크나이저
model_name = "digit82/kobart-summarization"
tokenizer = PreTrainedTokenizerFast.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

# Mecab 토크나이저
mecab = Mecab()
def tokenize_ko(text):
    return ' '.join(mecab.morphs(text))

# Dataset 전처리
def preprocess(examples):
    inputs = tokenizer(examples['dialogue'], max_length=1024, truncation=True, padding='max_length')
    targets = tokenizer(examples['summary'], max_length=128, truncation=True, padding='max_length')
    inputs['labels'] = targets['input_ids']
    return inputs

train_dataset = Dataset.from_pandas(train_df).map(preprocess, batched=True)
eval_dataset = Dataset.from_pandas(dev_df).map(preprocess, batched=True)

# ROUGE 계산 함수
def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=False)
    r1, r2, rL = [], [], []

    for pred, label in zip(decoded_preds, decoded_labels):
        pred_tokenized = tokenize_ko(pred)
        label_tokenized = tokenize_ko(label)
        scores = scorer.score(label_tokenized, pred_tokenized)
        r1.append(scores['rouge1'].fmeasure)
        r2.append(scores['rouge2'].fmeasure)
        rL.append(scores['rougeL'].fmeasure)

    return {
        'rouge1': np.mean(r1),
        'rouge2': np.mean(r2),
        'rougeL': np.mean(rL),
        'rouge_total': np.mean(r1) + np.mean(r2) + np.mean(rL),
    }

# 학습 설정
training_args = Seq2SeqTrainingArguments(
    output_dir="./kobart-summary-rdrop",
    eval_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    save_total_limit=2,
    predict_with_generate=True,
    fp16=True,
    logging_steps=100,
)

# 사용
trainer = RDropTrainer(
    alpha=5.0,
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# 학습 시작
trainer.train()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/295 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/109 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

You passed `num_labels=3` which is incompatible to the `id2label` map of length `2`.


pytorch_model.bin:   0%|          | 0.00/496M [00:00<?, ?B/s]

Map:   0%|          | 0/12457 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

Map:   0%|          | 0/499 [00:00<?, ? examples/s]

  super().__init__(*args, **kwargs)
  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33maboutexo04[0m ([33maboutexo04-korea-open-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rouge Total
1,0.5496,0.414833,0.752271,0.623728,0.685056,2.061054
2,0.4462,0.395591,0.750668,0.627022,0.689842,2.067532
3,0.4276,0.38854,0.741665,0.620688,0.692166,2.054518


TrainOutput(global_step=9345, training_loss=0.5130397274125509, metrics={'train_runtime': 1261.243, 'train_samples_per_second': 29.63, 'train_steps_per_second': 7.409, 'total_flos': 2.278646118088704e+16, 'train_loss': 0.5130397274125509, 'epoch': 3.0})

In [6]:
# 로컬 저장
model.save_pretrained("./alpha50-lr3e5")
tokenizer.save_pretrained("./alpha50-lr3e5")
print("로컬 저장 완료!")

# Google Drive 백업
from google.colab import drive
drive.mount('/content/drive')

# 여기 경로도 맞춰야 함!
!cp -r ./alpha50-lr3e5 /content/drive/MyDrive/
print("Google Drive 백업 완료!")

로컬 저장 완료!
Mounted at /content/drive
Google Drive 백업 완료!


In [None]:
# 셀 4: Dev 전체 평가
model.eval()
model.to("cuda")

def summarization(dialogue: str) -> str:
    inputs = tokenizer(dialogue, return_tensors="pt", max_length=1024, truncation=True).to("cuda")
    outputs = model.generate(
        inputs["input_ids"],
        max_length=128,
        num_beams=8,
        early_stopping=True,
        length_penalty=2.0,
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=False)
r1, r2, rL = [], [], []

for idx, row in tqdm(dev_df.iterrows(), total=len(dev_df)):
    pred = summarization(row['dialogue'])
    pred_tokenized = tokenize_ko(pred)
    gold_tokenized = tokenize_ko(row['summary'])
    scores = scorer.score(gold_tokenized, pred_tokenized)
    r1.append(scores['rouge1'].fmeasure)
    r2.append(scores['rouge2'].fmeasure)
    rL.append(scores['rougeL'].fmeasure)

print(f"\n=== Dev 최종 결과 ===")
print(f"ROUGE-1: {np.mean(r1):.4f}")
print(f"ROUGE-2: {np.mean(r2):.4f}")
print(f"ROUGE-L: {np.mean(rL):.4f}")
print(f"Total: {np.mean(r1) + np.mean(r2) + np.mean(rL):.4f}")

100%|██████████| 499/499 [11:14<00:00,  1.35s/it]


=== Dev 최종 결과 ===
ROUGE-1: 0.7811
ROUGE-2: 0.6745
ROUGE-L: 0.7234
Total: 2.1790





In [7]:
model.eval()
model.to("cuda")

def summarization(dialogue: str) -> str:
    inputs = tokenizer(dialogue, return_tensors="pt", max_length=1024, truncation=True).to("cuda")
    outputs = model.generate(
        inputs["input_ids"],
        max_length=128,
        num_beams=8,
        early_stopping=True,
        length_penalty=2.0,

    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)
# 셀 5: Test 추론 및 제출 파일 생성
from datetime import datetime

test_df = pd.read_csv(f'{DATA_PATH}/test.csv')
summaries = []

for idx, row in tqdm(test_df.iterrows(), total=len(test_df)):
    summaries.append(summarization(row['dialogue']))

output = pd.DataFrame({
    "fname": test_df['fname'],
    "summary": summaries,
})

RESULT_PATH = '/content/results'
os.makedirs(RESULT_PATH, exist_ok=True)

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output.to_csv(f'{RESULT_PATH}/rdrop_alpha50_lr3e5_beams8_lengthpenalty20_{timestamp}.csv', index=False)
print(f"저장 완료: {RESULT_PATH}/rdrop_alpha50_lr3e5_beams8_lengthpenalty20_{timestamp}.csv")

100%|██████████| 499/499 [11:29<00:00,  1.38s/it]

저장 완료: /content/results/rdrop_alpha50_lr3e5_beams8_lengthpenalty20_20251202_011647.csv





In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# 나중에 다시 사용
from transformers import PreTrainedTokenizerFast, BartForConditionalGeneration

model = BartForConditionalGeneration.from_pretrained("/content/drive/MyDrive/kobart-summary-best")
tokenizer = PreTrainedTokenizerFast.from_pretrained("/content/drive/MyDrive/kobart-summary-best")
model.to("cuda")
model.eval()