In [3]:
# 구동 환경 확인
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"사용 장치: {device}")

사용 장치: cuda


In [2]:
"""
Description: Fold 2 베스트 모델(Teacher)을 활용하여 Pseudo-Labeling 수행 후 Student 모델 학습
Best Score: 0.802 (Public Leaderboard)
Dependency: best_model_fold2.pt
"""

import pandas as pd
import numpy as np
import torch
import gc
import re
import os
import random
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_cosine_schedule_with_warmup
from torch.cuda.amp import autocast, GradScaler
from tqdm.auto import tqdm

# ==========================================
# 1. 환경 설정 (아까 성공했던 그 설정값!)
# ==========================================
MODEL_NAME = "klue/bert-base"
MAX_LEN = 256
BATCH_SIZE = 16
STUDENT_EPOCHS = 4          # ⭐️ 핵심: 4 Epoch
CONFIDENCE_THRESHOLD = 0.70 # ⭐️ 핵심: 0.7 임계값
TEACHER_PATH = "best_model_fold2.pt" # ⭐️ 0.776점 맞았던 그 전설의 파일

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(42)

# ==========================================
# 2. 데이터 준비 (전처리는 동일)
# ==========================================
def clean_text(text):
    text = re.sub(r'[^가-힣a-zA-Z0-9?!\.,\s]', '', text)
    text = text.replace('\n', ' [SEP] ')
    return text

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

class ConversationDataset(Dataset):
    def __init__(self, texts, labels=None, tokenizer=None, max_len=256):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self): return len(self.texts)
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        inputs = self.tokenizer(text, return_tensors='pt', truncation=True, max_length=self.max_len, padding='max_length', add_special_tokens=True)
        item = {'input_ids': inputs['input_ids'][0], 'attention_mask': inputs['attention_mask'][0]}
        if self.labels is not None: item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

# 데이터 로드
train_df = pd.read_csv('train.csv')
normal_df = pd.read_csv('normal_conversation.csv')
test_df = pd.read_csv('test.csv')

# 병합 및 전처리
normal_df['idx'] = range(train_df['idx'].max() + 1, train_df['idx'].max() + 1 + len(normal_df))
normal_df = normal_df[['idx', 'class', 'conversation']]
train_df = pd.concat([train_df, normal_df], ignore_index=True)

train_df['conversation'] = train_df['conversation'].apply(clean_text)
test_df['conversation'] = test_df['conversation'].apply(clean_text)
label_dict = {'협박 대화':0, '갈취 대화':1, '직장 내 괴롭힘 대화':2, '기타 괴롭힘 대화':3, '일반 대화':4}
train_df['class'] = train_df['class'].map(label_dict)

# ==========================================
# 3. [Teacher] 저장된 모델 불러오기 & Pseudo-Labeling
# ==========================================
print(f"📂 저장된 Teacher 모델({TEACHER_PATH}) 로드 중...")

teacher_model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=5)
teacher_model.load_state_dict(torch.load(TEACHER_PATH)) # ★ 여기서 저장된 파일 사용!
teacher_model.to(device)
teacher_model.eval()

test_ds = ConversationDataset(test_df['conversation'].tolist(), labels=None, tokenizer=tokenizer, max_len=MAX_LEN)
test_loader = DataLoader(test_ds, batch_size=32, shuffle=False)

pseudo_texts = []
pseudo_labels = []

print(f"🏷️ Pseudo-Labeling 시작 (Threshold: {CONFIDENCE_THRESHOLD})...")

with torch.no_grad():
    # 배치 단위로 추론 및 필터링
    test_texts = test_df['conversation'].tolist()
    batch_start = 0

    for batch in tqdm(test_loader):
        inputs = {k: v.to(device) for k, v in batch.items()}
        outputs = teacher_model(**inputs)
        probs = F.softmax(outputs.logits, dim=-1)
        max_probs, preds = torch.max(probs, dim=-1)

        max_probs = max_probs.cpu().numpy()
        preds = preds.cpu().numpy()

        # 현재 배치의 텍스트 가져오기
        batch_end = batch_start + len(max_probs)
        current_texts = test_texts[batch_start:batch_end]

        for i, prob in enumerate(max_probs):
            if prob >= CONFIDENCE_THRESHOLD:
                pseudo_texts.append(current_texts[i])
                pseudo_labels.append(preds[i])

        batch_start = batch_end

print(f"📈 확보된 추가 데이터: {len(pseudo_texts)}개")
del teacher_model
gc.collect()
torch.cuda.empty_cache()

# ==========================================
# 4. [Student] 전체 데이터로 재학습 (Epoch 4)
# ==========================================
print(f"🎓 Student 모델 학습 시작 (Epochs: {STUDENT_EPOCHS})...")

# 데이터 합치기
final_X = train_df['conversation'].tolist() + pseudo_texts
final_y = np.concatenate([train_df['class'].values, np.array(pseudo_labels)])

train_ds = ConversationDataset(final_X, final_y, tokenizer, MAX_LEN)
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)

student_model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=5)
student_model.to(device)

optimizer = AdamW(student_model.parameters(), lr=5e-5)
total_steps = len(train_loader) * STUDENT_EPOCHS
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=int(total_steps*0.1), num_training_steps=total_steps)
scaler = GradScaler()

student_model.train()
for epoch in range(STUDENT_EPOCHS):
    total_loss = 0
    loop = tqdm(train_loader, desc=f"Ep {epoch+1}")
    for batch in loop:
        optimizer.zero_grad()
        with autocast():
            inputs = {k: v.to(device) for k, v in batch.items()}
            outputs = student_model(**inputs)
            loss = outputs.loss
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()
        total_loss += loss.item()
        loop.set_postfix(loss=loss.item())

# ==========================================
# 5. 최종 제출 파일 생성
# ==========================================
print("🏁 최종 추론 중...")
student_model.eval()
test_loader = DataLoader(test_ds, batch_size=32, shuffle=False)
final_preds = []

with torch.no_grad():
    for batch in tqdm(test_loader):
        inputs = {k: v.to(device) for k, v in batch.items()}
        outputs = student_model(**inputs)
        preds = torch.argmax(outputs.logits, dim=-1).cpu().numpy()
        final_preds.extend(preds)

submission = pd.DataFrame({'idx': test_df['idx'], 'target': final_preds})
filename = "submission_reproduced_0.802.csv"
submission.to_csv(filename, index=False)
print(f"🎉 0.802점 재현 파일 생성 완료: {filename}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/425 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/289 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

📂 저장된 Teacher 모델(best_model_fold2.pt) 로드 중...


model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertForSequenceClassification LOAD REPORT from: klue/bert-base
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
bert.embeddings.position_ids               | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
classifier.bias                            | MISSING    | 
classifier.weight                          | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on you

🏷️ Pseudo-Labeling 시작 (Threshold: 0.7)...


  0%|          | 0/16 [00:00<?, ?it/s]

📈 확보된 추가 데이터: 431개
🎓 Student 모델 학습 시작 (Epochs: 4)...


Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertForSequenceClassification LOAD REPORT from: klue/bert-base
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
bert.embeddings.position_ids               | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
classifier.bias                            | MISSING    | 
classifier.weight                          | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on you

Ep 1:   0%|          | 0/353 [00:00<?, ?it/s]

  with autocast():


Ep 2:   0%|          | 0/353 [00:00<?, ?it/s]

Ep 3:   0%|          | 0/353 [00:00<?, ?it/s]

Ep 4:   0%|          | 0/353 [00:00<?, ?it/s]

🏁 최종 추론 중...


  0%|          | 0/16 [00:00<?, ?it/s]

🎉 0.802점 재현 파일 생성 완료: submission_reproduced_0.802.csv
