In [None]:
import pandas as pd
import numpy as np
import torch
import pickle
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import LabelEncoder
import os

# Config
MODEL_NAME = "dragonkue/BGE-m3-ko"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
SAVE_DIR = "./cached_embeddings"
os.makedirs(SAVE_DIR, exist_ok=True)

# Load SentenceTransformer
model = SentenceTransformer(MODEL_NAME, device=DEVICE)
model.max_seq_length = 512

# LabelEncoder 공통화
label_encoder = LabelEncoder()

def process_split(split_name, csv_path, fit_label=False):
    df = pd.read_csv(csv_path)
    questions = df["Question"].tolist()
    doc_indices = df["Doc_index"].str.replace(".txt", "", regex=False)

    if fit_label:
        label_tensor = torch.tensor(label_encoder.fit_transform(doc_indices))
        with open(os.path.join(SAVE_DIR, "label_encoder.pkl"), "wb") as f:
            pickle.dump(label_encoder, f)
    else:
        label_tensor = torch.tensor(label_encoder.transform(doc_indices))

    # Embedding (CPU 저장!)
    with torch.no_grad():
        embeddings = model.encode(questions, convert_to_tensor=True, device=DEVICE).cpu()

    # 저장
    torch.save(embeddings, os.path.join(SAVE_DIR, f"{split_name}_emb.pt"))
    torch.save(label_tensor, os.path.join(SAVE_DIR, f"{split_name}_label.pt"))
    print(f"{split_name} 저장 완료: {len(questions)} samples")

# Train + Test 처리
process_split("train", "./data/csv/train.csv", fit_label=True)
process_split("test", "./data/csv/test.csv", fit_label=False)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


✅ train 저장 완료: 16575 samples
✅ test 저장 완료: 430 samples


In [None]:
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import pickle
import os
import numpy as np
import random

# Config
CACHE_DIR = "./cached_embeddings"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
SEEDS = [42, 123, 2025]
EPOCHS = 40
BATCH_SIZE = 8

# Load cached embeddings and labels
X = torch.load(os.path.join(CACHE_DIR, "train_emb.pt"))
y = torch.load(os.path.join(CACHE_DIR, "train_label.pt"))
with open(os.path.join(CACHE_DIR, "label_encoder.pkl"), "rb") as f:
    le = pickle.load(f)

# Dataset class
class QDataset(Dataset):
    def __init__(self, X, y):
        self.X, self.y = X, y
    def __len__(self):
        return len(self.y)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# MLP 정의
class MLP(nn.Module):
    def __init__(self, input_dim, num_classes):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.3),

            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.3),

            nn.Linear(256, 128),
            nn.ReLU(),

            nn.Linear(128, num_classes)
        )

    def forward(self, x):
        return self.net(x)

# 🔁 여러 seed로 반복 학습
for seed in SEEDS:
    print(f"\n🔁 Training model with seed {seed}...")

    # 시드 고정
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    # Train/Validation split
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=0.2, random_state=seed, stratify=y
    )

    train_loader = DataLoader(QDataset(X_train, y_train), batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(QDataset(X_val, y_val), batch_size=BATCH_SIZE)

    # 모델 초기화
    mlp = MLP(X.shape[1], len(le.classes_)).to(DEVICE)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(mlp.parameters(), lr=1e-4)

    best_acc = 0.0
    for epoch in range(EPOCHS):
        # ----- TRAIN -----
        mlp.train()
        total_loss = 0
        for xb, yb in train_loader:
            xb, yb = xb.to(DEVICE), yb.to(DEVICE)
            optimizer.zero_grad()
            logits = mlp(xb)
            loss = criterion(logits, yb)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        avg_train_loss = total_loss / len(train_loader)

        # ----- VALIDATION -----
        mlp.eval()
        correct, total = 0, 0
        with torch.no_grad():
            for xb, yb in val_loader:
                xb, yb = xb.to(DEVICE), yb.to(DEVICE)
                logits = mlp(xb)
                preds = torch.argmax(logits, dim=1)
                correct += (preds == yb).sum().item()
                total += yb.size(0)
        val_acc = correct / total
        best_acc = max(best_acc, val_acc)

        print(f"[Seed {seed} | Epoch {epoch+1}] Loss: {avg_train_loss:.4f} | Val Acc: {val_acc:.2%}")

    # 모델 저장
    save_path = f"./mlp_seed{seed}.pt"
    torch.save(mlp.state_dict(), save_path)
    print(f"Saved model with seed {seed} → {save_path} (Best Val Acc: {best_acc:.2%})")



🔁 Training model with seed 42...
[Seed 42 | Epoch 1] Loss: 4.5436 | Val Acc: 38.19%
[Seed 42 | Epoch 2] Loss: 2.9110 | Val Acc: 53.18%
[Seed 42 | Epoch 3] Loss: 2.0993 | Val Acc: 59.85%
[Seed 42 | Epoch 4] Loss: 1.7387 | Val Acc: 63.80%
[Seed 42 | Epoch 5] Loss: 1.5254 | Val Acc: 65.01%
[Seed 42 | Epoch 6] Loss: 1.3815 | Val Acc: 67.33%
[Seed 42 | Epoch 7] Loss: 1.2768 | Val Acc: 68.48%
[Seed 42 | Epoch 8] Loss: 1.1911 | Val Acc: 69.89%
[Seed 42 | Epoch 9] Loss: 1.1390 | Val Acc: 69.26%
[Seed 42 | Epoch 10] Loss: 1.0754 | Val Acc: 70.83%
[Seed 42 | Epoch 11] Loss: 1.0290 | Val Acc: 71.43%
[Seed 42 | Epoch 12] Loss: 0.9754 | Val Acc: 71.89%
[Seed 42 | Epoch 13] Loss: 0.9475 | Val Acc: 70.80%
[Seed 42 | Epoch 14] Loss: 0.9211 | Val Acc: 72.22%
[Seed 42 | Epoch 15] Loss: 0.8914 | Val Acc: 72.16%
[Seed 42 | Epoch 16] Loss: 0.8711 | Val Acc: 71.89%
[Seed 42 | Epoch 17] Loss: 0.8403 | Val Acc: 72.70%
[Seed 42 | Epoch 18] Loss: 0.8199 | Val Acc: 73.12%
[Seed 42 | Epoch 19] Loss: 0.8055 | Val

In [None]:
import torch
import pickle
from tqdm import tqdm
import os

# Config
CACHE_DIR = "./cached_embeddings"
MODEL_PATHS = ["mlp_seed42.pt", "mlp_seed123.pt", "mlp_seed2025.pt"]
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Load data
X_test = torch.load(f"{CACHE_DIR}/test_emb.pt")
y_test = torch.load(f"{CACHE_DIR}/test_label.pt")

# Load label encoder (for inverse_transform if needed)
with open(f"{CACHE_DIR}/label_encoder.pkl", "rb") as f:
    le = pickle.load(f)

# Model architecture (동일한 MLP 구조 써야 함)
class MLP(nn.Module):
    def __init__(self, input_dim, num_classes):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, num_classes)
        )
    def forward(self, x):
        return self.net(x)

# Load models
models = []
for path in MODEL_PATHS:
    model = MLP(X_test.shape[1], len(le.classes_))
    model.load_state_dict(torch.load(path, map_location=DEVICE))
    model.to(DEVICE)
    model.eval()
    models.append(model)

# Ensemble inference
recall_hits = 0
total = len(y_test)

with torch.no_grad():
    for i in tqdm(range(0, total, 16), desc="Ensemble Recall@3"):
        xb = X_test[i:i+16].to(DEVICE)
        logits_sum = torch.zeros((xb.size(0), len(le.classes_)), device=DEVICE)

        for model in models:
            logits_sum += model(xb)

        top3 = torch.topk(logits_sum, k=3, dim=1).indices.cpu()
        labels = y_test[i:i+16]

        for true, pred_top3 in zip(labels, top3):
            if true.item() in pred_top3.tolist():
                recall_hits += 1

print(f"\nEnsemble Recall@3: {recall_hits}/{total} = {recall_hits / total:.2%}")


Ensemble Recall@3: 100%|██████████| 27/27 [00:00<00:00, 712.85it/s]


✅ Ensemble Recall@3: 412/430 = 95.81%



