In [2]:
from google.colab import drive
import json
drive.mount('/content/drive')

Mounted at /content/drive


### 모델 학습 코드

In [None]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import json
import joblib
from tqdm import tqdm
from torch.optim import AdamW

# ✅ 설정
model_name = "klue/roberta-base"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ✅ 데이터 로딩, 질문과 답변을 합쳐 입력 텍스트 생성
with open("/content/drive/MyDrive/data_with_category.json", "r", encoding="utf-8") as f:
    raw = json.load(f)
df = pd.DataFrame(raw)
df["text"] = df["question"].str.strip() + " " + df["answer"].str.strip()

# ✅ 라벨 인코딩, category를 숫자 라벨로 변환
le = LabelEncoder()
df["label"] = le.fit_transform(df["category"])
joblib.dump(le, "category_label_encoder.pkl")

# ✅ Tokenizer 로드
tokenizer = AutoTokenizer.from_pretrained(model_name)

# ✅ 커스텀 Dataset,토큰화된 입력과 레이블 반환
class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=256)
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# ✅ 데이터셋 나누기
train_texts = df["text"].tolist()
train_labels = df["label"].tolist()
dataset = TextDataset(train_texts, train_labels)
loader = DataLoader(dataset, batch_size=8, shuffle=True)

# ✅ 모델 정의
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(le.classes_)
).to(device)

# ✅ 옵티마이저 및 손실 함수 설정
optimizer = AdamW(model.parameters(), lr=5e-5)
loss_fn = torch.nn.CrossEntropyLoss()

# ✅ 학습 루프,
model.train()
epochs = 5

# 에폭을 반복하며 배치별로 학습,로스 출력
for epoch in range(epochs):
    total_loss = 0
    for batch in tqdm(loader):
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    print(f"[Epoch {epoch+1}] Loss: {total_loss:.4f}")

# ✅ 학습된 모델, 토크나이저 저장
model.save_pretrained("roberta-category-manual")
tokenizer.save_pretrained("roberta-category-manual")


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 1903/1903 [02:39<00:00, 11.93it/s]


[Epoch 1] Loss: 797.2398


100%|██████████| 1903/1903 [02:39<00:00, 11.94it/s]


[Epoch 2] Loss: 981.8512


100%|██████████| 1903/1903 [02:39<00:00, 11.97it/s]


[Epoch 3] Loss: 1745.1392


100%|██████████| 1903/1903 [02:39<00:00, 11.95it/s]


[Epoch 4] Loss: 1927.0037


100%|██████████| 1903/1903 [02:39<00:00, 11.94it/s]


[Epoch 5] Loss: 1917.4106


('roberta-category-manual/tokenizer_config.json',
 'roberta-category-manual/special_tokens_map.json',
 'roberta-category-manual/vocab.txt',
 'roberta-category-manual/added_tokens.json',
 'roberta-category-manual/tokenizer.json')

### 추론 코드

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import joblib
import torch.nn.functional as F

# ✅ 저장된 모델, 토크나이저, 라벨 인코더 불러오기
model_path = "roberta-category-manual"  # 너가 저장한 경로
model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)
label_encoder = joblib.load("category_label_encoder.pkl")

model.eval()

# ✅ 디바이스 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# ✅ 예측 함수
def predict_category(question: str) -> str:
    # 입력 전처리
    inputs = tokenizer(question, return_tensors="pt", truncation=True, padding=True, max_length=256).to(device)

    # 추론
    with torch.no_grad():
        outputs = model(**inputs)
        probs = F.softmax(outputs.logits, dim=1)
        pred_label = torch.argmax(probs, dim=1).item()

    # 라벨 디코딩
    category = label_encoder.inverse_transform([pred_label])[0]
    return category

# ✅ 예시
question = "복학을 하려면 어떤 절차를 밟아야 하나요?"
predicted_category = predict_category(question)
print(f"📌 예측된 category: {predicted_category}")


📌 예측된 category: 학사


In [None]:
from sklearn.metrics import classification_report
import numpy as np

# ✅ 모델 평가 모드로 전환
model.eval()

# ✅ 전체 추론용 데이터 (텍스트 & 정답 라벨)
texts = df["text"].tolist()
true_labels = df["label"].tolist()

pred_labels = []

# ✅ 배치로 추론
batch_size = 32
for i in range(0, len(texts), batch_size):
    batch_texts = texts[i:i+batch_size]
    inputs = tokenizer(batch_texts, return_tensors="pt", truncation=True, padding=True, max_length=256).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
        pred_labels.extend(preds)

# ✅ classification report 출력
print(classification_report(true_labels, pred_labels, target_names=le.classes_))


              precision    recall  f1-score   support

          국제       0.00      0.00      0.00       263
         동아리       0.00      0.00      0.00       138
          일반       0.40      0.12      0.18      3808
          장학       0.00      0.00      0.00      1019
          진로       0.00      0.00      0.00       307
          학사       0.65      0.95      0.77      9686

    accuracy                           0.63     15221
   macro avg       0.18      0.18      0.16     15221
weighted avg       0.51      0.63      0.54     15221



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### 병렬 분류 모델

In [4]:
import pandas as pd
import json
from sklearn.preprocessing import LabelEncoder
import joblib

# 🔹 데이터 불러오기
with open("/content/drive/MyDrive/data_with_category.json", "r", encoding="utf-8") as f:
    data = json.load(f)
df = pd.DataFrame(data)
df["text"] = df["question"].str.strip() + " " + df["answer"].str.strip()

# 🔹 라벨 인코딩
cat_le = LabelEncoder()
subcat_le = LabelEncoder()
df["cat_label"] = cat_le.fit_transform(df["category"])
df["subcat_label"] = subcat_le.fit_transform(df["subcategory"])

joblib.dump(cat_le, "category_label_encoder.pkl")
joblib.dump(subcat_le, "subcategory_label_encoder.pkl")


['subcategory_label_encoder.pkl']

In [None]:
from transformers import AutoTokenizer
import torch
from torch.utils.data import Dataset

tokenizer = AutoTokenizer.from_pretrained("klue/roberta-base")

#커스텀 Dataset 정의
class MultiOutputDataset(Dataset):
    def __init__(self, texts, cat_labels, subcat_labels):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=256)
        self.cat_labels = cat_labels
        self.subcat_labels = subcat_labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["cat_label"] = torch.tensor(self.cat_labels[idx])
        item["subcat_label"] = torch.tensor(self.subcat_labels[idx])
        return item

    def __len__(self):
        return len(self.cat_labels)


In [None]:
from transformers import AutoModel
import torch.nn as nn

class RobertaMultiHeadClassifier(nn.Module):
    def __init__(self, model_name, num_cat_classes, num_subcat_classes):
        super().__init__()
        self.backbone = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.1)
        self.cat_head = nn.Linear(self.backbone.config.hidden_size, num_cat_classes)
        self.subcat_head = nn.Linear(self.backbone.config.hidden_size, num_subcat_classes)

    def forward(self, input_ids, attention_mask, token_type_ids=None):
        outputs = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
        pooled = outputs.last_hidden_state[:, 0]
        pooled = self.dropout(pooled)
        cat_logits = self.cat_head(pooled)
        subcat_logits = self.subcat_head(pooled)
        return cat_logits, subcat_logits


In [None]:
from torch.utils.data import DataLoader
import torch.nn.functional as F
from torch.optim import AdamW
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 🔹 데이터 준비
dataset = MultiOutputDataset(df["text"].tolist(), df["cat_label"].tolist(), df["subcat_label"].tolist())
loader = DataLoader(dataset, batch_size=8, shuffle=True)

# 🔹 모델
model = RobertaMultiHeadClassifier("klue/roberta-base", len(cat_le.classes_), len(subcat_le.classes_)).to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)

# 🔹 학습
model.train()
for epoch in range(3):
    total_loss = 0
    for batch in tqdm(loader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        cat_labels = batch["cat_label"].to(device)
        subcat_labels = batch["subcat_label"].to(device)

        optimizer.zero_grad()
        cat_logits, subcat_logits = model(input_ids=input_ids, attention_mask=attention_mask)

        cat_loss = F.cross_entropy(cat_logits, cat_labels)
        subcat_loss = F.cross_entropy(subcat_logits, subcat_labels)
        loss = cat_loss + subcat_loss

        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"[Epoch {epoch+1}] Loss: {total_loss:.4f}")


Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 1903/1903 [02:42<00:00, 11.70it/s]


[Epoch 1] Loss: 2561.1003


100%|██████████| 1903/1903 [02:42<00:00, 11.71it/s]


[Epoch 2] Loss: 1813.7088


100%|██████████| 1903/1903 [02:42<00:00, 11.73it/s]

[Epoch 3] Loss: 1430.0917





In [None]:
torch.save(model.state_dict(), "multihead_model.pt")

#### 추론

In [None]:
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer
import joblib

# ✅ 모델 로드
model = RobertaMultiHeadClassifier(
    model_name="klue/roberta-base",
    num_cat_classes=len(cat_le.classes_),
    num_subcat_classes=len(subcat_le.classes_)
)
model.load_state_dict(torch.load("multihead_model.pt", map_location="cpu"))
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# ✅ 토크나이저 & 인코더
tokenizer = AutoTokenizer.from_pretrained("klue/roberta-base")
cat_le = joblib.load("category_label_encoder.pkl")
subcat_le = joblib.load("subcategory_label_encoder.pkl")

# ✅ 추론 함수
def predict_category_and_subcategory(question: str, answer: str = ""):
    text = question.strip() + " " + answer.strip()
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=256).to(device)

    with torch.no_grad():
        cat_logits, subcat_logits = model(**inputs)
        cat_pred = torch.argmax(F.softmax(cat_logits, dim=1), dim=1).item()
        subcat_pred = torch.argmax(F.softmax(subcat_logits, dim=1), dim=1).item()

    category = cat_le.inverse_transform([cat_pred])[0]
    subcategory = subcat_le.inverse_transform([subcat_pred])[0]
    return category, subcategory


Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
question = "교환학생 신청은 어디서 하나요?"
answer = "교무처 국제팀에서 공지 확인 가능합니다."

cat, subcat = predict_category_and_subcategory(question, answer)
print(f"📌 예측된 category: {cat}")
print(f"📌 예측된 subcategory: {subcat}")


📌 예측된 category: 국제
📌 예측된 subcategory: 국제교류(교환학생,어학연수)


### 훈련된 모델로 실제 데이터 전체를 태깅

In [None]:
import torch
import json
from tqdm import tqdm
import pandas as pd
from transformers import AutoTokenizer
import joblib
from torch.nn.functional import softmax

# ✅ 디바이스 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ✅ 모델 로드
model = RobertaMultiHeadClassifier(
    model_name="klue/roberta-base",
    num_cat_classes=6,  # 수정: 카테고리 수
    num_subcat_classes=20  # 수정: 서브카테고리 수
)
model.load_state_dict(torch.load("multihead_model.pt", map_location=device))
model.to(device)
model.eval()

# ✅ 토크나이저 & 인코더 로드
tokenizer = AutoTokenizer.from_pretrained("klue/roberta-base")
cat_le = joblib.load("category_label_encoder.pkl")
subcat_le = joblib.load("subcategory_label_encoder.pkl")

# ✅ 데이터 로드
with open("/content/drive/MyDrive/data_with_category.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# ✅ 전체 분류 수행
for item in tqdm(data):
    q, a = item["question"].strip(), item["answer"].strip()
    text = q + " " + a

    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=256).to(device)
    #기울기 계산 비활성화로 추론 최적화,로짓 -> softmax -> argma: 확률이 가장 높은 클래스를 예측값으로 선택
    with torch.no_grad():
        cat_logits, subcat_logits = model(**inputs)
        cat_pred = torch.argmax(softmax(cat_logits, dim=1), dim=1).item()
        subcat_pred = torch.argmax(softmax(subcat_logits, dim=1), dim=1).item()

    #정수 예측값을 실제 문자열 라벨로 변환하여 딕셔너리에 추가
    item["predicted_category"] = cat_le.inverse_transform([cat_pred])[0]
    item["predicted_subcategory"] = subcat_le.inverse_transform([subcat_pred])[0]

# ✅ 결과 저장
with open("/content/drive/MyDrive/data_with_predicted_cat_subcat.json", "w", encoding="utf-8") as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

print("✅ 예측 결과 저장 완료 → data_with_predicted_cat_subcat.json")


Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 15221/15221 [02:32<00:00, 99.89it/s] 


✅ 예측 결과 저장 완료 → data_with_predicted_cat_subcat.json


In [3]:
import json
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
import torch.nn as nn
from torch.optim import AdamW
from tqdm import tqdm
import joblib
import torch.nn.functional as F

# 설정
model_name = "klue/roberta-base"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 🔹 예측 결과 데이터 로드
with open("/content/drive/MyDrive/data_with_predicted_cat_subcat.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# 🔹 잘못 분류된 샘플만 추출
wrong_preds = [
    item for item in data
    if item["category"] != item["predicted_category"]
    or item["subcategory"] != item["predicted_subcategory"]
]

# 🔹 잘못된 예측 데이터프레임 변환
df_wrong = pd.DataFrame(wrong_preds)
df_wrong["text"] = df_wrong["question"].str.strip() + " " + df_wrong["answer"].str.strip()

# 🔹 원본 정답 데이터 불러오기
with open("/content/drive/MyDrive/data_with_category.json", "r", encoding="utf-8") as f:
    original_data = json.load(f)

df_orig = pd.DataFrame(original_data)
df_orig["text"] = df_orig["question"].str.strip() + " " + df_orig["answer"].str.strip()

# 🔹 라벨 인코더 로드
cat_le = joblib.load("category_label_encoder.pkl")
subcat_le = joblib.load("subcategory_label_encoder.pkl")

df_wrong["cat_label"] = cat_le.transform(df_wrong["category"])
df_wrong["subcat_label"] = subcat_le.transform(df_wrong["subcategory"])
df_orig["cat_label"] = cat_le.transform(df_orig["category"])
df_orig["subcat_label"] = subcat_le.transform(df_orig["subcategory"])

# 🔹 합치고 중복 제거
df_all = pd.concat([df_orig, df_wrong]).drop_duplicates(subset=["text", "cat_label", "subcat_label"])

# 🔹 토크나이저
tokenizer = AutoTokenizer.from_pretrained(model_name)

# 🔹 Dataset 클래스
class MultiOutputDataset(Dataset):
    def __init__(self, texts, cat_labels, subcat_labels):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=256)
        self.cat_labels = cat_labels
        self.subcat_labels = subcat_labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["cat_label"] = torch.tensor(self.cat_labels[idx])
        item["subcat_label"] = torch.tensor(self.subcat_labels[idx])
        return item

    def __len__(self):
        return len(self.cat_labels)

# 🔹 DataLoader 준비
dataset = MultiOutputDataset(df_all["text"].tolist(), df_all["cat_label"].tolist(), df_all["subcat_label"].tolist())
loader = DataLoader(dataset, batch_size=8, shuffle=True)

# 🔹 모델 정의
class RobertaMultiHeadClassifier(nn.Module):
    def __init__(self, model_name, num_cat_classes, num_subcat_classes):
        super().__init__()
        self.backbone = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.1)
        self.cat_head = nn.Linear(self.backbone.config.hidden_size, num_cat_classes)
        self.subcat_head = nn.Linear(self.backbone.config.hidden_size, num_subcat_classes)

    def forward(self, input_ids, attention_mask, token_type_ids=None):
        outputs = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
        pooled = outputs.last_hidden_state[:, 0]
        pooled = self.dropout(pooled)
        cat_logits = self.cat_head(pooled)
        subcat_logits = self.subcat_head(pooled)
        return cat_logits, subcat_logits

# 🔹 학습 시작
model = RobertaMultiHeadClassifier(model_name, len(cat_le.classes_), len(subcat_le.classes_)).to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)
model.train()

for epoch in range(3):
    total_loss = 0
    for batch in tqdm(loader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        cat_labels = batch["cat_label"].to(device)
        subcat_labels = batch["subcat_label"].to(device)

        optimizer.zero_grad()
        cat_logits, subcat_logits = model(input_ids=input_ids, attention_mask=attention_mask)

        cat_loss = F.cross_entropy(cat_logits, cat_labels)
        subcat_loss = F.cross_entropy(subcat_logits, subcat_labels)
        loss = cat_loss + subcat_loss

        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"[Epoch {epoch+1}] Loss: {total_loss:.4f}")

# 🔹 모델 저장
torch.save(model.state_dict(), "/content/drive/MyDrive/retrained_multihead_model.pt")
print("✅ 재학습 완료 및 저장")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/375 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/752k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/546 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/443M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 1900/1900 [02:42<00:00, 11.68it/s]


[Epoch 1] Loss: 2598.9127


100%|██████████| 1900/1900 [02:41<00:00, 11.79it/s]


[Epoch 2] Loss: 1889.9362


100%|██████████| 1900/1900 [02:41<00:00, 11.79it/s]


[Epoch 3] Loss: 1649.3179
✅ 재학습 완료 및 저장


In [4]:
import torch
import json
import pandas as pd
from transformers import AutoTokenizer
import joblib
import torch.nn.functional as F
from sklearn.metrics import accuracy_score, classification_report

# ✅ 디바이스 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ✅ 모델 로드
model = RobertaMultiHeadClassifier(
    model_name="klue/roberta-base",
    num_cat_classes=len(cat_le.classes_),
    num_subcat_classes=len(subcat_le.classes_)
)
model.load_state_dict(torch.load("/content/drive/MyDrive/retrained_multihead_model.pt", map_location=device))
model.to(device)
model.eval()

# ✅ 인코더 & 토크나이저
tokenizer = AutoTokenizer.from_pretrained("klue/roberta-base")
cat_le = joblib.load("category_label_encoder.pkl")
subcat_le = joblib.load("subcategory_label_encoder.pkl")

# ✅ 평가용 데이터 로드
with open("/content/drive/MyDrive/data_with_category.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# ✅ 예측 수행
y_cat_true, y_cat_pred = [], []
y_subcat_true, y_subcat_pred = [], []

for item in data:
    text = item["question"].strip() + " " + item["answer"].strip()
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=256).to(device)

    with torch.no_grad():
        cat_logits, subcat_logits = model(**inputs)
        cat_pred = torch.argmax(F.softmax(cat_logits, dim=1), dim=1).item()
        subcat_pred = torch.argmax(F.softmax(subcat_logits, dim=1), dim=1).item()

    # 정답 레이블
    y_cat_true.append(cat_le.transform([item["category"]])[0])
    y_cat_pred.append(cat_pred)

    y_subcat_true.append(subcat_le.transform([item["subcategory"]])[0])
    y_subcat_pred.append(subcat_pred)

# ✅ 정확도 출력
print("🔹 [Category] Accuracy:", accuracy_score(y_cat_true, y_cat_pred))
print("🔹 [Subcategory] Accuracy:", accuracy_score(y_subcat_true, y_subcat_pred))

# ✅ 상세 리포트 (optional)
print("\n📋 Category Classification Report:")
print(classification_report(y_cat_true, y_cat_pred, target_names=cat_le.classes_))

print("\n📋 Subcategory Classification Report:")
print(classification_report(y_subcat_true, y_subcat_pred, target_names=subcat_le.classes_))


Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


🔹 [Category] Accuracy: 0.9366664476709808
🔹 [Subcategory] Accuracy: 0.8370672097759674

📋 Category Classification Report:
              precision    recall  f1-score   support

          국제       0.62      0.97      0.76       263
         동아리       0.98      0.97      0.97       138
          일반       0.96      0.82      0.88      3808
          장학       0.98      0.93      0.95      1019
          진로       0.91      0.83      0.87       307
          학사       0.94      0.99      0.96      9686

    accuracy                           0.94     15221
   macro avg       0.90      0.92      0.90     15221
weighted avg       0.94      0.94      0.94     15221


📋 Subcategory Classification Report:
                  precision    recall  f1-score   support

           IT서비스       0.79      0.42      0.55       140
              교직       0.93      0.90      0.91       296
국제(외국인유학생,한국어연수)       0.00      0.00      0.00        40
 국제교류(교환학생,어학연수)       0.53      0.99      0.69       223
      

### (잘못 분류된 데이터 + 원본 데이터로) 재학습 모델로 예측->JSON 저장

In [5]:
import torch
import json
from tqdm import tqdm
from transformers import AutoTokenizer
import joblib
import torch.nn.functional as F

# ✅ 디바이스 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ✅ 토크나이저 & 라벨 인코더 로드
tokenizer = AutoTokenizer.from_pretrained("klue/roberta-base")
cat_le = joblib.load("category_label_encoder.pkl")
subcat_le = joblib.load("subcategory_label_encoder.pkl")

# ✅ 모델 정의 (클래스 필요 시 함께 정의)
class RobertaMultiHeadClassifier(torch.nn.Module):
    def __init__(self, model_name, num_cat_classes, num_subcat_classes):
        super().__init__()
        from transformers import AutoModel
        self.backbone = AutoModel.from_pretrained(model_name)
        self.dropout = torch.nn.Dropout(0.1)
        self.cat_head = torch.nn.Linear(self.backbone.config.hidden_size, num_cat_classes)
        self.subcat_head = torch.nn.Linear(self.backbone.config.hidden_size, num_subcat_classes)

    def forward(self, input_ids, attention_mask, token_type_ids=None):
        outputs = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
        pooled = outputs.last_hidden_state[:, 0]
        pooled = self.dropout(pooled)
        cat_logits = self.cat_head(pooled)
        subcat_logits = self.subcat_head(pooled)
        return cat_logits, subcat_logits

# ✅ 모델 로드
model = RobertaMultiHeadClassifier(
    model_name="klue/roberta-base",
    num_cat_classes=len(cat_le.classes_),
    num_subcat_classes=len(subcat_le.classes_)
)
model.load_state_dict(torch.load("/content/drive/MyDrive/retrained_multihead_model.pt", map_location=device))
model.to(device)
model.eval()

# ✅ 예측 대상 데이터 로드
with open("/content/drive/MyDrive/data_with_category.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# ✅ 예측 수행
for item in tqdm(data):
    q, a = item["question"].strip(), item["answer"].strip()
    text = q + " " + a

    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=256).to(device)

    with torch.no_grad():
        cat_logits, subcat_logits = model(**inputs)
        cat_pred = torch.argmax(F.softmax(cat_logits, dim=1), dim=1).item()
        subcat_pred = torch.argmax(F.softmax(subcat_logits, dim=1), dim=1).item()

    item["predicted_category"] = cat_le.inverse_transform([cat_pred])[0]
    item["predicted_subcategory"] = subcat_le.inverse_transform([subcat_pred])[0]

# ✅ 결과 저장
with open("/content/drive/MyDrive/data_with_predicted_retrained.json", "w", encoding="utf-8") as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

print("✅ 저장 완료 → data_with_predicted_retrained.json")


Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 15221/15221 [02:26<00:00, 104.09it/s]


✅ 저장 완료 → data_with_predicted_retrained.json


In [6]:
import json

# ✅ 데이터 로드
with open("/content/drive/MyDrive/data_with_predicted_retrained.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# ✅ 통계 변수 초기화
only_category_diff = 0
only_subcategory_diff = 0
both_diff = 0
all_correct = 0

# ✅ 비교 및 카운팅
for item in data:
    cat_true = item.get("category")
    cat_pred = item.get("predicted_category")
    subcat_true = item.get("subcategory")
    subcat_pred = item.get("predicted_subcategory")

    cat_diff = (cat_true != cat_pred)
    subcat_diff = (subcat_true != subcat_pred)

    if cat_diff and subcat_diff:
        both_diff += 1
    elif cat_diff and not subcat_diff:
        only_category_diff += 1
    elif subcat_diff and not cat_diff:
        only_subcategory_diff += 1
    else:
        all_correct += 1

# ✅ 결과 출력
print(f"❌ category만 다름: {only_category_diff}개")
print(f"❌ subcategory만 다름: {only_subcategory_diff}개")
print(f"❌ 둘 다 다름: {both_diff}개")
print(f"✅ 모두 일치: {all_correct}개")


❌ category만 다름: 130개
❌ subcategory만 다름: 1646개
❌ 둘 다 다름: 834개
✅ 모두 일치: 12611개


### 오답+정답 혼합 재학습 코드 (retrained_multihead_model.pt 이어서 학습)

In [14]:
import json
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
import torch.nn as nn
from torch.optim import AdamW
from tqdm import tqdm
import joblib
import torch.nn.functional as F

# ✅ 설정
model_name = "klue/roberta-base"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ✅ 데이터 로드
with open("/content/drive/MyDrive/data_with_predicted_retrained.json", "r", encoding="utf-8") as f:
    pred_data = json.load(f)

# ✅ 오답 샘플만 추출
wrong_samples = [
    item for item in pred_data
    if item["category"] != item["predicted_category"]
    or item["subcategory"] != item["predicted_subcategory"]
]

# ✅ 원본 정답 데이터 로드
with open("/content/drive/MyDrive/data_with_category.json", "r", encoding="utf-8") as f:
    original_data = json.load(f)

# ✅ DataFrame 변환 및 전처리
df_wrong = pd.DataFrame(wrong_samples)
df_orig = pd.DataFrame(original_data)

df_wrong["text"] = df_wrong["question"].str.strip() + " " + df_wrong["answer"].str.strip()
df_orig["text"] = df_orig["question"].str.strip() + " " + df_orig["answer"].str.strip()

# ✅ 라벨 인코더 로드
cat_le = joblib.load("category_label_encoder.pkl")
subcat_le = joblib.load("subcategory_label_encoder.pkl")

df_wrong["cat_label"] = cat_le.transform(df_wrong["category"])
df_wrong["subcat_label"] = subcat_le.transform(df_wrong["subcategory"])
df_orig["cat_label"] = cat_le.transform(df_orig["category"])
df_orig["subcat_label"] = subcat_le.transform(df_orig["subcategory"])

# ✅ 오답 + 정답 병합, 중복 제거
df_all = pd.concat([df_orig, df_wrong]).drop_duplicates(subset=["text", "cat_label", "subcat_label"])

# ✅ Dataset 클래스
class MultiOutputDataset(Dataset):
    def __init__(self, texts, cat_labels, subcat_labels):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=256)
        self.cat_labels = cat_labels
        self.subcat_labels = subcat_labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["cat_label"] = torch.tensor(self.cat_labels[idx])
        item["subcat_label"] = torch.tensor(self.subcat_labels[idx])
        return item

    def __len__(self):
        return len(self.cat_labels)

# ✅ 토크나이저
tokenizer = AutoTokenizer.from_pretrained(model_name)

# ✅ DataLoader
dataset = MultiOutputDataset(df_all["text"].tolist(), df_all["cat_label"].tolist(), df_all["subcat_label"].tolist())
loader = DataLoader(dataset, batch_size=8, shuffle=True)

# ✅ 모델 클래스 정의
class RobertaMultiHeadClassifier(nn.Module):
    def __init__(self, model_name, num_cat_classes, num_subcat_classes):
        super().__init__()
        self.backbone = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.1)
        self.cat_head = nn.Linear(self.backbone.config.hidden_size, num_cat_classes)
        self.subcat_head = nn.Linear(self.backbone.config.hidden_size, num_subcat_classes)

    def forward(self, input_ids, attention_mask, token_type_ids=None):
        outputs = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
        pooled = outputs.last_hidden_state[:, 0]
        pooled = self.dropout(pooled)
        cat_logits = self.cat_head(pooled)
        subcat_logits = self.subcat_head(pooled)
        return cat_logits, subcat_logits

# ✅ 기존 모델 로드 (이어 학습)
model = RobertaMultiHeadClassifier(model_name, len(cat_le.classes_), len(subcat_le.classes_)).to(device)
model.load_state_dict(torch.load("/content/drive/MyDrive/retrained_multihead_model.pt", map_location=device))
optimizer = AdamW(model.parameters(), lr=5e-5)
model.train()

# ✅ 학습 루프
for epoch in range(3):
    total_loss = 0
    for batch in tqdm(loader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        cat_labels = batch["cat_label"].to(device)
        subcat_labels = batch["subcat_label"].to(device)

        optimizer.zero_grad()
        cat_logits, subcat_logits = model(input_ids=input_ids, attention_mask=attention_mask)

        cat_loss = F.cross_entropy(cat_logits, cat_labels)
        subcat_loss = F.cross_entropy(subcat_logits, subcat_labels)
        # ✅ 가중치 조정 적용
        loss = 0.3 * cat_loss + 0.7 * subcat_loss

        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"[Epoch {epoch+1}] Loss: {total_loss:.4f}")

# ✅ 저장
torch.save(model.state_dict(), "/content/drive/MyDrive/retrained_multihead_model_v2.pt")
print("✅ 오답 + 정답 혼합 재학습 완료 및 저장")


Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 1900/1900 [02:41<00:00, 11.80it/s]


[Epoch 1] Loss: 816.8387


100%|██████████| 1900/1900 [02:41<00:00, 11.80it/s]


[Epoch 2] Loss: 766.4871


100%|██████████| 1900/1900 [02:41<00:00, 11.78it/s]


[Epoch 3] Loss: 2362.5431
✅ 오답 + 정답 혼합 재학습 완료 및 저장


In [4]:
import json
from sklearn.metrics import accuracy_score

# ✅ 1. 재학습 전 예측 결과 로드
with open("/content/drive/MyDrive/data_with_predicted_cat_subcat.json", "r", encoding="utf-8") as f:
    data_before = json.load(f)

# ✅ 2. 재학습 후 예측 결과 로드
with open("/content/drive/MyDrive/data_with_predicted_retrained.json", "r", encoding="utf-8") as f:
    data_after = json.load(f)

# ✅ 3. 비교용 리스트 초기화
cat_labels_true = []
cat_preds_before = []
cat_preds_after = []

subcat_labels_true = []
subcat_preds_before = []
subcat_preds_after = []

both_correct_before = 0
both_correct_after = 0

# ✅ 4. 비교 반복
for b, a in zip(data_before, data_after):
    cat_true = b["category"]
    subcat_true = b["subcategory"]

    cat_pred_before = b["predicted_category"]
    cat_pred_after = a["predicted_category"]

    subcat_pred_before = b["predicted_subcategory"]
    subcat_pred_after = a["predicted_subcategory"]

    cat_labels_true.append(cat_true)
    cat_preds_before.append(cat_pred_before)
    cat_preds_after.append(cat_pred_after)

    subcat_labels_true.append(subcat_true)
    subcat_preds_before.append(subcat_pred_before)
    subcat_preds_after.append(subcat_pred_after)

    if cat_true == cat_pred_before and subcat_true == subcat_pred_before:
        both_correct_before += 1
    if cat_true == cat_pred_after and subcat_true == subcat_pred_after:
        both_correct_after += 1

total = len(cat_labels_true)

# ✅ 5. 정확도 계산 및 출력
cat_acc_before = accuracy_score(cat_labels_true, cat_preds_before)
cat_acc_after = accuracy_score(cat_labels_true, cat_preds_after)

subcat_acc_before = accuracy_score(subcat_labels_true, subcat_preds_before)
subcat_acc_after = accuracy_score(subcat_labels_true, subcat_preds_after)

print(f"🎯 Category Accuracy - BEFORE: {cat_acc_before:.4f} → AFTER: {cat_acc_after:.4f}")
print(f"🎯 Subcategory Accuracy - BEFORE: {subcat_acc_before:.4f} → AFTER: {subcat_acc_after:.4f}")
print(f"✅ 둘 다 맞은 비율 - BEFORE: {both_correct_before}/{total} ({both_correct_before/total:.4f})")
print(f"✅ 둘 다 맞은 비율 - AFTER:  {both_correct_after}/{total} ({both_correct_after/total:.4f})")


🎯 Category Accuracy - BEFORE: 0.9508 → AFTER: 0.9367
🎯 Subcategory Accuracy - BEFORE: 0.8457 → AFTER: 0.8371
✅ 둘 다 맞은 비율 - BEFORE: 12746/15221 (0.8374)
✅ 둘 다 맞은 비율 - AFTER:  12611/15221 (0.8285)


### subcategory만 틀린 샘플만 골라서 재학습

In [5]:
with open("/content/drive/MyDrive/data_with_predicted_retrained.json", "r", encoding="utf-8") as f:
    data = json.load(f)

wrong_subcat_only = [
    item for item in data
    if item["category"] == item["predicted_category"]
    and item["subcategory"] != item["predicted_subcategory"]
]

In [9]:
import pandas as pd
import joblib

df_wrong = pd.DataFrame(wrong_subcat_only)
with open("/content/drive/MyDrive/data_with_category.json", "r", encoding="utf-8") as f:
    original_data = json.load(f)
df_orig = pd.DataFrame(original_data)

df_wrong["text"] = df_wrong["question"].str.strip() + " " + df_wrong["answer"].str.strip()
df_orig["text"] = df_orig["question"].str.strip() + " " + df_orig["answer"].str.strip()

cat_le = joblib.load("category_label_encoder.pkl")
subcat_le = joblib.load("subcategory_label_encoder.pkl")

df_wrong["cat_label"] = cat_le.transform(df_wrong["category"])
df_wrong["subcat_label"] = subcat_le.transform(df_wrong["subcategory"])
df_orig["cat_label"] = cat_le.transform(df_orig["category"])
df_orig["subcat_label"] = subcat_le.transform(df_orig["subcategory"])

df_all = pd.concat([df_orig, df_wrong]).drop_duplicates(subset=["text", "cat_label", "subcat_label"])


In [12]:
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
import torch.nn as nn
import torch
from torch.optim import AdamW
import torch.nn.functional as F
from tqdm import tqdm

# ✅ 설정
model_name = "klue/roberta-base"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ✅ 토크나이저
tokenizer = AutoTokenizer.from_pretrained(model_name)

# ✅ 커스텀 Dataset
class MultiOutputDataset(Dataset):
    def __init__(self, texts, cat_labels, subcat_labels):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=256)
        self.cat_labels = cat_labels
        self.subcat_labels = subcat_labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["cat_label"] = torch.tensor(self.cat_labels[idx])
        item["subcat_label"] = torch.tensor(self.subcat_labels[idx])
        return item

    def __len__(self):
        return len(self.cat_labels)

# ✅ 모델 클래스
class RobertaMultiHeadClassifier(nn.Module):
    def __init__(self, model_name, num_cat_classes, num_subcat_classes):
        super().__init__()
        self.backbone = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.1)
        self.cat_head = nn.Linear(self.backbone.config.hidden_size, num_cat_classes)
        self.subcat_head = nn.Linear(self.backbone.config.hidden_size, num_subcat_classes)

    def forward(self, input_ids, attention_mask, token_type_ids=None):
        outputs = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
        pooled = outputs.last_hidden_state[:, 0]
        pooled = self.dropout(pooled)
        cat_logits = self.cat_head(pooled)
        subcat_logits = self.subcat_head(pooled)
        return cat_logits, subcat_logits

# ✅ 모델 불러오기 (기존 모델 이어서)
model = RobertaMultiHeadClassifier(model_name, len(cat_le.classes_), len(subcat_le.classes_)).to(device)
model.load_state_dict(torch.load("/content/drive/MyDrive/retrained_multihead_model.pt", map_location=device))
model.train()

# ✅ DataLoader 준비
dataset = MultiOutputDataset(df_all["text"].tolist(), df_all["cat_label"].tolist(), df_all["subcat_label"].tolist())
loader = DataLoader(dataset, batch_size=8, shuffle=True)

# ✅ 옵티마이저 및 학습 루프
optimizer = AdamW(model.parameters(), lr=5e-5)

for epoch in range(3):
    total_loss = 0
    for batch in tqdm(loader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        cat_labels = batch["cat_label"].to(device)
        subcat_labels = batch["subcat_label"].to(device)

        optimizer.zero_grad()
        cat_logits, subcat_logits = model(input_ids=input_ids, attention_mask=attention_mask)

        cat_loss = F.cross_entropy(cat_logits, cat_labels)
        subcat_loss = F.cross_entropy(subcat_logits, subcat_labels)
        loss = 0.3 * cat_loss + 0.7 * subcat_loss  # ✅ subcategory 중심 학습

        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"[Epoch {epoch+1}] Loss: {total_loss:.4f}")

# ✅ 모델 저장
torch.save(model.state_dict(), "/content/drive/MyDrive/retrained_subcat_focus.pt")
print("✅ subcategory만 틀린 데이터로 재학습 완료")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/375 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/752k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/546 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/443M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 1900/1900 [02:43<00:00, 11.59it/s]


[Epoch 1] Loss: 837.2330


100%|██████████| 1900/1900 [02:42<00:00, 11.68it/s]


[Epoch 2] Loss: 751.8111


100%|██████████| 1900/1900 [02:42<00:00, 11.67it/s]


[Epoch 3] Loss: 681.6081
✅ subcategory만 틀린 데이터로 재학습 완료


### 학습 후 추론

In [8]:
import torch
import json
from tqdm import tqdm
from transformers import AutoTokenizer
import joblib
import torch.nn.functional as F

# ✅ 디바이스
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ✅ 로드: 인코더, 토크나이저
tokenizer = AutoTokenizer.from_pretrained("klue/roberta-base")
cat_le = joblib.load("category_label_encoder.pkl")
subcat_le = joblib.load("subcategory_label_encoder.pkl")

# ✅ 모델 클래스 정의
class RobertaMultiHeadClassifier(torch.nn.Module):
    def __init__(self, model_name, num_cat_classes, num_subcat_classes):
        super().__init__()
        from transformers import AutoModel
        self.backbone = AutoModel.from_pretrained(model_name)
        self.dropout = torch.nn.Dropout(0.1)
        self.cat_head = torch.nn.Linear(self.backbone.config.hidden_size, num_cat_classes)
        self.subcat_head = torch.nn.Linear(self.backbone.config.hidden_size, num_subcat_classes)

    def forward(self, input_ids, attention_mask, token_type_ids=None):
        outputs = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
        pooled = outputs.last_hidden_state[:, 0]
        pooled = self.dropout(pooled)
        cat_logits = self.cat_head(pooled)
        subcat_logits = self.subcat_head(pooled)
        return cat_logits, subcat_logits

# ✅ 모델 로드
model = RobertaMultiHeadClassifier(
    model_name="klue/roberta-base",
    num_cat_classes=len(cat_le.classes_),
    num_subcat_classes=len(subcat_le.classes_)
)
model.load_state_dict(torch.load("/content/drive/MyDrive/retrained_subcat_focus.pt", map_location=device))
model.to(device)
model.eval()

# ✅ 전체 데이터 불러오기
with open("/content/drive/MyDrive/data_with_category.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# ✅ 예측 수행
for item in tqdm(data):
    q, a = item["question"].strip(), item["answer"].strip()
    text = q + " " + a

    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=256).to(device)

    with torch.no_grad():
        cat_logits, subcat_logits = model(**inputs)
        cat_pred = torch.argmax(F.softmax(cat_logits, dim=1), dim=1).item()
        subcat_pred = torch.argmax(F.softmax(subcat_logits, dim=1), dim=1).item()

    item["predicted_category"] = cat_le.inverse_transform([cat_pred])[0]
    item["predicted_subcategory"] = subcat_le.inverse_transform([subcat_pred])[0]

# ✅ 결과 저장
with open("/content/drive/MyDrive/data_with_predicted_subcat_focus.json", "w", encoding="utf-8") as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

print("✅ 저장 완료 → data_with_predicted_subcat_focus.json")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/375 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/752k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/546 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/443M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 15221/15221 [02:31<00:00, 100.21it/s]


✅ 저장 완료 → data_with_predicted_subcat_focus.json


### 재학습 후 정확도 비교

In [15]:
import json
from sklearn.metrics import accuracy_score

# ✅ 1. 재학습 전 결과
with open("/content/drive/MyDrive/data_with_predicted_cat_subcat.json", "r", encoding="utf-8") as f:
    data_before = json.load(f)

# ✅ 2. subcat 재학습 후 결과
with open("/content/drive/MyDrive/data_with_predicted_subcat_focus.json", "r", encoding="utf-8") as f:
    data_after = json.load(f)

# ✅ 비교용 리스트
cat_labels_true = []
cat_preds_before, cat_preds_after = [], []
subcat_labels_true = []
subcat_preds_before, subcat_preds_after = [], []
both_correct_before = 0
both_correct_after = 0

# ✅ 정답 vs 예측 비교
for b, a in zip(data_before, data_after):
    cat_true = b["category"]
    subcat_true = b["subcategory"]

    cat_pred_before = b["predicted_category"]
    cat_pred_after = a["predicted_category"]

    subcat_pred_before = b["predicted_subcategory"]
    subcat_pred_after = a["predicted_subcategory"]

    cat_labels_true.append(cat_true)
    cat_preds_before.append(cat_pred_before)
    cat_preds_after.append(cat_pred_after)

    subcat_labels_true.append(subcat_true)
    subcat_preds_before.append(subcat_pred_before)
    subcat_preds_after.append(subcat_pred_after)

    if cat_true == cat_pred_before and subcat_true == subcat_pred_before:
        both_correct_before += 1
    if cat_true == cat_pred_after and subcat_true == subcat_pred_after:
        both_correct_after += 1

total = len(cat_labels_true)

# ✅ 정확도 출력
cat_acc_before = accuracy_score(cat_labels_true, cat_preds_before)
cat_acc_after = accuracy_score(cat_labels_true, cat_preds_after)
subcat_acc_before = accuracy_score(subcat_labels_true, subcat_preds_before)
subcat_acc_after = accuracy_score(subcat_labels_true, subcat_preds_after)

print(f"🎯 Category Accuracy - BEFORE: {cat_acc_before:.4f} → AFTER: {cat_acc_after:.4f}")
print(f"🎯 Subcategory Accuracy - BEFORE: {subcat_acc_before:.4f} → AFTER: {subcat_acc_after:.4f}")
print(f"✅ 둘 다 맞은 비율 - BEFORE: {both_correct_before}/{total} ({both_correct_before/total:.4f})")
print(f"✅ 둘 다 맞은 비율 - AFTER:  {both_correct_after}/{total} ({both_correct_after/total:.4f})")


🎯 Category Accuracy - BEFORE: 0.9508 → AFTER: 0.9647
🎯 Subcategory Accuracy - BEFORE: 0.8457 → AFTER: 0.9009
✅ 둘 다 맞은 비율 - BEFORE: 12746/15221 (0.8374)
✅ 둘 다 맞은 비율 - AFTER:  13693/15221 (0.8996)


In [17]:
import json

# ✅ 데이터 로드
with open("/content/drive/MyDrive/data_with_predicted_subcat_focus.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# ✅ 통계 변수 초기화
only_category_diff = 0
only_subcategory_diff = 0
both_diff = 0
all_correct = 0

# ✅ 비교 및 카운팅
for item in data:
    cat_true = item.get("category")
    cat_pred = item.get("predicted_category")
    subcat_true = item.get("subcategory")
    subcat_pred = item.get("predicted_subcategory")

    cat_diff = (cat_true != cat_pred)
    subcat_diff = (subcat_true != subcat_pred)

    if cat_diff and subcat_diff:
        both_diff += 1
    elif cat_diff and not subcat_diff:
        only_category_diff += 1
    elif subcat_diff and not cat_diff:
        only_subcategory_diff += 1
    else:
        all_correct += 1

# ✅ 결과 출력
print(f"❌ category만 다름: {only_category_diff}개")
print(f"❌ subcategory만 다름: {only_subcategory_diff}개")
print(f"❌ 둘 다 다름: {both_diff}개")
print(f"✅ 모두 일치: {all_correct}개")

❌ category만 다름: 20개
❌ subcategory만 다름: 991개
❌ 둘 다 다름: 517개
✅ 모두 일치: 13693개


In [21]:
import json

# 🔹 데이터 로드
with open("/content/drive/MyDrive/data_with_predicted_subcat_focus.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# 🔹 question 앞에 [subcategory] 붙이기
for item in data:
    subcat = item["predicted_subcategory"] if "predicted_subcategory" in item else item["subcategory"]
    if not item["question"].strip().startswith(f"[{subcat}]"):
        item["question"] = f"[{subcat}]{item['question'].strip()}"

# 🔹 저장
with open("/content/drive/MyDrive/data_with_predicted_final_tagged.json", "w", encoding="utf-8") as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

print("✅ '[subcategory]'가 question 앞에 추가된 결과 저장 완료 → data_with_predicted_final_tagged.json")


✅ '[subcategory]'가 question 앞에 추가된 결과 저장 완료 → data_with_predicted_final_tagged.json
