In [6]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [7]:
!pip install transformers onnx



### 모델&토크나이저 로드 + state_dict 적용

In [8]:
import torch

# 1. 원본 state_dict 로드
state_dict = torch.load("/content/drive/MyDrive/retrained_subcat_focus.pt", map_location="cpu")

# 2. 필요한 key만 추출 + 이름 변경
new_state_dict = {}
for k, v in state_dict.items():
    if k.startswith("backbone."):
        new_state_dict[k] = v
    elif k.startswith("subcat_head."):
        new_key = k.replace("subcat_head.", "classifier.")
        new_state_dict[new_key] = v

# 3. 저장
torch.save(new_state_dict, "/content/drive/MyDrive/subcat_classifier_clean.pt")


In [9]:
import torch.nn as nn
from transformers import AutoModel

class SubcatOnlyRobertaClassifier(nn.Module):
    def __init__(self, model_name, num_subcat_classes):
        super().__init__()
        self.backbone = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(self.backbone.config.hidden_size, num_subcat_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
        pooled = outputs.last_hidden_state[:, 0]
        pooled = self.dropout(pooled)
        logits = self.classifier(pooled)
        return logits


In [10]:
model = SubcatOnlyRobertaClassifier("klue/roberta-base", num_subcat_classes=20)

state_dict = torch.load("/content/drive/MyDrive/subcat_classifier_clean.pt", map_location="cpu")
model.load_state_dict(state_dict)  # ❗여기서 오류 안 나야 정상
model.eval()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/546 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/443M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


SubcatOnlyRobertaClassifier(
  (backbone): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

### ONNX export

In [11]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("klue/roberta-base")
dummy_input = tokenizer("장학금 신청은 언제?", return_tensors="pt")

torch.onnx.export(
    model,
    (dummy_input["input_ids"], dummy_input["attention_mask"]),
    "subcat_model.onnx",
    input_names=["input_ids", "attention_mask"],
    output_names=["logits"],
    dynamic_axes={
        "input_ids": {0: "batch_size"},
        "attention_mask": {0: "batch_size"},
        "logits": {0: "batch_size"}
    },
    opset_version=17
)


tokenizer_config.json:   0%|          | 0.00/375 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/752k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

### ONNX 양자화

In [12]:
!pip install onnxruntime onnxruntime-tools



In [13]:
from onnxruntime.quantization import quantize_dynamic, QuantType

# 경로
model_fp32 = "subcat_model.onnx"
model_quant = "subcat_model_quant.onnx"

# 양자화 실행 (전체 모델에 대해 weight만 양자화)
quantize_dynamic(
    model_input=model_fp32,
    model_output=model_quant,
    weight_type=QuantType.QInt8  # 또는 QuantType.QUInt8
)

print(f"✅ 양자화된 모델 저장 완료: {model_quant}")




✅ 양자화된 모델 저장 완료: subcat_model_quant.onnx


#### 양자화 결과 비교

In [14]:
import os

print(f"Before: {os.path.getsize(model_fp32) / 1024 / 1024:.2f} MB")
print(f"After : {os.path.getsize(model_quant) / 1024 / 1024:.2f} MB")

Before: 420.04 MB
After : 105.75 MB


In [55]:
from sklearn.preprocessing import LabelEncoder
import joblib
import pandas as pd
import json

with open("/content/drive/MyDrive/data_with_category.json", "r", encoding="utf-8") as f:
    data = json.load(f)
df = pd.DataFrame(data)

# 학습 시 사용한 subcategory 컬럼을 기반으로
subcategory_list = df["subcategory"].tolist()

label_encoder = LabelEncoder()
label_encoder.fit(subcategory_list)

# 저장
joblib.dump(label_encoder, "/content/drive/MyDrive/subcategory_label_encoder.pkl")


['/content/drive/MyDrive/subcategory_label_encoder.pkl']

In [56]:
import joblib
from fastapi import FastAPI
from pydantic import BaseModel
from transformers import AutoTokenizer
import onnxruntime
import numpy as np

# ✅ 준비
tokenizer = AutoTokenizer.from_pretrained("klue/roberta-base")
session = onnxruntime.InferenceSession("subcat_model_quant.onnx", providers=["CPUExecutionProvider"])
label_encoder = joblib.load("/content/drive/MyDrive/subcategory_label_encoder.pkl")  # ← 꼭 경로 확인

# API 정의
app = FastAPI()

class Question(BaseModel):
    text: str

@app.post("/predict")
def predict(question: Question):
    # 전처리
    inputs = tokenizer(question.text, return_tensors="np", padding=True, truncation=True)

    # ONNX 추론
    outputs = session.run(None, {
        "input_ids": inputs["input_ids"],
        "attention_mask": inputs["attention_mask"]
    })

    logits = outputs[0]
    pred_index = int(np.argmax(logits, axis=-1)[0])

    # ✅ 라벨 디코딩
    pred_label = label_encoder.inverse_transform([pred_index])[0]

    return {
        "input": question.text,
        "predicted_index": pred_index,
        "predicted_label": pred_label
    }


In [15]:
import shutil

# Colab 상의 경로 (ONNX export할 때 생성된 파일)
local_path = "subcat_model_quant.onnx"

# Google Drive에 저장할 경로
drive_path = "/content/drive/MyDrive/subcat_model_quant.onnx"

# 복사
shutil.copy(local_path, drive_path)


'/content/drive/MyDrive/subcat_model_quant.onnx'