In [None]:
import zipfile
import os

zip_path = "/content/drive/MyDrive/Colab Notebooks/Trained_file_zip_folder/finetuned_data(6).zip"
extract_path = "/content/finetuned_data(6)"  # 임시로 압축을 풀 디렉토리(파인튜닝용 모델)

# 압축 해제
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

In [None]:
from transformers import AutoModel, AutoTokenizer
import torch.nn as nn

class TokenIntentJointModel(nn.Module):
    def __init__(self, model_name, num_slot_labels, num_intent_labels):
        super(TokenIntentJointModel, self).__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        hidden_size = self.bert.config.hidden_size
        self.dropout = nn.Dropout(0.1)
        self.slot_classifier = nn.Linear(hidden_size, num_slot_labels)
        self.intent_classifier = nn.Linear(hidden_size, num_intent_labels)

    def forward(self, input_ids, attention_mask, token_type_ids=None, labels=None, intent_label=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        sequence_output = self.dropout(outputs.last_hidden_state)
        pooled_output = self.dropout(outputs.pooler_output)

        slot_logits = self.slot_classifier(sequence_output)
        intent_logits = self.intent_classifier(pooled_output)

        loss = 0
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss += loss_fct(slot_logits.view(-1, slot_logits.shape[-1]), labels.view(-1))
        if intent_label is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss += loss_fct(intent_logits, intent_label)
        return {"loss": loss, "slot_logits": slot_logits, "intent_logits": intent_logits}


In [None]:
# 모델 준비 코드
tokenizer = AutoTokenizer.from_pretrained("monologg/koelectra-base-v3-discriminator")

def preprocess_input(examples, label2id, intent2id):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        padding="max_length",
        max_length=128,
        is_split_into_words=True,
    )

    labels = []
    for i, label in enumerate(examples["tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[label[word_idx]])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    tokenized_inputs["intent_label"] = [intent2id[intent] for intent in examples["intent"]]
    return tokenized_inputs

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/61.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/467 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/263k [00:00<?, ?B/s]

In [None]:
# 최초 학습용 코드
import os
import json
import torch
import zipfile
import shutil
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    AutoModelForSequenceClassification,
    DataCollatorForTokenClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer
)
from sklearn.metrics import accuracy_score, f1_score

# ✅ 설정
os.environ["WANDB_DISABLED"] = "true"
model_name = "monologg/koelectra-base-v3-discriminator"
label_list = ['B-DIRECTION', 'B-LINE', 'B-ROUTE', 'B-STATION', 'B-TRANSPORT-BUS', 'B-TRANSPORT-SUBWAY', 'O']
intent_list = ['arrival_bus', 'arrival_subway', 'congestion', 'other']
label2id = {l: i for i, l in enumerate(label_list)}
id2label = {i: l for l, i in label2id.items()}
intent2id = {l: i for i, l in enumerate(intent_list)}
id2intent = {i: l for l, i in intent2id.items()}

# ✅ 파일 경로
train_path = "./electra_slot_tagging_data_20000.json" #학습에 사용할 데이터
eval_path = "./electra_slot_tagging_data_2500.json"   #대조군 데이터

# ✅ 데이터 로드
with open(train_path, encoding='utf-8') as f:
    train_data = json.load(f)
with open(eval_path, encoding='utf-8') as f:
    eval_data = json.load(f)

train_dataset = Dataset.from_list(train_data)
eval_dataset = Dataset.from_list(eval_data)

# ✅ Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# ✅ 전처리 함수
def preprocess(examples):
    tokenized = tokenizer(
        examples["tokens"],
        truncation=True,
        padding="max_length",
        max_length=128,
        is_split_into_words=True,
    )

    slot_labels = []
    for i, tags in enumerate(examples["tags"]):
        word_ids = tokenized.word_ids(batch_index=i)
        prev_word = None
        label_ids = []
        for word_id in word_ids:
            if word_id is None:
                label_ids.append(-100)
            elif word_id != prev_word:
                label_ids.append(label2id[tags[word_id]])
            else:
                label_ids.append(-100)
            prev_word = word_id
        slot_labels.append(label_ids)

    tokenized["labels"] = slot_labels
    tokenized["intent_label"] = [intent2id[intent] for intent in examples["intent"]]
    return tokenized

# ✅ 전처리 적용
train_tokenized = train_dataset.map(preprocess, batched=True)
eval_tokenized = eval_dataset.map(preprocess, batched=True)

# ✅ 모델 정의
slot_model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

intent_model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(intent_list),
    id2label=id2intent,
    label2id=intent2id
)

# ✅ 트레이닝 파라미터
slot_args = TrainingArguments(
    output_dir="./checkpoints/slot", # 슬롯 체크포인트 경로
    num_train_epochs=7,              # 에폭. 학습 횟수
    per_device_train_batch_size=8,
    learning_rate=2e-5,              # 학습률. 높으면 많이 학습, 적으면 적게 학습
    logging_strategy="steps",        # 에폭마다 체크포인트 저장
    logging_steps=50,                # 에폭. 학습 횟수
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=3,              # 체크포인트는 최대 3개까지 저장
    report_to="none",
    fp16=torch.cuda.is_available()
)

intent_args = TrainingArguments(
    output_dir="./checkpoints/intent", # 인텐트 체크포인트 경로
    num_train_epochs=5,                # 에폭. 학습 횟수
    per_device_train_batch_size=8,
    learning_rate=2e-5,                # 학습률. 높으면 많이 학습, 적으면 적게 학습
    logging_strategy="steps",          # 에폭마다 체크포인트 저장
    logging_steps=50,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=3,                # 체크포인트는 최대 3개까지 저장
    report_to="none",
    fp16=torch.cuda.is_available()
)


# ✅ intent용 데이터셋 정리
intent_train = train_tokenized.remove_columns("labels").rename_column("intent_label", "labels")
intent_eval = eval_tokenized.remove_columns("labels").rename_column("intent_label", "labels")

# ✅ 평가지표
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    preds = logits.argmax(-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="macro")
    return {"accuracy": acc, "f1": f1}

# ✅ Trainer 정의
slot_trainer = Trainer(
    model=slot_model,
    args=slot_args,
    train_dataset=train_tokenized,
    eval_dataset=eval_tokenized,
    tokenizer=tokenizer,
    data_collator=DataCollatorForTokenClassification(tokenizer),
)

intent_trainer = Trainer(
    model=intent_model,
    args=intent_args,
    train_dataset=intent_train,
    eval_dataset=intent_eval,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer),
    compute_metrics=compute_metrics,
)

# ✅ 학습 시작
print("🔥 슬롯 태깅 학습 시작")
slot_trainer.train()

print("🔥 인텐트 분류 학습 시작")
intent_trainer.train()

# ✅ 모델 저장
saving_model_name = "checkpoint_HugeData(new20000)"  # 학습 후 저장할 파일 이름
output_dir = f"./{saving_model_name}"
slot_model.save_pretrained(f"{output_dir}/slot")
intent_model.save_pretrained(f"{output_dir}/intent")
tokenizer.save_pretrained(f"{output_dir}/tokenizer")

# ✅ zip 압축
def zipdir(source_dir, zip_filename):
    with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, dirs, files in os.walk(source_dir):
            for file in files:
                filepath = os.path.join(root, file)
                arcname = os.path.relpath(filepath, start=source_dir)
                zipf.write(filepath, arcname)

zip_path = f"{output_dir}.zip"
zipdir(output_dir, zip_path)
print(f"✅ 압축 완료: {zip_path}")

# ✅ 드라이브 경로로 이동
drive_save_path = f"/content/drive/MyDrive/Colab Notebooks/Trained_file_zip_folder/{saving_model_name}.zip"
shutil.copy(zip_path, drive_save_path)
print(f"✅ 드라이브 저장 완료: {drive_save_path}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/61.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/467 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/263k [00:00<?, ?B/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/452M [00:00<?, ?B/s]

Some weights of ElectraForTokenClassification were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/452M [00:00<?, ?B/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  slot_trainer = Trainer(
  intent_trainer = Trainer(


🔥 슬롯 태깅 학습 시작


Epoch,Training Loss,Validation Loss
1,0.0011,0.001789
2,0.0036,0.000431
3,0.0002,0.000521
4,0.0001,2.7e-05
5,0.0,0.000331
6,0.0,2.1e-05
7,0.0,4.9e-05


🔥 인텐트 분류 학습 시작


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.0004,0.017706,0.9976,0.997297
2,0.0213,0.001108,0.9996,0.9996
3,0.0,6e-06,1.0,1.0
4,0.0,2e-06,1.0,1.0
5,0.0,1e-06,1.0,1.0


✅ 압축 완료: ./checkpoint_HugeData(new20000).zip
✅ 드라이브 저장 완료: /content/drive/MyDrive/Colab Notebooks/Trained_file_zip_folder/checkpoint_HugeData(new20000).zip


# 파인 튜닝용 코드
# 아래 코드를 사용하여 파인튜닝할것

In [None]:
import os
import json
import torch
import zipfile
import shutil
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    AutoModelForSequenceClassification,
    DataCollatorForTokenClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer
)
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import numpy as np

# ✅ 설정
os.environ["WANDB_DISABLED"] = "true"
model_name = "monologg/koelectra-base-v3-discriminator"
label_list = ['B-DIRECTION', 'B-LINE', 'B-ROUTE', 'B-STATION', 'B-TRANSPORT-BUS', 'B-TRANSPORT-SUBWAY', 'O']
intent_list = ['arrival_bus', 'arrival_subway', 'congestion', 'other']
label2id = {l: i for i, l in enumerate(label_list)}
id2label = {i: l for l, i in label2id.items()}
intent2id = {l: i for i, l in enumerate(intent_list)}
id2intent = {i: l for l, i in intent2id.items()}

# ✅ 경로
train_path = "./electra_slot_tagging_data_finetune7_300.json" # 학습에 사용할 데이터
eval_path = "./electra_slot_tagging_data_finetune7_150.json"  # 대조군 데이터

# ✅ 데이터 로드
with open(train_path, encoding='utf-8') as f:
    train_data = json.load(f)
with open(eval_path, encoding='utf-8') as f:
    eval_data = json.load(f)

train_dataset = Dataset.from_list(train_data)
eval_dataset = Dataset.from_list(eval_data)

# ✅ Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# ✅ 전처리 함수
def preprocess(examples):
    tokenized = tokenizer(
        examples["tokens"],
        truncation=True,
        padding="max_length",
        max_length=128,
        is_split_into_words=True,
    )

    slot_labels = []
    for i, tags in enumerate(examples["tags"]):
        word_ids = tokenized.word_ids(batch_index=i)
        prev_word = None
        label_ids = []
        for word_id in word_ids:
            if word_id is None:
                label_ids.append(-100)
            elif word_id != prev_word:
                label_ids.append(label2id[tags[word_id]])
            else:
                label_ids.append(-100)
            prev_word = word_id
        slot_labels.append(label_ids)

    tokenized["labels"] = slot_labels
    tokenized["intent_label"] = [intent2id[intent] for intent in examples["intent"]]
    return tokenized

# ✅ 전처리 적용
train_tokenized = train_dataset.map(preprocess, batched=True)
eval_tokenized = eval_dataset.map(preprocess, batched=True)

# ✅ 모델 불러오기
slot_model = AutoModelForTokenClassification.from_pretrained(
    "/content/finetuned_data(6)/slot",
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

intent_model = AutoModelForSequenceClassification.from_pretrained(
    "/content/finetuned_data(6)/intent",            # 학습에 사용할 모델 불러오기
    num_labels=len(intent_list),
    id2label=id2intent,
    label2id=intent2id
)

# ✅ 학습 설정
slot_args = TrainingArguments(
    output_dir="./checkpoints/slot-finetune",
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=8,
    learning_rate=2e-6,
    logging_strategy="steps",
    logging_steps=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=3,
    report_to="none",
    fp16=torch.cuda.is_available()
)

intent_args = TrainingArguments(
    output_dir="./checkpoints/intent-finetune",
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=8,
    learning_rate=2e-6,
    logging_strategy="steps",
    logging_steps=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=3,
    report_to="none",
    fp16=torch.cuda.is_available()
)

# ✅ 인텐트용 데이터셋
intent_train = train_tokenized.remove_columns("labels").rename_column("intent_label", "labels")
intent_eval = eval_tokenized.remove_columns("labels").rename_column("intent_label", "labels")

# ✅ 인텐트 평가지표
def compute_metrics_intent(eval_preds):
    logits, labels = eval_preds
    preds = logits.argmax(-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="macro"),
    }

# ✅ 슬롯 평가지표
def compute_metrics_slot(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = []
    pred_labels = []
    for pred, label in zip(predictions, labels):
        for p_i, l_i in zip(pred, label):
            if l_i != -100:
                true_labels.append(l_i)
                pred_labels.append(p_i)

    return {
        "accuracy": accuracy_score(true_labels, pred_labels),
        "precision": precision_score(true_labels, pred_labels, average="macro", zero_division=0),
        "recall": recall_score(true_labels, pred_labels, average="macro", zero_division=0),
        "f1": f1_score(true_labels, pred_labels, average="macro"),
    }

# ✅ Trainer
slot_trainer = Trainer(
    model=slot_model,
    args=slot_args,
    train_dataset=train_tokenized,
    eval_dataset=eval_tokenized,
    tokenizer=tokenizer,
    data_collator=DataCollatorForTokenClassification(tokenizer),
    compute_metrics=compute_metrics_slot,
)

intent_trainer = Trainer(
    model=intent_model,
    args=intent_args,
    train_dataset=intent_train,
    eval_dataset=intent_eval,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer),
    compute_metrics=compute_metrics_intent,
)

# ✅ 파인튜닝 시작
print("🔥 슬롯 태깅 파인튜닝 시작")
slot_trainer.train()

print("🔥 인텐트 분류 파인튜닝 시작")
intent_trainer.train()

# ✅ 모델 저장
saving_model_name = "finetuned_data(7)a" # 학습하고 저장할 파일명 지정
output_dir = f"./{saving_model_name}"
slot_model.save_pretrained(f"{output_dir}/slot")
intent_model.save_pretrained(f"{output_dir}/intent")
tokenizer.save_pretrained(f"{output_dir}/tokenizer")

# ✅ zip 압축
def zipdir(source_dir, zip_filename):
    with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, dirs, files in os.walk(source_dir):
            for file in files:
                filepath = os.path.join(root, file)
                arcname = os.path.relpath(filepath, start=source_dir)
                zipf.write(filepath, arcname)

zip_path = f"{output_dir}.zip"
zipdir(output_dir, zip_path)
print(f"✅ 압축 완료: {zip_path}")

# ✅ 드라이브로 이동
drive_save_path = f"/content/drive/MyDrive/Colab Notebooks/Trained_file_zip_folder/{saving_model_name}.zip"
shutil.copy(zip_path, drive_save_path)
print(f"✅ 드라이브 저장 완료: {drive_save_path}")

Map:   0%|          | 0/210 [00:00<?, ? examples/s]

Map:   0%|          | 0/105 [00:00<?, ? examples/s]

  slot_trainer = Trainer(
  intent_trainer = Trainer(


🔥 슬롯 태깅 파인튜닝 시작


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0406,0.090226,0.99005,0.982993,0.971751,0.975607
2,0.0484,0.081379,0.989055,0.979485,0.969857,0.973271
3,0.0352,0.073692,0.989055,0.979485,0.969857,0.973271
4,0.0267,0.069501,0.99005,0.981107,0.972682,0.975791
5,0.0197,0.068076,0.99005,0.981107,0.972682,0.975791


🔥 인텐트 분류 파인튜닝 시작


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,1.8641,0.571489,0.904762,0.702278
2,0.6336,0.075437,0.961905,0.727868
3,0.2789,0.007751,1.0,1.0
4,0.2301,0.003413,1.0,1.0
5,0.0024,0.000842,1.0,1.0


✅ 압축 완료: ./finetuned_data(7)a.zip
✅ 드라이브 저장 완료: /content/drive/MyDrive/Colab Notebooks/Trained_file_zip_folder/finetuned_data(7)a.zip


# 문장 테스트용 코드

In [None]:
# 모델 불러오는 코드
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, AutoModelForSequenceClassification
import os

# ✅ 모델 경로
path = "finetuned_data(6)"
slot_model_path = f"./{path}/slot"
intent_model_path = f"./{path}/intent"
tokenizer_path = f"./{path}/tokenizer"

# ✅ 레이블 리스트
label_list = ['B-DIRECTION', 'B-LINE', 'B-ROUTE', 'B-STATION', 'B-TRANSPORT-BUS', 'B-TRANSPORT-SUBWAY', 'O']
intent_list = ['arrival_bus', 'arrival_subway', 'congestion', 'other']

# ✅ 장치 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ✅ 토크나이저 및 모델 불러오기
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
slot_model = AutoModelForTokenClassification.from_pretrained(slot_model_path).to(device)
intent_model = AutoModelForSequenceClassification.from_pretrained(intent_model_path).to(device)

# 모델 1: 슬롯 태깅용 (Token Classification)
slot_model = AutoModelForTokenClassification.from_pretrained(slot_model_path).to(device)
slot_model.eval()

# 모델 2: 인텐트 분류용 (Sequence Classification)
intent_model = AutoModelForSequenceClassification.from_pretrained(intent_model_path).to(device)
intent_model.eval()

ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(35000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0-11): 12 x ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): L

In [None]:
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer

def predict(sentence, tokenizer, slot_model, intent_model, label_list, intent_list):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # 문장을 단어 단위로 분할 (띄어쓰기 기준)
    words = sentence.strip().split()

    # 토큰화 (단어 단위 입력!)
    tokenized = tokenizer(
        words,
        is_split_into_words=True,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=128
    )

    # ⚠️ tokenized를 dict로 바꾸지 말고 tensor만 device로 이동
    for k in tokenized:
        tokenized[k] = tokenized[k].to(device)

    # 모델을 device로
    slot_model.to(device)
    intent_model.to(device)

    with torch.no_grad():
        # 🔸 인텐트 예측
        intent_logits = intent_model(**tokenized).logits
        intent_probs = F.softmax(intent_logits, dim=1)[0]
        intent_pred_id = torch.argmax(intent_probs).item()
        intent_label = intent_list[intent_pred_id]
        intent_score = intent_probs[intent_pred_id].item()

        # 🔸 슬롯 태깅 예측
        slot_logits = slot_model(**tokenized).logits  # (1, seq_len, num_labels)
        slot_probs = F.softmax(slot_logits, dim=2)[0]
        slot_preds = torch.argmax(slot_probs, dim=1).tolist()
        slot_scores = slot_probs[range(len(slot_preds)), slot_preds].tolist()

    input_ids = tokenized["input_ids"][0].cpu()
    word_ids = tokenized.word_ids(batch_index=0)
    tokens = tokenizer.convert_ids_to_tokens(input_ids)

    # 단어 단위 병합 및 출력
    print(f"\n🟦 문장: {sentence}")
    print(f"🔸 예측 인텐트: {intent_label}  (score: {intent_score:.4f})")
    print(f"🔸 슬롯 태깅:")

    word_to_tag = {}
    for idx, word_id in enumerate(word_ids):
        if word_id is None or input_ids[idx].item() in tokenizer.all_special_ids:
            continue
        if word_id not in word_to_tag:
            pred_id = slot_preds[idx]
            score = slot_scores[idx]
            word_to_tag[word_id] = (label_list[pred_id], score)

    for i, word in enumerate(words):
        if i in word_to_tag:
            tag, score = word_to_tag[i]
            print(f"   {word:10} → {tag:20} (score: {score:.4f})")
        else:
            print(f"   {word:10} → [NO TAG]")

    print()

In [None]:
sentences = [
    # 1) 방향·특수문자 혼합, “급행” 키워드 포함
    "가락시장 정류소에서 급행 9502번 언제 와?",

    # 2) ‘정거장’-표현, 혼잡도 질문
    "센텀시티 정거장 주변에 지하철 빡빡해?",

    # 3) 괄호 내 추가정보, ‘있나요?’ 존댓말
    "서울대입구역에서 관악구청 방향 버스 있나요?",

    # 4) 역 2곳·호선 동시 언급, 특수문자 화살표, 혼잡도
    "잠실새내역으로 가는 잠실역의 2호선 열차는 지금 얼마나 혼잡해?",

    # 5) 약칭 방향어, 도착 시간 문의
    "나 인천역인데, 광화문쪽으로 가는 172번은 아직 멀었어?",

    # 6) 복합 경로 요청(환승 안내)
    "드림랜드입구 정류소에서 내려서 상계역까지 가는 방법 알려줘",

    # 7) 공항버스·첫차 시간 질의
    "주공10단지에서 탈 김포공항행 공항버스의 첫차 시간 좀",

    # 9) 급행 지하철 환승 안내 질문
    "노량진역에서 내려서 9호선 급행 타려면 어디로 가야 해?",

    # 10) 방언형 종결, ‘방면’+부정 확인
    "춘천역앞 정류장 우리쪽 방면 410번 버스 아직 안 왔지?"
]



for sentence in sentences:
    predict(sentence, tokenizer, slot_model, intent_model, label_list, intent_list)


🟦 문장: 가락시장 정류소에서 급행 9502번 언제 와?
🔸 예측 인텐트: arrival_bus  (score: 0.9954)
🔸 슬롯 태깅:
   가락시장       → B-STATION            (score: 1.0000)
   정류소에서      → O                    (score: 1.0000)
   급행         → B-TRANSPORT-SUBWAY   (score: 1.0000)
   9502번      → B-ROUTE              (score: 1.0000)
   언제         → O                    (score: 1.0000)
   와?         → O                    (score: 1.0000)


🟦 문장: 센텀시티 정거장 주변에 지하철 빡빡해?
🔸 예측 인텐트: other  (score: 0.9907)
🔸 슬롯 태깅:
   센텀시티       → B-STATION            (score: 0.9998)
   정거장        → O                    (score: 1.0000)
   주변에        → O                    (score: 1.0000)
   지하철        → B-TRANSPORT-SUBWAY   (score: 1.0000)
   빡빡해?       → O                    (score: 1.0000)


🟦 문장: 서울대입구역에서 관악구청 방향 버스 있나요?
🔸 예측 인텐트: arrival_bus  (score: 1.0000)
🔸 슬롯 태깅:
   서울대입구역에서   → B-STATION            (score: 1.0000)
   관악구청       → B-DIRECTION          (score: 1.0000)
   방향         → O                    (score: 1.0000)
   버스         → B-TRANSP

In [None]:
import zipfile
import os

# 압축할 폴더 경로
folder_to_zip = r"finetuned_data(7)d"
# 압축 후 저장될 zip 파일 경로
zip_output_path = folder_to_zip + ".zip"

# zip 파일 생성
with zipfile.ZipFile(zip_output_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
    for root, dirs, files in os.walk(folder_to_zip):
        for file in files:
            file_path = os.path.join(root, file)
            # zip 내부 경로에서 최상위 폴더 유지
            arcname = os.path.relpath(file_path, os.path.dirname(folder_to_zip))
            zipf.write(file_path, arcname)

print(f"압축 완료: {zip_output_path}")

# ✅ 드라이브로 이동
drive_save_path = "/content/drive/MyDrive/Colab Notebooks/Trained_file_zip_folder/finetuned_data(7)d.zip"
shutil.copy(zip_output_path, drive_save_path)
print(f"✅ 드라이브 저장 완료: {drive_save_path}")

압축 완료: finetuned_data(7)d.zip
✅ 드라이브 저장 완료: /content/drive/MyDrive/Colab Notebooks/Trained_file_zip_folder/finetuned_data(7)d.zip
