In [1]:
import os
import sys

project_root = os.getcwd()
sys.path.append(os.path.join(project_root, "src"))
sys.path.append(os.path.join(project_root, "scripts"))
sys.path.append(os.path.join(project_root, "model"))

In [2]:
import json
from sentence_transformers import InputExample

jsonl_path = "Dataset/AppenBanking/all.jsonl"
prefs_path = "Dataset/AppenBanking/preference_pairs.json"

utterance_map = {}
theme_utterances_set = set()

with open(jsonl_path, "r", encoding="utf-8") as f:
    for line in f:
        dialogue = json.loads(line)
        for turn in dialogue.get("turns", []):
            utt_id = turn.get("utterance_id")
            utt_text = turn.get("utterance")
            if utt_id and utt_text:
                utterance_map[utt_id] = utt_text
                if turn.get("theme_label") is not None:
                    theme_utterances_set.add(utt_text)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
with open(prefs_path, "r", encoding="utf-8") as f:
    prefs = json.load(f)

should_link_pairs = prefs.get("should_link", [])

text_1, text_2 = [], []
for id1, id2 in should_link_pairs:
    if id1 in utterance_map and id2 in utterance_map:
        text_1.append(utterance_map[id1])
        text_2.append(utterance_map[id2])

print(f"✅ should_link 기반 positive 쌍 수: {len(text_1)}")

✅ should_link 기반 positive 쌍 수: 164


In [4]:
positive_examples = [InputExample(texts=[u1, u2]) for u1, u2 in zip(text_1, text_2)]

In [5]:
print(f"🧹 SimCSE 자기쌍으로 추가할 발화 수: {len(theme_utterances_set)}")

self_pair_examples = [InputExample(texts=[utt, utt]) for utt in theme_utterances_set]

🧹 SimCSE 자기쌍으로 추가할 발화 수: 1632


In [6]:
train_examples = positive_examples + self_pair_examples
print(f"✅ 최종 SimCSE 학습쌍 수: {len(train_examples)}")

✅ 최종 SimCSE 학습쌍 수: 1796


In [7]:
import random

# 시드 고정 (재현성 보장)
random.seed(42)
random.shuffle(train_examples)

# 총 개수
total = len(train_examples)
n_train = int(total * 0.8)
n_val = int(total * 0.1)

# Split
train_data = train_examples[:n_train]
val_data   = train_examples[n_train:n_train + n_val]
test_data  = train_examples[n_train + n_val:]

print(f"✅ Train: {len(train_data)}, Val: {len(val_data)}, Test: {len(test_data)}")

✅ Train: 1436, Val: 179, Test: 181


In [19]:
from datasets import Dataset

def input_examples_to_dict_list(data):
    return [{"anchor": ex.texts[0], "positive": ex.texts[1]} for ex in data]

train_dataset = Dataset.from_list(input_examples_to_dict_list(train_data))
val_dataset   = Dataset.from_list(input_examples_to_dict_list(val_data))
test_dataset  = Dataset.from_list(input_examples_to_dict_list(test_data))


In [20]:
print(train_dataset[0])

{'anchor': 'Yeah, I need to check my balance on my checking.', 'positive': 'Yeah, I need to check my balance on my checking.'}


In [15]:
from sentence_transformers import SentenceTransformer
from sentence_transformers.losses import MatryoshkaLoss
from sentence_transformers.losses import MultipleNegativesRankingLoss

model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")

base_loss = MultipleNegativesRankingLoss(model)

matryoshka_loss = MatryoshkaLoss(
    model=model,
    loss=base_loss,
    matryoshka_dims=[768, 512, 256, 128, 64],  # 원하는 차원들
    matryoshka_weights=[1, 1, 1, 1, 1],         # 각 차원별 동일한 가중치
    n_dims_per_step=-1                         # 각 스텝에서 모든 차원 학습
)

In [23]:
from sentence_transformers import SentenceTransformerTrainingArguments

args = SentenceTransformerTrainingArguments(
    output_dir="output/mpnet-matryoshka",
    num_train_epochs=3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    learning_rate=2e-5,
    warmup_ratio=0.1,
    logging_steps=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    batch_sampler="no_duplicates",  # MultipleNegativesRankingLoss 전용
    bf16=True,  # GPU가 지원 시 사용
    tf32=True,
)

In [24]:
from sentence_transformers import SentenceTransformerTrainer

trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    loss=matryoshka_loss,
)

In [25]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.5106,2.502309
2,0.5097,2.40027
3,0.1417,2.416077


TrainOutput(global_step=135, training_loss=0.39243021452868426, metrics={'train_runtime': 67.5933, 'train_samples_per_second': 63.734, 'train_steps_per_second': 1.997, 'total_flos': 0.0, 'train_loss': 0.39243021452868426, 'epoch': 3.0})

In [26]:
trainer.save_model("output/mpnet-matryoshka")