# 1. Library Implementation

In [None]:
%%capture
!pip install -qU sentence-transformers
!pip install -q faiss-cpu
!pip install -qU underthesea
import json
import random
import numpy as np

from pprint import pprint
from sentence_transformers import CrossEncoder, InputExample, util
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scipy.stats import pearsonr, spearmanr
from sklearn.model_selection import KFold

import torch, gc
import shutil
import os

In [None]:
print("Phiên bản của Sentence-transformers")
!pip show sentence-transformers

In [None]:
import torch
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("Number of GPUs:", torch.cuda.device_count())
    print("Current GPU index:", torch.cuda.current_device())
    print("GPU name:", torch.cuda.get_device_name(torch.cuda.current_device()))

In [None]:
import logging
import traceback
import pandas as pd
import torch
from datasets import load_dataset, Dataset, DatasetDict
from sentence_transformers import (
    SentenceTransformer,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments,
    SentenceTransformerModelCardData,
)
from sentence_transformers.losses import MultipleNegativesRankingLoss
from sentence_transformers.training_args import BatchSamplers
from sentence_transformers.cross_encoder import losses

logging.getLogger("transformers").setLevel(logging.ERROR)

# 2. Loading corpus

In [None]:
# =============================================
# II. XỬ LÝ DỮ LIỆU (QA Dataset)
# =============================================
# 2a. Load the QA dataset: dataset.json
from underthesea import word_tokenize
from transformers import AutoTokenizer
logging.info("Read my QA training dataset")
with open('/kaggle/input/stock-dataset/corpus.json', 'r',encoding ='utf-8') as f:
    raw_data = json.load(f)
print(f"Số mẫu ban đầu: {len(raw_data)}")

# 2b. Tokenizing dataset
def tokenize_data(dataset):
    for sample in dataset:
        sample['anchor'] = (word_tokenize(dataset['question'], 'text'))
        sample['positive'] = (word_tokenize(dataset['answer'], 'text'))
    pprint(dataset[0])
#raw_data = tokenize_data(raw_data)        

In [None]:
# 2d. Exclude exact samples
def deduplicate_by_text(data, keys=["anchor", "positive"]):
    print(f"Trước khi loại trùng: {len(data)} mẫu")    
    seen = set()
    unique_data = []
    for sample in data:
        key = tuple(sample[k].strip().lower() for k in keys)
        if key not in seen:
            seen.add(key)
            unique_data.append(sample)
    print(f"\nSau khi loại trùng: {len(data)} mẫu")
    return unique_data

#unique_data = deduplicate_by_text(raw_data)
#with open('corpus_processed.json', 'w', encoding='utf-8') as f:
#    json.dump(unique_data, f, ensure_ascii=False, indent=2)

In [None]:
dataset = load_dataset("json", data_files='/kaggle/input/stock-dataset/dataset/finetune_sentenceTransformer/corpus_processed.json')['train'] # DatasetDict => Dataset
print(f"\n Dataset đã load: {dataset}")
df = dataset.to_pandas()
print("\nPhân phối câu hỏi theo level:")
print(df['level'].value_counts(normalize=True))

# 3. Loading model

In [None]:
%%capture
# 2. Load a model to finetune with 3. (Optional) model card data
from sentence_transformers.util import cos_sim
# model = SentenceTransformer(
#     "bkai-foundation-models/vietnamese-bi-encoder",
#     model_card_data=SentenceTransformerModelCardData(
#         language="vi",
#         license="apache-2.0",
#         model_name="Vietnames-Biencoder finetuned on domain stock",
#     )
# )

# model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-mpnet-base-v2")
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')


# 4. Define a loss function
loss= MultipleNegativesRankingLoss(model=model, similarity_fct=cos_sim, scale=20.0)

# 4. Loss function

## MNRLoss

## TripletLoss

In [None]:
from __future__ import annotations

from collections.abc import Iterable
from enum import Enum
from typing import Any

import torch.nn.functional as F
from torch import Tensor, nn

from sentence_transformers.SentenceTransformer import SentenceTransformer
from sentence_transformers.util import pairwise_cos_sim, pairwise_euclidean_sim, pairwise_manhattan_sim


class TripletDistanceMetric(Enum):
    """The metric for the triplet loss"""

    COSINE = lambda x, y: 1 - pairwise_cos_sim(x, y)
    EUCLIDEAN = lambda x, y: pairwise_euclidean_sim(x, y)
    MANHATTAN = lambda x, y: pairwise_manhattan_sim(x, y)


class TripletLoss(nn.Module):
    def __init__(
        self, model: SentenceTransformer, distance_metric=TripletDistanceMetric.EUCLIDEAN, triplet_margin: float = 5
    ) -> None:
        super().__init__()
        self.model = model
        self.distance_metric = distance_metric
        self.triplet_margin = triplet_margin

    def forward(self, sentence_features: Iterable[dict[str, Tensor]], labels: Tensor) -> Tensor:
        embeddings = [self.model(sentence_feature)["sentence_embedding"] for sentence_feature in sentence_features]

        return self.compute_loss_from_embeddings(embeddings, labels)

    def compute_loss_from_embeddings(self, embeddings: list[Tensor], labels: Tensor) -> Tensor:
        """
        Compute the CoSENT loss from embeddings.

        Args:
            embeddings: List of embeddings

        Returns:
            Loss value
        """
        rep_anchor, rep_pos, rep_neg = embeddings
        distance_pos = self.distance_metric(rep_anchor, rep_pos)
        distance_neg = self.distance_metric(rep_anchor, rep_neg)

        losses = F.relu(distance_pos - distance_neg + self.triplet_margin)
        return losses.mean()

    def get_config_dict(self) -> dict[str, Any]:
        distance_metric_name = self.distance_metric.__name__
        for name, value in vars(TripletDistanceMetric).items():
            if value == self.distance_metric:
                distance_metric_name = f"TripletDistanceMetric.{name}"
                break

        return {"distance_metric": distance_metric_name, "triplet_margin": self.triplet_margin}

    @property
    def citation(self) -> str:
        return """
@misc{hermans2017defense,
    title={In Defense of the Triplet Loss for Person Re-Identification},
    author={Alexander Hermans and Lucas Beyer and Bastian Leibe},
    year={2017},
    eprint={1703.07737},
    archivePrefix={arXiv},
    primaryClass={cs.CV}
}
"""

# 5. Building hard Training set

In [None]:
# =============================================
## III. Building hard Training set
# =============================================
# 2a. Generate hard dataset
import torch
import gc
from sentence_transformers.util import mine_hard_negatives
from datasets import Dataset
from typing import Literal, Union

def generate_hard_negatives_dataset(
    dataset: Union[Dataset, list[dict]],
    model,
    anchor_column_name: str = "anchor",
    positive_column_name: str = "positive",
    num_negatives: int = 1,
    range_min: int = 5,
    range_max: int = 300,
    max_score: float = 0.8,
    margin: float = 0.05,
    relative_margin: float = 0.1,
    sampling_strategy: Literal["top", "random"] = "random",
    batch_size: int = 128,
    as_triplets: bool = True,
    use_faiss: bool = True,
    verbose: bool = True,
):
    """
    Sinh tập dữ liệu hard negatives để huấn luyện mô hình Bi-Encoder hoặc Cross-Encoder.

    Args:
        dataset: Tập dữ liệu đầu vào, dạng Huggingface Dataset hoặc list các dicts có các trường anchor / positive.
        model: SentenceTransformer đã được huấn luyện (hoặc đang fine-tune).
        anchor_column_name: Tên cột chứa anchor (ví dụ: câu hỏi).
        positive_column_name: Tên cột chứa positive (ví dụ: câu trả lời đúng).
        num_negatives: Số negative cần sinh cho mỗi anchor.
        range_min/range_max: Giới hạn range tìm kiếm candidates (theo chỉ số).
        max_score: Điểm tương tự tối đa giữa anchor và candidate để được chọn làm negative.
        margin/relative_margin: Khoảng cách giữa anchor-positive và anchor-negative.
        sampling_strategy: "top" (ưu tiên khó nhất) hoặc "random" trong các candidates hợp lệ.
        batch_size: Batch size khi tính embedding.
        as_triplets: Nếu True sẽ sinh (anchor, positive, negative), nếu False thì sinh (anchor, positive, label).
        use_faiss: Có dùng FAISS để tăng tốc retrieval hay không.
        verbose: In ra log dọn cache và 1 sample ví dụ nếu True.

    Returns:
        List các triplets (hoặc pair + label), tùy thuộc vào as_triplets.
    """

    # Dọn bộ nhớ trước khi bắt đầu
    if verbose:
        torch.cuda.empty_cache()
        gc.collect()

    hard_dataset = mine_hard_negatives(
        dataset=dataset,
        model=model,
        anchor_column_name=anchor_column_name,
        positive_column_name=positive_column_name,
        num_negatives=num_negatives,
        range_min=range_min,
        range_max=range_max,
        max_score=max_score,
        margin=margin,
        relative_margin=relative_margin,
        sampling_strategy=sampling_strategy,
        batch_size=batch_size,
        as_triplets=as_triplets,
        use_faiss=use_faiss,
    )

    if verbose:
        print(f"Generated {len(hard_dataset)} hard training samples.")
        from pprint import pprint
        pprint(hard_dataset[0])

    return hard_dataset
#hard_dataset_random = generate_hard_negatives_dataset(dataset,model,'anchor','positive',num_negatives=1)

In [None]:
# 3a. Split hard dataset into train, validation set
from datasets import Dataset, ClassLabel, DatasetDict
import os


def prepare_train_valid_test_split(hard_dataset, original_dataset, test_size_fixed=2000, val_ratio=0.2, seed=42):
    import pandas as pd

    # Bước 1: Merge dữ liệu để lấy cột `level`
    df_hard = hard_dataset.to_pandas()
    df_source = original_dataset.to_pandas()[["anchor", "level"]]
    df_source_dedup = df_source.drop_duplicates(subset="anchor", keep="first")
    df_merged = df_hard.merge(df_source_dedup, on="anchor", how="left")

    # Bước 2: Encode `level` để stratify
    unique_levels = sorted(set(original_dataset["level"]))
    class_label = ClassLabel(names=[str(l) for l in unique_levels])
    df_merged["level"] = df_merged["level"].astype(str)
    #class_label = ClassLabel(names=["1", "2", "3"])
    
    
    # Bước 3: Lấy tập test cố định 2000 mẫu theo stratified
    num_classes = df_merged["level"].nunique()  # = 3
    n_per_class = test_size_fixed // num_classes  # 2000 // 3 = 666
    remainder = test_size_fixed % num_classes     # 2000 % 3 = 2

    # Lấy ngẫu nhiên từ mỗi lớp
    df_test_parts = []
    for i, level in enumerate(sorted(df_merged["level"].unique())):
        sample_n = n_per_class + (1 if i < remainder else 0)  # chia đều phần dư
        df_test_part = df_merged[df_merged["level"] == level].sample(
            n=sample_n, random_state=seed + i
        )
        df_test_parts.append(df_test_part)
    
    # Gộp các phần thành df_test
    df_test = pd.concat(df_test_parts).reset_index(drop=True)

    # Loại bỏ các mẫu test khỏi df_merged để tạo df_remaining
    df_remaining = df_merged.drop(df_test.index).reset_index(drop=True)
    
    from sklearn.model_selection import train_test_split
    """df_remaining, df_test = train_test_split(
        df_merged,
        test_size=test_size_fixed,
        stratify=df_merged["level"],
        random_state=seed,
    )"""

    # Bước 4: Chia remaining thành train và validation theo tỷ lệ 9:1
    df_train, df_valid = train_test_split(
        df_remaining,
        test_size=val_ratio,
        stratify=df_remaining["level"],
        random_state=seed,
    )

    # Bước 5: Chuyển về Dataset + ép kiểu level
    def convert(df):
        df = df.reset_index(drop=True)
        dset = Dataset.from_pandas(df)
        dset = dset.map(lambda x: {"level": str(x["level"])})
        return dset.cast_column("level", class_label)

    train_set = convert(df_train)
    validation_set = convert(df_valid)
    test_set = convert(df_test)

    # In thống kê
    print(f"Train size: {(train_set)}")
    print(train_set.to_pandas()["level"].value_counts(normalize=True,sort=True), "\n")
    
    print(f"Validation size: {(validation_set)}")
    print(validation_set.to_pandas()["level"].value_counts(normalize=True,sort=True), "\n")
    
    print(f"Test size: {(test_set)}")
    print(test_set.to_pandas()["level"].value_counts(normalize=True,sort=False))

    # Bước 6: Lưu dữ liệu về JSON theo định dạng chuẩn (mỗi dòng là 1 object đẹp, KHÔNG phải JSON Lines)
    def save_to_json(dataset, path):
        import json
        df = dataset.to_pandas()
        records = df.to_dict(orient="records")
        with open(path, "w", encoding="utf-8") as f:
            json.dump(records, f, indent=2, ensure_ascii=False)
    
    output_dir = "/kaggle/working/output_splits"
    os.makedirs(output_dir, exist_ok=True)
    
    save_to_json(train_set, f"{output_dir}/train_set.json")
    save_to_json(validation_set, f"{output_dir}/validation_set.json")
    save_to_json(test_set, f"{output_dir}/test_set.json")

    return train_set.remove_columns("level"), validation_set.remove_columns("level"), test_set.remove_columns("level")
#train_set, validation_set, test_set = prepare_train_valid_test_split(hard_dataset_random, dataset)

In [None]:
from datasets import load_dataset

test_set = load_dataset("json", data_files="/kaggle/input/stock-dataset/dataset/finetune_sentenceTransformer/margin0.05/test_set.json")['train'].remove_columns("level")
print(test_set)

validation_set = load_dataset("json", data_files="/kaggle/input/stock-dataset/dataset/finetune_sentenceTransformer/margin0.05/validation_set.json")['train'].remove_columns("level")
print(validation_set)

train_set = load_dataset("json", data_files="/kaggle/input/stock-dataset/dataset/finetune_sentenceTransformer/margin0.05/train_set.json")['train'].remove_columns("level")
print(train_set)

# 6. TripletEvaluator

In [None]:
import torch
import gc
from sentence_transformers.evaluation import TripletEvaluator
from typing import List, Dict

def evaluate_with_multiple_margins(
    model,
    dataset: dict,
    margins: List[float],
    name_prefix: str = "eval",
    batch_size: int = 128,
    show_progress_bar: bool = False,
    verbose: bool = True
) -> Dict[float, float]:
    torch.cuda.empty_cache()
    gc.collect()

    results = {}

    for margin in margins:
        evaluator = TripletEvaluator(
            anchors=dataset["anchor"],
            positives=dataset["positive"],
            negatives=dataset["negative"],
            name=f"{name_prefix}_margin_{margin}",
            margin=margin,
            batch_size=batch_size,
            show_progress_bar=show_progress_bar
        )
        accuracy = evaluator(model)
        results[margin] = accuracy
        print(margin, accuracy)
    return results

In [None]:
from sentence_transformers.evaluation import TripletEvaluator
def triplet_evaluator(dataset,name:str,margin:float,batch_size=128):
    # Dọn bộ nhớ trước khi bắt đầu
    torch.cuda.empty_cache()
    gc.collect()
    
    evaluator = TripletEvaluator(
    anchors=dataset["anchor"],
    positives=dataset["positive"],
    negatives=dataset["negative"],
    name=name,
    margin = margin,
    batch_size =batch_size,
    show_progress_bar=False
    )
    return evaluator

# 7. Training Arguments

In [None]:
from sentence_transformers import SentenceTransformerTrainingArguments
from sentence_transformers.trainer import BatchSamplers

def traing_argument(
    name: str,
    num_epochs: int,
    batch_size: int,
    accumulation: int,
    learning_rate: float,
    warmup_ratio: float = 0.1,
    weight_decay: float = 0.01,
    max_grad_norm: float = 1.0,
    lr_scheduler_type: str = "linear",
):
    print(f"Learning_rate: {learning_rate}\nBatch_size: {batch_size*accumulation}\nScheduler: {lr_scheduler_type}")
    
    args = SentenceTransformerTrainingArguments(
        output_dir=f"models/{name}",
        # Training
        num_train_epochs=num_epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        learning_rate=learning_rate,
        warmup_ratio=warmup_ratio,
        weight_decay=weight_decay,
        max_grad_norm=max_grad_norm,
        gradient_accumulation_steps=accumulation,
        eval_accumulation_steps=accumulation,

        fp16=True,
        bf16=False,
        batch_sampler=BatchSamplers.NO_DUPLICATES,
        lr_scheduler_type=lr_scheduler_type,

        # Checkpointing & Logging
        save_strategy="epoch",
        save_total_limit=1,
        logging_steps=10,
        logging_dir="logs",
        logging_strategy="steps",

        eval_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,

        report_to=["tensorboard"],
        disable_tqdm=False,
        seed=42,
        run_name=name
    )
    return args

# 8. Trainer

In [None]:
# 7. Create a trainer & train
import traceback
from transformers import EarlyStoppingCallback
def trainer(model,
            training_args,
            train_data,
            eval_data,
            loss_fn,
            evaluator,
            use_triplet:bool=True,
            early_stopping_patience: int = 2
    ):
    torch.cuda.empty_cache()
    gc.collect()

    print(f"Loss: {loss_fn}")

    early_stopping = EarlyStoppingCallback(
        early_stopping_patience=early_stopping_patience,
        early_stopping_threshold=5e-4
    )
    
    if not use_triplet:
        print("Inbatch-negatives")
        train_dataset = train_data.remove_columns("negative")
        eval_dataset = eval_data.remove_columns("negative")
    else:
        print("Triplet")

    trainer = SentenceTransformerTrainer(
        model=model,
        args=training_args,
        train_dataset=train_data,
        eval_dataset=eval_data,
        loss=loss_fn,
        evaluator=evaluator,
        callbacks=[early_stopping] 
    )
    return trainer 

# Training

In [None]:
dev_evaluator = triplet_evaluator(dataset=validation_set,name='validation',margin=0.1)
print(dev_evaluator(model))

In [None]:
margins_to_test = [0.05, 0.1,0.15, 0.2, 0.3,0.4,0.5,0.6,0.7,0.8]
results = evaluate_with_multiple_margins(model=model,dataset=test_set,margins=margins_to_test,name_prefix="test_set")

In [None]:
from sentence_transformers.util import cos_sim

# model = SentenceTransformer("bkai-foundation-models/vietnamese-bi-encoder",
#     model_card_data=SentenceTransformerModelCardData(language="vi",license="apache-2.0",model_name="Vietnames-Biencoder finetuned on domain stock",)
# )
# model_mpnet = SentenceTransformer("sentence-transformers/paraphrase-multilingual-mpnet-base-v2")
# model_miniLM = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

mnr_loss = MultipleNegativesRankingLoss(model=model,similarity_fct=cos_sim,scale=20.0)
triplet_loss = TripletLoss(model=model,distance_metric=TripletDistanceMetric.COSINE,triplet_margin=0.2)

args = traing_argument(
    name="biencoder_StockVN",
    num_epochs=25,
    batch_size=64,
    accumulation=2,
    learning_rate=2e-5,
    warmup_ratio=0.1,
    weight_decay=0.01,
    max_grad_norm=1.0,
    lr_scheduler_type="cosine"
)
trainer = trainer(model,
                  args,
                  train_set,
                  validation_set,
                  triplet_loss,
                  dev_evaluator,
                  use_triplet=True)
try:
    trainer.train()
    best_model_dir = trainer.state.best_model_checkpoint
    if best_model_dir:
        zip_output = best_model_dir.rstrip(os.sep) + ".zip"
        shutil.make_archive(base_name=best_model_dir, format='zip', root_dir=best_model_dir)
        print(f"✅ Đã nén model tốt nhất tại: {zip_output}")
    else:
        print("⚠️ Không tìm thấy checkpoint tốt nhất!")
except Exception as e:
    print(e)

In [None]:
margins_to_test = [0.05, 0.1,0.15, 0.2, 0.3,0.4,0.5,0.6,0.7,0.8]
results = evaluate_with_multiple_margins(model=model,dataset=test_set,margins=margins_to_test,name_prefix="test_set")