# Imports

In [1]:
import torch
from datasets import load_dataset, Dataset # Kept for user's potential future use
from sentence_transformers import SentenceTransformer, InputExample, losses, SentenceTransformerModelCardData
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.losses import MultipleNegativesRankingLoss
import os
from sentence_transformers import (
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments,
)
from sentence_transformers.training_args import BatchSamplers
from peft import LoraConfig, TaskType, get_peft_model # Ensure LoraConfig and TaskType are imported
from sklearn.metrics.pairwise import cosine_similarity # For inference test

  from .autonotebook import tqdm as notebook_tqdm


# 1. Configuration & Setup

In [None]:
base_model_id = "Alibaba-NLP/gte-multilingual-base"
new_model_name = "gte-finance-model" # Path for the final fine-tuned model
train_csv_file = "train_df.csv"
eval_csv_file = "eval_df.csv"
train_df = load_dataset("csv", data_files="train_df.csv")
eval_df = load_dataset("csv", data_files="eval_df.csv")


# --- LoRA Parameters ---
lora_r = 16          # LoRA attention dimension (rank)
lora_alpha = 32      # Alpha parameter for LoRA scaling
lora_dropout = 0.1   # Dropout probability for LoRA layers
lora_target_modules = ["qkv_proj", "o_proj", "up_gate_proj", "down_proj"]

# --- Training Parameters ---
training_output_dir = f"{new_model_name}_training_checkpoints" # Directory for checkpoints
num_train_epochs = 4    # Adjust as needed
train_batch_size = 8    # Adjust based on your VRAM
eval_batch_size = 16    # Adjust based on your VRAM
learning_rate = 2e-5
warmup_ratio = 0.1      # Percentage of training steps for warmup
model_max_length = 512  # Max sequence length for GTE models

print("=== LoRA Fine-tuning for Sentence Embedding Models (from CSV - No Error Handling) ===")
print(f"Base model: {base_model_id}")
print(f"Output model name: {new_model_name}")
print(f"Training data CSV: {train_csv_file}")
print(f"Evaluation data CSV: {eval_csv_file}")
print(f"LoRA r: {lora_r}, LoRA alpha: {lora_alpha}, LoRA dropout: {lora_dropout}")
print(f"LoRA Target modules: {lora_target_modules} (CRITICAL: Verify these!)")
print(f"Max sequence length: {model_max_length}")
print("=================================================================================\n")

# 2. Load and Prepare Dataset from CSV

In [None]:
def prepare_input_examples_for_evaluator(dataset, stage="evaluator_data_prep"):
    """
    Prepares a list of InputExample objects from a Hugging Face Dataset.
    This is now primarily used for setting up the EmbeddingSimilarityEvaluator.
    """
    input_examples = []
    if dataset is None or len(dataset) == 0:
        print(f"No data provided or dataset is empty for {stage}.")
        return input_examples

    sample_item = dataset[0]
    query_col_name = 'query'
    corpus_col_name = 'corpus'

    if query_col_name not in sample_item:
        print(f"Warning: Column '{query_col_name}' not found in {stage} data sample. Ensure your CSV has this column. Current columns: {list(sample_item.keys())}")
    if corpus_col_name not in sample_item:
         print(f"Warning: Column '{corpus_col_name}' not found in {stage} data sample. Ensure your CSV has this column. Current columns: {list(sample_item.keys())}")

    for item_index, item in enumerate(dataset):
        query_text = item.get(query_col_name) # Use .get() for safety if columns might be missing in some rows
        corpus_text = item.get(corpus_col_name)

        if query_text is None or corpus_text is None:
            print(f"Warning: Skipping item at index {item_index} in {stage} due to missing '{query_col_name}' or '{corpus_col_name}'.")
            continue
        
        if not isinstance(query_text, str) or not isinstance(corpus_text, str):
            print(f"Warning: Item at index {item_index} in {stage} data has non-string content. Query: '{str(query_text)[:50]}...', Corpus: '{str(corpus_text)[:50]}...'. Attempting to convert to string.")
            query_text = str(query_text)
            corpus_text = str(corpus_text)

        example = InputExample(texts=[query_text, corpus_text], label=1.0)
        input_examples.append(example)

    print(f"Created {len(input_examples)} InputExamples for {stage} from {len(dataset)} raw items.")
    return input_examples

print("Loading and preparing dataset from CSV files...")

# Load training data - script will fail here if file not found or format is incorrect
train_dataset_dict = load_dataset("csv", data_files=train_csv_file)
train_data_from_csv = train_dataset_dict['train'] # This is a Hugging Face Dataset object
print(f"Successfully loaded training data from {train_csv_file}. Number of examples: {len(train_data_from_csv)}")
if not train_data_from_csv or len(train_data_from_csv) == 0: # Check if the loaded HF dataset is empty
    raise ValueError(f"Training data loaded from {train_csv_file} is empty. Ensure the file contains data and has 'query' and 'corpus' columns.")

# Load evaluation data - script will fail here if file not found or format is incorrect
eval_dataset_dict = load_dataset("csv", data_files=eval_csv_file)
eval_data_from_csv = eval_dataset_dict['train'] # This is a Hugging Face Dataset object
print(f"Successfully loaded evaluation data from {eval_csv_file}. Number of examples: {len(eval_data_from_csv)}")
if not eval_data_from_csv or len(eval_data_from_csv) == 0: # Check if the loaded HF dataset is empty
    print(f"Warning: Evaluation data loaded from {eval_csv_file} is empty. Trainer evaluation might be skipped or may cause issues.")
    # eval_data_from_csv will be an empty Dataset object, trainer should handle this.

# Prepare data specifically for the EmbeddingSimilarityEvaluator
eval_input_examples_for_custom_evaluator = prepare_input_examples_for_evaluator(eval_data_from_csv)

# 3. Initialize Model, Add LoRA Adapter

In [None]:
print(f"\nInitializing Sentence Transformer model: {base_model_id}")
model = SentenceTransformer(
    model_name_or_path=base_model_id,
    trust_remote_code=True
)
model.max_seq_length = model_max_length

peft_config = LoraConfig(
    task_type=TaskType.FEATURE_EXTRACTION,
    inference_mode=False,
    r=lora_r,
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    bias="none",
    target_modules=lora_target_modules,
)

model.add_adapter(peft_config)
print("LoRA adapter added to the SentenceTransformer model (or underlying transformer).")

# 4. Setup Training Arguments, Loss, Evaluator, and Trainer

In [None]:
print("\nSetting up training components...")

# Determine evaluation strategy based on the loaded Hugging Face eval dataset
perform_evaluation = bool(eval_data_from_csv and len(eval_data_from_csv) > 0)

args = SentenceTransformerTrainingArguments(
    output_dir=training_output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=eval_batch_size, # This will be used if evaluation is performed
    learning_rate=learning_rate,
    warmup_ratio=warmup_ratio,
    run_name=f"{os.path.basename(base_model_id)}-lora-finetune-csv-noerr-{lora_r}",
    # Removed other arguments as per user request.
    # Default behaviors for logging, saving, evaluation strategy will apply.
    # For example, evaluation_strategy defaults to "no" if eval_dataset is not provided to Trainer.
    # Logging defaults to "steps" with logging_steps=500.
    # Saving defaults to "steps" with save_steps=500.
)

train_loss = MultipleNegativesRankingLoss(model=model)

evaluator = None
if eval_input_examples_for_custom_evaluator: # Check the list prepared for the custom evaluator
    print(f"Setting up EmbeddingSimilarityEvaluator with {len(eval_input_examples_for_custom_evaluator)} examples.")
    eval_queries = [example.texts[0] for example in eval_input_examples_for_custom_evaluator]
    eval_corpus_texts = [example.texts[1] for example in eval_input_examples_for_custom_evaluator]
    eval_scores = [1.0 for _ in eval_input_examples_for_custom_evaluator] # Assuming positive pairs

    evaluator = EmbeddingSimilarityEvaluator(
        sentences1=eval_queries,
        sentences2=eval_corpus_texts,
        scores=eval_scores,
        main_similarity=None, # Defaults to cosine similarity
        name="financial_qa_csv_custom_eval", # Unique name for the evaluator
        show_progress_bar=False,
        write_csv=True
    )
else:
    print("No data for EmbeddingSimilarityEvaluator or eval_data_from_csv was empty. Skipping custom evaluator setup.")

trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_data_from_csv, # Pass the Hugging Face Dataset object
    eval_dataset=eval_data_from_csv if perform_evaluation else None, # Pass the Hugging Face Dataset object
    loss=train_loss,
    evaluator=evaluator, # Pass the custom evaluator if created
)

# 5. Training

In [None]:
print("\nStarting training...")
trainer.train()
print("Training completed!")

# 6. Save Final Model

In [None]:
final_model_path = new_model_name
print(f"\nSaving fine-tuned SentenceTransformer model to: {final_model_path}")
os.makedirs(final_model_path, exist_ok=True)
model.save(
    path=final_model_path,
    model_name=os.path.basename(final_model_path),
)
print(f"Model saved to {final_model_path}")
print(f"You can load this model using: SentenceTransformer('{final_model_path}')")

# Additional

In [3]:
model = SentenceTransformer('gte-finance-model', trust_remote_code=True)
print(model)

Some weights of the model checkpoint at Alibaba-NLP/gte-multilingual-base were not used when initializing NewModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  warn("The installed version of bitsandbytes was compiled without GPU support. "


'NoneType' object has no attribute 'cadam32bit_grad_fp32'
SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: PeftModelForFeatureExtraction 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)


In [12]:
model[0].max_seq_length = 1024

In [13]:
model.encode([])

SentenceTransformer(
  (0): Transformer({'max_seq_length': 1024, 'do_lower_case': False}) with Transformer model: PeftModelForFeatureExtraction 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)