In [None]:
!pip install pandas
!pip install datasets
!pip install sentence_transformers
!pip install transformers[torch]
!pip install gdown

In [14]:
import gdown

# Replace the URL below with your Google Drive shareable link
drive_url = "https://drive.google.com/uc?id=1YmdITHIV6BNwVQOEN-dHF8dBwO_CKCF5"

# Replace 'dataset.csv' with your desired filename
output_file = "output_dataset.csv"

# Download the file
gdown.download(drive_url, output_file, quiet=False)

print(f"File downloaded as: {output_file}")

Downloading...
From (original): https://drive.google.com/uc?id=1YmdITHIV6BNwVQOEN-dHF8dBwO_CKCF5
From (redirected): https://drive.google.com/uc?id=1YmdITHIV6BNwVQOEN-dHF8dBwO_CKCF5&confirm=t&uuid=4201e367-06cd-4845-b67b-18318b746b62
To: /workspace/output_dataset.csv
100%|██████████| 687M/687M [00:06<00:00, 103MB/s]  

File downloaded as: output_dataset.csv





In [17]:
import pandas as pd
from datasets import Dataset
from sentence_transformers import SentenceTransformer
from sentence_transformers.losses import CosineSimilarityLoss
from sentence_transformers import SentenceTransformerTrainer, SentenceTransformerTrainingArguments
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, SimilarityFunction
from tqdm import tqdm
from sklearn.model_selection import train_test_split

def finetune_embedding_model_v3(
    dataset_path,
    model_name="sentence-transformers/all-mpnet-base-v2",
    output_model_path="fine-tuned-embedding-model-v3",
    batch_size=32,
    epochs=3,
    learning_rate=1e-5
):
    """
    Fine-tunes a pre-trained embedding model using a dataset of query and lyric pairs
    (with a numeric 'label' column). Follows the Sentence Transformers v3 approach.

    Args:
        dataset_path (str): Path to the dataset CSV containing query, full_lyrics, and label.
        model_name (str): Name of the pre-trained Sentence Transformers model to finetune.
        output_model_path (str): Directory where the fine-tuned model will be saved.
        batch_size (int): Batch size for training.
        epochs (int): Number of training epochs.
        learning_rate (float): Learning rate for the optimizer.

    Returns:
        None. Saves the fine-tuned model to the specified output path.
    """

    # -------------------------
    # 1. Load & preprocess data
    # -------------------------
    df = pd.read_csv(dataset_path)
    # Ensure columns exist
    required_columns = ["query", "full_lyrics", "label"]
    for col in required_columns:
        if col not in df.columns:
            raise ValueError(f"Missing required column: {col}")

    # Convert label to float if necessary
    df["label"] = df["label"].astype(float)
    df = df.sample(n=40000, random_state=42)

    # Split into train/dev
    train_df, eval_df = train_test_split(df, test_size=0.1, random_state=42)

    train_df = train_df.dropna(subset=["query", "full_lyrics", "label"])
    eval_df = eval_df.dropna(subset=["query", "full_lyrics", "label"])

    # Convert pandas DFs to huggingface Dataset objects.
    # We want them in the format:  (text1, text2, label)
    # where text1= "query", text2="full_lyrics", label="label"
    train_dataset = Dataset.from_pandas(train_df[["query", "full_lyrics", "label"]])
    eval_dataset = Dataset.from_pandas(eval_df[["query", "full_lyrics", "label"]])
    train_dataset = train_dataset.remove_columns(["__index_level_0__"])
    eval_dataset = eval_dataset.remove_columns(["__index_level_0__"])

    print(train_dataset)
    print(eval_dataset)

    # ---------------
    # 2. Load a model
    # ---------------
    # This is your base pretrained model that you'll be fine-tuning
    model = SentenceTransformer(model_name)

    # -----------------
    # 3. Define a loss
    # -----------------
    # CosineSimilarityLoss expects (text1, text2, label) where label is a float similarity score
    loss = CosineSimilarityLoss(model)

    # ----------------------------------
    # 4. (Optional) Create an evaluator
    # ----------------------------------
    # If your labels are 0/1 or some floating scale, an EmbeddingSimilarityEvaluator can help track the model’s
    # performance on the dev set. Just ensure your labels make sense as a "similarity" measure.
    # For binary 0/1, you can still treat 1 => "similar" and 0 => "not similar", but
    # be aware that you'll get typical correlation metrics, etc.
    dev_evaluator = EmbeddingSimilarityEvaluator(
        sentences1=eval_dataset["query"],
        sentences2=eval_dataset["full_lyrics"],
        scores=eval_dataset["label"],
        main_similarity=SimilarityFunction.COSINE,
        name="my-dev-set",
    )
    dev_evaluator(model)
    # You can run dev_evaluator(model) manually if you’d like a baseline before training.

    # -----------------------------------------
    # 5. Specify training arguments and trainer
    # -----------------------------------------
    training_args = SentenceTransformerTrainingArguments(
        output_dir=output_model_path,
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        learning_rate=learning_rate,
        # Optional but useful
        eval_strategy="steps",      # Evaluate every N steps
        eval_steps=225,                  # How often to run dev_evaluator
        save_strategy="steps",
        save_steps=225,
        warmup_steps=337,
        save_total_limit=2,
        logging_steps=100,
        fp16=True,  # Set to False if your GPU doesn’t support FP16
        # run_name="my-st-v3-run"  # If you have W&B or another logging tool
    )

    # Create the trainer
    trainer = SentenceTransformerTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,   # For eval loss
        loss=loss,
        evaluator=dev_evaluator,     # For additional metrics
    )

    # -----------------
    # 6. Train the model
    # -----------------
    print("Starting fine-tuning...")
    total_steps = (len(train_dataset) // batch_size) * epochs
    progress_bar = tqdm(total=total_steps, desc="Training Progress")

    # If you want to manually loop over epochs:
    for epoch in range(epochs):
        trainer.train()  # This runs one full epoch by default
        progress_bar.update(len(train_dataset) // batch_size)

    progress_bar.close()

    # Alternatively (and more simply):
    # trainer.train()   # would do all epochs automatically, but you'd lose the custom progress_bar above

    # -------------------------
    # 7. Evaluate final metrics
    # -------------------------
    if dev_evaluator is not None:
        print("Final evaluation on dev set:")
        print(dev_evaluator(model))

    # ---------------------------
    # 8. Save the fine-tuned model
    # ---------------------------
    model.save_pretrained(output_model_path)
    print(f"Model fine-tuned and saved to {output_model_path}")

In [18]:
finetune_embedding_model_v3(
    dataset_path = "output_dataset.csv"
)

Dataset({
    features: ['query', 'full_lyrics', 'label'],
    num_rows: 35892
})
Dataset({
    features: ['query', 'full_lyrics', 'label'],
    num_rows: 3992
})


Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Starting fine-tuning...



Training Progress:   0%|          | 0/3363 [00:00<?, ?it/s][A

Step,Training Loss,Validation Loss,My-dev-set Pearson Cosine,My-dev-set Spearman Cosine
225,0.0639,0.05824,0.479995,0.369713
450,0.0556,0.052005,0.516648,0.344231
675,0.05,0.051278,0.559059,0.35991
900,0.0489,0.049911,0.562503,0.359318
1125,0.05,0.049428,0.583235,0.367651
1350,0.0467,0.048122,0.574608,0.360213
1575,0.0488,0.047949,0.574154,0.355734
1800,0.0452,0.047649,0.5661,0.348593
2025,0.0475,0.047472,0.56656,0.355099
2250,0.0477,0.047465,0.58528,0.361725


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]


Training Progress:  33%|███▎      | 1121/3363 [24:48<49:36,  1.33s/it][A

Step,Training Loss,Validation Loss,My-dev-set Pearson Cosine,My-dev-set Spearman Cosine
225,0.0381,0.04823,0.550775,0.342356
450,0.0371,0.048285,0.550373,0.336176
675,0.0362,0.052202,0.551306,0.336336
900,0.0354,0.048426,0.549791,0.334867
1125,0.036,0.047905,0.574196,0.348126
1350,0.0297,0.047115,0.558874,0.334133
1575,0.0323,0.05111,0.538228,0.330184
1800,0.0302,0.048626,0.542434,0.327884
2025,0.0342,0.047887,0.55166,0.337767
2250,0.0334,0.048656,0.557036,0.337676



Training Progress:  67%|██████▋   | 2242/3363 [49:37<24:48,  1.33s/it][A

Step,Training Loss,Validation Loss,My-dev-set Pearson Cosine,My-dev-set Spearman Cosine
225,0.0254,0.048839,0.54705,0.327964
450,0.0217,0.050662,0.535847,0.327927
675,0.0239,0.055982,0.519462,0.318151
900,0.0238,0.048238,0.526264,0.314759
1125,0.0251,0.050834,0.547683,0.329524
1350,0.0183,0.048799,0.529846,0.318289
1575,0.0201,0.052959,0.523394,0.318562
1800,0.0203,0.04963,0.529122,0.318791
2025,0.0238,0.048937,0.53486,0.320914
2250,0.0226,0.050623,0.530334,0.320643



Training Progress: 100%|██████████| 3363/3363 [1:14:28<00:00,  1.33s/it][A


Final evaluation on dev set:
{'my-dev-set_pearson_cosine': 0.5330700861787097, 'my-dev-set_spearman_cosine': 0.3202504036973425}
Model fine-tuned and saved to fine-tuned-embedding-model-v3
