# Install packages

In [None]:
# InstallSet-fit
!python -m pip install setfit
!python -m pip install setfit[optuna]
! pip install evaluate



In [None]:
# Install Neptune the track performance
%pip install -U neptune transformers[torch,sklearn] datasets evaluate scipy

In [None]:
import neptune
project = "dducl/Dissertation-SETFIT"

In [None]:
from datasets import load_dataset
from sentence_transformers.losses import CosineSimilarityLoss
from setfit import SetFitModel, SetFitTrainer
from transformers.integrations import NeptuneCallback
#from getpass import getpass

In [None]:
from getpass import getpass
run = neptune.init_run(
    project="Removed for anonymity ",
    api_token="Removed for anonymity"
)

In [None]:
# Initiate neptune callback according to run to log data
neptune_callback = NeptuneCallback(
    run=run,
    log_checkpoints='best',
)

# Data loading and transformation

In [None]:
# Load the parquet dataset
dataset = load_dataset("parquet", data_files={'train': '/content/drive/MyDrive/Dissertation/Data/train_df.parquet', 'test': '/content/drive/MyDrive/Dissertation/Data/test_df.parquet'})

In [None]:
# Rename columns to the suitable column names for modelling
test_ds = dataset['test'].remove_columns('__index_level_0__')
train_ds = dataset['train'].remove_columns('__index_level_0__')
train_ds = train_ds.rename_column("processed_text", "text")
train_ds = train_ds.rename_column("label_cat", "label")
test_ds = test_ds.rename_column("processed_text", "text")
test_ds= test_ds.rename_column("label_cat", "label")

# Baseline SetFit model

In [None]:
# Base model using all-minLM-L6
model_base = SetFitModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

In [None]:
# Precision accuracy
import evaluate
import numpy as np
f1_metric = evaluate.load("f1")
def compute_metrics(y_pred, y_test):
    predictions = y_pred
    return f1_metric.compute(predictions=predictions, references=y_test, average="weighted")

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

In [None]:
# Create Trainer
trainer_base = SetFitTrainer(
    model=model_base,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    loss_class=CosineSimilarityLoss,
    batch_size=16,
    metric= compute_metrics,
    num_iterations=20, # The number of text pairs for contrastive learning
    column_mapping= {"text": "text", "label": "label"},
    num_epochs = 2
)

In [None]:
trainer_base.train()

Applying column mapping to training dataset


Generating Training Pairs:   0%|          | 0/20 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 10440
  Num epochs = 2
  Total optimization steps = 1306
  Total train batch size = 16


Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/653 [00:00<?, ?it/s]

Iteration:   0%|          | 0/653 [00:00<?, ?it/s]

In [None]:
# Accuracy 0.74, 0.78
metrics = trainer_base.evaluate()
metrics

Applying column mapping to evaluation dataset
***** Running evaluation *****


{'f1': 0.7594877344877344}

# Legal Set-Fit model

In [None]:
#Law based LM
model_legal = SetFitModel.from_pretrained("mitra-mir/setfit-model-Feb11-Misinformation-on-Law")

In [None]:
# Create Trainer
trainer_legal = SetFitTrainer(
    model=model_legal,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    loss_class=CosineSimilarityLoss,
    batch_size=16,
    metric = compute_metrics,
    num_iterations=20, # The number of text pairs for contrastive learning
    column_mapping= {"text": "text", "label": "label"},
    num_epochs = 2
)

In [None]:
trainer_legal.train()

In [None]:
trainer_legal.evaluate()

Applying column mapping to evaluation dataset
***** Running evaluation *****


{'f1': 0.8346681096681097}

# Save trained SetFit models

In [None]:
# Save two trained SETFIT model
save_directory = '/content/drive/MyDrive/Dissertation/SETFIT/base'
trainer_base.model._save_pretrained(save_directory=save_directory)

In [None]:
save_directory = '/content/drive/MyDrive/Dissertation/SETFIT/law'
trainer_legal.model._save_pretrained(save_directory=save_directory)

# Hyperparameter search

In [None]:
from setfit import SetFitModel
trainer_base = '/content/drive/MyDrive/Dissertation/SETFIT/base'
trainer_legal = '/content/drive/MyDrive/Dissertation/SETFIT/law'

In [None]:
# Precision accuracy
import evaluate
import numpy as np
f1_metric = evaluate.load("f1")
def compute_metrics(y_pred, y_test):
    predictions = y_pred
    return f1_metric.compute(predictions=predictions, references=y_test, average="weighted")

## Adapted from Base SetFit hyperparameter search (tomaarsen, 2023)
Link: https://github.com/huggingface/setfit

In [None]:
# Define functions to fine-tuning initial base model
def model_init_base(params):
    params = params or {}
    max_iter = params.get("max_iter", 100)
    solver = params.get("solver", "liblinear")
    params = {
        "head_params": {
            "max_iter": max_iter,
            "solver": solver,
        }
    }
    return SetFitModel.from_pretrained("/content/drive/MyDrive/Dissertation/SETFIT/base", **params)

def hp_space(trial):  # Training parameters
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True),
        "num_epochs": trial.suggest_int("num_epochs", 1, 2),
        "batch_size": trial.suggest_categorical("batch_size", [4, 8, 16, 32, 64]),
        "seed": trial.suggest_int("seed", 1, 40),
        "num_iterations": trial.suggest_categorical("num_iterations", [5, 10, 15]),
        "max_iter": trial.suggest_int("max_iter", 50, 300),
        "solver": trial.suggest_categorical("solver", ["newton-cg", "lbfgs", "liblinear"]),
    }

In [None]:
trainer_base_hypersearch = SetFitTrainer(
    train_dataset=train_ds,
    eval_dataset=test_ds,
    model_init=model_init_base,
    column_mapping={"text": "text", "label": "label"},
)
best_run = trainer_base_hypersearch.hyperparameter_search(direction="maximize", hp_space=hp_space, n_trials=20)
trainer_base_hypersearch.apply_hyperparameters(best_run.hyperparameters, final_model=True)
trainer_base_hypersearch.train()

In [None]:
# Save two trained SETFIT model
save_directory = '/content/drive/MyDrive/Dissertation/SETFIT/base_hyp'
trainer_base_hypersearch.model._save_pretrained(save_directory=save_directory)

## Legal setfit hyperparameter search

In [None]:
# Define functions to fine-tuning initial base model
def model_init_legal(params):
    params = params or {}
    max_iter = params.get("max_iter", 100)
    solver = params.get("solver", "liblinear")
    params = {
        "head_params": {
            "max_iter": max_iter,
            "solver": solver,
        }
    }
    return SetFitModel.from_pretrained(trainer_legal, **params)

def hp_space(trial):  # Training parameters
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True),
        "num_epochs": trial.suggest_int("num_epochs", 1, 2),
        "batch_size": trial.suggest_categorical("batch_size", [4, 8, 16, 32, 64]),
        "seed": trial.suggest_int("seed", 1, 40),
        "num_iterations": trial.suggest_categorical("num_iterations", [5, 10, 15]),
        "max_iter": trial.suggest_int("max_iter", 50, 300),
        "solver": trial.suggest_categorical("solver", ["newton-cg", "lbfgs", "liblinear"]),
    }

In [None]:
trainer_legal_hypersearch = SetFitTrainer(
    train_dataset=train_ds,
    eval_dataset=test_ds,
    model_init=model_init_legal,
    column_mapping={"text": "text", "label": "label"},
)
best_run = trainer_legal_hypersearch.hyperparameter_search(direction="maximize", hp_space=hp_space, n_trials=20)
trainer_legal_hypersearch.apply_hyperparameters(best_run.hyperparameters, final_model=True)
trainer_legal_hypersearch.train()

[I 2023-07-25 15:09:55,278] A new study created in memory with name: no-name-8bd195c1-77ae-44f8-869a-9af82d8c6154
Trial: {'learning_rate': 2.0579903115962447e-05, 'num_epochs': 2, 'batch_size': 64, 'seed': 28, 'num_iterations': 5, 'max_iter': 58, 'solver': 'lbfgs'}
Applying column mapping to training dataset


Generating Training Pairs:   0%|          | 0/5 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 2610
  Num epochs = 2
  Total optimization steps = 82
  Total train batch size = 64


Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/41 [00:00<?, ?it/s]

Iteration:   0%|          | 0/41 [00:00<?, ?it/s]

Applying column mapping to evaluation dataset
***** Running evaluation *****
[I 2023-07-25 16:10:44,378] Trial 0 finished with value: 0.8181818181818182 and parameters: {'learning_rate': 2.0579903115962447e-05, 'num_epochs': 2, 'batch_size': 64, 'seed': 28, 'num_iterations': 5, 'max_iter': 58, 'solver': 'lbfgs'}. Best is trial 0 with value: 0.8181818181818182.
Trial: {'learning_rate': 2.072307282862749e-06, 'num_epochs': 2, 'batch_size': 4, 'seed': 19, 'num_iterations': 5, 'max_iter': 75, 'solver': 'lbfgs'}
Applying column mapping to training dataset


Generating Training Pairs:   0%|          | 0/5 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 2610
  Num epochs = 2
  Total optimization steps = 1306
  Total train batch size = 4


Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/653 [00:00<?, ?it/s]

Iteration:   0%|          | 0/653 [00:00<?, ?it/s]

Applying column mapping to evaluation dataset
***** Running evaluation *****
[I 2023-07-25 17:13:36,853] Trial 1 finished with value: 0.8484848484848485 and parameters: {'learning_rate': 2.072307282862749e-06, 'num_epochs': 2, 'batch_size': 4, 'seed': 19, 'num_iterations': 5, 'max_iter': 75, 'solver': 'lbfgs'}. Best is trial 1 with value: 0.8484848484848485.
Trial: {'learning_rate': 5.380556433677042e-06, 'num_epochs': 2, 'batch_size': 8, 'seed': 39, 'num_iterations': 15, 'max_iter': 149, 'solver': 'newton-cg'}
Applying column mapping to training dataset


Generating Training Pairs:   0%|          | 0/15 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 7830
  Num epochs = 2
  Total optimization steps = 1958
  Total train batch size = 8


Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/979 [00:00<?, ?it/s]

Iteration:   0%|          | 0/979 [00:00<?, ?it/s]

Applying column mapping to evaluation dataset
***** Running evaluation *****
[I 2023-07-25 19:53:34,431] Trial 2 finished with value: 0.8636363636363636 and parameters: {'learning_rate': 5.380556433677042e-06, 'num_epochs': 2, 'batch_size': 8, 'seed': 39, 'num_iterations': 15, 'max_iter': 149, 'solver': 'newton-cg'}. Best is trial 2 with value: 0.8636363636363636.
Trial: {'learning_rate': 4.695136801584497e-06, 'num_epochs': 1, 'batch_size': 4, 'seed': 2, 'num_iterations': 5, 'max_iter': 137, 'solver': 'liblinear'}
Applying column mapping to training dataset


Generating Training Pairs:   0%|          | 0/5 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 2610
  Num epochs = 1
  Total optimization steps = 653
  Total train batch size = 4


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/653 [00:00<?, ?it/s]

Applying column mapping to evaluation dataset
***** Running evaluation *****
[I 2023-07-25 20:24:24,420] Trial 3 finished with value: 0.8484848484848485 and parameters: {'learning_rate': 4.695136801584497e-06, 'num_epochs': 1, 'batch_size': 4, 'seed': 2, 'num_iterations': 5, 'max_iter': 137, 'solver': 'liblinear'}. Best is trial 2 with value: 0.8636363636363636.
Trial: {'learning_rate': 4.637083834518471e-06, 'num_epochs': 1, 'batch_size': 8, 'seed': 8, 'num_iterations': 15, 'max_iter': 52, 'solver': 'lbfgs'}
Applying column mapping to training dataset


Generating Training Pairs:   0%|          | 0/15 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 7830
  Num epochs = 1
  Total optimization steps = 979
  Total train batch size = 8


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/979 [00:00<?, ?it/s]

Applying column mapping to evaluation dataset
***** Running evaluation *****
[I 2023-07-25 21:43:53,066] Trial 4 finished with value: 0.8636363636363636 and parameters: {'learning_rate': 4.637083834518471e-06, 'num_epochs': 1, 'batch_size': 8, 'seed': 8, 'num_iterations': 15, 'max_iter': 52, 'solver': 'lbfgs'}. Best is trial 2 with value: 0.8636363636363636.
Trial: {'learning_rate': 3.117056995205337e-05, 'num_epochs': 2, 'batch_size': 64, 'seed': 14, 'num_iterations': 5, 'max_iter': 104, 'solver': 'lbfgs'}
Applying column mapping to training dataset


Generating Training Pairs:   0%|          | 0/5 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 2610
  Num epochs = 2
  Total optimization steps = 82
  Total train batch size = 64


Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/41 [00:00<?, ?it/s]

In [None]:
# Save two trained SETFIT model
save_directory = '/content/drive/MyDrive/Dissertation/SETFIT/law_hyp'
trainer_legal_hypersearch.model._save_pretrained(save_directory=save_directory)