In [None]:
!pip install datasets
!pip install -U sentence-transformers[onnx-gpu]
!pip install onnx onnxruntime-gpu

In [None]:
from google.colab import drive
drive.mount('/content/My_Drive')
%cd "/content/My_Drive/MyDrive/2025 Spring/15642"

In [None]:
import logging
import traceback
import copy
import os
import random
import torch
from datasets import load_dataset
from collections import Counter
import numpy as np
from sentence_transformers.cross_encoder import CrossEncoder
from sentence_transformers.cross_encoder.evaluation import CrossEncoderNanoBEIREvaluator
from sentence_transformers.cross_encoder.losses import ListMLELoss
from sentence_transformers.cross_encoder.trainer import CrossEncoderTrainer
from sentence_transformers.cross_encoder.training_args import CrossEncoderTrainingArguments
import time
from sentence_transformers import export_dynamic_quantized_onnx_model
from datetime import datetime
from transformers import BertForSequenceClassification, AutoTokenizer

In [None]:
import os
os.environ["WANDB_API_KEY"] = "***REMOVED***"
import wandb
wandb.login()

In [None]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)


## Initial Config

In [59]:
model_name = "tomaarsen/reranker-MiniLM-L12-gooaq-bce" # 33.4M model trained on gooaq, converged at the end of the 1st epoch, Nanomsmarco R100 Mrr@10 about 0.53, precision F32
# tomaarsen/reranker-msmarco-MiniLM-L12-H384-uncased-lambdaloss # 33.4M model trained on MS MARCO, converged at the beginning of the 1st epoch, precision F32, Nanomsmarco R100 Mrr@10 about 0.530746
# cross-encoder/ms-marco-MiniLM-L6-v2 # 22.7M SOTA model trained on MS MARCO, converged at the beginning, Nanomsmarco R100 Mrr@10 about 0.54, precision F32, base model microsoft/MiniLM-L12-H384-uncased

# Set the log level to INFO to get more information
logging.basicConfig(
    format="%(asctime)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    level=logging.INFO,
)
NUM = 64
train_batch_size = NUM
eval_batch_size = NUM
mini_batch_size = NUM
num_epochs = 1
max_docs = None
respect_input_order = True  # Whether to respect the original order of documents

#Retrieve Time
TIME = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

# 1. Define our CrossEncoder model
model = CrossEncoder(model_name, num_labels=1)

# print(model.model)

# # Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print("Model max length:", model.max_length)
print("Model num labels:", model.num_labels)
print("Model is on device:", next(model.parameters()).device)

# 2. Load the MS MARCO dataset: https://huggingface.co/datasets/microsoft/ms_marco
logging.info("Read train dataset")
dataset = load_dataset("microsoft/ms_marco", "v1.1", split="train")

def listwise_mapper(batch, max_docs: int | None = 10):
    processed_queries = []
    processed_docs = []
    processed_labels = []

    for query, passages_info in zip(batch["query"], batch["passages"]):
        # Extract passages and labels
        passages = passages_info["passage_text"]
        labels = passages_info["is_selected"]

        # Pair passages with labels and sort descending by label (positives first)
        paired = sorted(zip(passages, labels), key=lambda x: x[1], reverse=True)

        # Separate back to passages and labels
        sorted_passages, sorted_labels = zip(*paired) if paired else ([], [])

        # Filter queries without any positive labels
        if max(sorted_labels) < 1.0:
            continue

        # Truncate to max_docs
        if max_docs is not None:
            sorted_passages = list(sorted_passages[:max_docs])
            sorted_labels = list(sorted_labels[:max_docs])

        processed_queries.append(query)
        processed_docs.append(sorted_passages)
        processed_labels.append(sorted_labels)

    return {
        "query": processed_queries,
        "docs": processed_docs,
        "labels": processed_labels,
    }

# Create a dataset with a "query" column with strings, a "docs" column with lists of strings,
# and a "labels" column with lists of floats
dataset = dataset.map(
    lambda batch: listwise_mapper(batch=batch, max_docs=max_docs),
    batched=True,
    remove_columns=dataset.column_names,
    desc="Processing listwise samples",
)

dataset = dataset.train_test_split(test_size=1_000)
train_dataset = dataset["train"]
eval_dataset = dataset["test"]
logging.info(train_dataset)

# 3. Define our training loss
loss = ListMLELoss(model, mini_batch_size=mini_batch_size, respect_input_order=respect_input_order)



Model max length: 512
Model num labels: 1
Model is on device: cuda:0


In [60]:
# 4. Define the evaluator. We use the CENanoBEIREvaluator, which is a light-weight evaluator for English reranking
evaluator = CrossEncoderNanoBEIREvaluator(dataset_names=["msmarco"], batch_size=eval_batch_size)
# evaluator(model)

# 5. Define the training arguments
short_model_name = model_name if "/" not in model_name else model_name.split("/")[-1]
run_name = f"reranker-msmarco-v1.1-{short_model_name}-listmle-{TIME}"



In [61]:
args = CrossEncoderTrainingArguments(
    output_dir=f"models/{run_name}",
    num_train_epochs=num_epochs,
    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=eval_batch_size,
    learning_rate=2e-5,
    warmup_ratio=0.1,
    fp16=False,  # Set to False if you get an error that your GPU can't run on FP16
    bf16=True,  # Set to True if you have a GPU that supports BF16
    load_best_model_at_end=True,
    metric_for_best_model="eval_NanoBEIR_R100_mean_ndcg@10",
    eval_strategy="steps",
    eval_steps=50,
    save_strategy="steps",
    save_steps=500,
    save_total_limit=2,
    logging_steps=250,
    logging_first_step=True,
    run_name=run_name,
    seed=12,
    save_on_each_node=True
)



In [62]:
def profiling_evaluator(model, cpu: bool = False):
    """
    Run evaluator(model) and report:
      - wall-clock time
      - (if GPU) peak *new* GPU memory in MB
    """
    # 1) pick device
    device = torch.device("cpu") if cpu else torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # 2) GPU: warm up & snapshot baseline
    if device.type == "cuda":
        torch.cuda.empty_cache()
        torch.cuda.synchronize()               # wait for any inflight ops
        start_alloc = torch.cuda.memory_allocated()
        torch.cuda.reset_peak_memory_stats()

    # 3) time the evaluation
    t0 = time.time()
    results = evaluator(model)
    if device.type == "cuda":
        torch.cuda.synchronize()
    t1 = time.time()

    # 4) report
    print("\n" + "-"*50)
    print(f"Evaluator wall time: {t1 - t0:.2f} seconds")

    if device.type == "cuda":
        peak_alloc = torch.cuda.max_memory_allocated()
        used_bytes = peak_alloc - start_alloc
        used_mb    = used_bytes / (1024**2)
        print(f"Peak *new* GPU memory: {used_mb:.2f} MB")
    else:
        print("Running on CPU—skipping GPU memory stats.")
    print("-"*50 + "\n")

    return results

## Pruning Pipeline

### Unstructured Pruning Trainer Config

In [63]:
import torch.nn.functional as F
class PruningTrainer(CrossEncoderTrainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.eval_steps=50
        self.weight_dropout=0.2
        self.max_pruning_steps=500
        self.step_counter = 0
        self.masks = []
        self.prune_step = 1
        for blk in self.model.model.bert.encoder.layer:
            w = blk.intermediate.dense.weight
            mask = torch.ones_like(w, dtype=torch.bool, device=w.device)
            self.masks.append(mask)
        self.total = sum(p.numel() for p in self.model.model.parameters())


    def training_step(self, model: CrossEncoder, inputs: dict, batch_size: int) -> torch.Tensor:
        # run the normal forward/backward/optimizer.step()
        loss = super().training_step(model, inputs, batch_size)

        self.step_counter += 1
        with torch.no_grad():
            # Re‑apply the existing mask so zeroed weights stay zero
            for blk, mask in zip(model.model.bert.encoder.layer, self.masks):
                blk.intermediate.dense.weight.data.mul_(mask)

            # For each eval step, expand the mask by dropping a random subset of **remaining** weights
            if self.step_counter % self.eval_steps == 0 and self.step_counter <= self.max_pruning_steps:
                print(f"— Pruning at step {self.step_counter} —")
                new_masks = []
                if self.prune_step % 2 != 0:
                    for blk, old_mask in zip(model.model.bert.encoder.layer[:6], self.masks[:6]):
                        w = blk.intermediate.dense.weight
                        # randomly dropout only on the **currently unpruned** positions
                        keep_prob = 1.0 - self.weight_dropout
                        random_tensor = torch.rand_like(w)
                        drop_mask = (random_tensor < keep_prob)  # True = keep, False = drop
                        combined_mask = old_mask & drop_mask
                        # apply combined_mask
                        w.data.mul_(combined_mask)
                        new_masks.append(combined_mask)

                    for i in range(len(new_masks)):
                        self.masks[i] = new_masks[i]
                else:
                    for blk, old_mask in zip(model.model.bert.encoder.layer[6:], self.masks[6:]):
                        w = blk.intermediate.dense.weight
                        # randomly dropout only on the **currently unpruned** positions
                        keep_prob = 1.0 - self.weight_dropout
                        random_tensor = torch.rand_like(w)
                        drop_mask = (random_tensor < keep_prob)  # True = keep, False = drop
                        combined_mask = old_mask & drop_mask
                        # apply it
                        w.data.mul_(combined_mask)
                        new_masks.append(combined_mask)

                    for i in range(len(new_masks)):
                        self.masks[i+6] = new_masks[i]


            if self.step_counter % self.eval_steps == 0:
                # total   = sum(p.numel() for p in model.parameters())
                nonzero = sum(torch.count_nonzero(p).item() for p in model.parameters())
                print(f"Total params: {self.total:,},   non-zero: {nonzero:,}")
                print(f"Sparsity: {100 * (1 - nonzero / self.total):.2f}%")

        self.prune_step += 1
        return loss


In [None]:
# 6. Create the trainer & start training
trainer = PruningTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    loss=loss,
    evaluator=evaluator,

)
trainer.train()
torch.cuda.empty_cache()
# 8. Save the final model
TIME = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
try:
    save_model_name = f"./models/random_mask_{TIME}"
    torch.save(model, f"{save_model_name}.pt")
    final_output_dir = save_model_name
    model.save_pretrained(final_output_dir)
except:
    print("Saving Failure")


In [67]:
profiling_evaluator(model)


--------------------------------------------------
Evaluator wall time: 3.96 seconds
Peak *new* GPU memory: 132.96 MB
--------------------------------------------------



{'NanoMSMARCO_R100_map': 0.49747865280923753,
 'NanoMSMARCO_R100_mrr@10': 0.4883015873015873,
 'NanoMSMARCO_R100_ndcg@10': 0.567446414649366,
 'NanoMSMARCO_R100_base_map': 0.4895766320756843,
 'NanoMSMARCO_R100_base_mrr@10': 0.4775,
 'NanoMSMARCO_R100_base_ndcg@10': 0.5404259879670522,
 'NanoBEIR_R100_mean_map': 0.49747865280923753,
 'NanoBEIR_R100_mean_mrr@10': 0.4883015873015873,
 'NanoBEIR_R100_mean_ndcg@10': 0.567446414649366,
 'NanoBEIR_R100_mean_base_map': 0.4895766320756843,
 'NanoBEIR_R100_mean_base_mrr@10': 0.4775,
 'NanoBEIR_R100_mean_base_ndcg@10': 0.5404259879670522}

### Structured Pruning

In [34]:
import torch
import torch.nn as nn
from sentence_transformers.cross_encoder.trainer import CrossEncoderTrainer
from sentence_transformers.cross_encoder.CrossEncoder import CrossEncoder

class RandomPruningTrainer(CrossEncoderTrainer):
    def __init__(
        self,
        *args,
        keep_ratio: float = 0.95,
        prune_every: int = 50,
        max_prune_rounds: int = 4,
        **kwargs
    ):
        super().__init__(*args, **kwargs)
        assert 0.0 < keep_ratio <= 1.0
        self.keep_ratio        = keep_ratio
        self.prune_every       = prune_every
        self.max_prune_rounds  = max_prune_rounds
        self.step_counter      = 0
        self.prune_round       = 0
        # model config
        cfg = self.model.model.config
        self.hidden_size       = cfg.hidden_size
        self.intermediate_size = cfg.intermediate_size  # original MLP size
        self.total_param       = sum(p.numel() for p in self.model.parameters())

    def training_step(self, model: CrossEncoder, inputs: dict, batch_size: int) -> torch.Tensor:
        # Standard forward/backward/optimizer
        loss = super().training_step(model, inputs, batch_size)
        self.step_counter += 1

        # Check if it's time to prune
        if (self.step_counter % self.prune_every == 0
            and self.prune_round < self.max_prune_rounds):

            print(f"— Random pruning round {self.prune_round+1}/{self.max_prune_rounds} at step {self.step_counter} —")
            hf = model.model  # BertForSequenceClassification

            # Determine which half to prune this round
            num_layers = len(hf.bert.encoder.layer)
            half = num_layers // 2
            start = 0 if (self.prune_round % 2 == 0) else half
            end   = start + half

            # Apply random structured pruning to selected layers
            for idx in range(start, end):
                layer = hf.bert.encoder.layer[idx]
                w_int = layer.intermediate.dense.weight.data  # [intermediate, hidden]
                n_neurons = w_int.size(0)
                k_keep = int(self.keep_ratio * n_neurons)
                if k_keep < 1:
                    continue

                # Randomly select k_keep neuron indices
                perm = torch.randperm(n_neurons, device=w_int.device)
                keep_idx = perm[:k_keep].sort().values

                # Rebuild intermediate.dense: hidden -> k_keep
                new_int = nn.Linear(self.hidden_size, k_keep,
                                    bias=layer.intermediate.dense.bias is not None)
                new_int.weight.data = w_int[keep_idx]
                if layer.intermediate.dense.bias is not None:
                    bias = layer.intermediate.dense.bias.data
                    new_int.bias.data = bias[keep_idx]
                layer.intermediate.dense = new_int

                # Rebuild output.dense: k_keep -> hidden
                w_out = layer.output.dense.weight.data  # [hidden, intermediate]
                new_out = nn.Linear(k_keep, self.hidden_size,
                                    bias=layer.output.dense.bias is not None)
                new_out.weight.data = w_out[:, keep_idx]
                if layer.output.dense.bias is not None:
                    new_out.bias.data = layer.output.dense.bias.data
                layer.output.dense = new_out

            # Log total parameters after pruning
            total_params = sum(p.numel() for p in model.parameters())
            print(f"Total params now: {total_params:,}")
            print(f"Sparsity: {100 * (1 - total_params / self.total_param):.2f}%")

            self.prune_round += 1

        return loss


In [None]:
# 6. Create the trainer & start training
trainer = RandomPruningTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    loss=loss,
    evaluator=evaluator,

)
trainer.train()
torch.cuda.empty_cache()
# 8. Save the final model
TIME = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
try:
    save_model_name = f"./models/sp_{TIME}"
    torch.save(model, f"{save_model_name}.pt")
    final_output_dir = save_model_name
    model.save_pretrained(final_output_dir)
except:
    print("Saving Failure")


### Structured Pruning - L1 Norm

In [8]:
import torch
import torch.nn as nn
from sentence_transformers.cross_encoder.trainer import CrossEncoderTrainer
from sentence_transformers.cross_encoder.CrossEncoder import CrossEncoder

class StructuredPruningL1Trainer(CrossEncoderTrainer):
    def __init__(
        self,
        *args,
        keep_ratio: float = 0.95,        # fraction of neurons to keep globally per round
        prune_every: int = 300,           # prune every N steps
        max_prune_rounds: int = 4,       # total number of pruning rounds
        **kwargs
    ):
        super().__init__(*args, **kwargs)
        assert 0.0 < keep_ratio <= 1.0
        self.keep_ratio       = keep_ratio
        self.prune_every      = prune_every
        self.max_prune_rounds = max_prune_rounds
        self.step_counter     = 0
        self.prune_round      = 0
        # model config
        cfg = self.model.model.config
        self.hidden_size       = cfg.hidden_size
        self.intermediate_size = cfg.intermediate_size
        self.total_param       = sum(p.numel() for p in self.model.parameters())

    def training_step(self, model: CrossEncoder, inputs: dict, batch_size: int) -> torch.Tensor:
        loss = super().training_step(model, inputs, batch_size)
        self.step_counter += 1

        # time to prune?
        if (self.step_counter % self.prune_every == 0
            and self.prune_round < self.max_prune_rounds):
            print(f"— Pruning round {self.prune_round+1} at step {self.step_counter} —")
            hf = model.model
            num_layers = len(hf.bert.encoder.layer)
            half = num_layers // 2

            # decide half based on round parity: odd rounds -> first half, even -> second half
            if self.prune_round % 2 == 0:
                layer_idxs = range(0, half)
            else:
                layer_idxs = range(half, num_layers)

            # Collect L1 scores only for selected layers
            scores = []  # (layer_idx, neuron_idx, score)
            for layer_idx in layer_idxs:
                layer = hf.bert.encoder.layer[layer_idx]
                w_int = layer.intermediate.dense.weight.data  # [inter, hidden]
                neuron_scores = w_int.abs().sum(dim=1)
                for n_idx, score in enumerate(neuron_scores.tolist()):
                    scores.append((layer_idx, n_idx, score))

            # Determine cutoff for this slice
            scores_sorted = sorted(scores, key=lambda x: x[2])
            total_neurons = len(scores_sorted)
            num_keep = int(self.keep_ratio * total_neurons)
            keep_set = set((l, n) for l, n, _ in scores_sorted[-num_keep:])

            # Rebuild only selected layers
            for layer_idx in layer_idxs:
                layer = hf.bert.encoder.layer[layer_idx]
                # neurons to keep in this layer
                kept = sorted(n for (l, n) in keep_set if l == layer_idx)
                if not kept:
                    continue
                k = len(kept)
                # rebuild intermediate.dense
                w_int = layer.intermediate.dense.weight.data
                new_int = nn.Linear(self.hidden_size, k,
                                    bias=layer.intermediate.dense.bias is not None)
                new_int.weight.data = w_int[kept]
                if layer.intermediate.dense.bias is not None:
                    new_int.bias.data = layer.intermediate.dense.bias.data[kept]
                layer.intermediate.dense = new_int
                # rebuild output.dense
                w_out = layer.output.dense.weight.data
                new_out = nn.Linear(k, self.hidden_size,
                                    bias=layer.output.dense.bias is not None)
                new_out.weight.data = w_out[:, kept]
                if layer.output.dense.bias is not None:
                    new_out.bias.data = layer.output.dense.bias.data
                layer.output.dense = new_out

            # log removal in this round and cumulative sparsity
            new_total = sum(p.numel() for p in model.parameters())
            removed = self.total_param - new_total
            sparsity = 100.0 * removed / self.total_param
            print(f"   params remaining: {new_total:,} (removed {removed:,}, sparsity {sparsity:.2f}% )")

            self.prune_round += 1

        return loss


In [9]:
trainer = StructuredPruningL1Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    loss=loss,
    evaluator=evaluator,

)
trainer.train()
torch.cuda.empty_cache()

# 8. Save the final model
TIME = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
try:
    save_model_name = f"./models/sp_L1_{TIME}"
    torch.save(model, f"{save_model_name}.pt")
    final_output_dir = save_model_name
    model.save_pretrained(final_output_dir)
except:
    print("Saving Failure")

Step,Training Loss,Validation Loss,Nanomsmarco R100 Map,Nanomsmarco R100 Mrr@10,Nanomsmarco R100 Ndcg@10,Nanomsmarco R100 Base Map,Nanomsmarco R100 Base Mrr@10,Nanomsmarco R100 Base Ndcg@10,Nanobeir R100 Mean Map,Nanobeir R100 Mean Mrr@10,Nanobeir R100 Mean Ndcg@10,Nanobeir R100 Mean Base Map,Nanobeir R100 Mean Base Mrr@10,Nanobeir R100 Mean Base Ndcg@10
1,18.651,18.528578,0.431997,0.420524,0.502198,0.489577,0.4775,0.540426,0.431997,0.420524,0.502198,0.489577,0.4775,0.540426
2,18.651,18.508196,0.431997,0.420524,0.502198,0.489577,0.4775,0.540426,0.431997,0.420524,0.502198,0.489577,0.4775,0.540426
3,18.651,18.465065,0.432031,0.420524,0.502198,0.489577,0.4775,0.540426,0.432031,0.420524,0.502198,0.489577,0.4775,0.540426
4,18.651,18.40262,0.432027,0.420524,0.502198,0.489577,0.4775,0.540426,0.432027,0.420524,0.502198,0.489577,0.4775,0.540426
5,18.651,18.321585,0.432027,0.420524,0.502198,0.489577,0.4775,0.540426,0.432027,0.420524,0.502198,0.489577,0.4775,0.540426
6,18.651,18.217079,0.432085,0.420524,0.502198,0.489577,0.4775,0.540426,0.432085,0.420524,0.502198,0.489577,0.4775,0.540426
7,18.651,18.097054,0.432785,0.42119,0.50281,0.489577,0.4775,0.540426,0.432785,0.42119,0.50281,0.489577,0.4775,0.540426


KeyboardInterrupt: 

## Quantize INT-8

In [40]:
model = CrossEncoder(model_name, num_labels=1)
model.save_pretrained("./models/model_fp32")

In [41]:
model = CrossEncoder("tomaarsen/reranker-MiniLM-L12-gooaq-bce", backend="onnx")



*************** EP Error ***************
EP Error /onnxruntime_src/onnxruntime/python/onnxruntime_pybind_state.cc:505 void onnxruntime::python::RegisterTensorRTPluginsAsCustomOps(PySessionOptions&, const onnxruntime::ProviderOptions&) Please install TensorRT libraries as mentioned in the GPU requirements page, make sure they're in the PATH or LD_LIBRARY_PATH, and that your GPU is supported.
 when using ['TensorrtExecutionProvider', 'CUDAExecutionProvider']
Falling back to ['CUDAExecutionProvider', 'CPUExecutionProvider'] and retrying.
****************************************




In [42]:
export_dynamic_quantized_onnx_model(model, "avx512_vnni", "./models/model_fp32")
# adjust accordingly
model_int8 = CrossEncoder("./models/model_fp32",
                          backend="onnx",
                          model_kwargs={"file_name": "onnx/model_qint8_avx512_vnni.onnx"},)

The ONNX file model_qint8_avx512_vnni.onnx is not a regular name used in optimum.onnxruntime, the ORTModel might not behave as expected.


*************** EP Error ***************
EP Error /onnxruntime_src/onnxruntime/python/onnxruntime_pybind_state.cc:505 void onnxruntime::python::RegisterTensorRTPluginsAsCustomOps(PySessionOptions&, const onnxruntime::ProviderOptions&) Please install TensorRT libraries as mentioned in the GPU requirements page, make sure they're in the PATH or LD_LIBRARY_PATH, and that your GPU is supported.
 when using ['TensorrtExecutionProvider', 'CUDAExecutionProvider']
Falling back to ['CUDAExecutionProvider', 'CPUExecutionProvider'] and retrying.
****************************************


In [43]:
model_int8.to("cuda")
eval_result = profiling_evaluator(model=model_int8)
print("=" * 15 + "INT_8" + "=" * 15)
eval_result


--------------------------------------------------
Evaluator wall time: 100.19 seconds
Peak *new* GPU memory: 0.18 MB
--------------------------------------------------



{'NanoMSMARCO_R100_map': 0.4125383393409709,
 'NanoMSMARCO_R100_mrr@10': 0.39785714285714285,
 'NanoMSMARCO_R100_ndcg@10': 0.4764925075110537,
 'NanoMSMARCO_R100_base_map': 0.4895766320756843,
 'NanoMSMARCO_R100_base_mrr@10': 0.4775,
 'NanoMSMARCO_R100_base_ndcg@10': 0.5404259879670522,
 'NanoBEIR_R100_mean_map': 0.4125383393409709,
 'NanoBEIR_R100_mean_mrr@10': 0.39785714285714285,
 'NanoBEIR_R100_mean_ndcg@10': 0.4764925075110537,
 'NanoBEIR_R100_mean_base_map': 0.4895766320756843,
 'NanoBEIR_R100_mean_base_mrr@10': 0.4775,
 'NanoBEIR_R100_mean_base_ndcg@10': 0.5404259879670522}

In [20]:
model_int8.save_pretrained("./models/model_int8")

## Profile Pruning Result

In [44]:
org_model = CrossEncoder("tomaarsen/reranker-MiniLM-L12-gooaq-bce")

In [66]:
eval_result = profiling_evaluator(model=org_model)
print("-----------------------------------")
eval_result


--------------------------------------------------
Evaluator wall time: 3.98 seconds
Peak *new* GPU memory: 132.96 MB
--------------------------------------------------

-----------------------------------


{'NanoMSMARCO_R100_map': 0.4319972197140892,
 'NanoMSMARCO_R100_mrr@10': 0.4205238095238096,
 'NanoMSMARCO_R100_ndcg@10': 0.5021975259243594,
 'NanoMSMARCO_R100_base_map': 0.4895766320756843,
 'NanoMSMARCO_R100_base_mrr@10': 0.4775,
 'NanoMSMARCO_R100_base_ndcg@10': 0.5404259879670522,
 'NanoBEIR_R100_mean_map': 0.4319972197140892,
 'NanoBEIR_R100_mean_mrr@10': 0.4205238095238096,
 'NanoBEIR_R100_mean_ndcg@10': 0.5021975259243594,
 'NanoBEIR_R100_mean_base_map': 0.4895766320756843,
 'NanoBEIR_R100_mean_base_mrr@10': 0.4775,
 'NanoBEIR_R100_mean_base_ndcg@10': 0.5404259879670522}

In [11]:
model_pruned = CrossEncoder("./prune_result")

In [12]:
eval_result = profiling_evaluator(model=model_pruned)
eval_result


--------------------------------------------------
Evaluator wall time: 3.88 seconds
Peak *new* GPU memory: 132.96 MB
--------------------------------------------------



{'NanoMSMARCO_R100_map': 0.3370697432658286,
 'NanoMSMARCO_R100_mrr@10': 0.31622222222222224,
 'NanoMSMARCO_R100_ndcg@10': 0.36475284664158353,
 'NanoMSMARCO_R100_base_map': 0.4895766320756843,
 'NanoMSMARCO_R100_base_mrr@10': 0.4775,
 'NanoMSMARCO_R100_base_ndcg@10': 0.5404259879670522,
 'NanoBEIR_R100_mean_map': 0.3370697432658286,
 'NanoBEIR_R100_mean_mrr@10': 0.31622222222222224,
 'NanoBEIR_R100_mean_ndcg@10': 0.36475284664158353,
 'NanoBEIR_R100_mean_base_map': 0.4895766320756843,
 'NanoBEIR_R100_mean_base_mrr@10': 0.4775,
 'NanoBEIR_R100_mean_base_ndcg@10': 0.5404259879670522}

In [13]:
model_pruned = CrossEncoder("./models/unstructured_Lp")
eval_result = profiling_evaluator(model=model_pruned)
eval_result


--------------------------------------------------
Evaluator wall time: 3.96 seconds
Peak *new* GPU memory: 132.96 MB
--------------------------------------------------



{'NanoMSMARCO_R100_map': 0.3370697432658286,
 'NanoMSMARCO_R100_mrr@10': 0.31622222222222224,
 'NanoMSMARCO_R100_ndcg@10': 0.36475284664158353,
 'NanoMSMARCO_R100_base_map': 0.4895766320756843,
 'NanoMSMARCO_R100_base_mrr@10': 0.4775,
 'NanoMSMARCO_R100_base_ndcg@10': 0.5404259879670522,
 'NanoBEIR_R100_mean_map': 0.3370697432658286,
 'NanoBEIR_R100_mean_mrr@10': 0.31622222222222224,
 'NanoBEIR_R100_mean_ndcg@10': 0.36475284664158353,
 'NanoBEIR_R100_mean_base_map': 0.4895766320756843,
 'NanoBEIR_R100_mean_base_mrr@10': 0.4775,
 'NanoBEIR_R100_mean_base_ndcg@10': 0.5404259879670522}

In [18]:
pruned_cross: CrossEncoder = torch.load("./models/pruned_L1_v3.pt", weights_only=False)
results = profiling_evaluator(pruned_cross)
results


--------------------------------------------------
Evaluator wall time: 3.90 seconds
Peak *new* GPU memory: 128.96 MB
--------------------------------------------------



{'NanoMSMARCO_R100_map': 0.41964578138168795,
 'NanoMSMARCO_R100_mrr@10': 0.40404761904761904,
 'NanoMSMARCO_R100_ndcg@10': 0.47975764876280935,
 'NanoMSMARCO_R100_base_map': 0.4895766320756843,
 'NanoMSMARCO_R100_base_mrr@10': 0.4775,
 'NanoMSMARCO_R100_base_ndcg@10': 0.5404259879670522,
 'NanoBEIR_R100_mean_map': 0.41964578138168795,
 'NanoBEIR_R100_mean_mrr@10': 0.40404761904761904,
 'NanoBEIR_R100_mean_ndcg@10': 0.47975764876280935,
 'NanoBEIR_R100_mean_base_map': 0.4895766320756843,
 'NanoBEIR_R100_mean_base_mrr@10': 0.4775,
 'NanoBEIR_R100_mean_base_ndcg@10': 0.5404259879670522}

In [36]:
pruned_cross: CrossEncoder = torch.load("./models/sp_2025-04-30 21:11:37.pt", weights_only=False)
results = profiling_evaluator(pruned_cross)
results


--------------------------------------------------
Evaluator wall time: 3.96 seconds
Peak *new* GPU memory: 122.17 MB
--------------------------------------------------



{'NanoMSMARCO_R100_map': 0.49662572356799695,
 'NanoMSMARCO_R100_mrr@10': 0.4818571428571428,
 'NanoMSMARCO_R100_ndcg@10': 0.534606804298613,
 'NanoMSMARCO_R100_base_map': 0.4895766320756843,
 'NanoMSMARCO_R100_base_mrr@10': 0.4775,
 'NanoMSMARCO_R100_base_ndcg@10': 0.5404259879670522,
 'NanoBEIR_R100_mean_map': 0.49662572356799695,
 'NanoBEIR_R100_mean_mrr@10': 0.4818571428571428,
 'NanoBEIR_R100_mean_ndcg@10': 0.534606804298613,
 'NanoBEIR_R100_mean_base_map': 0.4895766320756843,
 'NanoBEIR_R100_mean_base_mrr@10': 0.4775,
 'NanoBEIR_R100_mean_base_ndcg@10': 0.5404259879670522}

In [19]:
pruned_cross: CrossEncoder = torch.load("./models/sp_L1_2025-04-30 19:14:17.pt", weights_only=False)
results = profiling_evaluator(pruned_cross)
results


--------------------------------------------------
Evaluator wall time: 3.92 seconds
Peak *new* GPU memory: 128.96 MB
--------------------------------------------------



{'NanoMSMARCO_R100_map': 0.1919384794449809,
 'NanoMSMARCO_R100_mrr@10': 0.17188888888888887,
 'NanoMSMARCO_R100_ndcg@10': 0.23948594355135802,
 'NanoMSMARCO_R100_base_map': 0.4895766320756843,
 'NanoMSMARCO_R100_base_mrr@10': 0.4775,
 'NanoMSMARCO_R100_base_ndcg@10': 0.5404259879670522,
 'NanoBEIR_R100_mean_map': 0.1919384794449809,
 'NanoBEIR_R100_mean_mrr@10': 0.17188888888888887,
 'NanoBEIR_R100_mean_ndcg@10': 0.23948594355135802,
 'NanoBEIR_R100_mean_base_map': 0.4895766320756843,
 'NanoBEIR_R100_mean_base_mrr@10': 0.4775,
 'NanoBEIR_R100_mean_base_ndcg@10': 0.5404259879670522}

In [45]:
model_fp16 = copy.deepcopy(org_model)
model_bf16 = copy.deepcopy(org_model)

model_fp16 = model_fp16.half()
model_bf16 = model_bf16.to(torch.bfloat16)

In [52]:
eval_result = profiling_evaluator(model=model_fp16)
print("-----------------------------------")
eval_result


--------------------------------------------------
Evaluator wall time: 3.23 seconds
Peak *new* GPU memory: 65.57 MB
--------------------------------------------------

-----------------------------------


{'NanoMSMARCO_R100_map': 0.4269972197140892,
 'NanoMSMARCO_R100_mrr@10': 0.4271904761904761,
 'NanoMSMARCO_R100_ndcg@10': 0.5045274715625427,
 'NanoMSMARCO_R100_base_map': 0.4895766320756843,
 'NanoMSMARCO_R100_base_mrr@10': 0.4775,
 'NanoMSMARCO_R100_base_ndcg@10': 0.5404259879670522,
 'NanoBEIR_R100_mean_map': 0.4269972197140892,
 'NanoBEIR_R100_mean_mrr@10': 0.4271904761904761,
 'NanoBEIR_R100_mean_ndcg@10': 0.5045274715625427,
 'NanoBEIR_R100_mean_base_map': 0.4895766320756843,
 'NanoBEIR_R100_mean_base_mrr@10': 0.4775,
 'NanoBEIR_R100_mean_base_ndcg@10': 0.5404259879670522}

In [54]:
eval_result = profiling_evaluator(model=model_bf16)
print("-----------------------------------")
eval_result


--------------------------------------------------
Evaluator wall time: 3.14 seconds
Peak *new* GPU memory: 65.57 MB
--------------------------------------------------

-----------------------------------


{'NanoMSMARCO_R100_map': 0.33455546869093944,
 'NanoMSMARCO_R100_mrr@10': 0.38126984126984126,
 'NanoMSMARCO_R100_ndcg@10': 0.47764309146703093,
 'NanoMSMARCO_R100_base_map': 0.4895766320756843,
 'NanoMSMARCO_R100_base_mrr@10': 0.4775,
 'NanoMSMARCO_R100_base_ndcg@10': 0.5404259879670522,
 'NanoBEIR_R100_mean_map': 0.33455546869093944,
 'NanoBEIR_R100_mean_mrr@10': 0.38126984126984126,
 'NanoBEIR_R100_mean_ndcg@10': 0.47764309146703093,
 'NanoBEIR_R100_mean_base_map': 0.4895766320756843,
 'NanoBEIR_R100_mean_base_mrr@10': 0.4775,
 'NanoBEIR_R100_mean_base_ndcg@10': 0.5404259879670522}