In [1]:
import geoopt as g
import torch
from torch.utils.data import DataLoader
import numpy as np
import matplotlib.pyplot as plt
from datasets import load_dataset
from hierarchy_transformers import HierarchyTransformer
from transformers import AutoTokenizer, AutoModel, Trainer, TrainingArguments, EvalPrediction


In [2]:
# semantic text similarity
sts_b = load_dataset("glue", "stsb")

In [3]:
# tokenizer for the model
tokenizer_HiT = AutoTokenizer.from_pretrained("Hierarchy-Transformers/HiT-MiniLM-L12-WordNetNoun")
tokenizer_allMini = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L12-v2")

In [4]:
# Tokenize the datasets

def process_samples(samples, tokenizer):
    # Tokenize the text via the tokenizer
    return tokenizer(samples["sentence1"], samples["sentence2"], padding=True, truncation=True, return_tensors="pt")

sts_b_HT = sts_b.map(lambda x: process_samples(x, tokenizer_HiT), batched=True)
sts_b_LM = sts_b.map(lambda x: process_samples(x, tokenizer_allMini), batched=True)

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [5]:
# Convert to PyTorch tensors

def format_to_torch(dataset, label_key): 
    dataset = dataset.remove_columns(["idx"])
    dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', label_key])
    return dataset

def normalize_labels(example):
	example['label'] = example['label'] / 5.0
	return example

In [6]:
sts_b_LM_formatted = format_to_torch(sts_b_LM['validation'], "label").map(normalize_labels)
sts_b_HT_formatted = format_to_torch(sts_b_HT['validation'], "label").map(normalize_labels)

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [7]:
# Load the models

hit = HierarchyTransformer.from_pretrained("Hierarchy-Transformers/HiT-MiniLM-L12-WordNetNoun")
elm = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L12-v2")

In [8]:
elm.to('mps')
hit.to('mps')

HierarchyTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

In [25]:
def compute_metrics(evaluation_prediction):
    logits, labels = evaluation_prediction
    predictions = np.argmax(logits, axis=1) # classification
    accuracy = np.mean(predictions == labels)
    return {"accuracy": accuracy}

def evaluate_model(model, dataset, batch_size=16):
    dataloader = DataLoader(dataset, batch_size=batch_size)
    model.eval()
    device = torch.device("mps" if torch.mps.is_available() else "cpu")

    model.to(device)

    all_logits = []
    all_labels = []

    with torch.no_grad():
        for batch in dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            logits = outputs.logits

            all_logits.append(logits)
            all_labels.append(batch["labels"]).cpu()
    
    all_logits = torch.cat(all_logits, dim=0)
    all_labels = torch.cat(all_labels, dim=0)

In [26]:
# Evaluate Hyperbolic Model
print("Hyperbolic Model Performance:")
print("STS-B:", evaluate_model(hit, sts_b_HT_formatted))

# Evaluate Euclidean Model
print("\nEuclidean Model Performance:")
print("STS-B:", evaluate_model(elm, sts_b_LM_formatted))

Hyperbolic Model Performance:


TypeError: SentenceTransformer.forward() missing 1 required positional argument: 'input'

In [27]:
from torch.utils.data import DataLoader
import numpy as np
from sentence_transformers import SentenceTransformer

def evaluate_model(model, dataset, batch_size=16, is_regression=False):
    """
    Evaluate a SentenceTransformer model on a dataset.
    - `is_regression`: Set to True for STS-B where labels are continuous scores.
    """
    dataloader = DataLoader(dataset, batch_size=batch_size)

    model.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    all_predictions = []
    all_labels = []

    with torch.no_grad():
        for batch in dataloader:
            sentences1 = batch["sentence1"]
            sentences2 = batch["sentence2"]
            labels = batch["labels"].numpy()  # Extract labels

            # Compute sentence embeddings
            embeddings1 = model.encode(sentences1, convert_to_tensor=True)
            embeddings2 = model.encode(sentences2, convert_to_tensor=True)

            # Compute cosine similarity
            similarity_scores = torch.nn.functional.cosine_similarity(embeddings1, embeddings2).cpu().numpy()

            all_predictions.extend(similarity_scores)
            all_labels.extend(labels)

    all_predictions = np.array(all_predictions)
    all_labels = np.array(all_labels)

    # Compute evaluation metric
    if is_regression:  # STS-B task (regression)
        from scipy.stats import pearsonr
        metric = {"pearson_correlation": pearsonr(all_predictions, all_labels)[0]}
    else:  # Classification task (SNLI, QQP)
        predictions = (all_predictions > 0.5).astype(int)  # Binary classification threshold
        accuracy = np.mean(predictions == all_labels)
        metric = {"accuracy": accuracy}

    return metric

In [33]:
sts_b['validation']['label']

[5.0,
 4.75,
 5.0,
 2.4000000953674316,
 2.75,
 2.615000009536743,
 5.0,
 2.3329999446868896,
 3.75,
 5.0,
 3.200000047683716,
 1.5829999446868896,
 5.0,
 5.0,
 4.908999919891357,
 0.800000011920929,
 2.4000000953674316,
 5.0,
 4.0,
 0.6359999775886536,
 3.0,
 1.7139999866485596,
 3.200000047683716,
 2.1670000553131104,
 1.0,
 1.9170000553131104,
 4.25,
 3.0,
 1.0,
 0.6000000238418579,
 2.5999999046325684,
 5.0,
 4.599999904632568,
 5.0,
 4.800000190734863,
 3.799999952316284,
 5.0,
 5.0,
 4.199999809265137,
 1.399999976158142,
 3.5999999046325684,
 2.799999952316284,
 1.600000023841858,
 3.0,
 1.399999976158142,
 0.25,
 0.25,
 0.0,
 4.0,
 4.5,
 0.5,
 3.799999952316284,
 4.800000190734863,
 5.0,
 0.25,
 1.2000000476837158,
 0.6000000238418579,
 0.800000011920929,
 3.799999952316284,
 0.0,
 3.5,
 4.5,
 2.799999952316284,
 3.799999952316284,
 3.799999952316284,
 0.0,
 4.0,
 4.25,
 2.812000036239624,
 4.25,
 3.0,
 1.0,
 3.75,
 0.0,
 0.4000000059604645,
 4.0,
 2.799999952316284,
 3.75,
 1.

In [36]:
from sentence_transformers import SentenceTransformer

# Convert dataset to proper format
def prepare_sentence_transformer_dataset(dataset):
    return [{"sentence1": s1, "sentence2": s2, "labels": label} 
            for s1, s2, label in zip(dataset["sentence1"], dataset["sentence2"], dataset["label"])]

# Prepare datasets
sts_b_data = prepare_sentence_transformer_dataset(sts_b["validation"])

# Load SentenceTransformer models
hit_model = SentenceTransformer("Hierarchy-Transformers/HiT-MiniLM-L12-WordNetNoun")
elm_model = SentenceTransformer("sentence-transformers/all-MiniLM-L12-v2")

# Evaluate models
print("Hyperbolic Model Performance:")
print("STS-B:", evaluate_model(hit_model, sts_b_data, is_regression=True))

print("\nEuclidean Model Performance:")
print("STS-B:", evaluate_model(elm_model, sts_b_data, is_regression=True))


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Hyperbolic Model Performance:
STS-B: {'pearson_correlation': np.float64(0.7610805140114083)}

Euclidean Model Performance:
STS-B: {'pearson_correlation': np.float64(0.876909302337823)}
