# Overview

We evaluated LLMs outputs by using SemScore in [Arena Leaderbaord with SemScore](https://www.kaggle.com/code/aisuko/arena-leaderboard-with-semscore) which is computed sentences similarity of the output of the LLMs. In this notebook, we we going to evaluate models and dataset using SemScore after and during training

In [1]:
%%capture
!pip install transformers==4.38.2
!pip install datasets==2.18.0

In [2]:
import os
import torch
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
login(token=user_secrets.get_secret("HUGGINGFACE_TOKEN"))

os.environ["MODEL"]="sentence-transformers/all-mpnet-base-v2"
os.environ["DATASET"]="g-ronimo/oasst2_top1_en"

torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_flash_sdp(False)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


# Loading Model and Calculating Semantic Similarity

Here we will use `sentence-transformers/all-mpnet-base-v2`.

In [3]:
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel

# mean pooling take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings=model_output[0] # first element to model_output contains all token embeddings
    input_mask_expanded=attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings*input_mask_expanded,1)/torch.clamp(input_mask_expanded.sum(1), min=1e-9)

tokenizer=AutoTokenizer.from_pretrained(os.getenv("MODEL"))

encoder=AutoModel.from_pretrained(os.getenv("MODEL"))
encoder.max_seq_length=5000
encoder.to("cuda")

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


MPNetModel(
  (embeddings): MPNetEmbeddings(
    (word_embeddings): Embedding(30527, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): MPNetEncoder(
    (layer): ModuleList(
      (0-11): 12 x MPNetLayer(
        (attention): MPNetAttention(
          (attn): MPNetSelfAttention(
            (q): Linear(in_features=768, out_features=768, bias=True)
            (k): Linear(in_features=768, out_features=768, bias=True)
            (v): Linear(in_features=768, out_features=768, bias=True)
            (o): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (intermediate): MPNetIntermediate(
          (dense): Linear(in_

In [4]:
torch.manual_seed(2024)

sentences=["apple","orange","car"]

encoded_input=tokenizer(sentences, padding=True, truncation=True, return_tensors="pt").to('cuda')

# compute token embeddings
with torch.no_grad():
    model_output=encoder(**encoded_input)

sentence_embeddings=mean_pooling(model_output, encoded_input['attention_mask'])

# normalize embeddings
sentence_embeddings=F.normalize(sentence_embeddings, p=2, dim=1)

# To obtain the cosine similarity, we calculate the dot product of the normalized vectors
for i in range(0, len(sentences)):
    print(sentences[0], sentences[i], (sentence_embeddings[0] @ sentence_embeddings[i]).item())

apple apple 0.9999998807907104
apple orange 0.40115001797676086
apple car 0.31371933221817017


# Evaluating Model on Any Dataset

In [5]:
import torch.nn as nn
from tqdm import tqdm

def emb_mean_pooling(embedding, attention_mask):
    token_embeddings=embedding[0]
    input_mask_expanded=attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings*input_mask_expanded,1)/torch.clamp(input_mask_expanded.sum(1),min=1e-9)


def get_embeddings(sentences, b_size=8):
    embeddings=torch.tensor([], device='cuda')
    batches=[sentences[i:i+b_size] for i in range(0, len(sentences), b_size)]
    
    for s in batches:
        encoded_input=tokenizer(s, padding=True, truncation=True, return_tensors='pt').to('cuda')
        with torch.no_grad():
            output=encoder(**encoded_input)
        batch_embeddings=emb_mean_pooling(output, encoded_input['attention_mask'])
        embeddings=torch.cat((embeddings, batch_embeddings), dim=0)
    return embeddings


def nn_cos(emd1,emd2=None):
    cos=nn.CosineSimilarity(dim=1, eps=1e-6)
    if emd2 is None:
        num_samples=emd1.shape[0]
        similarities=[[0 for i in range(num_samples)] for f in range(num_samples)]
        for row in tqdm(range(num_samples)):
            similarities[row][0:row+1]=cos(emd1[row].repeat(row+1,1), emd1[0:row+1]).tolist()
        return similarities
    else:
        return cos(emd_ans, emd_ref).tolist()


words=['lemon', 'orange', 'car','money']
embds=get_embeddings(words)
similarity=nn_cos(embds)
print(similarity)

100%|██████████| 4/4 [00:00<00:00, 1523.68it/s]

[[1.0, 0, 0, 0], [0.5340331792831421, 1.0, 0, 0], [0.2909420132637024, 0.30266571044921875, 1.0, 0], [0.22813823819160461, 0.19676585495471954, 0.3411044478416443, 1.0]]





In [6]:
from datasets import load_dataset

dataset=load_dataset(os.getenv("DATASET"), split="train")
dataset=dataset.shuffle(seed=42).select(range(10))
dataset

Downloading readme:   0%|          | 0.00/604 [00:00<?, ?B/s]

Downloading data: 100%|██████████| 5.66M/5.66M [00:00<00:00, 9.75MB/s]


Generating train split:   0%|          | 0/5419 [00:00<?, ? examples/s]

Dataset({
    features: ['conversation'],
    num_rows: 10
})

In [7]:
from transformers import AutoModelForCausalLM, AutoTokenizer

os.environ["TINYLLAMA"]="TinyLlama/TinyLlama-1.1B-Chat-v1.0"

tokenizer=AutoTokenizer.from_pretrained(os.getenv("TINYLLAMA"), use_fast=False)
model=AutoModelForCausalLM.from_pretrained(os.getenv("TINYLLAMA"), device_map="auto", torch_dtype=torch.bfloat16, trust_remote_code=True)
model.device

tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

device(type='cuda', index=0)

We use TinyLlama to generate answers, and create a list to save the reference answers. We will calcualte the semantic similarity of these two answers.

In [8]:
answers_ref, answers_pred = [], []

for conversation in dataset["conversation"]:
    for i, msg in enumerate(conversation):
        if msg["role"] == "assistant": continue
        partial_conv = conversation[0:i+1]
        prompt_tok = tokenizer.apply_chat_template(
            partial_conv, 
            add_generation_prompt = True, 
            return_tensors = "pt"
        ).to("cuda")
        answer_tok = model.generate(
            prompt_tok, 
            eos_token_id = tokenizer.eos_token_id,
            max_new_tokens = 2000, 
        )
        answer = tokenizer.decode(
            answer_tok[0][len(prompt_tok[0]):], 
            skip_special_tokens = True
        )
        answers_pred.append(answer) 
        answers_ref.append(conversation[i+1]["content"])

This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (2048). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.


RuntimeError: The size of tensor a (2048) must match the size of tensor b (2049) at non-singleton dimension 3

In [None]:
from statistics import mean

emd_ans=get_embeddings(answers_pred)
emd_ref=get_embeddings(answers_ref)

similarities=nn_cos(emd_ans, emd_ref)

for i, result in enumerate(similarities):
    print(f"question {i}:{similarities[i]}")

print("avg, similarity pred. vs. ref.:", mean(similarities), "\n")

# Acknowledge

* https://medium.com/@geronimo7/semscore-evaluating-llms-with-semantic-similarity-2abf5c2fadb9
* https://github.com/geronimi73/semscore/blob/main/nb_blog_part2_eval-trained.ipynb