# Overview

We are going to implement a Arena leaderboard with SemScore.

In [1]:
%%capture
!pip install transformers==4.38.2
!pip install accelerate==0.27.2
!pip install datasets==2.18.0
!pip install peft==0.9.0
!pip install bitsandbytes==0.42.0
!pip install sentence-transformers==2.5.1

In [2]:
import os
import torch
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
login(token=user_secrets.get_secret("HUGGINGFACE_TOKEN"))

os.environ["MODEL_NAME"] = ""
os.environ["DATASET"]="lmsys/chatbot_arena_conversations"

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


# Loading the Dataset

In [3]:
from datasets import load_dataset

dataset=load_dataset(os.getenv("DATASET"), split="train")
dataset

Downloading readme:   0%|          | 0.00/7.00k [00:00<?, ?B/s]

Downloading data: 100%|██████████| 41.6M/41.6M [00:05<00:00, 7.05MB/s]


Generating train split:   0%|          | 0/33000 [00:00<?, ? examples/s]

Dataset({
    features: ['question_id', 'model_a', 'model_b', 'winner', 'judge', 'conversation_a', 'conversation_b', 'turn', 'anony', 'language', 'tstamp', 'openai_moderation', 'toxic_chat_tag'],
    num_rows: 33000
})

# Conversations per Model

In [4]:
model_conv_count={}
for d in dataset:
    for k in ["model_a","model_b"]:
        model=d[k]
        if not model in model_conv_count:
            model_conv_count[model]=1
        else:
            model_conv_count[model]+=1
model_conv_count

{'chatglm-6b': 3322,
 'koala-13b': 5573,
 'oasst-pythia-12b': 4890,
 'alpaca-13b': 4453,
 'vicuna-13b': 5931,
 'dolly-v2-12b': 2786,
 'stablelm-tuned-alpha-7b': 2795,
 'llama-13b': 2009,
 'fastchat-t5-3b': 3210,
 'gpt-3.5-turbo': 4654,
 'gpt-4': 4217,
 'RWKV-4-Raven-14B': 3682,
 'claude-v1': 3927,
 'mpt-7b-chat': 2854,
 'palm-2': 2955,
 'claude-instant-v1': 2626,
 'vicuna-7b': 2869,
 'wizardlm-13b': 1116,
 'gpt4all-13b-snoozy': 1097,
 'guanaco-33b': 1034}

# Extracting Conversations that GPT4 Gave One of the Two Answers

In [5]:
from tqdm import tqdm

reference_model="gpt-4"
answers={}

for judgement in tqdm(dataset):
    models_involved=judgement["model_a"]+judgement["model_b"]
    if not reference_model in models_involved:
        continue
    # get answers for GPT-4 and other model
    reference_label, other_label=("a","b")  if judgement["model_a"]==reference_model else("b", "a")
    answers_ref=[msg["content"] for msg in judgement[f"conversation_{reference_label}"] if msg["role"]=="assistant"]
    answers_other=[msg["content"] for msg in judgement[f"conversation_{other_label}"] if msg["role"]=="assistant"]
    
    # store answers in answes dict
    other_model=judgement[f"model_{other_label}"]
    if not other_model in answers:
        answers[other_model]=dict(answers_model=[], answers_ref=[])
    answers[other_model]["answers_model"].extend(answers_other)
    answers[other_model]["answers_ref"].extend(answers_ref)


100%|██████████| 33000/33000 [00:10<00:00, 3213.21it/s]


In [6]:
import pandas as pd

data={"Model": answers.keys(), 'num_answers':[len(answers[m]["answers_model"]) for m in answers]}

df=pd.DataFrame(data)
df=df.sort_values(by=["num_answers"], ascending=False)
df.head()

Unnamed: 0,Model,num_answers
0,vicuna-13b,448
3,gpt-3.5-turbo,436
2,koala-13b,417
1,oasst-pythia-12b,395
10,claude-v1,366


In [9]:
from sentence_transformers import SentenceTransformer

model=SentenceTransformer('all-MiniLM-L6-v2', device='cuda')
model.max_seq_length=200
model.device

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

device(type='cuda', index=0)

In [22]:
import torch
from sentence_transformers import util
from sentence_transformers.util import normalize_embeddings


def get_embeddings(sentences):
    corpus_embeddings=model.encode(sentences, convert_to_tensor=True)
#     corpus_embeddings=normalize_embeddings(corpus_embeddings)
    return corpus_embeddings


def get_similarities(emd1, emd2):
    cosine_scores=util.pytorch_cos_sim(emd1, emd2)
    return cosine_scores


def mean_pooling(model_output, attention_mask):
    """
    Need tokenizer
    """
    # The first element of model_output contains all token embeddings
    token_embeddings=model_output[0]
    input_mask_expanded=attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings=torch.sum(token_embeddings*input_mask_expanded,1)
    sum_mask=torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings/sum_mask


emd_ans=get_embeddings(answers["vicuna-13b"]["answers_model"][6])
emd_ref=get_embeddings(answers["vicuna-13b"]["answers_ref"][6])

similarities=get_similarities(emd_ans, emd_ref)
print(similarities)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

tensor([[0.9201]], device='cuda:0')


In [47]:
from statistics import mean
import torch.nn as nn

def nn_cos(emd1,emd2):
    cos=nn.CosineSimilarity(dim=1, eps=1e-6)
    return cos(emd_ans, emd_ref).tolist()

emd_ans=get_embeddings(answers["vicuna-13b"]["answers_model"])
emd_ans=normalize_embeddings(emd_ans)
emd_ref=get_embeddings(answers["vicuna-13b"]["answers_ref"])
emd_ref=normalize_embeddings(emd_ref)

print(mean(nn_cos(emd_ans, emd_ref)))

Batches:   0%|          | 0/14 [00:00<?, ?it/s]

Batches:   0%|          | 0/14 [00:00<?, ?it/s]

0.7271350175724365


In [48]:
from sentence_transformers.util import cos_sim

cos_scores=cos_sim(emd_ans, emd_ref)
print(cos_scores)

tensor([[ 0.8245,  0.0437,  0.0380,  ...,  0.0598,  0.0732, -0.0286],
        [ 0.0250,  0.9121,  0.3769,  ...,  0.0310,  0.0327,  0.0995],
        [-0.0018,  0.2686,  0.7807,  ...,  0.0469,  0.1310,  0.1541],
        ...,
        [ 0.0264,  0.0147,  0.0789,  ...,  0.9540,  0.0722,  0.0518],
        [ 0.0917,  0.0592,  0.1052,  ...,  0.0482,  0.7334,  0.0316],
        [-0.0721,  0.0493,  0.1054,  ...,  0.0535,  0.0528,  0.9089]],
       device='cuda:0')


# Calculating Similarity for all Answers

In [None]:
models=list(answers.keys())
models_similarites=[]

for model in tqdm(models):
    pass