# Overview

We are going to implement a Arena leaderboard with SemScore.

In [1]:
%%capture
!pip install transformers==4.38.2
!pip install accelerate==0.27.2
!pip install datasets==2.18.0
!pip install peft==0.9.0
!pip install bitsandbytes==0.42.0
!pip install sentence-transformers==2.5.1

In [2]:
import os
import torch
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
login(token=user_secrets.get_secret("HUGGINGFACE_TOKEN"))

os.environ["MODEL_NAME"] = ""
os.environ["DATASET"]="lmsys/chatbot_arena_conversations"

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


# Loading the Dataset

In [3]:
from datasets import load_dataset

dataset=load_dataset(os.getenv("DATASET"), split="train")
dataset

Downloading readme:   0%|          | 0.00/7.00k [00:00<?, ?B/s]

Downloading data: 100%|██████████| 41.6M/41.6M [00:16<00:00, 2.47MB/s]


Generating train split:   0%|          | 0/33000 [00:00<?, ? examples/s]

Dataset({
    features: ['question_id', 'model_a', 'model_b', 'winner', 'judge', 'conversation_a', 'conversation_b', 'turn', 'anony', 'language', 'tstamp', 'openai_moderation', 'toxic_chat_tag'],
    num_rows: 33000
})

# Conversations per Model

In [6]:
model_conv_count={}
for d in dataset:
    for k in ["model_a","model_b"]:
        model=d[k]
        if not model in model_conv_count:
            model_conv_count[model]=1
        else:
            model_conv_count[model]+=1
model_conv_count

{'chatglm-6b': 3322,
 'koala-13b': 5573,
 'oasst-pythia-12b': 4890,
 'alpaca-13b': 4453,
 'vicuna-13b': 5931,
 'dolly-v2-12b': 2786,
 'stablelm-tuned-alpha-7b': 2795,
 'llama-13b': 2009,
 'fastchat-t5-3b': 3210,
 'gpt-3.5-turbo': 4654,
 'gpt-4': 4217,
 'RWKV-4-Raven-14B': 3682,
 'claude-v1': 3927,
 'mpt-7b-chat': 2854,
 'palm-2': 2955,
 'claude-instant-v1': 2626,
 'vicuna-7b': 2869,
 'wizardlm-13b': 1116,
 'gpt4all-13b-snoozy': 1097,
 'guanaco-33b': 1034}

# Extracting Conversations that GPT4 Gave One of the Two Answers

In [12]:
from tqdm import tqdm

reference_model="gpt-4"
answers={}

for judgement in tqdm(dataset):
    models_involved=judgement["model_a"]+judgement["model_b"]
    if not reference_model in models_involved:
        continue
    # get answers for GPT-4 and other model
    reference_label, other_label=("a","b")  if judgement["model_a"]==reference_model else("b", "a")
    answers_ref=[msg["content"] for msg in judgement[f"conversation_{reference_label}"] if msg["role"]=="assistant"]
    answers_other=[msg["content"] for msg in judgement[f"conversation_{other_label}"] if msg["role"]=="assistant"]
    
    # store answers in answes dict
    other_model=judgement[f"model_{other_label}"]
    if not other_model in answers:
        answers[other_model]=dict(answers_model=[], answers_ref=[])
    answers[other_model]["answers_model"].extend(answers_other)
    answers[other_model]["answers_ref"].extend(answers_ref)


100%|██████████| 33000/33000 [00:08<00:00, 3738.18it/s]


In [15]:
import pandas as pd

data={"Model": answers.keys(), 'num_answers':[len(answers[m]["answers_model"]) for m in answers]}

df=pd.DataFrame(data)
df=df.sort_values(by=["num_answers"], ascending=False)
df.head()

Unnamed: 0,Model,num_answers
0,vicuna-13b,448
3,gpt-3.5-turbo,436
2,koala-13b,417
1,oasst-pythia-12b,395
10,claude-v1,366


In [None]:
def 