- Training notebook: https://www.kaggle.com/code/ravaghi/wsdm-cup-gemma-2-9b-4-bit-qlora-training

# Imports and configs

In [1]:
!pip install accelerate peft bitsandbytes transformers trl unsloth seaborn 
!pip install --upgrade 'optree>=0.13.0'



In [87]:
from transformers import Gemma2ForSequenceClassification, GemmaTokenizerFast
from transformers.data.data_collator import pad_without_fast_tokenizer_warning
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report
from sklearn.base import clone
from concurrent.futures import ThreadPoolExecutor
from timeit import default_timer as timer
from peft import PeftModel
from tqdm import tqdm
from nltk.tokenize import sent_tokenize
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from scipy.special import logit
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import warnings
import joblib
import torch
import json
import gc
import os

warnings.filterwarnings('ignore')

In [3]:
import kagglehub

gemma_2_9b_4bit_it_unsloth_transformers_default_1_path = kagglehub.model_download('leimeng46/gemma-2-9b-4bit-it-unsloth/Transformers/default/1')
wsdm_cup_gemma_2_9b_4_bit_qlora_path = kagglehub.dataset_download('ravaghi/wsdm-cup-gemma-2-9b-4-bit-qlora')

In [4]:
class CFG:
    train_path = '../data/train.parquet'
    test_path = '../data/train.parquet'
    sample_sub_path = '/kaggle/input/wsdm-cup-multilingual-chatbot-arena/sample_submission.csv'

    data_path = '/kaggle/input/wsdm-cup-gemma-2-9b-4-bit-qlora'

    gemma_dir = gemma_2_9b_4bit_it_unsloth_transformers_default_1_path + "/gemma-2-9b-it-4bit-unsloth_old"
    lora_dir = wsdm_cup_gemma_2_9b_4_bit_qlora_path + "/gemma2-9b-4bit/gemma-2-9b-it-bnb-4bit-3072-8/checkpoint-2900"
    
    max_length = 3072
    batch_size = 4

    target = 'winner'
    n_folds = 5
    seed = 42

    char_vectorizer_params = {
        'analyzer': "char",
        "lowercase": False,
        "max_df": 0.605,
        "max_features": 331,
        "min_df": 0.075,
        "ngram_range": (1, 3),
        "strip_accents": "unicode"
    }

    word_vectorizer_params = {
        "analyzer": "word",
        "lowercase": True,
        "max_df": 0.985,
        "max_features": 769,
        "min_df": 0.01,
        "ngram_range": (1, 2),
        "strip_accents": "unicode"
    }

# Gemma-2 9b 4-bit

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
test = pd.read_parquet(CFG.test_path).fillna('')
train, test = train_test_split(test, test_size=0.2, random_state=1)
val, test = train_test_split(test, test_size=0.5, random_state=1)
train = pd.concat((train, val))

In [7]:
if len(test) > 10_000:
    time_limit = int(3600 * 12) 
else:
    time_limit = int(3600 * 4.75)

## Tokenizing

In [8]:
def tokenize(tokenizer, texts, max_length=CFG.max_length):
    texts = [t for t in texts]

    tokenized = tokenizer(texts, max_length=max_length, truncation=True)
    
    return tokenized['input_ids'], tokenized['attention_mask']

In [9]:
CFG.gemma_dir

'/home/ya.pristalov/.cache/kagglehub/models/leimeng46/gemma-2-9b-4bit-it-unsloth/Transformers/default/1/gemma-2-9b-it-4bit-unsloth_old'

In [10]:
tokenizer = GemmaTokenizerFast.from_pretrained(CFG.gemma_dir)


tokenizer.add_eos_token = True
tokenizer.padding_side = "right"

In [11]:
test

Unnamed: 0,id,prompt,response_a,response_b,winner,model_a,model_b,language
7419,271ba404fc25609b8ceb6f37c3375f278c42e2479514ea...,Улучшь сообщение: Добрый вечер! \nЭто Алексей ...,"Добрый вечер!\n\nМеня зовут Алексей Королев, я...","Добрый вечер!\n\nЯ, Алексей Королев, заместите...",model_b,qwen-max-0919,c4ai-aya-expanse-32b,Russian
39112,ce7be103ac8f040accc9b8ec421a6cbf78189c503671df...,i have a notification icon in my webpage. when...,Certainly! To implement a notification system ...,Here's a simple example of how you can achieve...,model_b,qwen2.5-72b-instruct,llama-3.1-405b-instruct-bf16,English
33946,b39725e0a7992eb0c5f34542f960727a4fd306741568fb...,I have a raspberry pi that is booted from USB....,**Hotplugging the SD Card on a Raspberry Pi**\...,"No, you generally **cannot** safely hotplug th...",model_a,llama-3.1-8b-instruct,gemini-1.5-pro-exp-0827,English
35392,bb33f84e8e85da9e41fa271f6210fdffba0400d725a04c...,"Imagine video on YouTube ""Asking people what i...",Here are some imagined YouTube comments relati...,Here are some comments that might appear under...,model_a,gemini-1.5-flash-002,gemma-2-27b-it,English
10254,364c8146a6a236fbdfa34492dba10405a0e448d3062316...,generate a wine description,Please provide some context! To generate a co...,**Wine Description:**\n\n**Name:** Sunset on t...,model_b,gemini-1.5-flash-8b-001,llama-3.1-70b-instruct,English
...,...,...,...,...,...,...,...,...
26506,8bd008cebcfd4a9e504f0dbca5a5664d412ca9b4c4045f...,"""La existencia de diferencias con respecto a c...",La presencia de variaciones en los espectros d...,La presencia de diferencias según el gas utili...,model_a,gpt-4-1106-preview,gpt-4o-2024-08-06,Spanish
4869,198aa3635a6fc9f6a7d069e82cc9cddcbc783e3a0e83e8...,Thế nào là tham nhũng?\n\n Tham nhũng là hà...,"Theo Khoản 1, Điều 353 Bộ luật Hình sự năm 201...",**Câu 1: Thế nào là tham nhũng?**\n\n**Đáp án ...,model_b,claude-3-opus-20240229,o1-mini,Vietnamese
16686,584414c294899b255a9576ade65032610ca1937cffcad4...,Which number is greater: 9.11 or 9.9?\nIs 9.11...,Let's compare the two numbers step by step:\n\...,9.11 is greater than 9.9.,model_a,grok-2-2024-08-13,deepseek-v2.5,English
28451,95fb0a8dabf14619a9564334dc390cbf170a0e09905086...,```\nto order the special call commission for ...,This is a transcript of a Kentucky Fish and Wi...,This transcript details a special call commiss...,model_b,gemini-1.5-pro-002,gemini-1.5-flash-8b-001,English


In [12]:
for col in ['prompt', 'response_a', 'response_b']:
    test[col] = test[col].fillna('')
    text_list = []
    if col == "prompt":
        max_no = 512
        s_no = 255
        e_no = -256
    else:
        max_no = 3072
        s_no = 1535
        e_no = -1536
    for text in tqdm(test[col]):
        encoded = tokenizer(text, return_offsets_mapping=True)
        if len(encoded['input_ids']) > max_no:
            start_idx, end_idx = encoded['offset_mapping'][s_no]
            new_text = text[:end_idx]
            start_idx, end_idx = encoded['offset_mapping'][e_no]
            new_text = new_text + "\n(snip)\n" + text[start_idx:]
            text = new_text
        text_list.append(text)
    test[col] = text_list

100%|██████████| 4844/4844 [00:01<00:00, 3318.35it/s]
100%|██████████| 4844/4844 [00:02<00:00, 1689.52it/s]
100%|██████████| 4844/4844 [00:02<00:00, 1679.58it/s]


In [13]:
data_prompt = pd.DataFrame()
data_response_a = pd.DataFrame()
data_response_b = pd.DataFrame()

data_prompt["id"] = test["id"]
data_response_a["id"] = test["id"]
data_response_b["id"] = test["id"]

data_prompt["input_ids"], data_prompt["attention_mask"] = tokenize(tokenizer, test["prompt"])
data_response_a["input_ids"], data_response_a["attention_mask"] = tokenize(tokenizer, test["response_a"])
data_response_b["input_ids"], data_response_b["attention_mask"] = tokenize(tokenizer, test["response_b"])

data_prompt["length"] = data_prompt["input_ids"].apply(len)
data_response_a["length"] = data_response_a["input_ids"].apply(len)
data_response_b["length"] = data_response_b["input_ids"].apply(len)

# aug_data = pd.DataFrame()
# aug_data["id"] = test["id"]
# # swap response_a & response_b
# aug_data['input_ids'], aug_data['attention_mask'] = tokenize(tokenizer, test["prompt"], test["response_b"], test["response_a"])
# aug_data["length"] = aug_data["input_ids"].apply(len)

In [14]:
data_prompt

Unnamed: 0,id,input_ids,attention_mask,length
7419,271ba404fc25609b8ceb6f37c3375f278c42e2479514ea...,"[2, 235774, 52696, 15243, 162247, 235292, 1345...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",115
39112,ce7be103ac8f040accc9b8ec421a6cbf78189c503671df...,"[2, 235252, 791, 476, 21244, 6480, 575, 970, 7...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",73
33946,b39725e0a7992eb0c5f34542f960727a4fd306741568fb...,"[2, 235285, 791, 476, 79801, 2846, 674, 603, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",21
35392,bb33f84e8e85da9e41fa271f6210fdffba0400d725a04c...,"[2, 41911, 3569, 611, 12526, 664, 100617, 1461...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",101
10254,364c8146a6a236fbdfa34492dba10405a0e448d3062316...,"[2, 18506, 476, 10058, 5966, 1]","[1, 1, 1, 1, 1, 1]",6
...,...,...,...,...
26506,8bd008cebcfd4a9e504f0dbca5a5664d412ca9b4c4045f...,"[2, 235281, 2841, 71765, 581, 88614, 632, 3906...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",123
4869,198aa3635a6fc9f6a7d069e82cc9cddcbc783e3a0e83e8...,"[2, 946, 235650, 30976, 5536, 54915, 3555, 893...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",411
16686,584414c294899b255a9576ade65032610ca1937cffcad4...,"[2, 13033, 1758, 603, 6561, 235292, 235248, 23...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",408
28451,95fb0a8dabf14619a9564334dc390cbf170a0e09905086...,"[2, 1917, 108, 511, 2184, 573, 3186, 2409, 112...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",517


## Model

In [15]:
CFG.gemma_dir

'/home/ya.pristalov/.cache/kagglehub/models/leimeng46/gemma-2-9b-4bit-it-unsloth/Transformers/default/1/gemma-2-9b-it-4bit-unsloth_old'

In [16]:
model = Gemma2ForSequenceClassification.from_pretrained(
    CFG.gemma_dir,
    device_map=torch.device("cuda"),
    use_cache=False,
)

Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at /home/ya.pristalov/.cache/kagglehub/models/leimeng46/gemma-2-9b-4bit-it-unsloth/Transformers/default/1/gemma-2-9b-it-4bit-unsloth_old and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
model = PeftModel.from_pretrained(model, CFG.lora_dir)

In [18]:
model.eval()

model.base_model.model.score = torch.nn.Identity()

## Inference

In [19]:
@torch.no_grad()
@torch.cuda.amp.autocast()
def inference(df, model, device, batch_size, max_length=CFG.max_length):
    all_embeddings = []
    
    for start_idx in tqdm(range(0, len(df), batch_size)):
        end_idx = min(start_idx + batch_size, len(df))
        tmp = df.iloc[start_idx:end_idx]
        input_ids = tmp["input_ids"].to_list()
        attention_mask = tmp["attention_mask"].to_list()
        inputs = pad_without_fast_tokenizer_warning(
            tokenizer,
            {"input_ids": input_ids, "attention_mask": attention_mask},
            padding="longest",
            pad_to_multiple_of=None,
            return_tensors="pt",
        )
        outputs = model(**inputs.to(device))
        
        embeddings = outputs.logits.cpu()
        
        all_embeddings.extend(embeddings.tolist())
    
    return all_embeddings

In [20]:
global_timer = timer()

In [21]:
data_prompt

Unnamed: 0,id,input_ids,attention_mask,length
7419,271ba404fc25609b8ceb6f37c3375f278c42e2479514ea...,"[2, 235774, 52696, 15243, 162247, 235292, 1345...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",115
39112,ce7be103ac8f040accc9b8ec421a6cbf78189c503671df...,"[2, 235252, 791, 476, 21244, 6480, 575, 970, 7...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",73
33946,b39725e0a7992eb0c5f34542f960727a4fd306741568fb...,"[2, 235285, 791, 476, 79801, 2846, 674, 603, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",21
35392,bb33f84e8e85da9e41fa271f6210fdffba0400d725a04c...,"[2, 41911, 3569, 611, 12526, 664, 100617, 1461...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",101
10254,364c8146a6a236fbdfa34492dba10405a0e448d3062316...,"[2, 18506, 476, 10058, 5966, 1]","[1, 1, 1, 1, 1, 1]",6
...,...,...,...,...
26506,8bd008cebcfd4a9e504f0dbca5a5664d412ca9b4c4045f...,"[2, 235281, 2841, 71765, 581, 88614, 632, 3906...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",123
4869,198aa3635a6fc9f6a7d069e82cc9cddcbc783e3a0e83e8...,"[2, 946, 235650, 30976, 5536, 54915, 3555, 893...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",411
16686,584414c294899b255a9576ade65032610ca1937cffcad4...,"[2, 13033, 1758, 603, 6561, 235292, 235248, 23...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",408
28451,95fb0a8dabf14619a9564334dc390cbf170a0e09905086...,"[2, 1917, 108, 511, 2184, 573, 3186, 2409, 112...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",517


In [22]:
data_response_a

Unnamed: 0,id,input_ids,attention_mask,length
7419,271ba404fc25609b8ceb6f37c3375f278c42e2479514ea...,"[2, 97520, 87759, 235341, 109, 230962, 206746,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",165
39112,ce7be103ac8f040accc9b8ec421a6cbf78189c503671df...,"[2, 94638, 235341, 1887, 7133, 476, 21244, 181...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1071
33946,b39725e0a7992eb0c5f34542f960727a4fd306741568fb...,"[2, 688, 18730, 44505, 9599, 573, 12596, 8518,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",520
35392,bb33f84e8e85da9e41fa271f6210fdffba0400d725a04c...,"[2, 4858, 708, 1009, 41483, 12526, 4703, 20081...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",554
10254,364c8146a6a236fbdfa34492dba10405a0e448d3062316...,"[2, 5958, 3658, 1009, 4807, 235341, 139, 1469,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",219
...,...,...,...,...
26506,8bd008cebcfd4a9e504f0dbca5a5664d412ca9b4c4045f...,"[2, 2841, 44313, 581, 221822, 659, 1454, 66693...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",110
4869,198aa3635a6fc9f6a7d069e82cc9cddcbc783e3a0e83e8...,"[2, 43502, 89031, 10591, 235248, 235274, 23526...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",132
16686,584414c294899b255a9576ade65032610ca1937cffcad4...,"[2, 5331, 235303, 235256, 12711, 573, 1378, 59...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",480
28451,95fb0a8dabf14619a9564334dc390cbf170a0e09905086...,"[2, 1596, 603, 476, 44399, 576, 476, 27353, 10...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",676


In [23]:
embeddings_prompt = inference(
    data_prompt,
    model, 
    torch.device('cuda'),
    CFG.batch_size
)

100%|██████████| 1211/1211 [04:53<00:00,  4.13it/s]


In [24]:
embeddings_response_a = inference(
    data_response_a,
    model, 
    torch.device('cuda'),
    CFG.batch_size
)

100%|██████████| 1211/1211 [15:20<00:00,  1.32it/s]


In [25]:
embeddings_response_b = inference(
    data_response_b,
    model, 
    torch.device('cuda'),
    CFG.batch_size
)

100%|██████████| 1211/1211 [15:08<00:00,  1.33it/s]


In [31]:
len(embeddings_response_b), len(embeddings_response_b[0])

(4844, 3584)

In [60]:
test.reset_index(drop=True, inplace=True)
data_prompt.reset_index(drop=True, inplace=True)

In [None]:
emb_dfs = []

for emb_set in [embeddings_prompt, embeddings_response_a, embeddings_response_b]:
    emb_df = pd.DataFrame(emb_set)
    print(emb_df.shape)
    emb_df['id'] = data_prompt['id'].copy()
    print(emb_df['id'])
    print(emb_df.shape)
    print(test.shape)
    print(len(test['id'].unique()))
    emb_df = emb_df.merge(test[['id', 'winner']], on='id').drop(columns=['id'])
    print(emb_df.shape)
    emb_dfs.append(emb_df)

In [62]:
emb_dfs[0]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3575,3576,3577,3578,3579,3580,3581,3582,3583,winner
0,0.107483,-0.930664,0.054596,-0.483398,-0.805176,0.009636,0.315918,0.177368,-0.340576,0.777832,...,0.547363,-0.068542,-0.007957,-0.245850,-0.229004,-0.431641,-0.071167,-0.099731,-0.494629,model_b
1,-0.160034,-1.169922,0.297363,-0.796387,-0.667480,0.345215,0.525879,0.639160,-0.140015,0.815918,...,0.479980,-0.142578,-0.152954,-0.061340,0.095642,-0.202393,-0.052582,0.040009,-0.464844,model_b
2,0.057953,-1.083008,-0.072876,-0.604980,-0.728516,0.129150,0.370605,0.601074,-0.077698,0.855957,...,0.430908,-0.101807,-0.187622,-0.217041,-0.050629,-0.273438,0.047638,-0.410156,-0.151855,model_a
3,-0.000818,-0.786621,0.043152,-0.747070,-0.606934,0.215210,0.124207,0.132324,-0.137207,0.751465,...,0.439697,-0.171143,-0.067871,-0.043213,-0.001268,-0.340820,-0.115173,0.055389,-0.543945,model_a
4,-0.131958,-1.006836,0.197021,-1.519531,-0.822754,-0.229004,0.531250,0.994141,-0.240479,1.251953,...,0.140381,-0.168335,-0.367432,0.119934,0.386719,-0.543945,0.352051,0.599609,-0.366211,model_b
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4839,-0.010765,-1.017578,0.164185,-0.657227,-0.732422,0.410156,0.496826,0.323730,-0.162720,0.904785,...,0.289062,-0.124390,0.009621,-0.238403,0.453613,-0.353516,0.237427,-0.204468,-0.274170,model_a
4840,-0.142456,-0.332764,0.410889,-0.698242,-0.638672,0.357910,0.484863,0.084229,-0.342773,0.673828,...,0.465820,0.097412,-0.052124,-0.156250,-0.023239,-0.314453,0.029175,0.185547,-0.309814,model_b
4841,-0.075623,-0.357910,0.222534,-0.459961,-0.767090,0.473633,0.684082,0.058533,-0.296631,0.671387,...,0.384766,0.118958,-0.181763,-0.339844,-0.048096,-0.186646,-0.260254,0.256836,-0.527344,model_a
4842,-0.045135,-0.321045,0.162231,-1.043945,-0.562500,0.374512,0.181152,0.311768,-0.075012,0.728027,...,0.211426,-0.129883,-0.082336,0.205933,0.105652,-0.185669,-0.012146,0.441895,-0.542969,model_b


In [30]:
emb_dfs[1]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3575,3576,3577,3578,3579,3580,3581,3582,3583,winner
0,-0.056152,-0.392090,0.374756,-0.440674,-0.629395,0.320068,0.730957,0.213989,-0.045593,0.625000,...,0.422363,0.135864,-0.046112,-0.385498,-0.217163,-0.268311,-0.080017,0.195068,-0.554199,model_a
1,0.063293,-0.368896,0.447998,-0.394043,-0.545410,0.210815,0.559082,0.040192,-0.263916,0.562988,...,0.500488,0.129517,-0.031738,-0.289551,-0.220337,-0.028839,-0.294189,0.172852,-0.396484,model_b
2,-0.086792,-0.345459,0.380615,-0.349365,-0.500000,0.382812,0.575195,0.227539,-0.131836,0.684570,...,0.404297,0.138184,0.020035,-0.313477,-0.322266,-0.135498,-0.247681,0.263916,-0.500488,model_a
3,-0.029343,-0.197632,0.378906,-0.022171,-0.942383,0.300293,0.728516,0.207886,-0.248047,0.628418,...,0.592285,0.032043,-0.057709,-0.436768,-0.322510,-0.329590,-0.185791,0.092896,-0.472900,model_a
4,-0.178589,-0.769043,0.210083,-0.603516,-0.742188,0.433594,0.994141,0.583984,-0.272705,0.706543,...,0.432861,0.027786,-0.140991,-0.198120,-0.073792,-0.469238,0.151978,-0.182983,-0.242065,model_a
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
504,0.084473,-0.485596,0.543945,-0.171509,-0.766602,0.349609,0.793457,0.099121,-0.290039,0.632812,...,0.500000,0.125732,-0.031250,-0.312988,-0.188843,-0.326172,-0.115112,0.100403,-0.552734,model_b
505,0.070923,-0.566895,0.302490,-0.293945,-0.723633,0.157349,0.723145,0.019608,-0.335205,0.679688,...,0.472900,0.041992,-0.102356,-0.254150,-0.237427,-0.197388,-0.086365,0.057465,-0.485352,model_b
506,-0.097473,-0.497070,0.210815,-0.364990,-0.842773,0.514160,0.591797,0.260742,-0.093018,0.746582,...,0.329834,0.055878,0.025772,-0.357422,-0.197632,-0.398926,-0.159302,-0.004623,-0.452881,model_b
507,-0.060852,-0.546387,0.578125,-0.260254,-0.851562,0.314697,0.758301,0.037354,-0.177979,0.751953,...,0.536133,0.290039,-0.034393,-0.411621,-0.198364,-0.236816,-0.191040,0.238647,-0.417969,model_a


In [51]:
test

Unnamed: 0,id,prompt,response_a,response_b,winner,model_a,model_b,language
7419,271ba404fc25609b8ceb6f37c3375f278c42e2479514ea...,Улучшь сообщение: Добрый вечер! \nЭто Алексей ...,"Добрый вечер!\n\nМеня зовут Алексей Королев, я...","Добрый вечер!\n\nЯ, Алексей Королев, заместите...",model_b,qwen-max-0919,c4ai-aya-expanse-32b,Russian
39112,ce7be103ac8f040accc9b8ec421a6cbf78189c503671df...,i have a notification icon in my webpage. when...,Certainly! To implement a notification system ...,Here's a simple example of how you can achieve...,model_b,qwen2.5-72b-instruct,llama-3.1-405b-instruct-bf16,English
33946,b39725e0a7992eb0c5f34542f960727a4fd306741568fb...,I have a raspberry pi that is booted from USB....,**Hotplugging the SD Card on a Raspberry Pi**\...,"No, you generally **cannot** safely hotplug th...",model_a,llama-3.1-8b-instruct,gemini-1.5-pro-exp-0827,English
35392,bb33f84e8e85da9e41fa271f6210fdffba0400d725a04c...,"Imagine video on YouTube ""Asking people what i...",Here are some imagined YouTube comments relati...,Here are some comments that might appear under...,model_a,gemini-1.5-flash-002,gemma-2-27b-it,English
10254,364c8146a6a236fbdfa34492dba10405a0e448d3062316...,generate a wine description,Please provide some context! To generate a co...,**Wine Description:**\n\n**Name:** Sunset on t...,model_b,gemini-1.5-flash-8b-001,llama-3.1-70b-instruct,English
...,...,...,...,...,...,...,...,...
26506,8bd008cebcfd4a9e504f0dbca5a5664d412ca9b4c4045f...,"""La existencia de diferencias con respecto a c...",La presencia de variaciones en los espectros d...,La presencia de diferencias según el gas utili...,model_a,gpt-4-1106-preview,gpt-4o-2024-08-06,Spanish
4869,198aa3635a6fc9f6a7d069e82cc9cddcbc783e3a0e83e8...,Thế nào là tham nhũng?\n\n Tham nhũng là hà...,"Theo Khoản 1, Điều 353 Bộ luật Hình sự năm 201...",**Câu 1: Thế nào là tham nhũng?**\n\n**Đáp án ...,model_b,claude-3-opus-20240229,o1-mini,Vietnamese
16686,584414c294899b255a9576ade65032610ca1937cffcad4...,Which number is greater: 9.11 or 9.9?\nIs 9.11...,Let's compare the two numbers step by step:\n\...,9.11 is greater than 9.9.,model_a,grok-2-2024-08-13,deepseek-v2.5,English
28451,95fb0a8dabf14619a9564334dc390cbf170a0e09905086...,```\nto order the special call commission for ...,This is a transcript of a Kentucky Fish and Wi...,This transcript details a special call commiss...,model_b,gemini-1.5-pro-002,gemini-1.5-flash-8b-001,English


In [65]:
emb_dfs[0].drop(columns='winner', inplace=True)
emb_dfs[1].drop(columns='winner', inplace=True)

In [69]:
all_embeddings = pd.concat(emb_dfs, axis=1)
all_embeddings.columns = range(len(all_embeddings.columns))
all_embeddings

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,10743,10744,10745,10746,10747,10748,10749,10750,10751,10752
0,0.107483,-0.930664,0.054596,-0.483398,-0.805176,0.009636,0.315918,0.177368,-0.340576,0.777832,...,0.503418,-0.152954,-0.050507,-0.290527,-0.316895,-0.117920,-0.150024,0.077271,-0.394043,model_b
1,-0.160034,-1.169922,0.297363,-0.796387,-0.667480,0.345215,0.525879,0.639160,-0.140015,0.815918,...,0.437256,0.148193,-0.164795,-0.390869,-0.201416,-0.016983,-0.364746,0.133057,-0.474854,model_b
2,0.057953,-1.083008,-0.072876,-0.604980,-0.728516,0.129150,0.370605,0.601074,-0.077698,0.855957,...,0.588379,0.235962,-0.103027,-0.507812,-0.341797,-0.223999,-0.101074,0.145874,-0.498535,model_a
3,-0.000818,-0.786621,0.043152,-0.747070,-0.606934,0.215210,0.124207,0.132324,-0.137207,0.751465,...,0.552734,0.217407,-0.144287,-0.280273,-0.154053,-0.214355,-0.159302,0.155640,-0.494629,model_a
4,-0.131958,-1.006836,0.197021,-1.519531,-0.822754,-0.229004,0.531250,0.994141,-0.240479,1.251953,...,0.447266,0.134766,-0.016479,-0.283691,-0.373535,-0.330566,-0.203003,0.064392,-0.593262,model_b
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4839,-0.010765,-1.017578,0.164185,-0.657227,-0.732422,0.410156,0.496826,0.323730,-0.162720,0.904785,...,0.305420,0.101746,-0.030777,-0.475098,-0.149658,-0.283936,-0.095947,0.027481,-0.299561,model_a
4840,-0.142456,-0.332764,0.410889,-0.698242,-0.638672,0.357910,0.484863,0.084229,-0.342773,0.673828,...,0.551270,0.087097,-0.166260,-0.210449,-0.225464,0.052917,-0.136841,0.188354,-0.246216,model_b
4841,-0.075623,-0.357910,0.222534,-0.459961,-0.767090,0.473633,0.684082,0.058533,-0.296631,0.671387,...,0.227051,0.204224,0.111328,-0.211548,0.237671,-0.657715,0.210571,0.273682,-0.159180,model_a
4842,-0.045135,-0.321045,0.162231,-1.043945,-0.562500,0.374512,0.181152,0.311768,-0.075012,0.728027,...,0.464844,0.210205,-0.069336,-0.237915,-0.180054,-0.226440,-0.280029,0.299805,-0.596680,model_b


In [82]:
train_eval_X, test_eval_X, train_eval_y, test_eval_y = train_test_split(all_embeddings.iloc[:, :-1], all_embeddings.iloc[:, -1], test_size=0.5)

In [72]:
clf = CatBoostClassifier(verbose=100, eval_metric='Accuracy')

clf.fit(train_eval_X, train_eval_y, eval_set=(test_eval_X, test_eval_y))

Learning rate set to 0.039428
0:	learn: 0.6156069	test: 0.5681255	best: 0.5681255 (0)	total: 175ms	remaining: 2m 54s
100:	learn: 0.8897605	test: 0.6354253	best: 0.6432700 (61)	total: 18.8s	remaining: 2m 47s
200:	learn: 0.9611891	test: 0.6391412	best: 0.6449216 (187)	total: 38.1s	remaining: 2m 31s
300:	learn: 0.9942197	test: 0.6440958	best: 0.6486375 (269)	total: 57.3s	remaining: 2m 13s
400:	learn: 1.0000000	test: 0.6461602	best: 0.6486375 (269)	total: 1m 16s	remaining: 1m 54s
500:	learn: 1.0000000	test: 0.6374897	best: 0.6494633 (449)	total: 1m 35s	remaining: 1m 35s
600:	learn: 1.0000000	test: 0.6374897	best: 0.6494633 (449)	total: 1m 55s	remaining: 1m 16s
700:	learn: 1.0000000	test: 0.6399670	best: 0.6494633 (449)	total: 2m 14s	remaining: 57.3s
800:	learn: 1.0000000	test: 0.6395541	best: 0.6494633 (449)	total: 2m 33s	remaining: 38.2s
900:	learn: 1.0000000	test: 0.6395541	best: 0.6494633 (449)	total: 2m 53s	remaining: 19s
999:	learn: 1.0000000	test: 0.6420314	best: 0.6494633 (449)	tota

<catboost.core.CatBoostClassifier at 0x7fd7e6657a10>

In [83]:
mapping = {'model_b': 0, 'model_a': 1}

train_eval_y = train_eval_y.apply(lambda el: mapping[el])
test_eval_y = test_eval_y.apply(lambda el: mapping[el])

In [84]:
train_eval_y.value_counts()

10752
0    1233
1    1189
Name: count, dtype: int64

In [88]:
clf = RidgeClassifier()

clf.fit(train_eval_X, train_eval_y)

preds = clf.predict(test_eval_X)
print(preds)
print(classification_report(preds, test_eval_y))

[0 0 0 ... 0 1 1]
              precision    recall  f1-score   support

           0       0.55      0.55      0.55      1224
           1       0.54      0.54      0.54      1198

    accuracy                           0.55      2422
   macro avg       0.55      0.55      0.55      2422
weighted avg       0.55      0.55      0.55      2422



In [None]:
clf = LGBMClassifier()

clf.fit(train_eval_X, train_eval_y)

preds = clf.predict(test_eval_X)
print(preds)
print(classification_report(preds, test_eval_y))

[LightGBM] [Info] Number of positive: 1189, number of negative: 1233
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.367738 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2741104
[LightGBM] [Info] Number of data points in the train set: 2422, number of used features: 10752
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.490917 -> initscore=-0.036338
[LightGBM] [Info] Start training from score -0.036338
[0 0 1 ... 1 1 0]
              precision    recall  f1-score   support

           0       0.67      0.63      0.65      1300
           1       0.60      0.64      0.62      1122

    accuracy                           0.63      2422
   macro avg       0.63      0.63      0.63      2422
weighted avg       0.64      0.63      0.63      2422

