In [22]:
import numpy as np
import pandas as pd 

from sklearn.model_selection import train_test_split
from sklearn.pipeline import FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

from catboost import CatBoostClassifier
from lightgbm import early_stopping,log_evaluation, LGBMClassifier

from transformers import pipeline, AutoTokenizer, AutoModelForMaskedLM, AutoModel
from sentence_transformers import SentenceTransformer
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

import matplotlib.pyplot as plt

from tqdm import tqdm
tqdm.pandas()

In [3]:
path="../data/"
train = pd.read_parquet(path+"train.parquet")

train, test = train_test_split(train, test_size=0.2, random_state=1)
eval, test = train_test_split(test, test_size=0.5, random_state=1)

sub = pd.read_csv(path+"sample_submission.csv")

In [4]:
train.head()

Unnamed: 0,id,prompt,response_a,response_b,winner,model_a,model_b,language
18632,623e2aa50f13aece7fb1c60cf555d8c7d290692f679655...,Phân tích kết quả sản xuất (KQSX) về chất lượn...,Để phân tích kết quả sản xuất (KQSX) về chất l...,Phân tích kết quả sản xuất (KQSX) về chất lượn...,model_a,yi-lightning,internlm2_5-20b-chat,Vietnamese
34859,b87079eef09873032f653314cfddd05e7bf77e4e6d0449...,Write a sample of Old Uyghur,Providing a truly representative sample of Old...,"I apologize, but I cannot generate an authenti...",model_a,gemini-1.5-flash-002,claude-3-5-sonnet-20240620,English
43764,e721d5bfbff8e05e2ffa92e0137a570c1b1b7fa3f7bbf7...,Write a poem in the style of Ada Limon.,**Under the Bruise of the World**\n\nIn the ga...,Here's a poem inspired by the style of Ada Lim...,model_b,reka-core-20240722,grok-2-2024-08-13,English
22541,77350a005d0195fa951b9fb073b018f2f6424a58ee58b5...,"describe the way ""golden slumbers"" by the beat...","Describing the way ""Golden Slumbers"" by The Be...","For a deaf person who has never heard ""Golden ...",model_a,jamba-1.5-large,deepseek-coder-v2-0724,English
37157,c43d63ee99a8b776065f410e4a1e0791a4449230cfd648...,かけっこをしています。あなたは3位の人を抜かし ました。今あなたは何位？,3位です。あなたは3位の人を抜かしたので、今あなたは3位になります。,はい、かけっこの状況について考えてみましょう。\n\n問題文から、以下のことがわかります。\...,model_a,o1-preview,claude-3-opus-20240229,Japanese


In [5]:
test.head()

Unnamed: 0,id,prompt,response_a,response_b,winner,model_a,model_b,language
7419,271ba404fc25609b8ceb6f37c3375f278c42e2479514ea...,Улучшь сообщение: Добрый вечер! \nЭто Алексей ...,"Добрый вечер!\n\nМеня зовут Алексей Королев, я...","Добрый вечер!\n\nЯ, Алексей Королев, заместите...",model_b,qwen-max-0919,c4ai-aya-expanse-32b,Russian
39112,ce7be103ac8f040accc9b8ec421a6cbf78189c503671df...,i have a notification icon in my webpage. when...,Certainly! To implement a notification system ...,Here's a simple example of how you can achieve...,model_b,qwen2.5-72b-instruct,llama-3.1-405b-instruct-bf16,English
33946,b39725e0a7992eb0c5f34542f960727a4fd306741568fb...,I have a raspberry pi that is booted from USB....,**Hotplugging the SD Card on a Raspberry Pi**\...,"No, you generally **cannot** safely hotplug th...",model_a,llama-3.1-8b-instruct,gemini-1.5-pro-exp-0827,English
35392,bb33f84e8e85da9e41fa271f6210fdffba0400d725a04c...,"Imagine video on YouTube ""Asking people what i...",Here are some imagined YouTube comments relati...,Here are some comments that might appear under...,model_a,gemini-1.5-flash-002,gemma-2-27b-it,English
10254,364c8146a6a236fbdfa34492dba10405a0e448d3062316...,generate a wine description,Please provide some context! To generate a co...,**Wine Description:**\n\n**Name:** Sunset on t...,model_b,gemini-1.5-flash-8b-001,llama-3.1-70b-instruct,English


In [24]:
model = AutoModel.from_pretrained("./fine_tuned_nsp_e5")
model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(250037, 384, padding_idx=0)
    (position_embeddings): Embedding(512, 384)
    (token_type_embeddings): Embedding(2, 384)
    (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=384, out_features=384, bias=True)
            (key): Linear(in_features=384, out_features=384, bias=True)
            (value): Linear(in_features=384, out_features=384, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=384, out_features=384, bias=True)
            (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=Fals

In [27]:
class QADataset(Dataset):
    def __init__(self, data, device, tokenizer):
        self.data = data
        self.device = device
        self.tokenizer = tokenizer
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        row = self.data.iloc[idx]

        prompt_encoding = self.tokenizer(
            row["prompt"],
            truncation=True, padding='max_length',
            # max_length=self.max_len,
            return_tensors='pt'
        )

        response_a_encoding = self.tokenizer(
            row["response_a"],
            truncation=True, padding='max_length',
            # max_length=self.max_len,
            return_tensors='pt'
        )

        response_b_encoding = self.tokenizer(
            row["response_b"],
            truncation=True, padding='max_length',
            # max_length=self.max_len,
            return_tensors='pt'
        )

        prompt = {k: v.squeeze(0).to(self.device) for k, v in prompt_encoding.items()}
        response_a = {k: v.squeeze(0).to(self.device) for k, v in response_a_encoding.items()}
        response_b = {k: v.squeeze(0).to(self.device) for k, v in response_b_encoding.items()}

        return (
            prompt, response_a, response_b
        )
    

In [29]:
device = 'cuda'

model = AutoModel.from_pretrained("./fine_tuned_nsp_e5")
tokenizer = AutoTokenizer.from_pretrained("./fine_tuned_nsp_e5")

model = model.to(device)

batch_size = 128
train_data_loader = DataLoader(QADataset(train, device, tokenizer), batch_size=batch_size, shuffle=True, drop_last=True)
eval_data_loader = DataLoader(QADataset(eval, device, tokenizer), batch_size=batch_size, drop_last=True)
test_data_loader = DataLoader(QADataset(test, device, tokenizer), batch_size=batch_size, drop_last=True)



In [30]:
@torch.no_grad()
def get_dataloader_embeddings(dataloader: DataLoader, model):
    embeddings = {
        "prompt": [], 
        "response_a": [], 
        "response_b": [], 
    }

    model.eval()

    for batch in tqdm(dataloader):
        prompts = batch[0]
        responses_a = batch[1]
        responses_b = batch[2]

        prompts_embs = model(**prompts).pooler_output
        response_a_embs = model(**responses_a).pooler_output
        response_b_embs = model(**responses_b).pooler_output

        embeddings["prompt"].append(prompts_embs.detach().cpu().numpy())
        embeddings["response_a"].append(response_a_embs.detach().cpu().numpy())
        embeddings["response_b"].append(response_b_embs.detach().cpu().numpy())
    
    return embeddings

In [31]:
train_embeddings = get_dataloader_embeddings(train_data_loader, model)
eval_embeddings = get_dataloader_embeddings(eval_data_loader, model)
test_embeddings = get_dataloader_embeddings(test_data_loader, model)

100%|██████████| 544/544 [07:16<00:00,  1.25it/s]
100%|██████████| 37/37 [00:29<00:00,  1.26it/s]
100%|██████████| 37/37 [00:30<00:00,  1.23it/s]


In [32]:
train_embeddings['prompt'][0].shape

(128, 384)

In [33]:
def aggregate_dataloader_embeddings(embeddings: dict[str, list[np.ndarray]]):
    prompts_embs = np.concatenate(embeddings['prompt'])
    response_a_embs = np.concatenate(embeddings['response_a'])
    response_b_embs = np.concatenate(embeddings['response_b'])

    all_embs = np.concatenate([prompts_embs, response_a_embs, response_b_embs], axis=1)
    return all_embs

train_features = aggregate_dataloader_embeddings(train_embeddings)
eval_features = aggregate_dataloader_embeddings(eval_embeddings)
test_features = aggregate_dataloader_embeddings(test_embeddings)

In [34]:
X_train = train_features
y_train = train['winner'].iloc[:len(X_train)]

X_eval = eval_features
y_eval = eval['winner'].iloc[:len(X_eval)]

X_test = test_features
y_test = test['winner'].iloc[:len(X_test)]


print(len(X_train), len(y_train))
print(len(X_eval), len(y_eval))
print(len(X_test), len(y_test))

69632 69632
4736 4736
4736 4736


In [35]:
cbm = CatBoostClassifier(depth=6, random_state=42, eval_metric="Accuracy", verbose=100)
cbm.fit(X_train, y_train, eval_set=(X_eval, y_eval))

Learning rate set to 0.090382
0:	learn: 0.5088178	test: 0.5027449	best: 0.5027449 (0)	total: 28.6ms	remaining: 28.5s
100:	learn: 0.5907341	test: 0.4936655	best: 0.5219595 (9)	total: 3.62s	remaining: 32.2s
200:	learn: 0.6505055	test: 0.4888091	best: 0.5219595 (9)	total: 7.07s	remaining: 28.1s
300:	learn: 0.6914350	test: 0.4966216	best: 0.5219595 (9)	total: 10.4s	remaining: 24.1s
400:	learn: 0.7241355	test: 0.4921875	best: 0.5219595 (9)	total: 13.8s	remaining: 20.7s
500:	learn: 0.7519100	test: 0.4993666	best: 0.5219595 (9)	total: 17.4s	remaining: 17.4s
600:	learn: 0.7736242	test: 0.4987331	best: 0.5219595 (9)	total: 20.8s	remaining: 13.8s
700:	learn: 0.7903694	test: 0.4978885	best: 0.5219595 (9)	total: 24.1s	remaining: 10.3s
800:	learn: 0.8066981	test: 0.5025338	best: 0.5219595 (9)	total: 27.6s	remaining: 6.85s
900:	learn: 0.8234002	test: 0.4968328	best: 0.5219595 (9)	total: 31.1s	remaining: 3.42s
999:	learn: 0.8359231	test: 0.4991554	best: 0.5219595 (9)	total: 34.5s	remaining: 0us

best

<catboost.core.CatBoostClassifier at 0x7f3e02bc2fd0>