In [7]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/kaggle/input/wsdm-cup-multilingual-chatbot-arena/sample_submission.csv
/kaggle/input/wsdm-cup-multilingual-chatbot-arena/train.parquet
/kaggle/input/wsdm-cup-multilingual-chatbot-arena/test.parquet


In [8]:
import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split
from lightgbm import early_stopping,log_evaluation,LGBMClassifier
from sklearn.pipeline import FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from tqdm import tqdm
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier
tqdm.pandas()

In [9]:
path="/kaggle/input/wsdm-cup-multilingual-chatbot-arena/"
train = pd.read_parquet(path+"train.parquet")
test = pd.read_parquet(path+"test.parquet")
sub = pd.read_csv(path+"sample_submission.csv")

In [10]:
train.head()

Unnamed: 0,id,prompt,response_a,response_b,winner,model_a,model_b,language
0,00007cff95d7f7974642a785aca248b0f26e60d3312fac...,vieš po Slovensky?,"Áno, hovorím po slovensky. Ako vám môžem pomôcť?","Áno, veď som tu! Môžem ti pomôcť s otázkami al...",model_a,o1-preview,reka-core-20240904,Slovak
1,00010ed04b536f56ebe43eef1100c13906abea12bf9855...,You will be given a piece of news. Analyze it ...,Let's break down the news and analyze it accor...,"```json\n{\n ""contains_orgs"": true,\n ""orgs""...",model_a,gemma-2-27b-it,gemini-1.5-flash-002,Russian
2,0003800d510e38803efba5ceaec122bc66408fe367b0be...,"Dört basamaklı, rakamları birbirinden ve sıfır...","Bu soruyu çözmek için, verilen koşulları adım ...",Bu problemi adım adım çözelim:\n\n1) ABCD - DC...,model_a,gpt-4-0125-preview,claude-3-5-sonnet-20240620,Turkish
3,00072026c68f5418ef2da238394e418ce72a534b9b22d5...,"현재 추천된 탑 3 종목인 Cabaletta Bio (CABA), Rocket Ph...","죄송하지만 저는 금융 조언을 제공할 수 없습니다. 저는 AI 모델이며, 투자 결정에...",현재 추천된 탑 3 종목에 순위를 매기기 위해서는 여러 가지 요소들을 고려해야 합니...,model_b,gemma-2-2b-it,llama-3.1-nemotron-70b-instruct,English
4,0007ce7cf6bc1b5a8f8a4669b854fb12030863c970d9dc...,Please be boring,"Alright, I'll be as boring as possible.\n\nTod...","Understood. Here is a straightforward, unadorn...",model_a,reka-flash-20240722,grok-2-2024-08-13,English


In [11]:
test.head()

Unnamed: 0,id,prompt,response_a,response_b,scored
0,327228,Caso Clínico: Un hombre de 70 años con anteced...,**Diagnóstico Diferencial de Anemia en Pacient...,"Basándonos en el caso clínico presentado, pode...",False
1,1139415,Peel Company received a cash dividend from a ...,The correct answer is **(a) No No**. Here's ...,The correct answer is **(a) No No**. Here's wh...,False
2,1235630,Há um grave problema com o relógio da torre da...,Dois problemas interessantes!\n\n**Problema 1:...,Vamos resolver os dois problemas em sequência....,False


In [12]:
# 10% as validation split, this percentage could be changed
train, valid=train_test_split(train, test_size=0.1, stratify=train["winner"], random_state=42)

# Train set can be inverted (and winner too) to get twice the data from the available training dataset
train_inv = train.copy()
train_inv["response_a"], train_inv["response_b"] = train_inv["response_b"], train_inv["response_a"]
train_inv["winner"] = train_inv["winner"].apply(lambda x: "model_a" if "b" in x else "model_b")

In [13]:
train_inv.head(1)

Unnamed: 0,id,prompt,response_a,response_b,winner,model_a,model_b,language
28076,940d1c4f9e10cf3acb0f4017ebf3e1e9a23efc886085f1...,Интересное про дом. Дюрер купил его в 1509 год...,Вопрос о переводе исторических цен в современн...,Перевести старинные денежные единицы в совреме...,model_b,qwen-max-0919,gemini-1.5-pro-002,Russian


In [14]:
train.head(1)

Unnamed: 0,id,prompt,response_a,response_b,winner,model_a,model_b,language
28076,940d1c4f9e10cf3acb0f4017ebf3e1e9a23efc886085f1...,Интересное про дом. Дюрер купил его в 1509 год...,Перевести старинные денежные единицы в совреме...,Вопрос о переводе исторических цен в современн...,model_a,qwen-max-0919,gemini-1.5-pro-002,Russian


In [15]:
# Here I compute some features
def compute_feats(df):
    for col in tqdm(["response_a","response_b","prompt"]):
        # response lenght is a key factor when choosing between two responses
        df[f"{col}_len"]=df[f"{col}"].str.len()

        # Some characters counting features 
        df[f"{col}_spaces"]=df[f"{col}"].str.count("\s")
        df[f"{col}_punct"]=df[f"{col}"].str.count(",|\.|!")
        df[f"{col}_question_mark"]=df[f"{col}"].str.count("\?")
        df[f"{col}_quot"]=df[f"{col}"].str.count("'|\"")
        df[f"{col}_formatting_chars"]=df[f"{col}"].str.count("\*|\_")
        df[f"{col}_math_chars"]=df[f"{col}"].str.count("\-|\+|\=")
        df[f"{col}_curly_open"]=df[f"{col}"].str.count("\{")
        df[f"{col}_curly_close"]=df[f"{col}"].str.count("}")
        df[f"{col}_round_open"]=df[f"{col}"].str.count("\(")
        df[f"{col}_round_close"]=df[f"{col}"].str.count("\)")
        df[f"{col}_accent_chars"]=df[f"{col}"].str.count("è|ò|à|ù|é|ì")
        df[f"{col}_special_chars"]=df[f"{col}"].str.count("\W")
        df[f"{col}_digits"]=df[f"{col}"].str.count("\d")/df[f"{col}_len"]
        df[f"{col}_lower"]=df[f"{col}"].str.count("[a-z]").astype("float32")/df[f"{col}_len"]
        df[f"{col}_upper"]=df[f"{col}"].str.count("[A-Z]").astype("float32")/df[f"{col}_len"]
        df[f"{col}_chinese"]=df[f"{col}"].str.count(r'[\u4e00-\u9fff]+').astype("float32")/df[f"{col}_len"]
        df[f"{col}_tild"]=df[f"{col}"].str.count("~")>0

        # Feature that show how balanced are curly and round brackets
        df[f"{col}_round_balance"]=df[f"{col}_round_open"]-df[f"{col}_round_close"]
        df[f"{col}_curly_balance"]=df[f"{col}_curly_open"]-df[f"{col}_curly_close"]

        # Feature that tells if the string json is present somewhere (e.g. asking a json response or similar)
        df[f"{col}_json"]=df[f"{col}"].str.lower().str.count("json")
        df[f"{col}_yaml"]=df[f"{col}"].str.lower().str.count("yaml")

    return df
train=compute_feats(train)
train_inv=compute_feats(train_inv)

train=pd.concat([train,train_inv])
valid=compute_feats(valid)
test=compute_feats(test)

100%|██████████| 3/3 [01:07<00:00, 22.43s/it]
100%|██████████| 3/3 [01:05<00:00, 22.00s/it]
100%|██████████| 3/3 [00:07<00:00,  2.43s/it]
100%|██████████| 3/3 [00:00<00:00, 64.54it/s]


In [16]:
train.head()

Unnamed: 0,id,prompt,response_a,response_b,winner,model_a,model_b,language,response_a_len,response_a_spaces,...,prompt_special_chars,prompt_digits,prompt_lower,prompt_upper,prompt_chinese,prompt_tild,prompt_round_balance,prompt_curly_balance,prompt_json,prompt_yaml
28076,940d1c4f9e10cf3acb0f4017ebf3e1e9a23efc886085f1...,Интересное про дом. Дюрер купил его в 1509 год...,Перевести старинные денежные единицы в совреме...,Вопрос о переводе исторических цен в современн...,model_a,qwen-max-0919,gemini-1.5-pro-002,Russian,1699,257,...,30,0.098039,0.0,0.0,0.0,False,0,0,0,0
20104,69d8fcef20fd0a51dcafccb0dd0ed08e1a07671d9298ab...,here are two groups of numbers. the number in ...,Let's analyze the given number groups to find ...,"To find the relationship between the numbers, ...",model_a,gemini-1.5-flash-002,llama-3.1-405b-instruct-bf16,English,1608,362,...,54,0.04955,0.689189,0.018018,0.0,False,0,0,0,0
9548,328ccffd540eb9d7774ca2a2e0c09dd3cdbf1a81d97030...,帮我写一篇文章：标题住房“财产化”是不想生、不敢生原因\n\n背景：中国人口与发展研究中心消...,### 住房“财产化”是不想生、不敢生的原因\n\n#### 引言\n\n随着中国人口与家庭...,### 住房“财产化”是不想生、不敢生的原因\n\n#### 引言\n\n中国人口与发展研究...,model_b,yi-lightning,mistral-large-2407,Chinese,1401,45,...,850,0.023235,0.004647,0.001949,0.097137,False,0,0,0,0
16922,598f765b7fa5f4f6c67e2fa63230fe89c487ac2cca6f2c...,"Merhaba, adım Zeki Sagay. Ben 27.06.1957 yılın...","Merhaba Zeki Bey,\n\nGüneş burcunuz, doğum tar...","İstanbul'da doğan, 27.06.1957 günü sabah 08:30...",model_a,llama-3.1-405b-instruct-bf16,llama-3.2-1b-instruct,Turkish,1098,145,...,27,0.090226,0.609023,0.037594,0.0,False,0,0,0,0
26214,8a24c32456b3b1338647389dfac6573a43d1ed2cf97ad1...,Uyku sorunumu nasıl çözebilirim kısaca,Uyku sorununu çözmek için birkaç basit adım iz...,Uyku sorunlarını çözmek için birkaç temel adım...,model_a,mistral-large-2407,qwen-max-0919,Turkish,724,98,...,4,0.0,0.763158,0.026316,0.0,False,0,0,0,0


In [17]:
vectorizer_char = TfidfVectorizer(sublinear_tf=True, analyzer='char', ngram_range=(1,2), max_features=100_000)
vectorizer_word = TfidfVectorizer(sublinear_tf=True, analyzer='word', min_df=3)
preprocessor = ColumnTransformer(
    transformers=[
        ('prompt_feats', FeatureUnion([
            ('prompt_char', vectorizer_char),
            ('prompt_word', vectorizer_word)
        ]), 'prompt'),
        ('response_a_feats', FeatureUnion([
            ('response_a_char', vectorizer_char),
            ('response_a_word', vectorizer_word)
        ]), 'response_a'),
        ('response_b_feats', FeatureUnion([
            ('response_b_char', vectorizer_char),
            ('response_b_word', vectorizer_word)
        ]), 'response_b')
    ]
)
train_feats = preprocessor.fit_transform(train[["response_a","response_b","prompt"]])
test_feats = preprocessor.transform(test[["response_a","response_b","prompt"]])
valid_feats = preprocessor.transform(valid[["response_a","response_b","prompt"]])

In [18]:
train_feats

<87190x859841 sparse matrix of type '<class 'numpy.float64'>'
	with 110519876 stored elements in Compressed Sparse Row format>

In [19]:
valid_feats, test_feats

(<4844x859841 sparse matrix of type '<class 'numpy.float64'>'
 	with 6082567 stored elements in Compressed Sparse Row format>,
 <3x859841 sparse matrix of type '<class 'numpy.float64'>'
 	with 3283 stored elements in Compressed Sparse Row format>)

In [31]:
model = LogisticRegression(C=0.1, solver='liblinear', dual=True, random_state=42)
model.fit(train_feats, train.winner)

In [32]:
model.predict_proba(test_feats)

array([[0.4611418 , 0.5388582 ],
       [0.53491638, 0.46508362],
       [0.54462179, 0.45537821]])

In [33]:
y_pred_val = model.predict(valid_feats)

In [34]:
y_pred_val

array(['model_a', 'model_b', 'model_a', ..., 'model_b', 'model_a',
       'model_a'], dtype=object)

In [35]:
from sklearn.metrics import classification_report

In [36]:
print(classification_report(valid.winner, y_pred_val, digits=6))

              precision    recall  f1-score   support

     model_a   0.611861  0.624374  0.618054      2396
     model_b   0.624844  0.612337  0.618527      2448

    accuracy                       0.618291      4844
   macro avg   0.618352  0.618355  0.618291      4844
weighted avg   0.618422  0.618291  0.618293      4844



In [None]:
test['winner'] = model.predict(test_feats)

In [None]:
test["id"] = test["id"]

In [None]:
test["winner"]=test["winner"].apply(lambda x: "model_a" if x==1 else "model_b")

sub = test[["id","winner"]]

In [None]:
sub.head()

In [None]:
sub.to_csv("submission.csv",index=False)