In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
tqdm.pandas()
import warnings
warnings.filterwarnings("ignore")
from scipy.sparse import hstack
from textblob import TextBlob
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import accuracy_score
import lightgbm as lgb
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import langid
from sklearn.metrics import roc_auc_score as ras
import string


/kaggle/input/wsdm-cup-multilingual-chatbot-arena/sample_submission.csv
/kaggle/input/wsdm-cup-multilingual-chatbot-arena/train.parquet
/kaggle/input/wsdm-cup-multilingual-chatbot-arena/test.parquet


In [None]:
df_train = pd.read_parquet('/kaggle/input/wsdm-cup-multilingual-chatbot-arena/train.parquet', engine='pyarrow')
df_test = pd.read_parquet('/kaggle/input/wsdm-cup-multilingual-chatbot-arena/test.parquet', engine='pyarrow')

Unnamed: 0,id,prompt,response_a,response_b,winner,model_a,model_b,language
0,00007cff95d7f7974642a785aca248b0f26e60d3312fac...,vieš po Slovensky?,"Áno, hovorím po slovensky. Ako vám môžem pomôcť?","Áno, veď som tu! Môžem ti pomôcť s otázkami al...",model_a,o1-preview,reka-core-20240904,Slovak
1,00010ed04b536f56ebe43eef1100c13906abea12bf9855...,You will be given a piece of news. Analyze it ...,Let's break down the news and analyze it accor...,"```json\n{\n ""contains_orgs"": true,\n ""orgs""...",model_a,gemma-2-27b-it,gemini-1.5-flash-002,Russian


In [None]:
vectorizer_char = TfidfVectorizer(sublinear_tf=True, analyzer='char', ngram_range=(1,2), max_features=100_000)
vectorizer_word = TfidfVectorizer(sublinear_tf=True, analyzer='word', min_df=3)
preprocessor = ColumnTransformer(
    transformers=[
        ('prompt_feats', FeatureUnion([
            ('prompt_char', vectorizer_char),
            ('prompt_word', vectorizer_word)
        ]), 'prompt'),
        ('response_a_feats', FeatureUnion([
            ('response_a_char', vectorizer_char),
            ('response_a_word', vectorizer_word)
        ]), 'response_a'),
        ('response_b_feats', FeatureUnion([
            ('response_b_char', vectorizer_char),
            ('response_b_word', vectorizer_word)
        ]), 'response_b')
    ]
)

In [None]:
idx_train, idx_validation = train_test_split(df_train.index,test_size=0.20,stratify=df_train["winner_ind"], random_state=42)
X_train, y_train = df_train.loc[idx_train,["response_a","response_b","prompt"]],  df_train.loc[idx_train, 'winner_ind']
X_val, y_val = df_train.loc[idx_validation,["response_a","response_b","prompt"]],  df_train.loc[idx_validation, 'winner_ind']

In [None]:
print(df_train.shape)
print(X_train.shape)
print(X_val.shape)

(48439, 18)
(38751, 3)
(9688, 3)


In [None]:
train_feats = preprocessor.fit_transform(X_train)
val_feats = preprocessor.transform(X_val)
test_feats = preprocessor.transform(df_test[["response_a","response_b","prompt"]])

In [None]:

def add_cosine_similarity_feature(df, feature1, feature2, new_feature_name):
    def compute_cosine_similarity(row):
        try:
            if not row[feature1] or not row[feature2]:
                return 0.0

            vectorizer = TfidfVectorizer(stop_words='english')
            tfidf_matrix = vectorizer.fit_transform([row[feature1], row[feature2]])

            return cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
        except:
            return 0

    df[new_feature_name] = df.progress_apply(compute_cosine_similarity, axis=1)
    return df

In [None]:
train = add_cosine_similarity_feature(df_train, 'prompt', 'response_a', 'response_a_similarity')
train = add_cosine_similarity_feature(df_train, 'prompt', 'response_b', 'response_b_similarity')

100%|██████████| 48439/48439 [02:10<00:00, 371.28it/s]
100%|██████████| 48439/48439 [02:10<00:00, 369.80it/s]


In [None]:
test = add_cosine_similarity_feature(df_test, 'prompt', 'response_a', 'response_a_similarity')
test = add_cosine_similarity_feature(df_test, 'prompt', 'response_b', 'response_b_similarity')

100%|██████████| 3/3 [00:00<00:00, 284.44it/s]
100%|██████████| 3/3 [00:00<00:00, 314.77it/s]


In [None]:
df_train['response_a_similarity_ratio'] = df_train['response_a_similarity']-(df_train['response_b_similarity'])
df_test['response_a_similarity_ratio'] = df_test['response_a_similarity']-df_test['response_b_similarity']
df_train['response_b_similarity_ratio'] = df_train['response_b_similarity']-df_train['response_a_similarity']
df_test['response_b_similarity_ratio'] = df_test['response_b_similarity']-df_test['response_a_similarity']

df_train.columns

Index(['id', 'prompt', 'response_a', 'response_b', 'winner', 'model_a',
       'model_b', 'language', 'space_count_a', 'space_count_b', 'len_winner',
       'len_loser', 'len_a', 'len_b', 'winner_len_ratio', 'response_a_ratio',
       'response_b_ratio', 'winner_ind', 'response_a_similarity',
       'response_b_similarity', 'response_a_similarity_ratio',
       'response_b_similarity_ratio'],
      dtype='object')

In [None]:
X_train_response_a_simrat = df_train.loc[idx_train,"response_a_similarity_ratio"]
X_val_response_a_simrat = df_train.loc[idx_validation,"response_a_similarity_ratio"]

In [None]:
df_train['response_a_similarity_ratio'].describe()

count    48439.000000
mean         0.000055
std          0.145510
min         -1.000000
25%         -0.060539
50%          0.000000
75%          0.060223
max          1.000000
Name: response_a_similarity_ratio, dtype: float64

In [None]:
X_train_all_features = hstack([train_feats, df_train.loc[idx_train, 'response_a_similarity_ratio'].values.reshape(-1, 1)])
X_train_all_features.shape

(38751, 620008)

In [None]:
X_val_all_features = hstack([val_feats, df_train.loc[idx_validation, 'response_a_similarity_ratio'].values.reshape(-1, 1)])
X_val_all_features.shape

(9688, 620008)

In [None]:
X_test_all_features = hstack([test_feats, df_test['response_a_similarity_ratio'].values.reshape(-1, 1)])
X_test_all_features.shape

(3, 620008)

In [None]:
def sentiment_calc(text):
    try:
        return TextBlob(text).sentiment
    except:
        return 0.0

df_train[['polarity_a', 'subjectivity_a']] = df_train['response_a'].apply(lambda Text: pd.Series(TextBlob(Text).sentiment))
df_train[['polarity_b', 'subjectivity_b']] = df_train['response_b'].apply(lambda Text: pd.Series(TextBlob(Text).sentiment))
df_train[['polarity_prompt', 'subjectivity_prompt']] = df_train['prompt'].apply(lambda Text: pd.Series(TextBlob(Text).sentiment))
df_test[['polarity_a', 'subjectivity_a']] = df_test['response_a'].apply(lambda Text: pd.Series(TextBlob(Text).sentiment))
df_test[['polarity_b', 'subjectivity_b']] = df_test['response_b'].apply(lambda Text: pd.Series(TextBlob(Text).sentiment))
df_test[['polarity_prompt', 'subjectivity_prompt']] = df_test['prompt'].apply(lambda Text: pd.Series(TextBlob(Text).sentiment))

df_train['polarity_a_score'] = df_train['polarity_prompt'] - df_train['polarity_a']
df_train['polarity_b_score'] = df_train['polarity_prompt'] - df_train['polarity_b']
df_test['polarity_a_score'] = df_test['polarity_prompt'] - df_test['polarity_a']
df_test['polarity_b_score'] = df_test['polarity_prompt'] - df_test['polarity_b']

In [None]:
df_train['subjectivity_a_score'] = df_train['subjectivity_prompt'] - df_train['subjectivity_a']
df_train['subjectivity_b_score'] = df_train['subjectivity_prompt'] - df_train['subjectivity_b']
df_test['subjectivity_a_score'] = df_test['subjectivity_prompt'] - df_test['subjectivity_a']
df_test['subjectivity_b_score'] = df_test['subjectivity_prompt'] - df_test['subjectivity_b']

df_train['subjectivity_score'] = df_train['subjectivity_a_score'] - df_train['subjectivity_b_score']
df_test['subjectivity_score'] = df_test['subjectivity_a_score'] - df_test['subjectivity_b_score']

In [None]:
sent = SentimentIntensityAnalyzer()

polarity = [round(sent.polarity_scores(i)['compound'], 2) for i in df_train['response_a']]
df_train['sentiment_score_a'] = polarity
polarity = [round(sent.polarity_scores(i)['compound'], 2) for i in df_train['response_b']]
df_train['sentiment_score_b'] = polarity
polarity = [round(sent.polarity_scores(i)['compound'], 2) for i in df_train['prompt']]
df_train['sentiment_score_prompt'] = polarity

In [None]:
polarity = [round(sent.polarity_scores(i)['compound'], 2) for i in df_test['response_a']]
df_test['sentiment_score_a'] = polarity
polarity = [round(sent.polarity_scores(i)['compound'], 2) for i in df_test['response_b']]
df_test['sentiment_score_b'] = polarity
polarity = [round(sent.polarity_scores(i)['compound'], 2) for i in df_test['prompt']]
df_test['sentiment_score_prompt'] = polarity

In [None]:
df_train['v_polarity_a_score'] = df_train['sentiment_score_prompt'] - df_train['sentiment_score_a']
df_train['v_polarity_b_score'] = df_train['sentiment_score_prompt'] - df_train['sentiment_score_b']
df_test['v_polarity_a_score'] = df_test['sentiment_score_prompt'] - df_test['sentiment_score_a']
df_test['v_polarity_b_score'] = df_test['sentiment_score_prompt'] - df_test['sentiment_score_b']

In [None]:
def langdetector(text):
    return langid.classify(text)[0]

def isEnglish(s):
  return s.isascii()

In [None]:
df_test['isEnglish'] = df_test['response_a'].apply(isEnglish)

In [None]:
df_train['lang_'] = df_train['prompt'].apply(langdetector)
df_train['isEnglish'] = df_train['response_a'].apply(isEnglish)
df_train.head()

Unnamed: 0,id,prompt,response_a,response_b,winner,model_a,model_b,language,space_count_a,space_count_b,...,subjectivity_a_score,subjectivity_b_score,subjectivity_score,sentiment_score_a,sentiment_score_b,sentiment_score_prompt,v_polarity_a_score,v_polarity_b_score,lang_,isEnglish
0,00007cff95d7f7974642a785aca248b0f26e60d3312fac...,vieš po slovensky?,"áno, hovorím po slovensky. ako vám môžem pomôcť?","áno, veď som tu! môžem ti pomôcť s otázkami al...",model_a,o1-preview,reka-core-20240904,Slovak,8,36,...,0.0,0.0,0.0,-0.56,0.56,-0.56,0.0,-1.12,sl,False
1,00010ed04b536f56ebe43eef1100c13906abea12bf9855...,you will be given a piece of news. analyze it ...,let's break down the news and analyze it accor...,"```json\n{\n ""contains_orgs"": true,\n ""orgs""...",model_a,gemma-2-27b-it,gemini-1.5-flash-002,Russian,195,29,...,0.046321,-0.111375,0.157696,0.66,0.42,-0.62,-1.28,-1.04,ru,False
2,0003800d510e38803efba5ceaec122bc66408fe367b0be...,"dört basamaklı, rakamları birbirinden ve sıfır...","bu soruyu çözmek için, verilen koşulları adım ...",bu problemi adım adım çözelim:\n\n1) abcd - dc...,model_a,gpt-4-0125-preview,claude-3-5-sonnet-20240620,Turkish,304,186,...,-1.0,-1.0,0.0,0.0,0.44,0.0,0.0,-0.44,tr,False
3,00072026c68f5418ef2da238394e418ce72a534b9b22d5...,"현재 추천된 탑 3 종목인 cabaletta bio (caba), rocket ph...","죄송하지만 저는 금융 조언을 제공할 수 없습니다. 저는 ai 모델이며, 투자 결정에...",현재 추천된 탑 3 종목에 순위를 매기기 위해서는 여러 가지 요소들을 고려해야 합니...,model_b,gemma-2-2b-it,llama-3.1-nemotron-70b-instruct,English,95,414,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ko,False
4,0007ce7cf6bc1b5a8f8a4669b854fb12030863c970d9dc...,please be boring,"alright, i'll be as boring as possible.\n\ntod...","understood. here is a straightforward, unadorn...",model_a,reka-flash-20240722,grok-2-2024-08-13,English,229,52,...,0.671334,0.406944,0.26439,0.59,0.36,0.0,-0.59,-0.36,en,True


In [None]:
print(df_train['lang_'].value_counts().head(10))

lang_
en    23467
ru     7220
zh     4782
vi     3156
de     1489
ja     1215
ko     1189
es      949
fr      791
pt      570
Name: count, dtype: int64


In [None]:
df_train = pd.get_dummies(df_train,columns = ['lang_'], dtype= int)

In [None]:
df_test['lang_'] = df_test['prompt'].apply(langdetector)
df_test = pd.get_dummies(df_test,columns = ['lang_'], dtype= int)

In [None]:
cols = df_train.columns.union(df_test.columns)
df_test = df_test.reindex(cols, axis=1, fill_value=0)

In [None]:
features_to_add = [
    'polarity_a', 'subjectivity_a', 'subjectivity_score',
    'polarity_a_score', 'polarity_b_score',
    'polarity_b', 'subjectivity_b',

    'sentiment_score_a', 'sentiment_score_b',
    'v_polarity_b_score',

    'isEnglish', 'len_a', 'len_b'
]

def add_features(feature_matrix, df, features, indices=None):
    for feature in features:
        if indices is not None:
            values = df.loc[indices, feature].values.reshape(-1, 1)
        else:
            values = df[feature].values.reshape(-1, 1)
        feature_matrix = hstack([feature_matrix, values])
    return feature_matrix

X_train_all_features = add_features(X_train_all_features, df_train, features_to_add, idx_train)
X_val_all_features = add_features(X_val_all_features, df_train, features_to_add, idx_validation)
X_test_all_features = add_features(X_test_all_features, df_test, features_to_add)

In [None]:
def count_punct(text):
    if not isinstance(text, str) or len(text) == 0:
        return 0.0
    count = sum(1 for char in text if char in string.punctuation)
    text_len = len(text) - text.count(" ")
    return round(count / text_len * 100, 3) if text_len > 0 else 0.0


df_train['punct%_a'] = df_train['response_a'].apply(count_punct)
df_train['punct%_b'] = df_train['response_b'].apply(count_punct)
df_test['punct%_a'] = df_test['response_a'].apply(count_punct)
df_test['punct%_b'] = df_test['response_b'].apply(count_punct)

df_train['len_diff'] = df_train['len_a'] - df_train['len_b']
df_test['len_diff'] = df_test['len_a'] - df_test['len_b']

In [None]:
def calculate_overlap(row, response_col):
    prompt_words = set(str(row['prompt']).split())
    response_words = set(str(row[response_col]).split())
    overlap = len(prompt_words & response_words)
    return overlap / len(prompt_words) if prompt_words else 0.0

df_train['overlap_a'] = df_train.apply(lambda x: calculate_overlap(x, 'response_a'), axis=1)
df_train['overlap_b'] = df_train.apply(lambda x: calculate_overlap(x, 'response_b'), axis=1)
df_train['overlap_score'] = df_train['overlap_a'] - df_train['overlap_b']

df_test['overlap_a'] = df_test.apply(lambda x: calculate_overlap(x, 'response_a'), axis=1)
df_test['overlap_b'] = df_test.apply(lambda x: calculate_overlap(x, 'response_b'), axis=1)
df_test['overlap_score'] = df_test['overlap_a'] - df_test['overlap_b']


In [None]:
features_to_add = [
    'punct%_a', 'punct%_b', 'len_diff',
    'overlap_a', 'overlap_b', 'overlap_score'
]

for feature in features_to_add:

    X_train_all_features = hstack([
        X_train_all_features,
        df_train.loc[idx_train, feature].values.reshape(-1, 1)
    ])

    X_val_all_features = hstack([
        X_val_all_features,
        df_train.loc[idx_validation, feature].values.reshape(-1, 1)
    ])


    X_test_all_features = hstack([
        X_test_all_features,
        df_test[feature].values.reshape(-1, 1)
    ])

In [None]:
num = 30
train_data = lgb.Dataset(X_train_all_features, label=y_train)
test_data = lgb.Dataset(X_val_all_features, label=y_val, reference=train_data)

params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'device_type':'gpu',
    'max_depth':7,
    'metric': 'auc',
    'num_leaves': 35,
    'learning_rate': 0.05,
    'feature_fraction': 0.4,
    'bagging_fraction': 0.35,
    'bagging_freq': 2,
    'verbose': 1,
    'lambda_l1':0.5,
    'lambda_l2':0.9,
    'min_data_in_leaf': 20,
    'max_bin': 200,
    'min_data_in_bin':20
}


model = lgb.train(params, train_data, num, valid_sets=[test_data])

[LightGBM] [Info] Number of positive: 19166, number of negative: 19585
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1712350
[LightGBM] [Info] Number of data points in the train set: 38751, number of used features: 138758
[LightGBM] [Info] Using GPU Device: Tesla P100-PCIE-16GB, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 592 dense feature groups (21.88 MB) transferred to GPU in 0.013264 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.494594 -> initscore=-0.021626
[LightGBM] [Info] Start training from score -0.021626


In [None]:
print(ras(y_train, model.predict(X_train_all_features)))
print(ras(y_val, model.predict(X_val_all_features)))

0.7106100055223419
0.6582519493955067
