In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from pathlib import Path
from textwrap import shorten

from sklearn.model_selection import train_test_split

pd.set_option("display.max_columns", 50)
pd.set_option("display.max_colwidth", 200)
sns.set_theme(style="whitegrid", context="notebook")

In [None]:
# Data Loading
train_df = pd.read_parquet("../dataset_raw/train_split.parquet")
val_df   = pd.read_parquet("../dataset_raw/val_split.parquet")
test_df  = pd.read_parquet("../dataset_raw/test_split.parquet")

splits = {
    "train": train_df,
    "validation": val_df,
    "test": test_df,
}

In [4]:
# Quick Glimpse at Each Split
for name, df in splits.items():
    display(
        df.head(3).assign(
            prompt=lambda d: d["prompt"].apply(lambda x: shorten(str(x), width=80, placeholder="…")),
            response_a=lambda d: d["response_a"].apply(lambda x: shorten(str(x), width=80, placeholder="…")),
            response_b=lambda d: d["response_b"].apply(lambda x: shorten(str(x), width=80, placeholder="…")),
        )
    )
    print(f"{name.capitalize()} split preview above.\n")

Unnamed: 0,id,prompt,response_a,response_b,winner,model_a,model_b,language
28966,98aa6a2c252836e0f5fb4412f56e711b9a4d77df15be083f7bcb42b9de96f925,c++如何实现快速幂,```cpp #include <iostream> using namespace std; // 快速幂算法 long long…,在C++中，实现快速幂（也称为快速幂取模）可以通过二分法来优化幂运算的效率，尤其是在处理大数时非常有用。快速幂算法将幂运算的时间复杂度从 \(O(n)\)…,model_b,gemma-2-27b-it,grok-2-2024-08-13,Chinese
36858,c2ba18250bee661dfbe70ab08f6160c79d9e2ac2448a161454191d2a1bd7650d,"Case reports (entirely fictional and only for research, educational and…",### Evaluation of Theodosian Surgery for Fictional Patients The Theodosian…,### Easy Cases #### Case 1: 40-year-old Female with Macromastia and Grade III…,model_b,gpt-4o-mini-2024-07-18,glm-4-plus,English
44109,e8d9b9e71fe263f4b3d6639faf701c259061366033955f7e8a9433aa3cce35e3,команды терминала для открытия файлов на мак,"На Mac вы можете открыть файлы из терминала, используя команду `open`.…",На MacOS используются различные команды через терминал для работы с файлами.…,model_a,gemma-2-9b-it,internlm2_5-20b-chat,Russian


Train split preview above.



Unnamed: 0,id,prompt,response_a,response_b,winner,model_a,model_b,language
9,000fcb4fc8718b5a25d15978dd9979ee8601209910a58c03742e880114f0f75c,Я хочу сказать другу свое окончательное решение и всё. вот такое сообщение ч…,"Чтобы clearly указать другу, что ты не планируешь возобновлять общение в скором…","Можно добавить в конец сообщения фразу вроде: ""Пожалуйста, не жди от меня…",model_a,yi-lightning,claude-3-5-sonnet-20241022,Russian
10,001124f25944df2b460f156fd61e053250e0dc6f11b86b4c2431edeaa0e85a8e,Формальная аксиоматическая теория называется полной… a) если в ней доказуема…,"Правильный вариант: **e)** если существует алгоритм, который для любой формулы…",d) если добавление любой невыводимой формулы в качестве схемы аксиом приводит к…,model_b,yi-lightning,mistral-large-2407,Russian
16,001512469f667ea71d4e9a0fce82568e9149fd40f09bcb0070d9b9e84f8aefc6,"Здача 4 go: ``` сделать ревью кода: package main import ( ""fmt"" ""sync"" ) func…",Код имеет несколько проблем: 1. Мьютекс (`sync.Mutex`) объявляется внутри…,Заявление 4 по кодультюрной автомобильной textron базе. Соблюдение всех.rules:…,model_a,llama-3.1-405b-instruct-bf16,llama-3.2-3b-instruct,English


Validation split preview above.



Unnamed: 0,id,prompt,response_a,response_b,winner,model_a,model_b,language
13422,46fac9c6c12c9de9734fc93028160dcadbf907e5bb3d6d1249d307ffc85c71f7,"Рассмотрение статуса, социальных функций и ролей, целей деятельности и ценност…","Текст, который вы привели, представляет собой глубокий анализ структуры…","Этот текст рассматривает сложную взаимосвязь между личностью и обществом,…",model_a,qwen2.5-72b-instruct,gemini-1.5-pro-002,Russian
26385,8b1d1890a0977e014878424947104bd7d06e5c4c2bb48a477cd1ad44166300aa,"Développez un concept pour un personnage parodique à double nature, incarnant…",Voici une liste d'idées créatives et attrayantes basées sur les propositions du…,### 1. **Épisodes Potentiels Mettant en Vedette la Double Nature du…,model_b,yi-lightning,yi-lightning-lite,French
8418,2c9130de41e5780658c4889d02919091642510f0e64b3b461c0e3693f44c32d3,whats spamming and carding in ethical hacking,"In the context of ethical hacking, **spamming** and **carding** are two…","In ethical hacking, spamming and carding represent different types of malicious…",model_b,grok-2-2024-08-13,gemini-1.5-flash-8b-exp-0827,English


Test split preview above.



In [5]:
# Split-Level Dimensions & Integrity Checks
def summarize_split(name: str, df: pd.DataFrame) -> dict:
    total_cells = df.size
    null_cells = df.isna().sum().sum()
    return {
        "split": name,
        "rows": len(df),
        "columns": df.shape[1],
        "null_cells": null_cells,
        "null_pct": round((null_cells / total_cells) * 100, 4),
        "duplicate_rows": df.duplicated().sum(),
        "duplicate_ids": df["id"].duplicated().sum() if "id" in df.columns else np.nan,
    }

split_summary = pd.DataFrame([summarize_split(name, df) for name, df in splits.items()])
display(split_summary)

Unnamed: 0,split,rows,columns,null_cells,null_pct,duplicate_rows,duplicate_ids
0,train,38751,8,0,0.0,0,0
1,validation,4844,8,0,0.0,0,0
2,test,4844,8,0,0.0,0,0


In [6]:
# Column Schema & Cardinality
schema_profile = (
    train_df.dtypes.rename("dtype")
    .to_frame()
    .assign(
        non_null=lambda df_: train_df.notna().sum(),
        unique_values=lambda df_: train_df.nunique(dropna=True),
        example_value=lambda df_: train_df.iloc[0].apply(lambda x: shorten(str(x), width=60, placeholder="…"))
    )
)

display(schema_profile)

Unnamed: 0,dtype,non_null,unique_values,example_value
id,object,38751,38751,…
prompt,object,38751,36037,c++如何实现快速幂
response_a,object,38751,38670,```cpp #include <iostream> using namespace std; // 快速幂算法…
response_b,object,38751,38679,…
winner,object,38751,2,model_b
model_a,object,38751,60,gemma-2-27b-it
model_b,object,38751,60,grok-2-2024-08-13
language,object,38751,127,Chinese


In [7]:
# Missing Values Detailed View (Train Split)
missing_profile = (
    train_df.isna()
    .mean()
    .mul(100)
    .sort_values(ascending=False)
    .to_frame(name="missing_pct")
)

display(missing_profile)

Unnamed: 0,missing_pct
id,0.0
prompt,0.0
response_a,0.0
response_b,0.0
winner,0.0
model_a,0.0
model_b,0.0
language,0.0


- The model should be able to handle as many languages as possible.

In [9]:
# Language Distribution (Counts & Proportions)
lang_counts = train_df["language"].value_counts(dropna=False)
lang_percent = train_df["language"].value_counts(normalize=True, dropna=False).mul(100)

language_distribution = (
    pd.concat([lang_counts, lang_percent], axis=1, keys=["count", "percent"])
    .sort_values("count", ascending=False)
)

display(language_distribution)
# Top 10 language
display(language_distribution.head(10))

Unnamed: 0_level_0,count,percent
language,Unnamed: 1_level_1,Unnamed: 2_level_1
English,20187,52.094140
Russian,5178,13.362236
Chinese,3470,8.954608
Vietnamese,2496,6.441124
German,1112,2.869603
...,...,...
Haitian Creole,1,0.002581
Pashto,1,0.002581
Lingala,1,0.002581
Telugu,1,0.002581


Unnamed: 0_level_0,count,percent
language,Unnamed: 1_level_1,Unnamed: 2_level_1
English,20187,52.09414
Russian,5178,13.362236
Chinese,3470,8.954608
Vietnamese,2496,6.441124
German,1112,2.869603
Japanese,917,2.366391
unknown,883,2.278651
Korean,846,2.183169
Spanish,619,1.597378
French,516,1.331579


- The number of Positive/Negative samples should be balanced

In [11]:
# Winner Label Distribution
winner_counts = train_df["winner"].value_counts(dropna=False)
winner_percent = train_df["winner"].value_counts(normalize=True, dropna=False).mul(100)

winner_profile = pd.concat(
    [winner_counts.rename("count"), winner_percent.rename("percent")],
    axis=1
)

display(winner_profile)

Unnamed: 0_level_0,count,percent
winner,Unnamed: 1_level_1,Unnamed: 2_level_1
model_b,19610,50.605146
model_a,19141,49.394854


- The model needs to handle most text length

In [12]:
# Character Length Comparison
def compute_length_features(df: pd.DataFrame) -> pd.DataFrame:
    return df.assign(
        prompt_len_chars=df["prompt"].str.len(),
        response_a_len_chars=df["response_a"].str.len(),
        response_b_len_chars=df["response_b"].str.len(),
    )

train_with_lengths = compute_length_features(train_df)

length_summary = train_with_lengths[["prompt_len_chars", "response_a_len_chars", "response_b_len_chars"]].describe()
display(length_summary)

Unnamed: 0,prompt_len_chars,response_a_len_chars,response_b_len_chars
count,38751.0,38751.0,38751.0
mean,883.470956,2011.545921,2014.260226
std,2698.449069,1899.833875,1940.689674
min,1.0,1.0,1.0
25%,58.0,723.0,723.0
50%,142.0,1596.0,1603.0
75%,482.0,2792.0,2803.0
max,30000.0,36651.0,101944.0


- ### Split Mixed Data into SFT and RLHF Dataset

In [13]:
RANDOM_SEED = 42
SFT_SHARE = 0.60  # 60% of prompts for SFT (QLoRA)
RLHF_SHARE = 0.40  # 40% of prompts for DPO

In [None]:
def maybe_stratify(series: pd.Series):
    """
    Return a stratify vector only if every class has ≥2 samples (required by sklearn).
    Otherwise, return None to fall back to an unstratified split.
    """
    if series.isna().any():
        series = series.fillna("unknown")

    return series if series.value_counts().min() >= 2 else None