In [1]:
import os
import pandas as pd
import polars as pl
import numpy as np

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import StratifiedGroupKFold

from catboost import CatBoostClassifier, Pool

from metrics import CompetitionMetric

import warnings
warnings.filterwarnings("ignore")

SEED = 42
os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)

In [2]:
DATA_DIR = '../data/raw/'
df = pl.read_csv(DATA_DIR + 'train.csv')
df_demog = pl.read_csv(DATA_DIR + 'train_demographics.csv')

df_test = pl.read_csv(DATA_DIR + 'test.csv')
df_test_demog = pl.read_csv(DATA_DIR + 'test_demographics.csv')

In [3]:
def encode_label(df):
    label_encoder = LabelEncoder()
    df = df.with_columns(
        pl.Series(label_encoder.fit_transform(df['gesture'])).alias('gesture'),
    )
    np.save('gesture_classes.npy', label_encoder.classes_)
    return df
    

def preprocess_seq(df):
    # TODO: scaled = StandardScaler().fit_transform(data) Нужно ли?
    # fillna
        # TODO: Удалить последовательности с NaN. Попробовать заполнить из других последовательностей с тем же жестом
        # rot_ - 4 шт. - полностью из NaN в 50 из 8151 sequences
    # padding/truncation
    seq_cols = [col for col in df.columns if col[:4] in ('acc_', 'rot_', 'thm_', 'tof_')]
    df = df.with_columns([pl.col(seq_cols).fill_null(0)])
    return df

def preprocess_data(df):
    # label encoding
    df = df.with_columns(
        pl.col('sequence_id').map_elements(lambda x: int(x[4:]))
    )
    
    cols_to_drop = {
        'row_id', 'sequence_type', 'sequence_counter', 
        'orientation', 'behavior', 'phase'
    }
    # del THM and TOF columns
    thm_tof_cols = [col for col in df.columns if col[:4] in ('thm_', 'tof_')]
    cols_to_drop.update(thm_tof_cols)
    df = df.drop(cols_to_drop, strict=False)
    
    # join subject features
    df = df.join(df_demog, on='subject', how='left')
    
    df = preprocess_seq(df)
    return df

In [4]:
def generate_features(df):
    seq_cols = [col for col in df.columns if col[:4] in ('acc_', 'rot_')]
    
    df_featured = (
        df
        .group_by('sequence_id')
        .tail(1)
        .drop(seq_cols)
        .join(get_statistics(df, seq_cols), on='sequence_id', how='left')
    )
    # len seq (other for all seq?)
    # subject features
    return df_featured



def get_magnitude():
    pass


def get_statistics(df, seq_cols):
    """
    min, max, mean, var
    """
    df_gr = (
        df
        .select(['sequence_id'] + seq_cols)
        .group_by('sequence_id')
        .agg([
            pl.col(seq_cols).min().name.suffix('_min'),
            pl.col(seq_cols).max().name.suffix('_max'),
            pl.col(seq_cols).median().name.suffix('_median'),
            pl.col(seq_cols).mean().name.suffix('_mean'),
            pl.col(seq_cols).var().name.suffix('_var'),
            pl.col(seq_cols).last().name.suffix('_last'),
        ])
    )
    # df_gr.columns = ["_".join(filter(None, c)) for c in df_gr.columns]
    return df_gr

In [5]:
df = encode_label(df)
df = preprocess_data(df)
df = generate_features(df)
df

sequence_id,subject,gesture,adult_child,age,sex,handedness,height_cm,shoulder_to_wrist_cm,elbow_to_wrist_cm,acc_x_min,acc_y_min,acc_z_min,rot_w_min,rot_x_min,rot_y_min,rot_z_min,acc_x_max,acc_y_max,acc_z_max,rot_w_max,rot_x_max,rot_y_max,rot_z_max,acc_x_median,acc_y_median,acc_z_median,rot_w_median,rot_x_median,rot_y_median,rot_z_median,acc_x_mean,acc_y_mean,acc_z_mean,rot_w_mean,rot_x_mean,rot_y_mean,rot_z_mean,acc_x_var,acc_y_var,acc_z_var,rot_w_var,rot_x_var,rot_y_var,rot_z_var,acc_x_last,acc_y_last,acc_z_last,rot_w_last,rot_x_last,rot_y_last,rot_z_last
i64,str,i64,i64,i64,i64,i64,f64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
40658,"""SUBJ_041770""",13,1,25,1,1,162.0,48,29.0,0.363281,-8.4375,-8.480469,0.372009,-0.812866,-0.645874,-0.477051,11.347656,0.2421875,3.394531,0.679443,-0.339539,-0.353333,-0.01239,9.3125,-4.097656,-3.328125,0.471954,-0.722961,-0.480164,-0.102356,7.228002,-3.896382,-2.204873,0.542374,-0.586768,-0.485805,-0.203203,11.378591,7.284636,16.061364,0.013539,0.038357,0.004052,0.029414,5.421875,-5.027344,-8.480469,0.423889,-0.622925,-0.645874,-0.12323
55077,"""SUBJ_028998""",6,0,15,0,0,173.0,52,25.0,-10.609375,-5.644531,-12.398438,0.001648,-0.96228,-0.34729,-0.495483,-1.109375,3.472656,-3.816406,0.297852,0.923401,0.364624,0.553223,-7.009766,1.347656,-6.382812,0.043701,-0.790741,-0.002289,0.272644,-6.469412,-0.023627,-6.275895,0.095761,-0.337,0.017942,0.140616,3.507312,10.201814,1.969708,0.009202,0.65987,0.048897,0.146167,-9.464844,-1.507812,-4.007812,0.072754,-0.851807,-0.089844,0.510925
63336,"""SUBJ_063447""",6,1,36,0,1,175.0,54,25.0,2.019531,-4.460938,-4.65625,0.132019,-0.206238,-0.849182,-0.893677,7.8828125,9.136719,12.695312,0.421082,0.079285,0.103882,-0.473572,3.285156,8.140625,-3.507812,0.213562,-0.044678,-0.823608,-0.530823,3.479703,6.355469,-1.007353,0.238968,-0.051281,-0.678036,-0.592931,1.178399,17.276686,28.464223,0.008632,0.00422,0.09808,0.020612,2.941406,8.605469,-3.699219,0.166077,0.00885,-0.808289,-0.564758
64116,"""SUBJ_047636""",14,1,26,1,1,166.0,54,25.0,-1.039062,-6.902344,-9.832031,0.138916,-0.723694,-0.728699,-0.827454,9.875,1.332031,5.457031,0.805359,-0.421143,-0.147583,0.195862,8.7890625,-2.630859,2.103516,0.353149,-0.558533,-0.412903,-0.343658,7.356615,-2.152188,-0.876953,0.35563,-0.587148,-0.412285,-0.389029,6.432375,10.331992,21.196791,0.034847,0.004867,0.037287,0.131853,8.996094,-3.722656,1.738281,0.708313,-0.470215,-0.383545,-0.360596
810,"""SUBJ_054811""",15,0,12,0,1,163.0,51,25.0,-13.960938,-1.171875,-10.679688,0.059021,-0.542603,-0.395752,-0.871033,18.183594,14.21875,10.539062,0.534119,0.749695,0.573425,0.617554,1.4765625,9.46875,-2.40625,0.444153,0.615479,0.474121,0.361084,1.985147,6.647184,0.303627,0.415249,0.386611,0.312711,0.010949,15.875945,22.050899,42.901054,0.006032,0.153549,0.078844,0.349029,-7.296875,7.597656,-5.664062,0.092407,-0.542603,-0.395752,-0.735168
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
43117,"""SUBJ_047636""",4,1,26,1,1,166.0,54,25.0,2.753906,-2.34375,-9.40625,0.167603,-0.688904,-0.962097,-0.809875,9.875,0.988281,3.65625,0.232056,-0.197266,-0.168884,0.082397,9.013672,0.832031,2.7734375,0.172882,-0.52359,-0.692047,-0.261078,6.403088,-0.507022,-2.483259,0.180064,-0.382988,-0.569904,-0.363074,8.761852,2.121454,37.293329,0.000235,0.031107,0.152139,0.185189,3.058594,-2.152344,-8.988281,0.172424,-0.205688,-0.960205,0.076843
54115,"""SUBJ_042779""",4,1,24,1,1,164.0,51,25.0,-5.515625,-3.265625,-10.425781,0.036926,-0.584473,-0.81311,-0.533081,10.070312,4.7421875,7.46875,0.756714,0.609619,0.808655,0.066345,6.699219,-0.103516,-3.910156,0.499725,-0.314606,-0.243286,-0.275116,2.210714,0.476618,-1.552679,0.477699,0.052183,0.109372,-0.255644,32.074121,6.951878,54.213926,0.063262,0.22657,0.350035,0.061918,-2.835938,4.28125,-7.363281,0.593323,-0.445435,-0.406677,-0.533081
20957,"""SUBJ_026824""",5,1,24,0,1,181.5,49,27.0,3.296875,-6.949219,-8.617188,0.022156,-0.947998,-0.144531,-0.91925,12.417969,4.2734375,8.238281,0.327393,0.78418,0.281433,0.902649,5.863281,-3.214844,-6.570312,0.114532,-0.898712,0.170898,-0.367798,6.12972,-2.696723,-3.4949,0.136493,-0.677927,0.139629,-0.331523,2.547691,3.562341,33.74003,0.005831,0.195008,0.007993,0.18908,8.011719,-3.117188,-4.636719,0.097595,-0.871582,0.184692,-0.443542
45085,"""SUBJ_021670""",3,0,10,1,1,145.0,41,22.0,-11.160156,-0.605469,-4.976562,0.0578,-0.647522,-0.650635,-0.654175,7.769531,8.433594,12.109375,0.842285,0.571289,0.6171875,0.611938,-9.333984,1.5,-1.470703,0.335663,-0.557373,0.501923,0.512756,-5.808049,1.69563,0.373955,0.428001,-0.408037,0.280169,0.246583,39.587103,3.276384,21.284893,0.047093,0.09279,0.15508,0.222068,-9.160156,1.921875,-0.914062,0.23407,0.480286,-0.535278,-0.654175


In [6]:
def get_ci_f1_hierarchical(y_true, y_pred, n_bootstraps=1000, alpha=0.05):
    """
    Возвращает нижнюю границу 95% ДИ для f1_hierarchical через бутстрапинг.
    """
    bootstrapped_scores = []
    n_samples = len(y_pred)
    
    for _ in range(n_bootstraps):
        indices = np.random.choice(n_samples, n_samples, replace=True)
        y_true_bs = y_true.iloc[indices]
        y_pred_bs = y_pred.iloc[indices]

        if len(np.unique(y_true_bs)) < 2:
            continue
        
        # Расчет метрики (только f1_hierarchical)
        f1_hier, _, _ = CompetitionMetric().calculate_f1_scores(y_true_bs, y_pred_bs)
        bootstrapped_scores.append(f1_hier)
    
    # Расчет перцентилей
    lower_bound = np.percentile(bootstrapped_scores, 100 * alpha / 2)
    return lower_bound

In [None]:
sgkf = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=SEED)
models = []
validation_scores = []

groups = df['subject']
X = df.drop(['sequence_id', 'subject', 'gesture'])
y = df['gesture']

for fold, (train_idx, val_idx) in enumerate(sgkf.split(X, y, groups)):
    X_train, y_train = X[train_idx], y[train_idx]
    X_val, y_val = X[val_idx], y[val_idx]

    model = CatBoostClassifier(
        iterations=1000
    )
    train_pool = Pool(data=X_train.to_pandas(), label=y_train.to_pandas())
    val_pool = Pool(data=X_val.to_pandas(), label=y_val.to_pandas())
    model.fit(train_pool, eval_set=val_pool, verbose=100, early_stopping_rounds=100)
    models.append(model)
    
    preds = model.predict(X_val.to_pandas())
    label_classes = np.load("gesture_classes.npy", allow_pickle=True)
    val_pred_df = pd.DataFrame({"gesture": [label_classes[i] for i in preds.squeeze()]})
    val_true_df = pd.DataFrame({"gesture": [label_classes[i] for i in y_val]})

    score = get_ci_f1_hierarchical(val_true_df, val_pred_df, n_bootstraps=1000)
    validation_scores.append(score)
    print(23 * '-')
    print(f"Fold {fold} - Score: {score:.5f}")
    print(23 * '-', end='\n\n')


final_score = np.median(validation_scores)
print(f"Final Validation Score: {final_score:.5f}")
print("Fold Scores:", *[f"{score:.5f}" for score in sorted(validation_scores)])

Learning rate set to 0.5
0:	learn: 2.4777217	test: 2.4948954	best: 2.4948954 (0)	total: 109ms	remaining: 979ms
9:	learn: 1.7204850	test: 1.7670731	best: 1.7670731 (9)	total: 1.23s	remaining: 0us

bestTest = 1.767073052
bestIteration = 9

-----------------------
Fold 0 - Score: 0.61660
-----------------------

Learning rate set to 0.5
0:	learn: 2.4992594	test: 2.5511838	best: 2.5511838 (0)	total: 125ms	remaining: 1.12s
9:	learn: 1.6739126	test: 1.9834488	best: 1.9834488 (9)	total: 1.4s	remaining: 0us

bestTest = 1.983448819
bestIteration = 9

-----------------------
Fold 1 - Score: 0.54841
-----------------------

Learning rate set to 0.5
0:	learn: 2.4826313	test: 2.4921100	best: 2.4921100 (0)	total: 125ms	remaining: 1.12s
9:	learn: 1.7306504	test: 1.8773036	best: 1.8773036 (9)	total: 1.37s	remaining: 0us

bestTest = 1.877303598
bestIteration = 9

-----------------------
Fold 2 - Score: 0.56208
-----------------------

Learning rate set to 0.5
0:	learn: 2.5356050	test: 2.6397883	best: 2

In [8]:
model.get_feature_importance(data=train_pool, prettified=True)

Unnamed: 0,Feature Id,Importances
0,acc_z_last,6.275045
1,acc_z_min,6.178174
2,acc_x_last,5.769382
3,acc_x_var,5.39575
4,rot_y_mean,5.279184
5,acc_y_last,5.146997
6,acc_y_var,5.097419
7,rot_z_last,4.746402
8,rot_x_median,4.735661
9,rot_x_min,4.367337


# Test submission

In [9]:
def predict(sequence: pl.DataFrame, demographics: pl.DataFrame) -> str:
    df = preprocess_data(sequence)
    df = generate_features(df)
    X = df.drop(['sequence_id', 'subject'])
    
    aggregated_proba = None
    for model in models:
        proba = model.predict_proba(X.to_pandas())
        if aggregated_proba is None:
            aggregated_proba = proba
        else:
            aggregated_proba += proba
    
    # Выбираем класс с наибольшей суммарной вероятностью
    final_prediction = np.argmax(aggregated_proba, axis=1)
    label_classes = np.load("gesture_classes.npy", allow_pickle=True)
    return str(label_classes[final_prediction[0]]) 

In [11]:
if not os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    print("\nRunning manual test...")
    test_df = pd.read_csv(f'{DATA_DIR}/test.csv')
    sample_seq_id = test_df['sequence_id'].unique()[0]
    test_seq = test_df[test_df['sequence_id'] == sample_seq_id]
    prediction = predict(pl.DataFrame(test_seq), None)
    print(f"Manual prediction result for sequence_id {sample_seq_id}: {prediction}")


Running manual test...
Manual prediction result for sequence_id SEQ_000001: Text on phone
