In [1]:
import os
import pandas as pd
import polars as pl
import numpy as np

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import StratifiedGroupKFold

from catboost import CatBoostClassifier, Pool

from metrics import CompetitionMetric

import warnings
warnings.filterwarnings("ignore")

SEED = 42
os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)

In [None]:
DATA_DIR = '../data/raw/'
df = pl.read_csv(DATA_DIR + 'train.csv')
df_demog = pl.read_csv(DATA_DIR + 'train_demographics.csv')

df_test = pl.read_csv(DATA_DIR + 'test.csv')
df_test_demog = pl.read_csv(DATA_DIR + 'test_demographics.csv')

In [None]:
def encode_label(df):
    label_encoder = LabelEncoder()
    df = df.with_columns(
        pl.Series(label_encoder.fit_transform(df['gesture'])).alias('gesture'),
    )
    np.save('gesture_classes.npy', label_encoder.classes_)
    return df
    

def preprocess_seq(df):
    # TODO: scaled = StandardScaler().fit_transform(data) Нужно ли?
    # fillna
        # TODO: Удалить последовательности с NaN. Попробовать заполнить из других последовательностей с тем же жестом
        # rot_ - 4 шт. - полностью из NaN в 50 из 8151 sequences
    # padding/truncation
    seq_cols = [col for col in df.columns if col[:4] in ('acc_', 'rot_', 'thm_', 'tof_')]
    df = df.with_columns([pl.col(seq_cols).fill_null(0)])
    return df

def preprocess_data(df):
    # label encoding
    df = df.with_columns(
        pl.col('sequence_id').map_elements(lambda x: int(x[4:]))
    )
    
    cols_to_drop = {
        'row_id', 'sequence_type', 'sequence_counter', 
        'orientation', 'behavior', 'phase'
    }
    # del THM and TOF columns
    thm_tof_cols = [col for col in df.columns if col[:4] in ('thm_', 'tof_')]
    cols_to_drop.update(thm_tof_cols)
    df = df.drop(cols_to_drop, strict=False)
    
    # join subject features
    df = df.join(df_demog, on='subject', how='left')
    
    df = preprocess_seq(df)
    return df

In [5]:
def generate_features(df):
    seq_cols = [col for col in df.columns if col[:4] in ('acc_', 'rot_')]
    
    df_featured = (
        df
        .group_by('sequence_id')
        .tail(1)
        .drop(seq_cols)
        .join(get_statistics(df, seq_cols), on='sequence_id', how='left')
    )
    # len seq (other for all seq?)
    # subject features
    return df_featured



def get_magnitude():
    pass


def get_statistics(df, seq_cols):
    """
    min, max, mean, var
    """
    df_gr = (
        df
        .select(['sequence_id'] + seq_cols)
        .group_by('sequence_id')
        .agg([
            pl.col(seq_cols).min().name.suffix('_min'),
            pl.col(seq_cols).max().name.suffix('_max'),
            pl.col(seq_cols).median().name.suffix('_median'),
            pl.col(seq_cols).mean().name.suffix('_mean'),
            pl.col(seq_cols).var().name.suffix('_var'),
            pl.col(seq_cols).last().name.suffix('_last'),
        ])
    )
    # df_gr.columns = ["_".join(filter(None, c)) for c in df_gr.columns]
    return df_gr

In [None]:
df = encode_label(df)
df = preprocess_data(df)
df = generate_features(df)
df

sequence_id,subject,gesture,adult_child,age,sex,handedness,height_cm,shoulder_to_wrist_cm,elbow_to_wrist_cm,acc_x_min,acc_y_min,acc_z_min,rot_w_min,rot_x_min,rot_y_min,rot_z_min,acc_x_max,acc_y_max,acc_z_max,rot_w_max,rot_x_max,rot_y_max,rot_z_max,acc_x_median,acc_y_median,acc_z_median,rot_w_median,rot_x_median,rot_y_median,rot_z_median,acc_x_mean,acc_y_mean,acc_z_mean,rot_w_mean,rot_x_mean,rot_y_mean,rot_z_mean,acc_x_var,acc_y_var,acc_z_var,rot_w_var,rot_x_var,rot_y_var,rot_z_var,acc_x_last,acc_y_last,acc_z_last,rot_w_last,rot_x_last,rot_y_last,rot_z_last
i64,str,i64,i64,i64,i64,i64,f64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
44570,"""SUBJ_063464""",6,0,15,0,1,162.0,43,23.0,4.277344,-3.550781,3.113281,0.401062,-0.298889,-0.517029,-0.791565,7.2578125,7.675781,10.582031,0.540588,0.031555,0.020508,-0.721313,4.886719,6.580078,5.71875,0.486511,-0.057526,-0.470642,-0.74765,5.143521,5.453125,5.96774,0.477672,-0.076289,-0.413392,-0.750101,0.405109,9.89183,1.870903,0.00133,0.008467,0.022766,0.000482,5.6171875,7.058594,4.566406,0.451477,-0.120911,-0.471191,-0.748047
55065,"""SUBJ_040106""",10,0,12,0,1,167.0,50,28.0,-0.667969,-4.539062,-3.953125,0.268005,-0.169434,-0.756653,-0.738953,5.191406,10.632812,12.636719,0.681946,0.435608,0.045471,-0.414124,1.6875,9.386719,0.4140625,0.485779,0.373932,-0.590088,-0.524323,1.910854,6.97998,1.901088,0.512557,0.272562,-0.492322,-0.552588,1.620299,22.160962,21.686835,0.009644,0.041185,0.057026,0.009498,0.328125,8.644531,-0.734375,0.514526,0.417908,-0.616821,-0.424377
31863,"""SUBJ_030676""",4,0,10,0,1,151.0,46,21.0,-13.277344,0.3125,-4.667969,0.049133,-0.708557,-0.363159,-0.634521,0.285156,7.515625,10.925781,0.930969,0.651001,0.586182,0.605286,-8.742188,0.964844,-1.892578,0.358185,-0.574158,0.391541,0.478638,-6.673549,1.113793,1.631045,0.496059,-0.375885,0.325763,0.200213,14.912124,1.266763,35.427943,0.087502,0.136525,0.052104,0.195916,-11.015625,1.269531,-3.519531,0.336365,-0.582703,0.586182,0.451355
38145,"""SUBJ_053906""",0,0,12,1,1,163.0,51,24.0,-7.253906,-4.46875,-13.851562,0.031799,-0.523254,-0.917664,-0.888916,11.832031,6.6015625,8.09375,0.444519,0.35376,0.964294,0.424988,-4.53125,2.425781,-7.648438,0.266052,0.24115,0.902222,0.060669,-1.899433,2.1214,-5.408701,0.240396,0.107004,0.67485,-0.098301,31.729629,6.889113,28.78423,0.015093,0.088963,0.233903,0.137022,-5.601562,0.816406,-6.765625,0.279968,0.313232,0.902039,0.098999
23869,"""SUBJ_019262""",0,0,12,0,1,165.0,48,25.0,-10.386719,-4.613281,-10.84375,0.056519,-0.944763,0.267517,0.055664,-1.226562,7.375,6.738281,0.361572,-0.175171,0.441528,0.818909,-3.21875,-2.046875,-8.990234,0.169373,-0.935272,0.286194,0.121399,-3.439535,-1.468262,-7.449463,0.177385,-0.87052,0.305116,0.182471,2.655926,6.609342,20.88666,0.003797,0.038984,0.002025,0.041327,-3.179688,-2.929688,-8.894531,0.168701,-0.936584,0.271606,0.143494
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
41679,"""SUBJ_059960""",15,1,31,1,1,166.0,49,23.0,-16.15625,-5.652344,-17.339844,0.048157,-0.532593,-0.780518,-0.36615,12.496094,7.675781,4.5,0.760193,0.712891,0.73938,0.223145,-4.359375,1.28125,-5.117188,0.418457,0.596069,0.639465,-0.184875,-2.09972,0.589254,-4.046801,0.461851,0.289224,0.339731,-0.158239,60.983489,21.949701,34.643969,0.036923,0.275075,0.240252,0.021156,-4.363281,-0.597656,-1.171875,0.373962,-0.509766,-0.6828,-0.36615
6678,"""SUBJ_053217""",3,0,15,1,1,173.0,55,27.0,4.113281,-1.574219,-0.394531,0.080505,-0.501526,-0.452515,-0.908325,11.011719,6.5078125,9.644531,0.36438,-0.220398,-0.000122,-0.74762,7.8984375,5.0546875,2.9765625,0.133179,-0.43927,-0.409607,-0.790161,7.490767,3.925639,4.172869,0.178024,-0.396018,-0.326511,-0.811158,2.871393,7.704015,8.629497,0.009823,0.008285,0.026685,0.002967,7.785156,5.589844,2.707031,0.123596,-0.42865,-0.408875,-0.796082
41944,"""SUBJ_042794""",7,1,35,1,1,177.0,55,25.0,-5.492188,-4.414062,-9.277344,0.030762,-0.820129,-0.546936,-0.511414,5.886719,7.9609375,12.019531,0.825745,0.773682,0.550537,0.23999,-3.769531,-2.382812,-8.089844,0.238281,-0.800964,0.534424,0.091492,-2.863361,-1.785156,-6.1875,0.278847,-0.657732,0.408321,0.025297,8.956574,6.231367,30.665971,0.030084,0.15192,0.104571,0.042404,-3.769531,-3.109375,-8.4375,0.047607,-0.820129,0.547729,0.15863
20037,"""SUBJ_016552""",17,1,43,1,1,161.0,51,23.0,2.472656,-9.089844,-5.972656,0.46991,-0.832825,-0.323547,-0.661987,8.21875,-5.714844,6.4765625,0.654297,-0.39563,0.107178,-0.0672,4.3125,-7.214844,-4.515625,0.498047,-0.817749,-0.263306,-0.092651,4.333147,-7.363919,-3.338329,0.519868,-0.765366,-0.228821,-0.167762,1.002709,0.568568,12.468527,0.002554,0.017217,0.012797,0.032203,5.8828125,-7.285156,-4.019531,0.488708,-0.816772,-0.276245,-0.133301


In [8]:
def get_ci_f1_hierarchical(y_true, y_pred, n_bootstraps=1000, alpha=0.05):
    """
    Возвращает нижнюю границу 95% ДИ для f1_hierarchical через бутстрапинг.
    """
    bootstrapped_scores = []
    n_samples = len(y_pred)
    
    for _ in range(n_bootstraps):
        indices = np.random.choice(n_samples, n_samples, replace=True)
        y_true_bs = y_true.iloc[indices]
        y_pred_bs = y_pred.iloc[indices]

        if len(np.unique(y_true_bs)) < 2:
            continue
        
        # Расчет метрики (только f1_hierarchical)
        f1_hier, _, _ = CompetitionMetric().calculate_f1_scores(y_true_bs, y_pred_bs)
        bootstrapped_scores.append(f1_hier)
    
    # Расчет перцентилей
    lower_bound = np.percentile(bootstrapped_scores, 100 * alpha / 2)
    return lower_bound

In [12]:
sgkf = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=SEED)

groups = df['subject']
X = df.drop(['sequence_id', 'subject', 'gesture'])
y = df['gesture']

for fold, (train_idx, val_idx) in enumerate(sgkf.split(X, y, groups)):
    X_train, y_train = X[train_idx], y[train_idx]
    X_val, y_val = X[val_idx], y[val_idx]

    model = CatBoostClassifier()
    train_pool = Pool(data=X_train.to_pandas(), label=y_train.to_pandas())
    val_pool = Pool(data=X_val.to_pandas(), label=y_val.to_pandas())
    model.fit(train_pool, eval_set=val_pool, verbose=100, early_stopping_rounds=100)
    preds = model.predict(X_val.to_pandas())

    label_classes = np.load("gesture_classes.npy", allow_pickle=True)
    val_pred_df = pd.DataFrame({"gesture": [label_classes[i] for i in preds.squeeze()]})
    val_true_df = pd.DataFrame({"gesture": [label_classes[i] for i in y_val]})

    score = get_ci_f1_hierarchical(val_true_df, val_pred_df)
    print(f"Fold {fold} - Score: {score:.5f}")
    
    break

Learning rate set to 0.112137
0:	learn: 2.7975997	test: 2.7896118	best: 2.7896118 (0)	total: 113ms	remaining: 1m 52s
100:	learn: 1.3642306	test: 1.5132301	best: 1.5132301 (100)	total: 14.1s	remaining: 2m 5s
200:	learn: 1.0849398	test: 1.4058848	best: 1.4058848 (200)	total: 28.8s	remaining: 1m 54s
300:	learn: 0.9038200	test: 1.3624457	best: 1.3624457 (300)	total: 44s	remaining: 1m 42s
400:	learn: 0.7592360	test: 1.3402188	best: 1.3396899 (398)	total: 59.6s	remaining: 1m 29s
500:	learn: 0.6499319	test: 1.3211142	best: 1.3210969 (499)	total: 1m 13s	remaining: 1m 13s
600:	learn: 0.5621561	test: 1.3139027	best: 1.3137589 (588)	total: 1m 28s	remaining: 59.1s
700:	learn: 0.4891500	test: 1.3081182	best: 1.3074998 (686)	total: 1m 46s	remaining: 45.6s
800:	learn: 0.4279971	test: 1.3019045	best: 1.3016263 (798)	total: 2m	remaining: 29.9s
900:	learn: 0.3764022	test: 1.3008887	best: 1.3001265 (842)	total: 2m 16s	remaining: 15s
999:	learn: 0.3348356	test: 1.2974744	best: 1.2971539 (998)	total: 2m 31

In [11]:
model.get_feature_importance(data=train_pool, prettified=True)

Unnamed: 0,Feature Id,Importances
0,acc_x_var,5.053665
1,acc_z_min,4.819306
2,acc_y_max,3.607899
3,acc_y_var,3.584101
4,acc_x_min,3.417204
5,acc_y_last,3.348117
6,acc_y_min,3.085295
7,rot_y_median,2.889351
8,acc_x_max,2.817354
9,rot_x_mean,2.78581
