In [10]:
import os
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import StratifiedGroupKFold

from catboost import CatBoostClassifier

from metrics import CompetitionMetric

import warnings
warnings.filterwarnings("ignore")

SEED = 42
os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)

In [11]:
DATA_DIR = '../data/raw/'
df = pd.read_csv(DATA_DIR + 'train.csv')
df_demog = pd.read_csv(DATA_DIR + 'train_demographics.csv')

df_test = pd.read_csv(DATA_DIR + 'test.csv')
df_test_demog = pd.read_csv(DATA_DIR + 'test_demographics.csv')

In [12]:
def preprocess_seq(df):
    # TODO: scaled = StandardScaler().fit_transform(data) Нужно ли?
    # fillna
        # TODO: Удалить последовательности с NaN. Попробовать заполнить из других последовательностей с тем же жестом
        # rot_ - 4 шт. - полностью из NaN в 50 из 8151 sequences
    # padding/truncation
    seq_cols = [col for col in df.columns if col[:4] in ('acc_', 'rot_', 'thm_', 'tof_')]
    df[seq_cols] = df[seq_cols].fillna(0)
    return df

def preprocess_data(df):
    # label encoding
    label_encoder = LabelEncoder()
    df['gesture'] = label_encoder.fit_transform(df['gesture'].astype(str))
    np.save('gesture_classes.npy', label_encoder.classes_)
    
    df['sequence_id'] = df['sequence_id'].apply(lambda x: int(x[4:]))
    
    cols_to_drop = {
        'row_id', 'sequence_type', 'sequence_counter', 
        'orientation', 'behavior', 'phase'
    }
    # del THM and TOF columns
    thm_tof_cols = [col for col in df.columns if col[:4] in ('thm_', 'tof_')]
    cols_to_drop.update(thm_tof_cols)
    df = df.drop(columns=cols_to_drop)
    
    # join subject features
    df = df.join(df_demog.set_index('subject'), on='subject', how='left')
    
    df = preprocess_seq(df)
    return df

In [13]:
df = preprocess_data(df)
df

Unnamed: 0,sequence_id,subject,gesture,acc_x,acc_y,acc_z,rot_w,rot_x,rot_y,rot_z,adult_child,age,sex,handedness,height_cm,shoulder_to_wrist_cm,elbow_to_wrist_cm
0,7,SUBJ_059520,1,6.683594,6.214844,3.355469,0.134399,-0.355164,-0.447327,-0.809753,0,12,1,1,163.0,52,24.0
1,7,SUBJ_059520,1,6.949219,6.214844,3.125000,0.143494,-0.340271,-0.428650,-0.824524,0,12,1,1,163.0,52,24.0
2,7,SUBJ_059520,1,5.722656,5.410156,5.421875,0.219055,-0.274231,-0.356934,-0.865662,0,12,1,1,163.0,52,24.0
3,7,SUBJ_059520,1,6.601562,3.531250,6.457031,0.297546,-0.264160,-0.238159,-0.885986,0,12,1,1,163.0,52,24.0
4,7,SUBJ_059520,1,5.566406,0.277344,9.632812,0.333557,-0.218628,-0.063538,-0.914856,0,12,1,1,163.0,52,24.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
574940,65531,SUBJ_039498,17,3.503906,-0.433594,-8.441406,0.106628,-0.862488,-0.470825,-0.151733,1,30,0,1,186.0,55,30.0
574941,65531,SUBJ_039498,17,3.773438,-0.664062,-9.207031,0.110596,-0.865417,-0.460327,-0.164185,1,30,0,1,186.0,55,30.0
574942,65531,SUBJ_039498,17,3.082031,0.218750,-7.402344,0.113159,-0.864258,-0.461182,-0.166138,1,30,0,1,186.0,55,30.0
574943,65531,SUBJ_039498,17,3.964844,-0.359375,-9.085938,0.117493,-0.866760,-0.450623,-0.178467,1,30,0,1,186.0,55,30.0


In [14]:
def generate_features(df):
    seq_cols = [col for col in df.columns if col[:4] in ('acc_', 'rot_')]
    
    df_featured = df.groupby('sequence_id').tail(1).reset_index(drop=True)
    df_featured.drop(columns=seq_cols, inplace=True)
    
    df_featured = df_featured.join(get_statistics(df, seq_cols), on='sequence_id', how='left')
    # len seq
    return df_featured



def get_magnitude():
    pass


def get_statistics(df, seq_cols):
    """
    min, max, mean, var
    """
    df_gr = df.groupby('sequence_id', as_index=False)[seq_cols].agg(['min', 'max', 'mean', 'var', 'last'])
    df_gr.columns = ["_".join(c) for c in df_gr.columns]
    return df_gr
    
    
    
    
    


In [15]:
df = generate_features(df)
df

Unnamed: 0,sequence_id,subject,gesture,adult_child,age,sex,handedness,height_cm,shoulder_to_wrist_cm,elbow_to_wrist_cm,...,rot_y_min,rot_y_max,rot_y_mean,rot_y_var,rot_y_last,rot_z_min,rot_z_max,rot_z_mean,rot_z_var,rot_z_last
0,7,SUBJ_059520,1,0,12,1,1,163.0,52,24.0,...,-0.411804,0.406067,0.272682,0.014177,0.270935,-0.206238,0.364563,0.275940,0.010817,0.364563
1,8,SUBJ_020948,6,1,24,1,1,173.0,49,26.0,...,-0.240173,0.170471,-0.050159,0.026180,-0.158325,-0.864258,0.621033,-0.716918,0.044884,-0.655518
2,13,SUBJ_040282,1,0,12,1,1,157.0,44,26.0,...,-0.590637,0.175781,-0.419606,0.074193,-0.585144,-0.944458,0.165161,-0.160796,0.150562,-0.062805
3,16,SUBJ_052342,17,0,13,0,1,171.0,54,26.0,...,-0.522583,0.309998,-0.153520,0.044949,-0.388672,-0.855469,0.923401,0.085844,0.327535,-0.285217
4,18,SUBJ_032165,6,0,13,0,1,165.0,52,23.0,...,-0.727478,0.202026,-0.465187,0.140415,-0.705078,-0.715332,-0.296692,-0.455389,0.022436,-0.338440
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8146,65508,SUBJ_027682,14,1,52,1,1,170.0,52,23.0,...,,,,,,,,,,
8147,65519,SUBJ_050642,10,1,31,0,1,184.0,56,28.0,...,,,,,,,,,,
8148,65522,SUBJ_040282,0,0,12,1,1,157.0,44,26.0,...,,,,,,,,,,
8149,65526,SUBJ_063447,1,1,36,0,1,175.0,54,25.0,...,,,,,,,,,,


In [16]:
def get_ci_f1_hierarchical(y_true, y_pred, n_bootstraps=1000, alpha=0.05):
    """
    Возвращает нижнюю границу 95% ДИ для f1_hierarchical через бутстрапинг.
    """
    bootstrapped_scores = []
    n_samples = len(y_pred)
    
    for _ in range(n_bootstraps):
        indices = np.random.choice(n_samples, n_samples, replace=True)
        y_true_bs = y_true.iloc[indices]
        y_pred_bs = y_pred.iloc[indices]

        if len(np.unique(y_true_bs)) < 2:
            continue
        
        # Расчет метрики (только f1_hierarchical)
        f1_hier, _, _ = CompetitionMetric().calculate_f1_scores(y_true_bs, y_pred_bs)
        bootstrapped_scores.append(f1_hier)
    
    # Расчет перцентилей
    lower_bound = np.percentile(bootstrapped_scores, 100 * alpha / 2)
    return lower_bound

In [17]:
sgkf = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=SEED)

groups = df['subject']
X = df.drop(columns=['sequence_id', 'subject', 'gesture'])
y = df['gesture']

for fold, (train_idx, val_idx) in enumerate(sgkf.split(X, y, groups)):
    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]

    model = CatBoostClassifier()
    model.fit(X_train, y_train, eval_set=(X_val, y_val), verbose=100, early_stopping_rounds=100)
    preds = model.predict(X_val)

    label_classes = np.load("gesture_classes.npy", allow_pickle=True)
    val_pred_df = pd.DataFrame({"gesture": [label_classes[i] for i in preds.squeeze()]})
    val_true_df = pd.DataFrame({"gesture": [label_classes[i] for i in y_val]})

    score = get_ci_f1_hierarchical(val_true_df, val_pred_df)
    print(f"Fold {fold} - Score: {score:.5f}")
    
    break

Learning rate set to 0.112137
0:	learn: 2.8660704	test: 2.8688561	best: 2.8688561 (0)	total: 160ms	remaining: 2m 39s
100:	learn: 2.6009555	test: 2.7691288	best: 2.7654991 (44)	total: 13.5s	remaining: 2m
Stopped by overfitting detector  (100 iterations wait)

bestTest = 2.765499149
bestIteration = 44

Shrink model to first 45 iterations.
Fold 0 - Score: 0.40969
