In [3]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
import tsfel
from sklearn.metrics import roc_auc_score
import optuna
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [4]:
df = pd.read_parquet('train.parquet')

In [5]:
cfg = {
    "statistical": {
        "Root mean square": {
      "complexity": "constant",
      "description": "Computes root mean square of the signal.",
      "function": "tsfel.rms",
      "parameters": "",
      "n_features": 1,
      "use": "yes",
      "tag": [
        "emg",
        "inertial"
      ]
    },
        "Mean": {
      "complexity": "constant",
      "description": "Computes the mean value of the signal.",
      "function": "tsfel.calc_mean",
      "parameters": "",
      "n_features": 1,
      "use": "yes",
      "tag": "inertial"
    },
        "Max": {
      "complexity": "constant",
      "description": "Computes the maximum value of the signal.",
      "function": "tsfel.calc_max",
      "parameters": "",
      "n_features": 1,
      "use": "yes"
    },
        "Kurtosis": {
      "complexity": "constant",
      "description": "Computes kurtosis of the signal.",
      "function": "tsfel.kurtosis",
      "parameters": "",
      "n_features": 1,
      "use": "yes"
    },
        "Interquartile range": {
      "complexity": "constant",
      "description": "Computes interquartile range of the signal.",
      "function": "tsfel.interq_range",
      "parameters": "",
      "n_features": 1,
      "use": "yes"
    },
        "Spectral entropy": {
      "complexity": "log",
      "description": "Computes the spectral entropy of the signal based on Fourier transform.",
      "function": "tsfel.spectral_entropy",
      "parameters": {
        "fs": 100
      },
      "n_features": 1,
      "use": "yes",
      "tag": "eeg"
    },
        "Max power spectrum": {
      "complexity": "log",
      "description": "Computes the maximum power spectrum density.",
      "function": "tsfel.max_power_spectrum",
      "parameters": {
        "fs": 100
      },
      "n_features": 1,
      "use": "yes"
    },
        "Maximum frequency": {
      "complexity": "log",
      "description": "Computes the maximum frequency.",
      "function": "tsfel.max_frequency",
      "parameters": {
        "fs": 100
      },
      "n_features": 1,
      "use": "yes"
    },
        "Median frequency": {
      "complexity": "log",
      "description": "Computes the median frequency.",
      "function": "tsfel.median_frequency",
      "parameters": {
        "fs": 100
      },
      "n_features": 1,
      "use": "yes"
    },
        "Standard deviation": {
      "complexity": "constant",
      "description": "Computes standard deviation of the signal.",
      "function": "tsfel.calc_std",
      "parameters": "",
      "n_features": 1,
      "use": "yes"
    },
        "Variance": {
      "complexity": "constant",
      "description": "Computes variance of the signal.",
      "function": "tsfel.calc_var",
      "parameters": "",
      "n_features": 1,
      "use": "yes"
    },
        "Mean diff": {
      "complexity": "constant",
      "description": "Computes mean of differences of the signal.",
      "function": "tsfel.mean_diff",
      "parameters": "",
      "n_features": 1,
      "use": "yes"
    },
        "Neighbourhood peaks": {
      "complexity": "constant",
      "description": "Computes the number of peaks from a defined neighbourhood of the signal.",
      "function": "tsfel.neighbourhood_peaks",
      "parameters": {
        "n": 10
      },
      "n_features": 1,
      "use": "yes"
    },
        "Centroid": {
      "complexity": "constant",
      "description": "Computes the centroid along the time axis.",
      "function": "tsfel.calc_centroid",
      "parameters": {
        "fs": 100
      },
      "n_features": 1,
      "use": "yes"
    },
        "Mean absolute diff": {
      "complexity": "constant",
      "description": "Computes mean absolute differences of the signal.",
      "function": "tsfel.mean_abs_diff",
      "parameters": "",
      "n_features": 1,
      "use": "yes"
    },
        "Min": {
      "complexity": "constant",
      "description": "Computes the minimum value of the signal.",
      "function": "tsfel.calc_min",
      "parameters": "",
      "n_features": 1,
      "use": "yes"
    },
        "Peak to peak distance": {
      "complexity": "constant",
      "description": "Computes the peak to peak distance.",
      "function": "tsfel.pk_pk_distance",
      "parameters": "",
      "n_features": 1,
      "use": "yes"
    }
    }
    
}

data = df
features_list = []

for idx, row in tqdm(data.iterrows()):
    values = row["values"]

    if len(values) >= 2:
        features = tsfel.time_series_features_extractor(cfg, values, verbose=0)
        features_series = features.iloc[0] if not features.empty else pd.Series(dtype='float64')
        features_list.append(features_series)
    else:
        features_list.append(pd.Series(dtype='float64'))

features_df = pd.DataFrame(features_list).reset_index(drop=True)


result = pd.concat([data.drop(columns=["values", "dates"]).reset_index(drop=True), features_df], axis=1)

df = df.merge(result,on='id', how ='left')
df_exploded = df.explode(['dates', 'values'], ignore_index=True)
n_lags = 4
df_exploded['values'] = df_exploded['values'].fillna(0)
for lag in range(1, n_lags + 1):
    df_exploded[f'lag_{lag}'] = df_exploded.groupby('id')['values'].shift(lag)

df_exploded = df_exploded.fillna(0)
df_exploded['dates'] = df_exploded['dates'].astype(str)
df_exploded[['year', 'month', 'day']] = df_exploded['dates'].str.split('-', expand=True)
df_exploded['year'] = df_exploded['year'].astype(int)
df_exploded['month'] = df_exploded['month'].astype(int)
df_exploded['day'] = df_exploded['day'].astype(int)
df_exploded.drop('dates',axis=1 ,inplace = True)
df_exploded.drop(columns = ['label_y'],inplace = True)
df_exploded.rename(columns = {'label_x':'label'},inplace = True)

80000it [05:30, 242.18it/s]


In [6]:
def predict_and_aggregate(model, pred_probs, X_test_ids, threshold=0.5):
    predictions_df = pd.DataFrame({'id': X_test_ids, 'pred_prob': pred_probs})
    final_predictions = predictions_df.groupby('id', sort=False)['pred_prob'].mean().reset_index()
    final_predictions['final_prediction'] = (final_predictions['pred_prob'] >= threshold).astype(int)
    
    return final_predictions

def calculate_roc_auc(model, y_test,pred_probs, X_test_ids):
    predictions_df = pd.DataFrame({'id': X_test_ids, 'true_label': y_test, 'pred_prob': pred_probs})
    aggregated_preds = predictions_df.groupby('id').agg({'pred_prob': 'mean', 'true_label': 'first'})
    roc_auc = roc_auc_score(aggregated_preds['true_label'], aggregated_preds['pred_prob'])
    
    return roc_auc


In [7]:
def split_by_id(df, id_column='id', test_size=0.2, random_state=42):
    unique_ids = df[id_column].unique()
    train_ids, val_ids = train_test_split(unique_ids, test_size=test_size, random_state=random_state)
    train_df = df[df[id_column].isin(train_ids)]
    val_df = df[df[id_column].isin(test_ids)]
    X_train,y_train = train_df.drop(columns = ['id','label']),train_df['label']
    X_val,y_val = val_df.drop(columns = ['id','label']),val_df['label']
    
    return X_train,y_train, X_val,y_val,val_df['id'],val_ids



def objective(trial, X_train, y_train, X_val, y_val, X_val_ids,val_ids_set):
    params = {
    'iterations': trial.suggest_int('iterations', 100, 6000, step=100),
    'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.03, log=True),
    'depth': trial.suggest_int('depth', 4, 12),
    'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-4, 1e-1, log=True),
    'random_strength': trial.suggest_float('random_strength', 1e-4, 1e-1, log=True),
    'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0),
    'border_count': trial.suggest_int('border_count', 32, 255),
    'one_hot_max_size': trial.suggest_int('one_hot_max_size', 2, 10),
    'leaf_estimation_iterations': trial.suggest_int('leaf_estimation_iterations', 1, 20),
    'grow_policy': trial.suggest_categorical('grow_policy', ['SymmetricTree', 'Depthwise', 'Lossguide'])
}

    model = CatBoostClassifier(**params, verbose=0,random_state =42)
    model.fit(X_train, y_train)

    pred_probs = model.predict_proba(X_val)[:, 1]
    preds_df = predict_and_aggregate(model, pred_probs, X_val_ids)

    score = calculate_roc_auc(model,y_val,pred_probs,X_val_ids)

    return score


In [None]:
X_train,y_train, X_val,y_val,X_val_ids,val_ids_set = split_by_id(df_exploded)

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(lambda trial: objective(trial, X_train, y_train, X_val, y_val, X_val_ids,val_ids_set), n_trials=100)

print("Лучшие параметры:", study.best_params)
print("Лучшее значение ROC-AUC:", study.best_value)

In [8]:
X_train,y_train = df_exploded.drop(columns = ['id','label']),df_exploded['label']

In [12]:
best_params = study.best_params
model = CatBoostClassifier(**best_params)
model.fit(X_train,y_train)

0:	learn: 0.6761882	total: 566ms	remaining: 30m 11s
1:	learn: 0.6595714	total: 960ms	remaining: 25m 34s
2:	learn: 0.6451136	total: 1.35s	remaining: 23m 59s
3:	learn: 0.6309570	total: 1.75s	remaining: 23m 18s
4:	learn: 0.6177446	total: 2.13s	remaining: 22m 43s
5:	learn: 0.6065664	total: 2.5s	remaining: 22m 11s
6:	learn: 0.5951939	total: 2.89s	remaining: 21m 58s
7:	learn: 0.5847769	total: 3.26s	remaining: 21m 42s
8:	learn: 0.5757497	total: 3.58s	remaining: 21m 9s
9:	learn: 0.5664398	total: 3.93s	remaining: 20m 55s
10:	learn: 0.5580314	total: 4.33s	remaining: 20m 56s
11:	learn: 0.5503833	total: 4.72s	remaining: 20m 53s
12:	learn: 0.5433254	total: 5.08s	remaining: 20m 45s
13:	learn: 0.5361535	total: 5.45s	remaining: 20m 39s
14:	learn: 0.5299378	total: 5.77s	remaining: 20m 25s
15:	learn: 0.5239886	total: 6.1s	remaining: 20m 14s
16:	learn: 0.5186190	total: 6.44s	remaining: 20m 5s
17:	learn: 0.5134089	total: 6.81s	remaining: 20m 4s
18:	learn: 0.5081518	total: 7.17s	remaining: 20m 1s
19:	learn

<catboost.core.CatBoostClassifier at 0x24f40cd8a60>

In [13]:
model.save_model('best_model_by_optuna.cbm')