In [1]:
import numpy as np
import pandas as pd
import tsfel
from tqdm import tqdm
from catboost import CatBoostClassifier
import json
import warnings
from sklearn.metrics import roc_auc_score
warnings.filterwarnings("ignore")

In [2]:
test_data = pd.read_parquet('test.parquet')

In [3]:
cfg = {
    "statistical": {
        "Root mean square": {
      "complexity": "constant",
      "description": "Computes root mean square of the signal.",
      "function": "tsfel.rms",
      "parameters": "",
      "n_features": 1,
      "use": "yes",
      "tag": [
        "emg",
        "inertial"
      ]
    },
        "Mean": {
      "complexity": "constant",
      "description": "Computes the mean value of the signal.",
      "function": "tsfel.calc_mean",
      "parameters": "",
      "n_features": 1,
      "use": "yes",
      "tag": "inertial"
    },
        "Max": {
      "complexity": "constant",
      "description": "Computes the maximum value of the signal.",
      "function": "tsfel.calc_max",
      "parameters": "",
      "n_features": 1,
      "use": "yes"
    },
        "Kurtosis": {
      "complexity": "constant",
      "description": "Computes kurtosis of the signal.",
      "function": "tsfel.kurtosis",
      "parameters": "",
      "n_features": 1,
      "use": "yes"
    },
        "Interquartile range": {
      "complexity": "constant",
      "description": "Computes interquartile range of the signal.",
      "function": "tsfel.interq_range",
      "parameters": "",
      "n_features": 1,
      "use": "yes"
    },
        "Spectral entropy": {
      "complexity": "log",
      "description": "Computes the spectral entropy of the signal based on Fourier transform.",
      "function": "tsfel.spectral_entropy",
      "parameters": {
        "fs": 100
      },
      "n_features": 1,
      "use": "yes",
      "tag": "eeg"
    },
        "Max power spectrum": {
      "complexity": "log",
      "description": "Computes the maximum power spectrum density.",
      "function": "tsfel.max_power_spectrum",
      "parameters": {
        "fs": 100
      },
      "n_features": 1,
      "use": "yes"
    },
        "Maximum frequency": {
      "complexity": "log",
      "description": "Computes the maximum frequency.",
      "function": "tsfel.max_frequency",
      "parameters": {
        "fs": 100
      },
      "n_features": 1,
      "use": "yes"
    },
        "Median frequency": {
      "complexity": "log",
      "description": "Computes the median frequency.",
      "function": "tsfel.median_frequency",
      "parameters": {
        "fs": 100
      },
      "n_features": 1,
      "use": "yes"
    },
        "Standard deviation": {
      "complexity": "constant",
      "description": "Computes standard deviation of the signal.",
      "function": "tsfel.calc_std",
      "parameters": "",
      "n_features": 1,
      "use": "yes"
    },
        "Variance": {
      "complexity": "constant",
      "description": "Computes variance of the signal.",
      "function": "tsfel.calc_var",
      "parameters": "",
      "n_features": 1,
      "use": "yes"
    },
        "Mean diff": {
      "complexity": "constant",
      "description": "Computes mean of differences of the signal.",
      "function": "tsfel.mean_diff",
      "parameters": "",
      "n_features": 1,
      "use": "yes"
    },
        "Neighbourhood peaks": {
      "complexity": "constant",
      "description": "Computes the number of peaks from a defined neighbourhood of the signal.",
      "function": "tsfel.neighbourhood_peaks",
      "parameters": {
        "n": 10
      },
      "n_features": 1,
      "use": "yes"
    },
        "Centroid": {
      "complexity": "constant",
      "description": "Computes the centroid along the time axis.",
      "function": "tsfel.calc_centroid",
      "parameters": {
        "fs": 100
      },
      "n_features": 1,
      "use": "yes"
    },
        "Mean absolute diff": {
      "complexity": "constant",
      "description": "Computes mean absolute differences of the signal.",
      "function": "tsfel.mean_abs_diff",
      "parameters": "",
      "n_features": 1,
      "use": "yes"
    },
        "Min": {
      "complexity": "constant",
      "description": "Computes the minimum value of the signal.",
      "function": "tsfel.calc_min",
      "parameters": "",
      "n_features": 1,
      "use": "yes"
    },
        "Peak to peak distance": {
      "complexity": "constant",
      "description": "Computes the peak to peak distance.",
      "function": "tsfel.pk_pk_distance",
      "parameters": "",
      "n_features": 1,
      "use": "yes"
    }
    }
    
}

data = test_data

features_list = []

for idx, row in tqdm(data.iterrows()):
    values = row["values"]

    if len(values) >= 2:
        features = tsfel.time_series_features_extractor(cfg, values, verbose=0)
        features_series = features.iloc[0] if not features.empty else pd.Series(dtype='float64')
        features_list.append(features_series)
    else:
        features_list.append(pd.Series(dtype='float64'))

features_df = pd.DataFrame(features_list).reset_index(drop=True)


result = pd.concat([data.drop(columns=["values", "dates"]).reset_index(drop=True), features_df], axis=1)

0it [00:00, ?it/s]

20000it [01:11, 281.13it/s]


In [4]:
test_data = test_data.merge(result,on='id', how ='left')

In [5]:
df_exploded = test_data.explode(['dates', 'values'], ignore_index=True)

In [6]:
n_lags = 4
df_exploded['values'] = df_exploded['values'].fillna(0)
for lag in range(1, n_lags + 1):
    df_exploded[f'lag_{lag}'] = df_exploded.groupby('id')['values'].shift(lag)
    
df_exploded = df_exploded.fillna(0)


In [7]:
df_exploded['dates'] = df_exploded['dates'].astype(str)
df_exploded[['year', 'month', 'day']] = df_exploded['dates'].str.split('-', expand=True)
df_exploded['year'] = df_exploded['year'].astype(int)
df_exploded['month'] = df_exploded['month'].astype(int)
df_exploded['day'] = df_exploded['day'].astype(int)
df_exploded.drop('dates',axis=1 ,inplace = True)

In [8]:
def predict_and_aggregate(pred_probs, X_test_ids):
    
    predictions_df = pd.DataFrame({'id': X_test_ids, 'pred_prob': pred_probs})
    final_predictions = predictions_df.groupby('id', sort=False)['pred_prob'].mean().reset_index()
    
    return final_predictions
def calculate_roc_auc(model, y_test,pred_probs, X_test_ids):
    predictions_df = pd.DataFrame({'id': X_test_ids, 'true_label': y_test, 'pred_prob': pred_probs})
    aggregated_preds = predictions_df.groupby('id').agg({'pred_prob': 'mean', 'true_label': 'first'})
    roc_auc = roc_auc_score(aggregated_preds['true_label'], aggregated_preds['pred_prob'])
    
    return roc_auc

In [9]:
X_test,X_test_ids=df_exploded.drop(columns = ['id']),df_exploded['id']

In [10]:
model = CatBoostClassifier()
model.load_model('best_model_by_optuna.cbm')

<catboost.core.CatBoostClassifier at 0x1ee19d82aa0>

In [11]:
pred_probs = model.predict_proba(X_test)[:,1]
final_prediction = predict_and_aggregate(pred_probs,X_test_ids)

In [12]:
final_prediction = final_prediction.rename(columns={'pred_prob': 'score'})

In [13]:
final_prediction.to_csv('submission.csv', index=False)