# Klasyfikacja szeregów czasowych dla małych danych

In [None]:
#!pip install tsfresh
#!pip install scipy==1.8.0
#!pip install dask[dataframe]
#!pip install --upgrade xgboost==1.6.0
#!pip install --upgrade scikit-learn==1.0.2

In [None]:
from copy import deepcopy
import os
from typing import Dict, List, Tuple, TypeVar, Union

import numpy as np
import pandas as pd
import scipy as sp
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import VarianceThreshold
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, LeaveOneOut, StratifiedKFold
from sklearn.metrics import (
    accuracy_score,
    balanced_accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    roc_auc_score,
    confusion_matrix
)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.svm import SVC
import tsfresh
from tsfresh.feature_extraction.settings import ComprehensiveFCParameters, EfficientFCParameters, MinimalFCParameters
from tsfresh.feature_selection.relevance import calculate_relevance_table
from tsfresh.transformers import RelevantFeatureAugmenter, FeatureAugmenter, FeatureSelector
from xgboost import XGBClassifier




In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
class Dataset:
    def __init__(self, dirpath: str, condition_dir_name: str = "condition"):
        condition_dirpath = os.path.join(dirpath, condition_dir_name)
        control_dirpath = os.path.join(dirpath, "control")

        self.condition: List[pd.DataFrame] = [
            pd.read_csv(os.path.join(condition_dirpath, file))
            for file in os.listdir(condition_dirpath)
        ]

        self.control: List[pd.DataFrame] = [
            pd.read_csv(os.path.join(control_dirpath, file))
            for file in os.listdir(control_dirpath)
        ]

def variance_thresholding(X_train, X_test, threshold: float):

    scaler = MinMaxScaler(feature_range=(0, 1), copy=True)
    X_train_scaled = scaler.fit_transform(X_train)

    thresholder = VarianceThreshold(threshold=threshold)
    thresholder.fit(X_train_scaled)

    if isinstance(X_train, np.ndarray):
        X_train = thresholder.transform(X_train)
    elif isinstance(X_train, pd.DataFrame):
        X_train = X_train.loc[:, thresholder.variances_ >= threshold]

    if isinstance(X_test, np.ndarray):
        X_test = thresholder.transform(X_test)
    elif isinstance(X_test, pd.DataFrame):
        X_test = X_test.loc[:, thresholder.variances_ >= threshold]

    return X_train, X_test


def standardize(X_train, X_test):

    scaler = StandardScaler()
    scaler.fit(X_train)

    if isinstance(X_train, np.ndarray):
        X_train = scaler.transform(X_train)
    elif isinstance(X_train, pd.DataFrame):
        X_train = pd.DataFrame(
            data=scaler.transform(X_train),
            index=X_train.index,
            columns=X_train.columns
        )

    if isinstance(X_test, np.ndarray):
        X_test = scaler.transform(X_test)
    elif isinstance(X_test, pd.DataFrame):
        X_test = pd.DataFrame(
            data=scaler.transform(X_test),
            index=X_test.index,
            columns=X_test.columns
        )

    return X_train, X_test


def mcc(y_true, y_pred):

    if len(y_true) == 1:
        return y_true == y_pred

    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

    numerator = tp * tn - fp * fn
    denominator = np.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))

    if np.isclose(denominator, 0):
        denominator = 1

    return numerator / denominator


def calculate_metrics(clf, X_test, y_test):

    y_pred = clf.predict(X_test)

    metrics = {
        "accuracy": accuracy_score(y_test, y_pred),
        "balanced_accuracy": balanced_accuracy_score(y_test, y_pred),
        "f1": f1_score(y_test, y_pred, zero_division=1),
        "precision": precision_score(y_test, y_pred, zero_division=1),
        "recall": recall_score(y_test, y_pred, pos_label=1, zero_division=1),
        "specificity": recall_score(y_test, y_pred, pos_label=0, zero_division=1),
        "ROC_AUC": roc_auc_score(y_test, y_pred),
        "MCC": mcc(y_test, y_pred)
    }

    return metrics


def calculate_metrics_statistics(metrics):

    results = {}
    metrics_names = metrics[0].keys()

    for metric in metrics_names:
        values = [fold_metrics[metric] for fold_metrics in metrics]
        mean = np.mean(values)
        stddev = np.std(values)
        results[metric] = mean, stddev

    return results


In [None]:
PROCESSED_DATA_DIR = "/content/drive/MyDrive/Projekt/processed_data"

## Ekstrakcja cech

In [None]:
def basic_data_cleaning(data):

    data = [df.copy() for df in data]

    for df in data:
        df.columns = df.columns.str.lower()

        df["timestamp"] = pd.to_datetime(df["timestamp"],
                                         format="%Y-%m-%d %H:%M:%S")
        if "date" in df.columns:
            df.drop("date", axis=1, inplace=True)
        df["activity"] = df["activity"].astype(np.float32)

    return data


def get_day_part(df, part):

    if part == "day":
        df = df.loc[(df["timestamp"].dt.hour >= 8) &
                    (df["timestamp"].dt.hour < 21)]
    elif part == "night":
        df = df.loc[(df["timestamp"].dt.hour >= 21) |
                    (df["timestamp"].dt.hour < 8)]

    return df


def fill_missing_activity(df):

    df = df.copy()
    df = df.resample("min", on="timestamp").mean()
    df = df.reset_index()
    df["activity"] = df["activity"].fillna(df["activity"].mean())

    return df


def resample(df, freq: str = "H"):

    df = df.copy()
    df = fill_missing_activity(df)
    df = df.resample(freq, on="timestamp").mean()
    df = df.reset_index()

    return df


def get_clean_dataframes(dfs, freq: str = "H"):

    full_dfs = basic_data_cleaning(dfs)
    full_dfs = [fill_missing_activity(df) for df in full_dfs]
    full_dfs = [resample(df, freq=freq) for df in full_dfs]

    night_dfs = [get_day_part(df, part="night") for df in full_dfs]
    day_dfs = [get_day_part(df, part="day") for df in full_dfs]

    datasets = {
        "full_24h": full_dfs,
        "night": night_dfs,
        "day": day_dfs
    }

    return datasets


def get_tsfresh_flat_format_df(dfs):

    dfs = deepcopy(dfs)

    flat_df_list = []

    for idx, df in enumerate(dfs):
        df["id"] = idx
        flat_df_list.append(df)

    flat_df = pd.concat(flat_df_list, ignore_index=True)

    return flat_df

In [None]:
def extract_tsfresh_features(dfs: List[pd.DataFrame], settings: Dict) \
        -> pd.DataFrame:

    ts = get_tsfresh_flat_format_df(dfs)
    ids = ts["id"].unique()
    X = pd.DataFrame(index=ids)

    augmenter = FeatureAugmenter(
        default_fc_parameters=settings,
        column_id="id",
        column_sort="timestamp",
        column_value="activity",
        chunksize=1,
        n_jobs=4
    )

    augmenter.set_timeseries_container(ts)
    X = augmenter.transform(X)

    return X


class TsfreshTopNFeatureSelector(BaseEstimator, TransformerMixin):
    """
    Selects top N features using tsfresh feature selector.
    """
    def __init__(self, n: int = 10):
        self.n: int = n
        self.features: List[int] = None

    def fit(self, X, y):
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X)

        if not isinstance(y, pd.Series):
            y = pd.Series(y)

        relevance_table = calculate_relevance_table(X, y)
        relevance_table.sort_values("p_value", inplace=True)
        features = relevance_table.head(self.n)["feature"]
        self.features = list(features.values)

    def transform(self, X, y=None):
        return X[:, self.features]

### Psykose

In [None]:
dataset_str = "psykose"

In [None]:
dataset = Dataset(dirpath=os.path.join("/content/drive/MyDrive/Projekt/data", dataset_str))
condition = dataset.condition
control = dataset.control

In [None]:
condition_parts_dfs = get_clean_dataframes(condition, freq="min")
control_parts_dfs = get_clean_dataframes(control, freq="min")

datasets = {}

for part in ["full_24h", "night", "day"]:
    condition_dfs_list = condition_parts_dfs[part]
    control_dfs_list = control_parts_dfs[part]

    dfs_list = condition_dfs_list + control_dfs_list
    datasets[part] = dfs_list

y = pd.read_csv(os.path.join(PROCESSED_DATA_DIR, f"{dataset_str}_y.csv"), header=None, dtype=int)
y = y.values.ravel()

In [None]:
settings_dict = {"minimal": MinimalFCParameters()}

for part, dfs in datasets.items():
    for settings_name, settings in settings_dict.items():
        X = extract_tsfresh_features(dfs, settings)
        filename = f"automatic_{dataset_str}_{settings_name}_{part}.csv"
        filepath = os.path.join(PROCESSED_DATA_DIR, filename)
        X.to_csv(filepath, index=False)

Feature Extraction: 100%|██████████| 54/54 [00:00<00:00, 66.17it/s]
Feature Extraction: 100%|██████████| 54/54 [00:00<00:00, 105.48it/s]
Feature Extraction: 100%|██████████| 54/54 [00:00<00:00, 104.94it/s]


In [None]:
classifiers = {
    "LR": LogisticRegression(
        penalty="elasticnet",
        random_state=0,
        solver="saga",
        max_iter=5000
    ),
    "RF": RandomForestClassifier(
        n_estimators=500,
        criterion="entropy"
    ),
    "XGB": XGBClassifier(
        use_label_encoder=False,
        eval_metric="logloss",
        random_state=0
    )
}


param_grids = {
    "LR": {
        "C": [0.1, 1, 10, 100],
        "class_weight": ["balanced"],
        "l1_ratio": [0, 0.2, 0.4, 0.6, 0.8, 1]
    },
    "RF": {
        "n_estimators": [50, 100, 200],
        "max_depth": [10, 20, 50],
        "class_weight": ["balanced"],
        "criterion": ["gini", "entropy"]
    },
    "XGB": {
        "n_estimators": [50, 100],
        "max_depth": [3, 5],
        "learning_rate": [0.01, 0.1]
    }
}

In [None]:
# Minimalne ustawienia

for part in ["full_24h", "night", "day"]:
    print(f"PART: {part}")

    filename = f"automatic_{dataset_str}_minimal_{part}.csv"
    filepath = os.path.join(PROCESSED_DATA_DIR, filename)
    X = pd.read_csv(filepath, header=0).fillna(0).values

    y = pd.read_csv(os.path.join(PROCESSED_DATA_DIR, f"{dataset_str}_y.csv"), header=None, dtype=int)
    y = y.values.ravel()

    for clf_type in ["LR", "RF"]:
        print(f"  {clf_type}")
        folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

        test_scores = []
        for train_idx, test_idx in folds.split(X, y):
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]

            X_train, X_test = variance_thresholding(X_train, X_test, threshold=0.05)
            X_train, X_test = standardize(X_train, X_test)

            grid_search = GridSearchCV(
                estimator=classifiers[clf_type],
                param_grid=param_grids[clf_type],
                scoring="accuracy",
                n_jobs=-1,
                refit=True,
                cv=LeaveOneOut()
            )
            grid_search.fit(X_train, y_train)

            clf = grid_search.best_estimator_

            metrics = calculate_metrics(clf, X_test, y_test)
            print(metrics)
            test_scores.append(metrics)

        final_scores = calculate_metrics_statistics(test_scores)

        for metric, (mean, stddev) in final_scores.items():
            print(f"    {metric}: {mean:.4f} +- {stddev:.4f}")

        print()

PART: full_24h
  LR
{'accuracy': 0.8181818181818182, 'balanced_accuracy': 0.8166666666666667, 'f1': 0.8000000000000002, 'precision': 0.8, 'recall': 0.8, 'specificity': 0.8333333333333334, 'ROC_AUC': 0.8166666666666667, 'MCC': 0.6333333333333333}
{'accuracy': 0.7272727272727273, 'balanced_accuracy': 0.7333333333333334, 'f1': 0.7272727272727272, 'precision': 0.6666666666666666, 'recall': 0.8, 'specificity': 0.6666666666666666, 'ROC_AUC': 0.7333333333333334, 'MCC': 0.4666666666666667}
{'accuracy': 0.8181818181818182, 'balanced_accuracy': 0.8035714285714286, 'f1': 0.75, 'precision': 0.75, 'recall': 0.75, 'specificity': 0.8571428571428571, 'ROC_AUC': 0.8035714285714286, 'MCC': 0.6071428571428571}
{'accuracy': 0.8181818181818182, 'balanced_accuracy': 0.8571428571428572, 'f1': 0.8, 'precision': 0.6666666666666666, 'recall': 1.0, 'specificity': 0.7142857142857143, 'ROC_AUC': 0.8571428571428572, 'MCC': 0.6900655593423543}
{'accuracy': 1.0, 'balanced_accuracy': 1.0, 'f1': 1.0, 'precision': 1.0, 

In [None]:
# Minimalne ustawienia

for part in ["full_24h", "night", "day"]:
    print(f"PART: {part}")

    filename = f"automatic_{dataset_str}_minimal_{part}.csv"
    filepath = os.path.join(PROCESSED_DATA_DIR, filename)
    X = pd.read_csv(filepath, header=0).fillna(0).values

    y = pd.read_csv(os.path.join(PROCESSED_DATA_DIR, f"{dataset_str}_y.csv"), header=None, dtype=int)
    y = y.values.ravel()

    for clf_type in ["XGB"]:
        print(f"  {clf_type}")
        folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

        test_scores = []
        for train_idx, test_idx in folds.split(X, y):
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]

            X_train, X_test = variance_thresholding(X_train, X_test, threshold=0.05)
            X_train, X_test = standardize(X_train, X_test)

            grid_search = GridSearchCV(
                estimator=classifiers[clf_type],
                param_grid=param_grids[clf_type],
                scoring="accuracy",
                n_jobs=-1,
                refit=True,
                cv=LeaveOneOut()
            )
            grid_search.fit(X_train, y_train)

            clf = grid_search.best_estimator_

            metrics = calculate_metrics(clf, X_test, y_test)
            print(metrics)
            test_scores.append(metrics)

        final_scores = calculate_metrics_statistics(test_scores)

        for metric, (mean, stddev) in final_scores.items():
            print(f"    {metric}: {mean:.4f} +- {stddev:.4f}")

        print()

PART: full_24h
  XGB
{'accuracy': 0.9090909090909091, 'balanced_accuracy': 0.9, 'f1': 0.888888888888889, 'precision': 1.0, 'recall': 0.8, 'specificity': 1.0, 'ROC_AUC': 0.9, 'MCC': 0.8280786712108251}
{'accuracy': 0.7272727272727273, 'balanced_accuracy': 0.7166666666666667, 'f1': 0.6666666666666665, 'precision': 0.75, 'recall': 0.6, 'specificity': 0.8333333333333334, 'ROC_AUC': 0.7166666666666668, 'MCC': 0.4485426135725303}
{'accuracy': 0.6363636363636364, 'balanced_accuracy': 0.6607142857142857, 'f1': 0.6, 'precision': 0.5, 'recall': 0.75, 'specificity': 0.5714285714285714, 'ROC_AUC': 0.6607142857142857, 'MCC': 0.3105295017040594}
{'accuracy': 0.9090909090909091, 'balanced_accuracy': 0.875, 'f1': 0.8571428571428571, 'precision': 1.0, 'recall': 0.75, 'specificity': 1.0, 'ROC_AUC': 0.875, 'MCC': 0.8100925873009825}
{'accuracy': 0.9, 'balanced_accuracy': 0.875, 'f1': 0.8571428571428571, 'precision': 1.0, 'recall': 0.75, 'specificity': 1.0, 'ROC_AUC': 0.875, 'MCC': 0.8017837257372732}
   

In [None]:
# Efektywne ustawienia - N = 10

top_n = 10

In [None]:
for part in ["full_24h", "night", "day"]:
    print(f"PART: {part}")

    filename = f"automatic_{dataset_str}_efficient_{part}.csv"
    filepath = os.path.join(PROCESSED_DATA_DIR, filename)
    X = pd.read_csv(filepath, header=0).fillna(0).values

    y = pd.read_csv(os.path.join(PROCESSED_DATA_DIR, f"{dataset_str}_y.csv"), header=None, dtype=int)
    y = y.values.ravel()

    for clf_type in ["LR", "RF"]:
        print(f"  {clf_type}")
        folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

        test_scores = []
        for train_idx, test_idx in folds.split(X, y):
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]

            X_train, X_test = variance_thresholding(X_train, X_test, threshold=0.05)

            selector = TsfreshTopNFeatureSelector(n=top_n)
            selector.fit(X_train, y_train)
            X_train, X_test = selector.transform(X_train), selector.transform(X_test)

            X_train, X_test = standardize(X_train, X_test)

            grid_search = GridSearchCV(
                estimator=classifiers[clf_type],
                param_grid=param_grids[clf_type],
                scoring="accuracy",
                n_jobs=-1,
                refit=True,
                cv=LeaveOneOut()
            )
            grid_search.fit(X_train, y_train)

            clf = grid_search.best_estimator_

            metrics = calculate_metrics(clf, X_test, y_test)
            print(metrics)
            test_scores.append(metrics)

        final_scores = calculate_metrics_statistics(test_scores)

        for metric, (mean, stddev) in final_scores.items():
            print(f"    {metric}: {mean:.4f} +- {stddev:.4f}")

        print()

PART: full_24h
  LR
{'accuracy': 0.8181818181818182, 'balanced_accuracy': 0.8166666666666667, 'f1': 0.8000000000000002, 'precision': 0.8, 'recall': 0.8, 'specificity': 0.8333333333333334, 'ROC_AUC': 0.8166666666666667, 'MCC': 0.6333333333333333}
{'accuracy': 0.8181818181818182, 'balanced_accuracy': 0.8166666666666667, 'f1': 0.8000000000000002, 'precision': 0.8, 'recall': 0.8, 'specificity': 0.8333333333333334, 'ROC_AUC': 0.8166666666666667, 'MCC': 0.6333333333333333}
{'accuracy': 0.8181818181818182, 'balanced_accuracy': 0.8571428571428572, 'f1': 0.8, 'precision': 0.6666666666666666, 'recall': 1.0, 'specificity': 0.7142857142857143, 'ROC_AUC': 0.8571428571428572, 'MCC': 0.6900655593423543}
{'accuracy': 0.8181818181818182, 'balanced_accuracy': 0.8571428571428572, 'f1': 0.8, 'precision': 0.6666666666666666, 'recall': 1.0, 'specificity': 0.7142857142857143, 'ROC_AUC': 0.8571428571428572, 'MCC': 0.6900655593423543}
{'accuracy': 1.0, 'balanced_accuracy': 1.0, 'f1': 1.0, 'precision': 1.0, 're

In [None]:
for part in ["day"]:
    print(f"PART: {part}")

    filename = f"automatic_{dataset_str}_efficient_{part}.csv"
    filepath = os.path.join(PROCESSED_DATA_DIR, filename)
    X = pd.read_csv(filepath, header=0).fillna(0).values

    y = pd.read_csv(os.path.join(PROCESSED_DATA_DIR, f"{dataset_str}_y.csv"), header=None, dtype=int)
    y = y.values.ravel()

    for clf_type in ["XGB"]:
        print(f"  {clf_type}")
        folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

        test_scores = []
        for train_idx, test_idx in folds.split(X, y):
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]

            X_train, X_test = variance_thresholding(X_train, X_test, threshold=0.05)

            selector = TsfreshTopNFeatureSelector(n=top_n)
            selector.fit(X_train, y_train)
            X_train, X_test = selector.transform(X_train), selector.transform(X_test)

            X_train, X_test = standardize(X_train, X_test)

            grid_search = GridSearchCV(
                estimator=classifiers[clf_type],
                param_grid=param_grids[clf_type],
                scoring="accuracy",
                n_jobs=-1,
                refit=True,
                cv=LeaveOneOut()
            )
            grid_search.fit(X_train, y_train)

            clf = grid_search.best_estimator_

            metrics = calculate_metrics(clf, X_test, y_test)
            print(metrics)
            test_scores.append(metrics)

        final_scores = calculate_metrics_statistics(test_scores)

        for metric, (mean, stddev) in final_scores.items():
            print(f"    {metric}: {mean:.4f} +- {stddev:.4f}")

        print()

PART: day
  XGB
{'accuracy': 1.0, 'balanced_accuracy': 1.0, 'f1': 1.0, 'precision': 1.0, 'recall': 1.0, 'specificity': 1.0, 'ROC_AUC': 1.0, 'MCC': 1.0}
{'accuracy': 0.8181818181818182, 'balanced_accuracy': 0.8166666666666667, 'f1': 0.8000000000000002, 'precision': 0.8, 'recall': 0.8, 'specificity': 0.8333333333333334, 'ROC_AUC': 0.8166666666666667, 'MCC': 0.6333333333333333}
{'accuracy': 0.9090909090909091, 'balanced_accuracy': 0.9285714285714286, 'f1': 0.888888888888889, 'precision': 0.8, 'recall': 1.0, 'specificity': 0.8571428571428571, 'ROC_AUC': 0.9285714285714286, 'MCC': 0.8280786712108251}
{'accuracy': 0.9090909090909091, 'balanced_accuracy': 0.9285714285714286, 'f1': 0.888888888888889, 'precision': 0.8, 'recall': 1.0, 'specificity': 0.8571428571428571, 'ROC_AUC': 0.9285714285714286, 'MCC': 0.8280786712108251}
{'accuracy': 0.9, 'balanced_accuracy': 0.875, 'f1': 0.8571428571428571, 'precision': 1.0, 'recall': 0.75, 'specificity': 1.0, 'ROC_AUC': 0.875, 'MCC': 0.8017837257372732}
 

### Depresjon

In [None]:
dataset_str = "depresjon"

In [None]:
dataset = Dataset(dirpath=os.path.join("/content/drive/MyDrive/Projekt/data", dataset_str))
condition = dataset.condition
control = dataset.control

In [None]:
condition_parts_dfs = get_clean_dataframes(condition, freq="min")
control_parts_dfs = get_clean_dataframes(control, freq="min")

datasets = {}

for part in ["full_24h", "night", "day"]:
    condition_dfs_list = condition_parts_dfs[part]
    control_dfs_list = control_parts_dfs[part]

    dfs_list = condition_dfs_list + control_dfs_list
    datasets[part] = dfs_list

y = pd.read_csv(os.path.join(PROCESSED_DATA_DIR, f"{dataset_str}_y.csv"), header=None, dtype=int)
y = y.values.ravel()

In [None]:
settings_dict = {"minimal": MinimalFCParameters(),
                 "efficient": EfficientFCParameters()}

for part, dfs in datasets.items():
    for settings_name, settings in settings_dict.items():
        X = extract_tsfresh_features(dfs, settings)
        filename = f"automatic_{dataset_str}_{settings_name}_{part}.csv"
        filepath = os.path.join(PROCESSED_DATA_DIR, filename)
        X.to_csv(filepath, index=False)

Feature Extraction: 100%|██████████| 55/55 [00:00<00:00, 102.24it/s]
Feature Extraction: 100%|██████████| 55/55 [12:06<00:00, 13.21s/it]
Feature Extraction: 100%|██████████| 55/55 [00:00<00:00, 221.09it/s]
Feature Extraction: 100%|██████████| 55/55 [03:37<00:00,  3.95s/it]
Feature Extraction: 100%|██████████| 55/55 [00:00<00:00, 190.43it/s]
Feature Extraction: 100%|██████████| 55/55 [04:44<00:00,  5.16s/it]


In [None]:
classifiers = {
    "LR": LogisticRegression(
        penalty="elasticnet",
        random_state=0,
        solver="saga",
        max_iter=5000
    ),
    "RF": RandomForestClassifier(
        n_estimators=500,
        criterion="entropy"
    ),
    "XGB": XGBClassifier(
        use_label_encoder=False,
        eval_metric="logloss",
        random_state=0
    )
}


param_grids = {
    "LR": {
        "C": [0.01, 0.1, 1, 10, 100, 1000],
        "class_weight": [None, "balanced"],
        "l1_ratio": [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
    },
    "RF": {
        "n_estimators": [50, 100, 200],
        "max_depth": [10, 20, 50],
        "class_weight": [None, "balanced"]
    },
    "XGB": {
        "n_estimators": [50, 100],
        "max_depth": [3, 5],
        "learning_rate": [0.01, 0.1]
    }
}

In [None]:
for part in ["full_24h", "night", "day"]:
    print(f"PART: {part}")

    filename = f"automatic_{dataset_str}_minimal_{part}.csv"
    filepath = os.path.join(PROCESSED_DATA_DIR, filename)
    X = pd.read_csv(filepath, header=0).fillna(0).values

    y = pd.read_csv(os.path.join(PROCESSED_DATA_DIR, f"{dataset_str}_y.csv"), header=None, dtype=int)
    y = y.values.ravel()

    for clf_type in ["LR", "RF"]:
        print(f"  {clf_type}")
        folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

        test_scores = []
        for train_idx, test_idx in folds.split(X, y):
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]

            X_train, X_test = variance_thresholding(X_train, X_test, threshold=0.05)
            X_train, X_test = standardize(X_train, X_test)

            grid_search = GridSearchCV(
                estimator=classifiers[clf_type],
                param_grid=param_grids[clf_type],
                scoring="accuracy",
                n_jobs=-1,
                refit=True,
                cv=LeaveOneOut()
            )
            grid_search.fit(X_train, y_train)

            clf = grid_search.best_estimator_

            metrics = calculate_metrics(clf, X_test, y_test)
            print(metrics)
            test_scores.append(metrics)

        final_scores = calculate_metrics_statistics(test_scores)

        for metric, (mean, stddev) in final_scores.items():
            print(f"    {metric}: {mean:.4f} +- {stddev:.4f}")

        print()

PART: full_24h
  LR
{'accuracy': 0.8181818181818182, 'balanced_accuracy': 0.8166666666666667, 'f1': 0.8000000000000002, 'precision': 0.8, 'recall': 0.8, 'specificity': 0.8333333333333334, 'ROC_AUC': 0.8166666666666667, 'MCC': 0.6333333333333333}
{'accuracy': 0.6363636363636364, 'balanced_accuracy': 0.6333333333333333, 'f1': 0.6, 'precision': 0.6, 'recall': 0.6, 'specificity': 0.6666666666666666, 'ROC_AUC': 0.6333333333333334, 'MCC': 0.26666666666666666}
{'accuracy': 0.7272727272727273, 'balanced_accuracy': 0.7333333333333334, 'f1': 0.7272727272727272, 'precision': 0.6666666666666666, 'recall': 0.8, 'specificity': 0.6666666666666666, 'ROC_AUC': 0.7333333333333334, 'MCC': 0.4666666666666667}
{'accuracy': 0.6363636363636364, 'balanced_accuracy': 0.6071428571428572, 'f1': 0.5, 'precision': 0.5, 'recall': 0.5, 'specificity': 0.7142857142857143, 'ROC_AUC': 0.6071428571428571, 'MCC': 0.21428571428571427}
{'accuracy': 0.8181818181818182, 'balanced_accuracy': 0.75, 'f1': 0.6666666666666666, 'pr

In [None]:
for part in ["full_24h", "night", "day"]:
    print(f"PART: {part}")

    filename = f"automatic_{dataset_str}_minimal_{part}.csv"
    filepath = os.path.join(PROCESSED_DATA_DIR, filename)
    X = pd.read_csv(filepath, header=0).fillna(0).values

    y = pd.read_csv(os.path.join(PROCESSED_DATA_DIR, f"{dataset_str}_y.csv"), header=None, dtype=int)
    y = y.values.ravel()

    for clf_type in ["XGB"]:
        print(f"  {clf_type}")
        folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

        test_scores = []
        for train_idx, test_idx in folds.split(X, y):
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]

            X_train, X_test = variance_thresholding(X_train, X_test, threshold=0.05)
            X_train, X_test = standardize(X_train, X_test)

            grid_search = GridSearchCV(
                estimator=classifiers[clf_type],
                param_grid=param_grids[clf_type],
                scoring="accuracy",
                n_jobs=-1,
                refit=True,
                cv=LeaveOneOut()
            )
            grid_search.fit(X_train, y_train)

            clf = grid_search.best_estimator_

            metrics = calculate_metrics(clf, X_test, y_test)
            print(metrics)
            test_scores.append(metrics)

        final_scores = calculate_metrics_statistics(test_scores)

        for metric, (mean, stddev) in final_scores.items():
            print(f"    {metric}: {mean:.4f} +- {stddev:.4f}")

        print()

PART: full_24h
  XGB
{'accuracy': 0.7272727272727273, 'balanced_accuracy': 0.7333333333333334, 'f1': 0.7272727272727272, 'precision': 0.6666666666666666, 'recall': 0.8, 'specificity': 0.6666666666666666, 'ROC_AUC': 0.7333333333333334, 'MCC': 0.4666666666666667}
{'accuracy': 0.9090909090909091, 'balanced_accuracy': 0.9166666666666667, 'f1': 0.9090909090909091, 'precision': 0.8333333333333334, 'recall': 1.0, 'specificity': 0.8333333333333334, 'ROC_AUC': 0.9166666666666667, 'MCC': 0.8333333333333334}
{'accuracy': 0.7272727272727273, 'balanced_accuracy': 0.75, 'f1': 0.7692307692307693, 'precision': 0.625, 'recall': 1.0, 'specificity': 0.5, 'ROC_AUC': 0.75, 'MCC': 0.5590169943749475}
{'accuracy': 0.8181818181818182, 'balanced_accuracy': 0.75, 'f1': 0.6666666666666666, 'precision': 1.0, 'recall': 0.5, 'specificity': 1.0, 'ROC_AUC': 0.75, 'MCC': 0.6236095644623235}
{'accuracy': 0.6363636363636364, 'balanced_accuracy': 0.6071428571428572, 'f1': 0.5, 'precision': 0.5, 'recall': 0.5, 'specificit

In [None]:
top_n = 10

In [None]:
for part in ["full_24h", "night", "day"]:
    print(f"PART: {part}")

    filename = f"automatic_{dataset_str}_efficient_{part}.csv"
    filepath = os.path.join(PROCESSED_DATA_DIR, filename)
    X = pd.read_csv(filepath, header=0).fillna(0).values

    y = pd.read_csv(os.path.join(PROCESSED_DATA_DIR, f"{dataset_str}_y.csv"), header=None, dtype=int)
    y = y.values.ravel()

    for clf_type in ["LR", "RF"]:
        print(f"  {clf_type}")
        folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

        test_scores = []
        for train_idx, test_idx in folds.split(X, y):
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]

            X_train, X_test = variance_thresholding(X_train, X_test, threshold=0.05)

            selector = TsfreshTopNFeatureSelector(n=top_n)
            selector.fit(X_train, y_train)
            X_train, X_test = selector.transform(X_train), selector.transform(X_test)

            X_train, X_test = standardize(X_train, X_test)

            grid_search = GridSearchCV(
                estimator=classifiers[clf_type],
                param_grid=param_grids[clf_type],
                scoring="accuracy",
                n_jobs=-1,
                refit=True,
                cv=LeaveOneOut()
            )
            grid_search.fit(X_train, y_train)

            clf = grid_search.best_estimator_

            metrics = calculate_metrics(clf, X_test, y_test)
            print(metrics)
            test_scores.append(metrics)

        final_scores = calculate_metrics_statistics(test_scores)

        for metric, (mean, stddev) in final_scores.items():
            print(f"    {metric}: {mean:.4f} +- {stddev:.4f}")

        print()

PART: full_24h
  LR
{'accuracy': 0.8181818181818182, 'balanced_accuracy': 0.8, 'f1': 0.7499999999999999, 'precision': 1.0, 'recall': 0.6, 'specificity': 1.0, 'ROC_AUC': 0.8, 'MCC': 0.6708203932499369}
{'accuracy': 0.7272727272727273, 'balanced_accuracy': 0.7166666666666667, 'f1': 0.6666666666666665, 'precision': 0.75, 'recall': 0.6, 'specificity': 0.8333333333333334, 'ROC_AUC': 0.7166666666666668, 'MCC': 0.4485426135725303}
{'accuracy': 0.5454545454545454, 'balanced_accuracy': 0.5333333333333333, 'f1': 0.4444444444444445, 'precision': 0.5, 'recall': 0.4, 'specificity': 0.6666666666666666, 'ROC_AUC': 0.5333333333333333, 'MCC': 0.06900655593423542}
{'accuracy': 0.6363636363636364, 'balanced_accuracy': 0.5535714285714286, 'f1': 0.3333333333333333, 'precision': 0.5, 'recall': 0.25, 'specificity': 0.8571428571428571, 'ROC_AUC': 0.5535714285714287, 'MCC': 0.1336306209562122}
{'accuracy': 0.8181818181818182, 'balanced_accuracy': 0.75, 'f1': 0.6666666666666666, 'precision': 1.0, 'recall': 0.5,

In [None]:
for part in ["full_24h", "night", "day"]:
    print(f"PART: {part}")

    filename = f"automatic_{dataset_str}_efficient_{part}.csv"
    filepath = os.path.join(PROCESSED_DATA_DIR, filename)
    X = pd.read_csv(filepath, header=0).fillna(0).values

    y = pd.read_csv(os.path.join(PROCESSED_DATA_DIR, f"{dataset_str}_y.csv"), header=None, dtype=int)
    y = y.values.ravel()

    for clf_type in ["XGB"]:
        print(f"  {clf_type}")
        folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

        test_scores = []
        for train_idx, test_idx in folds.split(X, y):
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]

            X_train, X_test = variance_thresholding(X_train, X_test, threshold=0.05)

            selector = TsfreshTopNFeatureSelector(n=top_n)
            selector.fit(X_train, y_train)
            X_train, X_test = selector.transform(X_train), selector.transform(X_test)

            X_train, X_test = standardize(X_train, X_test)

            grid_search = GridSearchCV(
                estimator=classifiers[clf_type],
                param_grid=param_grids[clf_type],
                scoring="accuracy",
                n_jobs=-1,
                refit=True,
                cv=LeaveOneOut()
            )
            grid_search.fit(X_train, y_train)

            clf = grid_search.best_estimator_

            metrics = calculate_metrics(clf, X_test, y_test)
            print(metrics)
            test_scores.append(metrics)

        final_scores = calculate_metrics_statistics(test_scores)

        for metric, (mean, stddev) in final_scores.items():
            print(f"    {metric}: {mean:.4f} +- {stddev:.4f}")

        print()

PART: full_24h
  XGB
{'accuracy': 0.8181818181818182, 'balanced_accuracy': 0.8166666666666667, 'f1': 0.8000000000000002, 'precision': 0.8, 'recall': 0.8, 'specificity': 0.8333333333333334, 'ROC_AUC': 0.8166666666666667, 'MCC': 0.6333333333333333}
{'accuracy': 0.7272727272727273, 'balanced_accuracy': 0.7, 'f1': 0.5714285714285715, 'precision': 1.0, 'recall': 0.4, 'specificity': 1.0, 'ROC_AUC': 0.7, 'MCC': 0.5163977794943223}
{'accuracy': 0.6363636363636364, 'balanced_accuracy': 0.65, 'f1': 0.6666666666666666, 'precision': 0.5714285714285714, 'recall': 0.8, 'specificity': 0.5, 'ROC_AUC': 0.65, 'MCC': 0.3105295017040594}
{'accuracy': 0.5454545454545454, 'balanced_accuracy': 0.5357142857142857, 'f1': 0.4444444444444445, 'precision': 0.4, 'recall': 0.5, 'specificity': 0.5714285714285714, 'ROC_AUC': 0.5357142857142857, 'MCC': 0.06900655593423542}
{'accuracy': 0.8181818181818182, 'balanced_accuracy': 0.75, 'f1': 0.6666666666666666, 'precision': 1.0, 'recall': 0.5, 'specificity': 1.0, 'ROC_AUC

### Hyperaktiv

In [None]:
PROCESSED_DATA_DIR = "/kaggle/working/"

In [None]:
classifiers = {
    "LR": LogisticRegression(
        penalty="elasticnet",
        random_state=0,
        solver="saga",
        max_iter=5000
    ),
    "RF": RandomForestClassifier(
        n_estimators=500,
        criterion="entropy"
    ),
    "XGB": XGBClassifier(
        use_label_encoder=False,
        eval_metric="logloss",
        random_state=0
    )
}


param_grids = {
    "LR": {
        "C": [0.01, 0.1, 1, 10, 100, 1000],
        "class_weight": [None, "balanced"],
        "l1_ratio": [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
    },
    "RF": {
        "class_weight": ["balanced"]
    },
    "XGB": {
        "n_estimators": [50, 100],
        "max_depth": [3, 5],
        "learning_rate": [0.01, 0.1]
    }
}

In [None]:
class Dataset:
    def __init__(self, dirpath: str, condition_dir_name: str = "condition"):
        condition_dirpath = os.path.join(dirpath, condition_dir_name)
        control_dirpath = os.path.join(dirpath, "control")

        self.condition: List[pd.DataFrame] = [
            pd.read_csv(os.path.join(condition_dirpath, file))
            for file in os.listdir(condition_dirpath)
        ]

        self.control: List[pd.DataFrame] = [
            pd.read_csv(os.path.join(control_dirpath, file))
            for file in os.listdir(control_dirpath)
        ]

In [None]:
def basic_data_cleaning(data):

    data = [df.copy() for df in data]

    for i, df in enumerate(data):
        df.columns = df.columns.str.lower()

        if 'timestamp' in df.columns:
            df["timestamp"] = pd.to_datetime(df["timestamp"], errors='coerce', dayfirst=True)

        if 'date' in df.columns:
            df.drop("date", axis=1, inplace=True)

        if 'activity' in df.columns:
            df["activity"] = df["activity"].astype(np.float32)

    return data


def get_clean_dataframes(dfs, freq: str = "H"):

    for df in dfs:

        if 'TIMESTAMP' in df.columns and 'ACTIVITY' in df.columns:
            df.rename(columns={'TIMESTAMP': 'timestamp', 'ACTIVITY': 'activity'}, inplace=True)

    full_dfs = basic_data_cleaning(dfs)

    for df in full_dfs:
        df["timestamp"] = pd.to_datetime(df["timestamp"], errors='coerce', dayfirst=True)

    full_dfs = [fill_missing_activity(df) for df in full_dfs]
    full_dfs = [resample(df, freq=freq) for df in full_dfs]

    night_dfs = [get_day_part(df, part="night") for df in full_dfs]
    day_dfs = [get_day_part(df, part="day") for df in full_dfs]

    datasets = {
        "full_24h": full_dfs,
        "night": night_dfs,
        "day": day_dfs
    }

    return datasets

def split_and_clean_columns(data):
    data = [df.copy() for df in data]

    for i, df in enumerate(data):
        if 'TIMESTAMP;ACTIVITY' in df.columns:

            df[['timestamp', 'activity']] = df['TIMESTAMP;ACTIVITY'].str.split(';', expand=True)
            df.drop(columns=['TIMESTAMP;ACTIVITY'], inplace=True)

        df.columns = df.columns.str.lower()

        if 'activity' in df.columns:
            df['activity'] = df['activity'].astype(np.float32)

        if 'timestamp' in df.columns:
            df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce', dayfirst=True)

        data[i] = df

    return data

In [None]:
dataset_str = "hyperaktiv3"

In [None]:
dataset = Dataset(dirpath=os.path.join("/kaggle/input/dataset1/data", dataset_str))
condition = split_and_clean_columns(dataset.condition)
control = split_and_clean_columns(dataset.control)

In [None]:
condition_parts_dfs = get_clean_dataframes(condition, freq="min")
control_parts_dfs = get_clean_dataframes(control, freq="min")

datasets = {}

for part in ["full_24h", "night", "day"]:
    condition_dfs_list = condition_parts_dfs[part]
    control_dfs_list = control_parts_dfs[part]

    dfs_list = condition_dfs_list + control_dfs_list
    datasets[part] = dfs_list

y = pd.read_csv(os.path.join('/kaggle/input/dataset1', f"{dataset_str}_y.csv"), header=None, dtype=int)
y = y.values.ravel()

In [None]:
settings_dict = {"minimal": MinimalFCParameters(),
                 "efficient": EfficientFCParameters()}

for part, dfs in datasets.items():
    for settings_name, settings in settings_dict.items():
        X = extract_tsfresh_features(dfs, settings)
        filename = f"automatic_{dataset_str}_{settings_name}_{part}.csv"
        filepath = os.path.join(PROCESSED_DATA_DIR, filename)
        X.to_csv(filepath, index=False)

Feature Extraction: 100%|██████████| 85/85 [00:01<00:00, 80.38it/s] 
Feature Extraction: 100%|██████████| 85/85 [56:52<00:00, 40.14s/it]   
Feature Extraction: 100%|██████████| 85/85 [00:00<00:00, 155.80it/s]
Feature Extraction: 100%|██████████| 85/85 [16:23<00:00, 11.57s/it]
Feature Extraction: 100%|██████████| 85/85 [00:00<00:00, 116.94it/s]
Feature Extraction: 100%|██████████| 85/85 [21:07<00:00, 14.92s/it]


In [3]:
for part in ["full_24h", "night", "day"]:
    print(f"PART: {part}")

    filename = f"automatic_{dataset_str}_minimal_{part}.csv"
    filepath = os.path.join('/kaggle/input/dataset2', filename)
    X = pd.read_csv(filepath, header=0).fillna(0).values

    y = pd.read_csv(os.path.join('/kaggle/input/dataset1', f"{dataset_str}_y.csv"), header=None, dtype=int)
    y = y.values.ravel()

    for clf_type in ["LR", "RF", "XGB"]:
        print(f"  {clf_type}")
        folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

        test_scores = []
        for train_idx, test_idx in folds.split(X, y):
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]

            X_train, X_test = variance_thresholding(X_train, X_test, threshold=0.05)
            X_train, X_test = standardize(X_train, X_test)

            grid_search = GridSearchCV(
                estimator=classifiers[clf_type],
                param_grid=param_grids[clf_type],
                scoring="accuracy",
                n_jobs=-1,
                refit=True,
                cv=LeaveOneOut()
            )
            grid_search.fit(X_train, y_train)

            clf = grid_search.best_estimator_

            metrics = calculate_metrics(clf, X_test, y_test)
            print(metrics)
            test_scores.append(metrics)

        final_scores = calculate_metrics_statistics(test_scores)

        for metric, (mean, stddev) in final_scores.items():
            print(f"    {metric}: {mean:.4f} +- {stddev:.4f}")

        print()

PART: full_24h
  LR
{'accuracy': 0.7882352941176471, 'balanced_accuracy': 0.7833333333333333, 'f1': 0.831578947368421, 'precision': 0.8, 'recall': 0.8666666666666666, 'specificity': 0.7, 'ROC_AUC': 0.7833333333333333, 'MCC': 0.3690308509457033}
{'accuracy': 0.7294117647058824, 'balanced_accuracy': 0.7138888888888888, 'f1': 0.8363636363636364, 'precision': 0.7384615384615384, 'recall': 0.9777777777777778, 'specificity': 0.45, 'ROC_AUC': 0.7138888888888888, 'MCC': 0.43268602252303067}
{'accuracy': 0.7294117647058824, 'balanced_accuracy': 0.7, 'f1': 0.8923076923076924, 'precision': 0.7294117647058824, 'recall': 1.0, 'specificity': 0.2, 'ROC_AUC': 0.7, 'MCC': 0.4}
{'accuracy': 0.7294117647058824, 'balanced_accuracy': 0.7, 'f1': 0.8923076923076924, 'precision': 0.7294117647058824, 'recall': 1.0, 'specificity': 0.2, 'ROC_AUC': 0.7, 'MCC': 0.4}
{'accuracy': 0.7294117647058824, 'balanced_accuracy': 0.7, 'f1': 0.8923076923076924, 'precision': 0.7294117647058824, 'recall': 1.0, 'specificity': 0.

In [None]:
top_n = 10

In [None]:
for part in ["full_24h", "night", "day"]:
    print(f"PART: {part}")

    filename = f"automatic_{dataset_str}_efficient_{part}.csv"
    filepath = os.path.join('/kaggle/working/', filename)
    X = pd.read_csv(filepath, header=0).fillna(0).values

    y = pd.read_csv(os.path.join('/kaggle/input/dataset1', f"{dataset_str}_y.csv"), header=None, dtype=int)
    y = y.values.ravel()

    for clf_type in ["LR", "RF"]:
        print(f"  {clf_type}")
        folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

        test_scores = []
        for train_idx, test_idx in folds.split(X, y):
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]

            X_train, X_test = variance_thresholding(X_train, X_test, threshold=0.05)

            selector = TsfreshTopNFeatureSelector(n=top_n)
            selector.fit(X_train, y_train)
            X_train, X_test = selector.transform(X_train), selector.transform(X_test)
            X_train, X_test = standardize(X_train, X_test)

            grid_search = GridSearchCV(
                estimator=classifiers[clf_type],
                param_grid=param_grids[clf_type],
                scoring="accuracy",
                n_jobs=-1,
                refit=True,
                cv=LeaveOneOut()
            )
            grid_search.fit(X_train, y_train)

            clf = grid_search.best_estimator_

            metrics = calculate_metrics(clf, X_test, y_test)
            print(metrics)
            test_scores.append(metrics)

        final_scores = calculate_metrics_statistics(test_scores)

        for metric, (mean, stddev) in final_scores.items():
            print(f"    {metric}: {mean:.4f} +- {stddev:.4f}")

        print()


PART: full_24h
  LR
{'accuracy': 0.7294117647058824, 'balanced_accuracy': 0.7277777777777778, 'f1': 0.7555555555555556, 'precision': 0.7555555555555556, 'recall': 0.7555555555555556, 'specificity': 0.7, 'ROC_AUC': 0.7277777777777778, 'MCC': 0.25555555555555555}
{'accuracy': 0.9058823529411765, 'balanced_accuracy': 0.9152777777777778, 'f1': 0.8666666666666667, 'precision': 0.9333333333333334, 'recall': 0.7555555555555556, 'specificity': 0.975, 'ROC_AUC': 0.9152777777777778, 'MCC': 0.6497006112188861}
{'accuracy': 0.7294117647058824, 'balanced_accuracy': 0.7208333333333333, 'f1': 0.8, 'precision': 0.7454545454545454, 'recall': 0.8666666666666666, 'specificity': 0.575, 'ROC_AUC': 0.7208333333333333, 'MCC': 0.44351941398892446}
{'accuracy': 0.7294117647058824, 'balanced_accuracy': 0.7416666666666666, 'f1': 0.62857142857142855, 'precision': 0.8, 'recall': 0.5333333333333333, 'specificity': 0.95, 'ROC_AUC': 0.7416666666666666, 'MCC': 0.49128709291752769}
{'accuracy': 0.67058823529411764, 'b

In [None]:
for part in ["full_24h", "night", "day"]:
    print(f"PART: {part}")

    filename = f"automatic_{dataset_str}_efficient_{part}.csv"
    filepath = os.path.join('/kaggle/working/', filename)
    X = pd.read_csv(filepath, header=0).fillna(0).values

    y = pd.read_csv(os.path.join('/kaggle/input/dataset1', f"{dataset_str}_y.csv"), header=None, dtype=int)
    y = y.values.ravel()

    for clf_type in ["XGB"]:
        print(f"  {clf_type}")
        folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

        test_scores = []
        for train_idx, test_idx in folds.split(X, y):
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]

            X_train, X_test = variance_thresholding(X_train, X_test, threshold=0.05)

            selector = TsfreshTopNFeatureSelector(n=top_n)
            selector.fit(X_train, y_train)
            X_train, X_test = selector.transform(X_train), selector.transform(X_test)
            X_train, X_test = standardize(X_train, X_test)

            grid_search = GridSearchCV(
                estimator=classifiers[clf_type],
                param_grid=param_grids[clf_type],
                scoring="accuracy",
                n_jobs=-1,
                refit=True,
                cv=LeaveOneOut()
            )
            grid_search.fit(X_train, y_train)

            clf = grid_search.best_estimator_

            metrics = calculate_metrics(clf, X_test, y_test)
            print(metrics)
            test_scores.append(metrics)

        final_scores = calculate_metrics_statistics(test_scores)

        for metric, (mean, stddev) in final_scores.items():
            print(f"    {metric}: {mean:.4f} +- {stddev:.4f}")

        print()


PART: full_24h
  XGB
{'accuracy': 0.67058823529411764, 'balanced_accuracy': 0.6652777777777778, 'f1': 0.7263157894736842, 'precision': 0.7, 'recall': 0.7555555555555556, 'specificity': 0.575, 'ROC_AUC': 0.6652777777777778, 'MCC': 0.07042952122737638}
{'accuracy': 0.7294117647058824, 'balanced_accuracy': 0.7277777777777778, 'f1': 0.7555555555555556, 'precision': 0.7555555555555556, 'recall': 0.7555555555555556, 'specificity': 0.7, 'ROC_AUC': 0.7277777777777778, 'MCC': 0.05555555555555555}
{'accuracy': 0.8470588235294118, 'balanced_accuracy': 0.8319444444444444, 'f1': 0.9272727272727274, 'precision': 0.8153846153846154, 'recall': 0.9888888888888888, 'specificity': 0.675, 'ROC_AUC': 0.8319444444444444, 'MCC': 0.3105172139687914}
{'accuracy': 0.7882352941176471, 'balanced_accuracy': 0.7902777777777778, 'f1': 0.7882352941176471, 'precision': 0.825, 'recall': 0.7555555555555556, 'specificity': 0.825, 'ROC_AUC': 0.7902777777777778, 'MCC': 0.48055555555555555}
{'accuracy': 0.8470588235294118,

# Psykose

In [None]:
def extract_tsfresh_features(dfs: List[pd.DataFrame], settings: Dict) -> pd.DataFrame:
    flat_df_list = []
    for idx, df in enumerate(dfs):
        df = df.copy()
        df["id"] = idx
        df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce").astype(int) / 10**9  # Konwersja do Unix time
        flat_df_list.append(df)

    ts = pd.concat(flat_df_list, ignore_index=True)

    augmenter = FeatureAugmenter(
        default_fc_parameters=settings,
        column_id="id",
        column_sort="timestamp",
        column_value="activity",
        chunksize=1,
        n_jobs=4
    )

    ids = ts["id"].unique()
    X = pd.DataFrame(index=ids)

    augmenter.set_timeseries_container(ts)
    X = augmenter.transform(X)

    return X


In [None]:
PROCESSED_DATA_DIR = "/kaggle/working/"

def basic_data_cleaning(data):
    data = [df.copy() for df in data]
    for df in data:
        df["timestamp"] = pd.to_datetime(df["timestamp"], dayfirst=False)
        df["timestamp"] = df["timestamp"].dt.strftime("%Y-%m-%d %H:%M:%S")
        if "date" in df.columns:
            df.drop("date", axis=1, inplace=True)
        df["activity"] = df["activity"].astype(np.float32)
    return data

def split_ts_to_24h_series(data, window_size: int = 24):
    window_size = pd.Timedelta(hours=window_size)
    data_windows = []
    for series in data:
        series["timestamp"] = pd.to_datetime(series["timestamp"], dayfirst=False)
        windows = []

        start_time = series["timestamp"][0]
        end_time = start_time + window_size

        while end_time < series["timestamp"].max():
            window_data = series[(series['timestamp'] >= start_time) & (series['timestamp'] < end_time)]
            if len(window_data) == 1440:
                windows.append(window_data)
            start_time = end_time
            end_time = start_time + window_size

        data_windows.append(windows)

    return data_windows

def save_windows(data_windows, dataset, patient_class='condition'):
    id = 0
    for windows in data_windows:
        id += 1
        path = f"{PROCESSED_DATA_DIR}/day_windows/{dataset}/{patient_class}/{id}"
        os.makedirs(path, exist_ok=True)
        window_id = 0
        for window in windows:
            window_id += 1
            filepath = os.path.join(PROCESSED_DATA_DIR, 'day_windows', dataset, patient_class, str(id), f"window_{window_id}.csv")
            window.to_csv(filepath, header=True, index=False)

def prepare_data_for_mil(dataset_name, patient_class, settings_name="minimal"):
    """
    Prepare data for MIL using extracted tsfresh features.
    """
    filepath = os.path.join(PROCESSED_DATA_DIR, f"{dataset_name}_{patient_class}_{settings_name}_features.csv")
    features = pd.read_csv(filepath)
    return features.to_numpy()


def split_day_windows():
    datasets = ['depresjon', 'psykose']
    settings_dict = {
        "minimal": MinimalFCParameters(),
        "efficient": EfficientFCParameters()
    }

    for dataset_name in datasets:
        dataset = Dataset(dirpath=os.path.join("/kaggle/input/datasetss", dataset_name))
        condition = dataset.condition
        control = dataset.control

        condition = basic_data_cleaning(condition)
        control = basic_data_cleaning(control)

        data_windows_condition = split_ts_to_24h_series(condition, window_size=24)
        save_windows(data_windows=data_windows_condition, dataset=dataset_name, patient_class='condition')

        data_windows_control = split_ts_to_24h_series(control, window_size=24)
        save_windows(data_windows=data_windows_control, dataset=dataset_name, patient_class='control')

        for settings_name, settings in settings_dict.items():
            print(f"Extracting features for {dataset_name} with {settings_name} settings...")

            condition_features = extract_tsfresh_features(
                [pd.concat(window, ignore_index=True) for window in data_windows_condition],
                settings
            )
            control_features = extract_tsfresh_features(
                [pd.concat(window, ignore_index=True) for window in data_windows_control],
                settings
            )

            condition_filepath = os.path.join(PROCESSED_DATA_DIR, f"{dataset_name}_condition_{settings_name}_features.csv")
            control_filepath = os.path.join(PROCESSED_DATA_DIR, f"{dataset_name}_control_{settings_name}_features.csv")

            condition_features.to_csv(condition_filepath, index=False)
            control_features.to_csv(control_filepath, index=False)

split_day_windows()

Extracting features for depresjon with minimal settings...


Feature Extraction: 100%|██████████| 23/23 [00:00<00:00, 258.74it/s]
Feature Extraction: 100%|██████████| 32/32 [00:00<00:00, 212.90it/s]


Extracting features for depresjon with efficient settings...


Feature Extraction: 100%|██████████| 23/23 [01:54<00:00,  4.98s/it]
Feature Extraction: 100%|██████████| 32/32 [04:09<00:00,  7.80s/it]


Extracting features for psykose with minimal settings...


Feature Extraction: 100%|██████████| 22/22 [00:00<00:00, 186.28it/s]
Feature Extraction: 100%|██████████| 32/32 [00:00<00:00, 175.26it/s]


Extracting features for psykose with efficient settings...


Feature Extraction: 100%|██████████| 22/22 [01:50<00:00,  5.03s/it]
Feature Extraction: 100%|██████████| 32/32 [04:05<00:00,  7.66s/it]


In [None]:
def standardize(X_train, X_test):
    scaler = StandardScaler()
    if X_train.ndim == 1:
        X_train = X_train.reshape(-1, 1)
    if X_test.ndim == 1:
        X_test = X_test.reshape(-1, 1)
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    return X_train, X_test

In [None]:
classifiers = {
    "LR": LogisticRegression(
        penalty="elasticnet",
        random_state=0,
        solver="saga",
        max_iter=5000
    ),
    "RF": RandomForestClassifier(
        n_estimators=500,
        criterion="entropy",
        random_state=0
    ),
    "XGB": XGBClassifier(
        use_label_encoder=False,
        eval_metric="logloss",
        random_state=0,
        tree_method='gpu_hist'
    )
}

param_grids = {
    "LR": {
        "classifier__C": [0.1, 1, 10, 100],
        "classifier__class_weight": ["balanced"],
        "classifier__l1_ratio": [0, 0.2, 0.4, 0.6, 0.8, 1]
    },
    "RF": {
        "classifier__class_weight": ["balanced"]
    },
    "XGB": {
        "classifier__n_estimators": [50, 100],
        "classifier__max_depth": [3, 5],
        "classifier__learning_rate": [0.01, 0.1]
    }
}

In [None]:
top_n = 10

In [None]:
X_condition = prepare_data_for_mil('psykose', 'condition', settings_name="efficient")
X_control = prepare_data_for_mil('psykose', 'control', settings_name="efficient")

y_condition = np.ones(len(X_condition))
y_control = np.zeros(len(X_control))

X = np.concatenate([X_condition, X_control], axis=0)
y = np.concatenate([y_condition, y_control], axis=0)

for clf_type in ["LR", "RF", "XGB"]:
    print(f"{clf_type}")
    folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

    test_scores = []
    for train_idx, test_idx in folds.split(X, y):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        X_train, X_test = variance_thresholding(X_train, X_test, threshold=0.05)

        selector = TsfreshTopNFeatureSelector(n=top_n)
        selector.fit(X_train, y_train)
        X_train, X_test = selector.transform(X_train), selector.transform(X_test)

        X_train, X_test = standardize(X_train, X_test)

        grid_search = GridSearchCV(
            estimator=Pipeline([('classifier', classifiers[clf_type])]),
            param_grid=param_grids[clf_type],
            scoring="accuracy",
            n_jobs=-1,
            refit=True,
            cv=LeaveOneOut()
        )
        grid_search.fit(X_train, y_train)

        clf = grid_search.best_estimator_

        metrics = calculate_metrics(clf, X_test, y_test)
        print(metrics)
        test_scores.append(metrics)

    final_scores = calculate_metrics_statistics(test_scores)

    for metric, (mean, stddev) in final_scores.items():
        print(f"    {metric}: {mean:.4f} +- {stddev:.4f}")

    print()


LR
{'accuracy': 0.8181818181818182, 'balanced_accuracy': 0.8333333333333333, 'f1': 0.8333333333333333, 'precision': 0.7142857142857143, 'recall': 1.0, 'specificity': 0.6666666666666666, 'ROC_AUC': 0.8333333333333334, 'MCC': 0.6900655593423543}
{'accuracy': 0.7272727272727273, 'balanced_accuracy': 0.7333333333333334, 'f1': 0.7272727272727272, 'precision': 0.6666666666666666, 'recall': 0.8, 'specificity': 0.6666666666666666, 'ROC_AUC': 0.7333333333333334, 'MCC': 0.4666666666666667}
{'accuracy': 0.8181818181818182, 'balanced_accuracy': 0.8035714285714286, 'f1': 0.75, 'precision': 0.75, 'recall': 0.75, 'specificity': 0.8571428571428571, 'ROC_AUC': 0.8035714285714286, 'MCC': 0.6071428571428571}
{'accuracy': 1.0, 'balanced_accuracy': 1.0, 'f1': 1.0, 'precision': 1.0, 'recall': 1.0, 'specificity': 1.0, 'ROC_AUC': 1.0, 'MCC': 1.0}
{'accuracy': 0.6, 'balanced_accuracy': 0.5833333333333333, 'f1': 0.5, 'precision': 0.5, 'recall': 0.5, 'specificity': 0.6666666666666666, 'ROC_AUC': 0.58333333333333

# Depresjon

In [None]:
param_grids = {
    "LR": {
        "classifier__C": [0.1, 1, 10, 100],
        "classifier__class_weight": [None, "balanced"],
        "classifier__l1_ratio": [0, 0.2, 0.4, 0.6, 0.8, 1]
    },
    "RF": {
        "classifier__class_weight": [None, "balanced"]
    },
    "XGB": {
        "classifier__n_estimators": [50, 100],
        "classifier__max_depth": [3, 5],
        "classifier__learning_rate": [0.01, 0.1]
    }
}

In [None]:
top_n = 10

In [None]:
X_condition = prepare_data_for_mil('depresjon', 'condition', settings_name="efficient")
X_control = prepare_data_for_mil('depresjon', 'control', settings_name="efficient")

y_condition = np.ones(len(X_condition))
y_control = np.zeros(len(X_control))

X = np.concatenate([X_condition, X_control], axis=0)
y = np.concatenate([y_condition, y_control], axis=0)

for clf_type in ["LR", "RF", "XGB"]:
    print(f"{clf_type}")
    folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

    test_scores = []
    for train_idx, test_idx in folds.split(X, y):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        X_train, X_test = variance_thresholding(X_train, X_test, threshold=0.05)

        selector = TsfreshTopNFeatureSelector(n=top_n)
        selector.fit(X_train, y_train)
        X_train, X_test = selector.transform(X_train), selector.transform(X_test)

        X_train, X_test = standardize(X_train, X_test)

        grid_search = GridSearchCV(
            estimator=Pipeline([('classifier', classifiers[clf_type])]),
            param_grid=param_grids[clf_type],
            scoring="accuracy",
            n_jobs=-1,
            refit=True,
            cv=LeaveOneOut()
        )
        grid_search.fit(X_train, y_train)

        clf = grid_search.best_estimator_

        metrics = calculate_metrics(clf, X_test, y_test)
        print(metrics)
        test_scores.append(metrics)

    final_scores = calculate_metrics_statistics(test_scores)

    for metric, (mean, stddev) in final_scores.items():
        print(f"    {metric}: {mean:.4f} +- {stddev:.4f}")

    print()


LR
{'accuracy': 0.6363636363636364, 'balanced_accuracy': 0.65, 'f1': 0.6666666666666666, 'precision': 0.5714285714285714, 'recall': 0.8, 'specificity': 0.5, 'ROC_AUC': 0.65, 'MCC': 0.3105295017040594}
{'accuracy': 0.6363636363636364, 'balanced_accuracy': 0.6333333333333333, 'f1': 0.6, 'precision': 0.6, 'recall': 0.6, 'specificity': 0.6666666666666666, 'ROC_AUC': 0.6333333333333334, 'MCC': 0.26666666666666666}
{'accuracy': 0.7272727272727273, 'balanced_accuracy': 0.7166666666666667, 'f1': 0.6666666666666665, 'precision': 0.75, 'recall': 0.6, 'specificity': 0.8333333333333334, 'ROC_AUC': 0.7166666666666668, 'MCC': 0.4485426135725303}
{'accuracy': 0.7272727272727273, 'balanced_accuracy': 0.7321428571428572, 'f1': 0.6666666666666665, 'precision': 0.6, 'recall': 0.75, 'specificity': 0.7142857142857143, 'ROC_AUC': 0.7321428571428571, 'MCC': 0.4485426135725303}
{'accuracy': 0.8181818181818182, 'balanced_accuracy': 0.8035714285714286, 'f1': 0.75, 'precision': 0.75, 'recall': 0.75, 'specificity

# Hyperaktiv

In [None]:
PROCESSED_DATA_DIR = "/kaggle/working/"

def basic_data_cleaning(data):

    data = [df.copy() for df in data]

    for i, df in enumerate(data):
        df.columns = df.columns.str.lower()

        if 'timestamp' in df.columns:
            df["timestamp"] = pd.to_datetime(df["timestamp"], errors='coerce', dayfirst=True)

        if 'date' in df.columns:
            df.drop("date", axis=1, inplace=True)

        if 'activity' in df.columns:
            df["activity"] = df["activity"].astype(np.float32)

    return data

def split_and_clean_columns(data):
    data = [df.copy() for df in data]

    for i, df in enumerate(data):
        if 'TIMESTAMP;ACTIVITY' in df.columns:

            df[['timestamp', 'activity']] = df['TIMESTAMP;ACTIVITY'].str.split(';', expand=True)
            df.drop(columns=['TIMESTAMP;ACTIVITY'], inplace=True)

        df.columns = df.columns.str.lower()

        if 'activity' in df.columns:
            df['activity'] = df['activity'].astype(np.float32)

        if 'timestamp' in df.columns:
            df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce', dayfirst=True)

        data[i] = df

    return data

def split_ts_to_24h_series(data, window_size: int = 24):
    window_size = pd.Timedelta(hours=window_size)
    data_windows = []
    for series in data:
        series["timestamp"] = pd.to_datetime(series["timestamp"], dayfirst=False)
        windows = []

        start_time = series["timestamp"][0]
        end_time = start_time + window_size

        while end_time < series["timestamp"].max():
            window_data = series[(series['timestamp'] >= start_time) & (series['timestamp'] < end_time)]
            if len(window_data) == 1440:
                windows.append(window_data)
            start_time = end_time
            end_time = start_time + window_size

        data_windows.append(windows)

    return data_windows

def save_windows(data_windows, dataset, patient_class='condition'):
    id = 0
    for windows in data_windows:
        id += 1
        path = f"{PROCESSED_DATA_DIR}/day_windows/{dataset}/{patient_class}/{id}"
        os.makedirs(path, exist_ok=True)
        window_id = 0
        for window in windows:
            window_id += 1
            filepath = os.path.join(PROCESSED_DATA_DIR, 'day_windows', dataset, patient_class, str(id), f"window_{window_id}.csv")
            window.to_csv(filepath, header=True, index=False)

def prepare_data_for_mil(dataset_name, patient_class):
    dataset_path = os.path.join(PROCESSED_DATA_DIR, 'day_windows', dataset_name, patient_class)
    windows = []
    for subdir, dirs, files in os.walk(dataset_path):
        for file in files:
            file_path = os.path.join(subdir, file)
            df = pd.read_csv(file_path)
            windows.append(df)

    centroid_data = np.array([window['activity'].mean() for window in windows])
    return centroid_data

def split_day_windows():
    datasets = ['hyperaktiv3']
    for dataset_name in datasets:
        dataset = Dataset(dirpath=os.path.join("/kaggle/input/dataset1/data", dataset_name))
        condition = dataset.condition
        control = dataset.control

        condition = split_and_clean_columns(condition)
        control = split_and_clean_columns(control)

        condition = basic_data_cleaning(condition)
        control = basic_data_cleaning(control)

        data_windows_condition = split_ts_to_24h_series(condition, window_size=24)
        save_windows(data_windows=data_windows_condition, dataset=dataset_name, patient_class='condition')

        data_windows_control = split_ts_to_24h_series(control, window_size=24)
        save_windows(data_windows=data_windows_control, dataset=dataset_name, patient_class='control')

split_day_windows()

In [None]:
def standardize(X_train, X_test):
    scaler = StandardScaler()
    if X_train.ndim == 1:
        X_train = X_train.reshape(-1, 1)
    if X_test.ndim == 1:
        X_test = X_test.reshape(-1, 1)
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    return X_train, X_test

In [None]:
classifiers = {
    "LR": LogisticRegression(
        penalty="elasticnet",
        random_state=0,
        solver="saga",
        max_iter=5000
    ),
    "RF": RandomForestClassifier(
        n_estimators=500,
        criterion="entropy",
        random_state=0
    ),
    "XGB": XGBClassifier(
        use_label_encoder=False,
        eval_metric="logloss",
        random_state=0
    )
}

param_grids = {
    "LR": {
        "classifier__C": [0.1, 1, 10, 100],
        "classifier__class_weight": ["balanced"],
        "classifier__l1_ratio": [0, 0.2, 0.4, 0.6, 0.8, 1]
    },
    "RF": {
        "classifier__n_estimators": [50, 100, 200],
        "classifier__max_depth": [10, 20, 50],
        "classifier__class_weight": ["balanced"],
        "classifier__criterion": ["gini", "entropy"]
    },
    "XGB": {
        "classifier__n_estimators": [50, 100],
        "classifier__max_depth": [3, 5],
        "classifier__learning_rate": [0.01, 0.1]
    }
}


In [None]:
X_condition = prepare_data_for_mil('hyperaktiv3', 'condition')
X_control = prepare_data_for_mil('hyperaktiv3', 'control')

y_condition = np.ones(len(X_condition))
y_control = np.zeros(len(X_control))

X = np.concatenate([X_condition, X_control], axis=0)
y = np.concatenate([y_condition, y_control], axis=0)

for clf_type in ["LR", "RF", "XGB"]:
    print(f"{clf_type}")
    folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

    test_scores = []
    for train_idx, test_idx in folds.split(X, y):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        X_train, X_test = standardize(X_train, X_test)

        grid_search = GridSearchCV(
            estimator=Pipeline([('classifier', classifiers[clf_type])]),
            param_grid=param_grids[clf_type],
            scoring="accuracy",
            n_jobs=-1,
            refit=True,
            cv=LeaveOneOut()
        )
        grid_search.fit(X_train, y_train)

        clf = grid_search.best_estimator_

        metrics = calculate_metrics(clf, X_test, y_test)
        print(metrics)
        test_scores.append(metrics)

    final_scores = calculate_metrics_statistics(test_scores)

    for metric, (mean, stddev) in final_scores.items():
        print(f"    {metric}: {mean:.4f} +- {stddev:.4f}")

    print()


LR
{'accuracy': 0.7, 'balanced_accuracy': 0.7, 'f1': 0.7151515151515151, 'precision': 0.7, 'recall': 0.73125, 'specificity': 0.66875, 'ROC_AUC': 0.5, 'MCC': 4.0}
{'accuracy': 0.7, 'balanced_accuracy': 0.7, 'f1': 0.7294117647058824, 'precision': 0.7, 'recall': 0.7625, 'specificity': 0.6375, 'ROC_AUC': 0.5, 'MCC': 4.0}
{'accuracy': 0.825, 'balanced_accuracy': 0.725, 'f1': 0.8000000000000001, 'precision': 0.8428571428571429, 'recall': 0.7625, 'specificity': 0.8875, 'ROC_AUC': 0.625, 'MCC': 0.4519763153394848}
{'accuracy': 0.746875, 'balanced_accuracy': 0.743010752688172, 'f1': 0.6727272727272728, 'precision': 0.7416666666666666, 'recall': 0.61935483870967744, 'specificity': 0.8666666666666666, 'ROC_AUC': 0.5430107526881721, 'MCC': 0.48879923794435518}
{'accuracy': 0.871875, 'balanced_accuracy': 0.8730205278592376, 'f1': 0.8769230769230768, 'precision': 0.8470588235294118, 'recall': 0.9096774193548387, 'specificity': 0.8363636363636364, 'ROC_AUC': 0.6730205278592375, 'MCC': 0.546549565993