Ogni riga = 24h consecutive

Le time series sono ordinate cronologicamente

Il sonno è alla fine della TS

sleep_sleepTimeSeconds è affidabile

-1 / -2 / None = missing

In [1]:
import pandas as pd
import plotly
from pathlib import Path
import re
import ast
import pandas as pd

base = Path('./filtered_data/')

groups_dfs = {}

for csv_path in sorted(base.glob('group*/*.csv')):
    group = csv_path.parent.name
    m = re.search(r'dataset_user_(\d+)_train\.csv', csv_path.name)
    if not m:
        continue
    user_id = int(m.group(1))
    df = pd.read_csv(csv_path)
    groups_dfs.setdefault(group, {})[user_id] = df

base_test = Path('../data/')
test_df= pd.read_csv(base_test / 'test.csv', sep=';')

def convert_timeseries_columns(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    def try_parse_list(x):
        if isinstance(x, str) and x.strip().startswith('[') and x.strip().endswith(']'):
            try:
                return ast.literal_eval(x)
            except (ValueError, SyntaxError):
                return x
        return x

    for col in df.columns:
        df[col] = df[col].apply(try_parse_list)

    return df
for group, users in groups_dfs.items():
    for user_id, df in users.items():
        groups_dfs[group][user_id] = convert_timeseries_columns(df)

# do the same for test_df
test_df = convert_timeseries_columns(test_df)


In [2]:
import numpy as np

def clean_ts(ts):
    if not isinstance(ts, list):
        return [np.nan]
    return [np.nan if (x is None or x < 0) else x for x in ts]

for group, users in groups_dfs.items():
    for user_id, df in users.items():
        for col in df.columns:
            if isinstance(df[col].iloc[0], list):
                df[col] = df[col].apply(clean_ts)
        groups_dfs[group][user_id] = df

# do the same for test_df
for col in test_df.columns:
    if isinstance(test_df[col].iloc[0], list):
        test_df[col] = test_df[col].apply(clean_ts)
        

In [3]:
def estimate_sampling(row, ts_col="hr_time_series"):
    ts = row[ts_col]

    if ts is None or len(ts) < 10:
        return np.nan, np.nan

    T = len(ts)
    dt_hours = 24.0 / T

    sleep_sec = row.get("sleep_sleepTimeSeconds", np.nan)
    if np.isnan(sleep_sec):
        return dt_hours, np.nan

    sleep_hours = sleep_sec / 3600.0
    N_sleep = int(sleep_hours / dt_hours)

    # clamp di sicurezza
    N_sleep = max(0, min(N_sleep, T))

    return dt_hours, N_sleep
for group, users in groups_dfs.items():
    for user_id, df in users.items():

        dt_list = []
        n_sleep_list = []

        for _, row in df.iterrows():
            dt, n_sleep = estimate_sampling(row)
            dt_list.append(dt)
            n_sleep_list.append(n_sleep)

        df["dt_hours"] = dt_list
        df["N_sleep"] = n_sleep_list

        groups_dfs[group][user_id] = df

# do the same for test_df
dt_list = []
n_sleep_list = []
for _, row in test_df.iterrows():
    dt, n_sleep = estimate_sampling(row)
    dt_list.append(dt)
    n_sleep_list.append(n_sleep)
test_df["dt_hours"] = dt_list
test_df["N_sleep"] = n_sleep_list



In [4]:
def ts_features(x):
    return {
        "mean": np.nanmean(x),
        "std": np.nanstd(x),
        "min": np.nanmin(x),
        "max": np.nanmax(x),
        "missing_pct": np.mean(np.isnan(x)),
    }

for group, users in groups_dfs.items():
    for user_id, df in users.items():

        feature_dicts = []

        for _, row in df.iterrows():
            features = {}

            for col in df.columns:
                if isinstance(row[col], list):
                    ts = np.array(row[col])
                    N = len(ts)

                    # --- TRY / EXCEPT QUI ---
                    try:
                        N_sleep_i = int(row["N_sleep"])
                    except (TypeError, ValueError):
                        N_sleep_i = None

                    if N_sleep_i is None or N_sleep_i <= 0 or N_sleep_i >= N:
                        feat = ts_features(ts)
                        features.update(
                            {f"{col}_all_{k}": v for k, v in feat.items()}
                        )
                    else:
                        day_indices = np.arange(0, N - N_sleep_i, dtype=int)
                        night_indices = np.arange(N - N_sleep_i, N, dtype=int)

                        ts_day = ts[day_indices]
                        ts_night = ts[night_indices]

                        feat_day = ts_features(ts_day)
                        feat_night = ts_features(ts_night)

                        features.update(
                            {f"{col}_day_{k}": v for k, v in feat_day.items()}
                        )
                        features.update(
                            {f"{col}_night_{k}": v for k, v in feat_night.items()}
                        )

            feature_dicts.append(features)

        df_features = pd.DataFrame(feature_dicts)

        df_final = pd.concat(
            [df.reset_index(drop=True), df_features.reset_index(drop=True)],
            axis=1,
        )

        groups_dfs[group][user_id] = df_final



# do the same for test_df
feature_dicts = []
for _, row in test_df.iterrows():
    features = {}

    for col in test_df.columns:
        if isinstance(row[col], list):
            ts = np.array(row[col])
            N = len(ts)

            # --- TRY / EXCEPT QUI ---
            try:
                N_sleep_i = int(row["N_sleep"])
            except (TypeError, ValueError):
                N_sleep_i = None

            if N_sleep_i is None or N_sleep_i <= 0 or N_sleep_i >= N:
                feat = ts_features(ts)
                features.update(
                    {f"{col}_all_{k}": v for k, v in feat.items()}
                )
            else:
                day_indices = np.arange(0, N - N_sleep_i, dtype=int)
                night_indices = np.arange(N - N_sleep_i, N, dtype=int)

                ts_day = ts[day_indices]
                ts_night = ts[night_indices]

                feat_day = ts_features(ts_day)
                feat_night = ts_features(ts_night)

                features.update(
                    {f"{col}_day_{k}": v for k, v in feat_day.items()}
                )
                features.update(
                    {f"{col}_night_{k}": v for k, v in feat_night.items()}
                )

    feature_dicts.append(features)
test_df_features = pd.DataFrame(feature_dicts)
test_df = pd.concat(
    [test_df.reset_index(drop=True), test_df_features.reset_index(drop=True)],
    axis=1,
)



  "mean": np.nanmean(x),
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  "min": np.nanmin(x),
  "max": np.nanmax(x),


In [5]:
for group, users in groups_dfs.items():
    for user_id, df in users.items():
        df['deep_sleep_pct'] = np.nan
        df['rem_sleep_pct'] = np.nan
        df['light_sleep_pct'] = np.nan
        df['awake_sleep_pct'] = np.nan
        df['sleep_efficiency'] = np.nan
        for row_idx, row in df.iterrows():
            sleep_deepSleepSeconds = row.get("sleep_deepSleepSeconds", np.nan)
            sleep_lightSleepSeconds = row.get("sleep_lightSleepSeconds", np.nan)
            sleep_remSleepSeconds = row.get("sleep_remSleepSeconds", np.nan)
            sleep_timeSeconds = row.get("sleep_sleepTimeSeconds", np.nan)
            sleep_awakeTimeSeconds = row.get("sleep_awakeSleepSeconds", np.nan)

            deep_pct = sleep_deepSleepSeconds / sleep_timeSeconds 
            rem_pct = sleep_remSleepSeconds / sleep_timeSeconds
            light_pct = sleep_lightSleepSeconds / sleep_timeSeconds
            awake_pct = sleep_awakeTimeSeconds / sleep_timeSeconds
            sleep_efficiency = sleep_timeSeconds / (sleep_timeSeconds + sleep_awakeTimeSeconds)
            df.at[row_idx, 'deep_sleep_pct'] = deep_pct
            df.at[row_idx, 'rem_sleep_pct'] = rem_pct
            df.at[row_idx, 'light_sleep_pct'] = light_pct
            df.at[row_idx, 'awake_sleep_pct'] = awake_pct
            df.at[row_idx, 'sleep_efficiency'] = sleep_efficiency
        groups_dfs[group][user_id] = df

# do the same for test_df
test_df['deep_sleep_pct'] = np.nan
test_df['rem_sleep_pct'] = np.nan
test_df['light_sleep_pct'] = np.nan
test_df['awake_sleep_pct'] = np.nan
test_df['sleep_efficiency'] = np.nan
for row_idx, row in test_df.iterrows():
    sleep_deepSleepSeconds = row.get("sleep_deepSleepSeconds", np.nan)
    sleep_lightSleepSeconds = row.get("sleep_lightSleepSeconds", np.nan)
    sleep_remSleepSeconds = row.get("sleep_remSleepSeconds", np.nan)
    sleep_timeSeconds = row.get("sleep_sleepTimeSeconds", np.nan)
    sleep_awakeTimeSeconds = row.get("sleep_awakeSleepSeconds", np.nan)

    deep_pct = sleep_deepSleepSeconds / sleep_timeSeconds 
    rem_pct = sleep_remSleepSeconds / sleep_timeSeconds
    light_pct = sleep_lightSleepSeconds / sleep_timeSeconds
    awake_pct = sleep_awakeTimeSeconds / sleep_timeSeconds
    sleep_efficiency = sleep_timeSeconds / (sleep_timeSeconds + sleep_awakeTimeSeconds)
    test_df.at[row_idx, 'deep_sleep_pct'] = deep_pct
    test_df.at[row_idx, 'rem_sleep_pct'] = rem_pct
    test_df.at[row_idx, 'light_sleep_pct'] = light_pct
    test_df.at[row_idx, 'awake_sleep_pct'] = awake_pct
    test_df.at[row_idx, 'sleep_efficiency'] = sleep_efficiency


In [6]:
FEATURES = [
    # sleep composition
    "deep_sleep_pct",
    "rem_sleep_pct",
    "light_sleep_pct",
    "awake_sleep_pct",
    "sleep_efficiency",

    # heart rate static
    "hr_restingHeartRate",
    "hr_lastSevenDaysAvgRestingHeartRate",
    "hr_maxHeartRate",
    "hr_minHeartRate",

    # stress static
    "str_avgStressLevel",
    "str_maxStressLevel",

    # activity
    "act_totalCalories",
    "act_activeKilocalories",
    "act_distance",

    # respiration static
    "resp_lowestRespirationValue",
    "resp_highestRespirationValue",
    "resp_avgSleepRespirationValue",

    # sampling info (utile!)
    "dt_hours",
]


In [7]:
import pandas as pd

X_list = []
y_list = []

for group, users in groups_dfs.items():
    for user_id, df in users.items():

        df_model = df.copy()

        # tieni solo righe valide
        df_model = df_model.dropna(subset=["label"])

        X = df_model[FEATURES]
        y = df_model["label"]

        X_list.append(X)
        y_list.append(y)

X_all = pd.concat(X_list, axis=0)
y_all = pd.concat(y_list, axis=0)


# x_list_test = test_df[FEATURES]
# y_list_test = test_df["label"]  


In [8]:
X_all = X_all.replace([np.inf, -np.inf], np.nan)


In [9]:
split_idx = int(len(X_all) * 0.7)



X_train = X_all.iloc[:split_idx]
X_val   = X_all.iloc[split_idx:]



y_train = y_all.iloc[:split_idx]
y_val   = y_all.iloc[split_idx:]

# X_train = X_all
# y_train = y_all



In [10]:
from xgboost import XGBRegressor

model = XGBRegressor(
    n_estimators=300,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.8,
    
    colsample_bytree=0.8,
    reg_alpha=0.1,
    reg_lambda=1.0,
    random_state=42,
    n_jobs=-1
)

# Training semplice
model.train(
    X_train,
    y_train
)


AttributeError: 'XGBRegressor' object has no attribute 'train'

In [None]:

# X_val=x_list_test

y_pred = model.predict(X_val)
y_pred = np.clip(y_pred, 0, 100)

In [None]:
from sklearn.metrics import mean_absolute_error, r2_score

# y_val = y_list_test

mae = mean_absolute_error(y_val, y_pred)
r2  = r2_score(y_val, y_pred)

print("MAE:", mae)
print("R2:", r2)


MAE: 10.613499641418457
R2: -0.23205804824829102
