testing the final model on a group


In [None]:
import pandas as pd
import plotly
from pathlib import Path
import re
import ast
import pandas as pd

base = Path('../data/train') #original 
# base = Path('./filtered_data') #filtered

groups_dfs = {}

for csv_path in sorted(base.glob('group8/*.csv')):
    group = csv_path.parent.name
    m = re.search(r'dataset_user_(\d+)_train\.csv', csv_path.name)
    if not m:
        continue
    user_id = int(m.group(1))

    df = pd.read_csv(csv_path, sep=';') # separatore ; da utilizzare quando si leccono i dati originali
    # df = pd.read_csv(csv_path)

    groups_dfs.setdefault(group, {})[user_id] = df


def convert_timeseries_columns(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    def try_parse_list(x):
        if isinstance(x, str) and x.strip().startswith('[') and x.strip().endswith(']'):
            try:
                return ast.literal_eval(x)
            except (ValueError, SyntaxError):
                return x
        return x

    for col in df.columns:
        df[col] = df[col].apply(try_parse_list)

    return df
for group, users in groups_dfs.items():
    for user_id, df in users.items():
        groups_dfs[group][user_id] = convert_timeseries_columns(df)

# print(groups_dfs['group8'][7].iloc[0]   )



In [None]:
import numpy as np

def clean_ts(ts):
    if not isinstance(ts, list):
        return [np.nan]
    return [np.nan if (x is None or x < 0) else x for x in ts]

for group, users in groups_dfs.items():
    for user_id, df in users.items():
        for col in df.columns:
            if isinstance(df[col].iloc[0], list):
                df[col] = df[col].apply(clean_ts)
        groups_dfs[group][user_id] = df

In [None]:
def estimate_sampling(row, ts_col="hr_time_series"):
    ts = row[ts_col]
    if ts is None or len(ts) < 10:
        return np.nan, np.nan

    T = len(ts)
    dt_hours = 24.0 / T

    sleep_sec = row.get("sleep_sleepTimeSeconds", np.nan)
    if np.isnan(sleep_sec):
        return dt_hours, np.nan

    sleep_hours = sleep_sec / 3600.0
    N_sleep = int(sleep_hours / dt_hours)

    # clamp di sicurezza
    N_sleep = max(0, min(N_sleep, T))

    return dt_hours, N_sleep
for group, users in groups_dfs.items():
    for user_id, df in users.items():
        dt_list = []
        n_sleep_list = []

        for _, row in df.iterrows():
            dt, n_sleep = estimate_sampling(row)
            dt_list.append(dt)
            n_sleep_list.append(n_sleep)

        df["dt_hours"] = dt_list
        df["N_sleep"] = n_sleep_list

        groups_dfs[group][user_id] = df



In [None]:
def ts_features(x):
    return {
        "mean": np.nanmean(x),
        "std": np.nanstd(x),
        "min": np.nanmin(x),
        "max": np.nanmax(x),
        "missing_pct": np.mean(np.isnan(x)),
    }

for group, users in groups_dfs.items():
    for user_id, df in users.items():

        feature_dicts = []

        for _, row in df.iterrows():
            features = {}

            for col in df.columns:
                if isinstance(row[col], list):
                    ts = np.array(row[col])
                    N = len(ts)

                    # --- TRY / EXCEPT QUI ---
                    try:
                        N_sleep_i = int(row["N_sleep"])
                    except (TypeError, ValueError):
                        N_sleep_i = None

                    if N_sleep_i is None or N_sleep_i <= 0 or N_sleep_i >= N:
                        feat = ts_features(ts)
                        features.update(
                            {f"{col}_all_{k}": v for k, v in feat.items()}
                        )
                    else:
                        day_indices = np.arange(0, N - N_sleep_i, dtype=int)
                        night_indices = np.arange(N - N_sleep_i, N, dtype=int)

                        ts_day = ts[day_indices]
                        ts_night = ts[night_indices]

                        feat_day = ts_features(ts_day)
                        feat_night = ts_features(ts_night)

                        features.update(
                            {f"{col}_day_{k}": v for k, v in feat_day.items()}
                        )
                        features.update(
                            {f"{col}_night_{k}": v for k, v in feat_night.items()}
                        )

            feature_dicts.append(features)

        df_features = pd.DataFrame(feature_dicts)

        df_final = pd.concat(
            [df.reset_index(drop=True), df_features.reset_index(drop=True)],
            axis=1,
        )

        groups_dfs[group][user_id] = df_final





  "mean": np.nanmean(x),
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  "min": np.nanmin(x),
  "max": np.nanmax(x),


In [None]:
def ts_features(ts):
    if ts is None:
        return [np.nan] * 7

    ts = pd.Series(ts).dropna()
    if len(ts) < 2:
        return [np.nan] * 7

    x = np.arange(len(ts))

    slope = np.polyfit(x, ts, 1)[0]

    p25, p50, p75 = np.percentile(ts, [25, 50, 75])
    iqr = p75 - p25

    rmssd = np.sqrt(np.mean(np.diff(ts) ** 2))

    mean = ts.mean()
    std = ts.std()

    return slope, p25, p50, p75, iqr, rmssd, std


for group, users in groups_dfs.items():
    for user_id, df in users.items():
        # Sleep composition
        df['deep_sleep_pct'] = np.nan
        df['rem_sleep_pct'] = np.nan
        df['light_sleep_pct'] = np.nan
        df['awake_sleep_pct'] = np.nan
        df['sleep_efficiency'] = np.nan
        df['sleep_day_ptc'] = np.nan

        # HR features
        df['hr_slope'] = np.nan
        df['hr_p25'] = np.nan
        df['hr_p50'] = np.nan
        df['hr_p75'] = np.nan
        df['hr_iqr'] = np.nan
        df['hr_rmssd'] = np.nan
        df['std_hearthrate'] = np.nan

        # Stress features
        df['stress_slope'] = np.nan
        df['stress_p25'] = np.nan
        df['stress_p50'] = np.nan
        df['stress_p75'] = np.nan
        df['stress_iqr'] = np.nan
        df['stress_rmssd'] = np.nan
        df['std_stress'] = np.nan

        # Resp features
        df['resp_slope'] = np.nan
        df['resp_p25'] = np.nan
        df['resp_p50'] = np.nan
        df['resp_p75'] = np.nan
        df['resp_iqr'] = np.nan
        df['resp_rmssd'] = np.nan
        df['std_resp'] = np.nan


        for idx, row in df.iterrows():
            sleep_time = row.get("sleep_sleepTimeSeconds", np.nan)
            if pd.isna(sleep_time) or sleep_time == 0:
                continue

            # ---- Sleep composition
            deep = row.get("sleep_deepSleepSeconds", 0) / sleep_time
            rem = row.get("sleep_remSleepSeconds", 0) / sleep_time
            light = row.get("sleep_lightSleepSeconds", 0) / sleep_time
            awake = row.get("sleep_awakeSleepSeconds", 0) / sleep_time
            efficiency = sleep_time / (sleep_time + row.get("sleep_awakeSleepSeconds", 0))

            df.at[idx, 'deep_sleep_pct'] = deep
            df.at[idx, 'rem_sleep_pct'] = rem
            df.at[idx, 'light_sleep_pct'] = light
            df.at[idx, 'awake_sleep_pct'] = awake
            df.at[idx, 'sleep_efficiency'] = efficiency
            df.at[idx, 'sleep_day_ptc'] = sleep_time / 86400.0

            # ---- HR
            hr_feats = ts_features(row.get("hr_time_series"))
            (
                df.at[idx, 'hr_slope'],
                df.at[idx, 'hr_p25'],
                df.at[idx, 'hr_p50'],
                df.at[idx, 'hr_p75'],
                df.at[idx, 'hr_iqr'],
                df.at[idx, 'hr_rmssd'],
                df.at[idx, 'std_hearthrate'],
            ) = hr_feats

            # ---- Stress
            stress_feats = ts_features(row.get("stress_time_series"))
            (
                df.at[idx, 'stress_slope'],
                df.at[idx, 'stress_p25'],
                df.at[idx, 'stress_p50'],
                df.at[idx, 'stress_p75'],
                df.at[idx, 'stress_iqr'],
                df.at[idx, 'stress_rmssd'],
                df.at[idx, 'std_stress'],
            ) = stress_feats

            # ---- Resp
            resp_feats = ts_features(row.get("resp_time_series"))
            (
                df.at[idx, 'resp_slope'],
                df.at[idx, 'resp_p25'],
                df.at[idx, 'resp_p50'],
                df.at[idx, 'resp_p75'],
                df.at[idx, 'resp_iqr'],
                df.at[idx, 'resp_rmssd'],
                df.at[idx, 'std_resp'],
            ) = resp_feats

        groups_dfs[group][user_id] = df




In [None]:
# FEATURES = [
#     # sleep composition
#     "deep_sleep_pct",
#     "rem_sleep_pct",
#     "light_sleep_pct",
#     "awake_sleep_pct",
#     "sleep_efficiency",

#     # heart rate static
#     "hr_restingHeartRate",
#     "hr_lastSevenDaysAvgRestingHeartRate",
#     "hr_maxHeartRate",
#     "hr_minHeartRate",

#     # stress static
#     "str_avgStressLevel",
#     "str_maxStressLevel",

#     # activity
#     "act_totalCalories",
#     "act_activeKilocalories",
#     "act_distance",

#     # respiration static
#     "resp_lowestRespirationValue",
#     "resp_highestRespirationValue",
#     "resp_avgSleepRespirationValue",

#     # sampling info (utile!)
#         'sleep_day_ptc'
# ]


In [None]:
#concatenate all dataframes into a single dataframe for training
train_dfs = []
train_labels = []
for group, users in groups_dfs.items():
    for user_id, df in users.items():
        if 'label' in df.columns:
            train_dfs.append(df)
            train_labels.extend(df['label'].tolist())
train_df = pd.concat(train_dfs, ignore_index=True)

# min max train_df[FEATURES]
# from sklearn.preprocessing import StandardScaler, MinMaxScaler
# scaler = MinMaxScaler()
# train_df[FEATURES] = scaler.fit_transform(train_df[FEATURES])

#print first row of train_df without the ... in the print, i want the intire one
print(train_df.iloc[0].to_dict())
feature_cols = [col for col in df.columns if col != "label" and 'time_series' not in col]
#se sono presenti rimuovi le colonne che contengono 'coverage'
feature_cols = [col for col in feature_cols if 'coverage' not in col]


import xgboost as xgb
booster = xgb.Booster()
booster.load_model("final_model_min_max.json")
dtest=xgb.DMatrix(train_df[feature_cols])
preds = booster.predict(dtest)

# #create a padatframe with column id and label 

# submission_df = pd.DataFrame({
#     'id': test_df['id'],
#     'label': preds
# })
# submission_df.to_csv("submission2.csv", index=False)


{'Unnamed: 0': 0, 'day': 0, 'label': 83, 'hr_maxHeartRate': 117, 'hr_minHeartRate': 43, 'hr_restingHeartRate': 47, 'hr_lastSevenDaysAvgRestingHeartRate': 46, 'hr_time_series': [46, 47, 48, 47, 47, 48, 46, 48, 48, 49, 51, 55, 50, 50, 49, 50, 51, 51, 53, 53, 53, 54, 55, 52, 54, 54, 54, 55, 56, 58, 59, 59, 61, 61, 60, 58, 58, 58, 58, 57, 57, 56, 56, 56, 57, 57, 55, 55, 55, 56, 58, 53, 52, 52, 49, 49, 49, 49, 51, 49, 49, 50, 51, 54, 60, 54, 57, 59, 59, 59, 66, 58, 56, 54, 55, 54, 51, 54, 50, 53, 56, 57, 54, 56, 59, 56, 56, 54, 59, 64, 59, 58, 54, 54, 55, 57, 54, 54, 52, 51, 53, 53, 53, 54, 53, 52, 53, 53, 52, 53, 48, 52, 51, 50, 55, 54, 52, 49, 49, 51, 52, 50, 52, 56, 55, 50, 52, 63, 53, 54, 50, 55, 55, 52, 51, 47, 48, 50, 50, 47, 48, 49, 49, 47, 48, 45, 45, 45, 47, 50, 49, 50, 51, 50, 50, 50, 51, 52, 51, 49, 51, 56, 44, 63, 73, 43, 58, 68, 58, 57, 65, 67, 65, 70, 85, 79, 68, 67, 76, 78, 80, 81, 84, 91, 90, 78, 79, 72, 82, 94, 94, 102, 105, 98, 97, 107, 100, 104, 113, 112, 111, 109, 100, 9

In [None]:


from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(train_labels, preds)
print(f"Mean Absolute Error on training data: {mae}")

#  8.756616592407227

Mean Absolute Error on training data: 469910.21875
