In [1]:
import pandas as pd
import plotly
from pathlib import Path
import re
import ast
import pandas as pd
import numpy as np


In [2]:
base = Path('./filtered_data/')

groups_dfs = {}

for csv_path in sorted(base.glob('group8/*.csv')):
    group = csv_path.parent.name
    m = re.search(r'dataset_user_(\d+)_train\.csv', csv_path.name)
    if not m:
        continue
    user_id = int(m.group(1))
    df = pd.read_csv(csv_path)
    groups_dfs.setdefault(group, {})[user_id] = df



In [3]:
def convert_timeseries_columns(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    def try_parse_list(x):
        if isinstance(x, str) and x.strip().startswith('[') and x.strip().endswith(']'):
            try:
                return ast.literal_eval(x)
            except (ValueError, SyntaxError):
                return x
        return x

    for col in df.columns:
        df[col] = df[col].apply(try_parse_list)

    return df

def clean_ts(ts):
    if not isinstance(ts, list):
        return [np.nan]
    return [np.nan if (x is None or x < 0) else x for x in ts]

for group, users in groups_dfs.items():
    for user_id, df in users.items():
        groups_dfs[group][user_id] = convert_timeseries_columns(df)

for group, users in groups_dfs.items():
    for user_id, df in users.items():
        for col in df.columns:
            if 'time_series' in col:
                df[col] = df[col].apply(clean_ts)
        groups_dfs[group][user_id] = df



In [4]:
rebuilded_dfs = {}

for group, users in groups_dfs.items():
    rebuilded_dfs[group] = {}
    for user_id, df in users.items():
        
        rebuilded_dfs[group][user_id] = pd.DataFrame()
    

        for i in range(len(df)):
            new_df = pd.DataFrame()

            row = df.iloc[i]

            ts_hr = pd.Series(row['hr_time_series'])
            ts_resp = pd.Series(row['resp_time_series'])
            ts_stress = pd.Series(row['stress_time_series'])

            new_df['hr'] = ts_hr
            new_df['resp'] = ts_resp
            new_df['stress'] = ts_stress

            # copy all the non-timeseries columns from df to new_df
            for col in df.columns:
                if 'time_series' not in col:
                    new_df[col] = row[col]


            # in the new_df dataframe,fill all the missing values and the negative values with NaN

            for col in new_df.columns:
                new_df[col] = new_df[col].apply(lambda x: np.nan if (x is None or x < 0) else x)

            rebuilded_dfs[group][user_id] = pd.concat([rebuilded_dfs[group][user_id], new_df], ignore_index=True)

rebuilded_dfs



{'group8': {17:         hr  resp  stress  Unnamed: 0  day  label  hr_maxHeartRate  \
  0     58.0  13.0    22.0           2    2     75              107   
  1     57.0  13.0    20.0           2    2     75              107   
  2     57.0  13.0    18.0           2    2     75              107   
  3     57.0  12.0    15.0           2    2     75              107   
  4     58.0  10.0    15.0           2    2     75              107   
  ...    ...   ...     ...         ...  ...    ...              ...   
  5027  64.0  17.0     NaN          22   25     29              106   
  5028  66.0  18.0     NaN          22   25     29              106   
  5029  66.0  13.0     NaN          22   25     29              106   
  5030  70.0  12.0     NaN          22   25     29              106   
  5031  69.0  16.0     NaN          22   25     29              106   
  
        hr_minHeartRate  hr_restingHeartRate  \
  0                  43                   45   
  1                  43            

In [5]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

# Remove TensorFlow/Keras imports
# from tensorflow.keras import layers, models
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split


def scale_data(df):
    # drop rows with all NaN
    df = df.dropna(how='all')
    # fill remaining NaNs per column with forward fill then back fill
    df = df.fillna(method='ffill').fillna(method='bfill')
    # still may have NaNs at boundaries; replace with column means
    df = df.apply(lambda col: col.fillna(col.mean()))

    scaler = MinMaxScaler()
    scaled = scaler.fit_transform(df)
    return scaled, scaler


def create_sequences(data, seq_length):
    X, y = [], []
    for i in range(len(data) - seq_length):
        x_win = data[i:i + seq_length]
        y_val = data[i + seq_length]
        # drop sequences containing NaNs
        if np.isnan(x_win).any() or np.isnan(y_val).any():
            continue
        X.append(x_win)
        y.append(y_val)
    return np.array(X, dtype=np.float32), np.array(y, dtype=np.float32)


n_features = 35
seq_length = 10


In [6]:
merged_group= {}

for group, users in rebuilded_dfs.items():
    merged_group[group] = pd.concat(users.values(), ignore_index=True)

x_all = []
y_all = []

for group, df in merged_group.items():
    feature_cols = [col for col in df.columns if col != 'label']
    X_vals, scaler = scale_data(df[feature_cols])
    y_vals = df['label'].values.astype(np.float32)

    x_all.append(X_vals)
    y_all.append(y_vals)



  df = df.fillna(method='ffill').fillna(method='bfill')


In [None]:
import torch
import torch.nn as nn
import numpy as np
from sklearn.metrics import mean_absolute_error
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
# ----- Definizione modello -----
class LSTMRegressor(nn.Module):
    def __init__(self, input_size=35, hidden_size1=64, hidden_size2=32, dropout=0.2):
        super().__init__()
        self.lstm1 = nn.LSTM(input_size=input_size, hidden_size=hidden_size1, batch_first=True)
        self.dropout1 = nn.Dropout(dropout)
        self.lstm2 = nn.LSTM(input_size=hidden_size1, hidden_size=hidden_size2, batch_first=True)
        self.dropout2 = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_size2, 1)

    def forward(self, x):
        out1, _ = self.lstm1(x)
        out1 = self.dropout1(out1)
        out2, _ = self.lstm2(out1)
        out2 = self.dropout2(out2)
        last = out2[:, -1, :]
        return self.fc(last)

# ----- Carica modello -----
model = LSTMRegressor()
state = torch.load('final_model.pt', map_location='cpu')  # usa 'cuda' se disponibile
model.load_state_dict(state)
model.eval()

# ----- Funzione di predizione -----
def predict_stress(model, data, seq_length):
    model.eval()
    X, _ = create_sequences(data, seq_length)  # crea sequenze per LSTM
    X_tensor = torch.tensor(X, dtype=torch.float32)
    with torch.no_grad():
        predictions = model(X_tensor).numpy()
    return predictions

# ----- Loop per predizioni e MAE -----
results = {}
mae_results = {}

for group, users in rebuilded_dfs.items():
    results[group] = {}
    mae_results[group] = {}
    
    for user_id, df in users.items():
        # Gestione NaN
        df_clean = df.drop(columns=['label'], errors='ignore')
        df_clean = df_clean.fillna(method='ffill').fillna(method='bfill')
        
        # Scaling dei dati
        scaled_data, scaler = scale_data(df_clean)
        
        # Predizione
        predictions = predict_stress(model, scaled_data, seq_length)
        # Inverse scaling
        dummy = np.zeros((predictions.shape[0], scaled_data.shape[1]))
        dummy[:, -1] = predictions[:, 0]  # metti predizioni nellâ€™ultima colonna
        inv_predictions = scaler.inverse_transform(dummy)[:, -1]
        
        # Salva predizioni
        results[group][user_id] = inv_predictions
        
        # Calcolo MAE
        true_values = df['label'].values[seq_length:]  
        pred_values = inv_predictions
        mae = mean_absolute_error(true_values, pred_values)
        mae_results[group][user_id] = mae

# ----- Risultati -----
print(mae_results)



[[0.37283522]
 [0.37283522]
 [0.37283522]
 ...
 [0.37283522]
 [0.37283522]
 [0.37283522]]
[0.77072509 0.77072509 0.77072509 ... 0.77072509 0.77072509 0.77072509]
[[0.37283522]
 [0.37283522]
 [0.37283522]
 ...
 [0.37283522]
 [0.37283522]
 [0.37283522]]
[0.80126484 0.80126484 0.80126484 ... 0.80126484 0.80126484 0.80126484]
[[0.37283522]
 [0.37283522]
 [0.37283522]
 ...
 [0.37283522]
 [0.37283522]
 [0.37283522]]
[0.78262308 0.78262308 0.78262308 ... 0.78262308 0.78262308 0.78262308]
[[0.37283522]
 [0.37283522]
 [0.37283522]
 ...
 [0.37283522]
 [0.37283522]
 [0.37283522]]
[0.82643957 0.82643957 0.82643957 ... 0.82643957 0.82643957 0.82643957]
[[0.37283522]
 [0.37283522]
 [0.37283522]
 ...
 [0.37283522]
 [0.37283522]
 [0.37283522]]
[0.83692101 0.83692101 0.83692101 ... 0.83692101 0.83692101 0.83692101]
{'group8': {17: 66.61557518449474, 26: 84.47605946558274, 35: 67.31072455783068, 44: 78.42200450063292, 8: 75.46786262775993}}


In [8]:
#calculate the mae for each group and user
from sklearn.metrics import mean_absolute_error
mae_results = {}
for group, users in rebuilded_dfs.items():
    mae_results[group] = {}
    for user_id, df in users.items():
        true_values = df['label'].values[seq_length:]
        pred_values = results[group][user_id]
        print(pred_values)
        mae = mean_absolute_error(true_values, pred_values)
        mae_results[group][user_id] = mae
mae_results


[0.77072509 0.77072509 0.77072509 ... 0.77072509 0.77072509 0.77072509]
[0.80126484 0.80126484 0.80126484 ... 0.80126484 0.80126484 0.80126484]
[0.78262308 0.78262308 0.78262308 ... 0.78262308 0.78262308 0.78262308]
[0.82643957 0.82643957 0.82643957 ... 0.82643957 0.82643957 0.82643957]
[0.83692101 0.83692101 0.83692101 ... 0.83692101 0.83692101 0.83692101]


{'group8': {17: 66.61557518449474,
  26: 84.47605946558274,
  35: 67.31072455783068,
  44: 78.42200450063292,
  8: 75.46786262775993}}