In [1]:
import os
from src.utils import data_load
import pandas as pd
from src.s3_utils import pandas_from_csv_s3
import numpy as np
from datetime import datetime, timedelta
import seaborn as sns
import matplotlib.pyplot as plt
from utils import get_survey_question, na_rate
import torch
import matplotlib.dates as mdates
import pingouin as pg
from torch.utils.data import Dataset

Matplotlib created a temporary config/cache directory at /tmp/matplotlib-vyjw2rt8 because the default path (/home/ubuntu/.config/matplotlib) is not a writable directory; it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.


In [2]:
df_survey = data_load(data_keys={"surveys"}, wave=7)['surveys']
# df_birth = data_load(data_keys={"birth"}, wave=5)['birth']


In [3]:
question_148_level = {"Yes": 0, "No": 0}
question_149_to_157_level = {"Never": 1, 'Sometimes': 2, 'Regularly': 3, 'Often': 4, 'Always': 5}
question_158_level = {"Never": 5, 'Sometimes': 4, 'Regularly': 3, 'Often': 2, 'Always': 1}


In [4]:
question_id_lst = list(range(148, 159))
num_questions = len(list(range(148, 159)))
question_levels_lst = [question_149_to_157_level] * num_questions
question_levels_lst[0] = question_148_level
question_levels_lst[-1] = question_158_level
question_level_mapping_dict = dict(zip(question_id_lst, question_levels_lst))

In [5]:
df_fas = df_survey.loc[df_survey['title'] =='Fatigue survey']

In [6]:
def map_answer_to_int(row):
    curr_level_mapping = question_level_mapping_dict[row['question_id']]
    return curr_level_mapping[row['answer_text']]
df_fas['fas_score'] = df_fas.apply(map_answer_to_int, axis=1)

In [7]:
grouped_df_fas = df_fas.groupby(['user_id', 'date'])['fas_score'].sum().reset_index()
grouped_df_fas['date'] = pd.to_datetime(grouped_df_fas['date'])
grouped_df_fas['week_year'] = grouped_df_fas['date'].dt.strftime('%Y-%U')

In [9]:
grouped_df_fas

Unnamed: 0,user_id,date,fas_score,week_year
0,28,2021-02-24,28,2021-08
1,28,2021-03-02,34,2021-09
2,28,2021-03-09,28,2021-10
3,28,2021-03-16,31,2021-11
4,28,2021-03-23,26,2021-12
...,...,...,...,...
8752,2664,2023-01-30,17,2023-05
8753,2664,2023-02-06,17,2023-06
8754,2664,2023-02-13,17,2023-07
8755,2664,2023-02-20,17,2023-08


In [10]:
curr_user_fas[curr_user_fas['date'] == '2021-09-21']

Unnamed: 0,user_id,date,fas_score,week_year
25,28,2021-09-21,36,2021-38


In [11]:

def get_dates_within_one_week(date):
    week_dates = [(pd.to_datetime(date) - timedelta(days=i)) for i in range(7)]
    return week_dates

class SameDayMappingDataset(Dataset):
    def __init__(self, dataset_x, dataset_y, user_id, feature_name_lst_x, feature_name_lst_y, train_time_steps, target_time_steps, one_week_back=True, normalize=True):
        self.user_id = user_id
        self.feature_name_lst_x = feature_name_lst_x
        self.feature_name_lst_y = feature_name_lst_y
        self.one_week_back = one_week_back
        self.train_time_steps = train_time_steps
        self.target_time_steps = target_time_steps
        if normalize:
            self.dataset_x = self._normalize(dataset_x, feature_name_lst_x, x_true=True)
            self.dataset_y = self._normalize(dataset_y, feature_name_lst_y, x_true=False)
        else:
            self.dataset_x = dataset_x
            self.dataset_y = dataset_y
        # 0 impute

        mean_value = 0
        self.dataset_x = self.dataset_x.fillna(value=mean_value)

    def _normalize(self, df, feature_names, x_true):
        if self.one_week_back:

            if x_true:
                dates_with_one_week = []
                for date in self.train_time_steps:
                    week_dates = get_dates_within_one_week(date)
                    dates_with_one_week.extend(week_dates)
                train_dates = [date.strftime('%Y-%m-%d') for date in dates_with_one_week]
            else:
                train_dates = self.train_time_steps

            train_indices = df.index[df['date'].isin(train_dates)]
            assert len(train_indices) != 0
            def min_max_normalize_subset(feature):
                if isinstance(feature.iloc[0], str):
                    return feature 
                min_val = feature.iloc[train_indices].min()
                max_val = feature.iloc[train_indices].max()
                if min_val == max_val:
                    max_val += 1
                return (feature - min_val) / (max_val - min_val)
            df[feature_names] = df[feature_names].apply(min_max_normalize_subset)
            return df
    
    def __len__(self):
        return len(self.target_time_steps)
    def __getitem__(self, idx):
        time_step = self.target_time_steps[idx]
        print(time_step)
        # print(self.dataset_y)
        y = self.dataset_y[self.dataset_y['date'] == time_step][self.feature_name_lst_y].values

        print(y.shape)
        if self.one_week_back:
            prev_week_dates = get_dates_within_one_week(time_step)
            x = self.dataset_x[self.dataset_x['date'].isin(prev_week_dates)][self.feature_name_lst_x].values
        else:
            x = self.dataset_x[self.dataset_x['date'].isin(time_step)][self.feature_name_lst_x].values
        print(x.shape)
        return {"X": x, "Y": y, "user_id": self.user_id}

2021-11-23
(1, 1)
(7, 1)


{'X': array([[0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.]]),
 'Y': array([[0.52631579]]),
 'user_id': 28}

In [None]:

for curr_user in grouped_df_fas['user_id'].unique():
    # curr_user = 28
    curr_user_fas = grouped_df_fas[grouped_df_fas['user_id'] == curr_user]
    fas_time = list(curr_user_fas['date'].unique())
    daily_csv_path = f'/mnt/dataset/fatigue/user_{curr_user}_fatigue_label.csv'
    if not os.path.exists(daily_csv_path):
        #print("Empty Edema")
        continue
    curr_user_daily_df = pd.read_csv(daily_csv_path).groupby("date", as_index = False).first()[['date', 'fatigue']]
    curr_user_daily_df['date'] = pd.to_datetime(curr_user_daily_df['date'])
    daily_time = list(curr_user_daily_df['date'].unique())
    available_time = list(set(daily_time).intersection(fas_time))
    available_time = pd.to_datetime(available_time)
    available_time = [date.strftime('%Y-%m-%d') for date in available_time]
    train_times = available_time[:int(len(available_time) * 0.8)]
    test_times = available_time[int(len(available_time) * 0.8):]

    train_set = SameDayMappingDataset(dataset_y = curr_user_fas, dataset_x = curr_user_daily_df, 
    user_id=28, feature_name_lst_y=['fas_score'], feature_name_lst_x=['fatigue'], train_time_steps=train_times, target_time_steps=train_times)
    test_set = SameDayMappingDataset(dataset_y = curr_user_fas, dataset_x = curr_user_daily_df, 
    user_id=28, feature_name_lst_y=['fas_score'], feature_name_lst_x=['fatigue'], train_time_steps=train_times, target_time_steps=test_times)
    