In [24]:
import os
from src.utils import data_load
import pandas as pd
from src.s3_utils import pandas_from_csv_s3
import numpy as np
from datetime import datetime, timedelta
import seaborn as sns
import matplotlib.pyplot as plt
from utils import get_survey_question, na_rate
import torch
import matplotlib.dates as mdates
import pingouin as pg
from torch.utils.data import Dataset

In [2]:
df_survey = data_load(data_keys={"surveys"}, wave=7)['surveys']
# df_birth = data_load(data_keys={"birth"}, wave=5)['birth']


In [3]:
question_148_level = {"Yes": 0, "No": 0}
question_149_to_157_level = {"Never": 1, 'Sometimes': 2, 'Regularly': 3, 'Often': 4, 'Always': 5}
question_158_level = {"Never": 5, 'Sometimes': 4, 'Regularly': 3, 'Often': 2, 'Always': 1}


In [6]:
question_id_lst = list(range(148, 159))
num_questions = len(list(range(148, 159)))
question_levels_lst = [question_149_to_157_level] * num_questions
question_levels_lst[0] = question_148_level
question_levels_lst[-1] = question_158_level
question_level_mapping_dict = dict(zip(question_id_lst, question_levels_lst))

In [7]:
df_fas = df_survey.loc[df_survey['title'] =='Fatigue survey']

In [8]:
def map_answer_to_int(row):
    curr_level_mapping = question_level_mapping_dict[row['question_id']]
    return curr_level_mapping[row['answer_text']]
df_fas['fas_score'] = df_fas.apply(map_answer_to_int, axis=1)

In [21]:
grouped_df_fas = df_fas.groupby(['user_id', 'date'])['fas_score'].sum().reset_index()
grouped_df_fas['date'] = pd.to_datetime(grouped_df_fas['date'])
grouped_df_fas['week_year'] = grouped_df_fas['date'].dt.strftime('%Y-%U')

In [22]:
grouped_df_fas

Unnamed: 0,user_id,date,fas_score,week_year
0,28,2021-02-24,28,2021-08
1,28,2021-03-02,34,2021-09
2,28,2021-03-09,28,2021-10
3,28,2021-03-16,31,2021-11
4,28,2021-03-23,26,2021-12
...,...,...,...,...
8752,2664,2023-01-30,17,2023-05
8753,2664,2023-02-06,17,2023-06
8754,2664,2023-02-13,17,2023-07
8755,2664,2023-02-20,17,2023-08


In [23]:
curr_user = 28
curr_user_fas = grouped_df_fas[grouped_df_fas['user_id'] == 28]


In [25]:
class SameDayMappingDataset(Dataset):
    def __init__(self, dataset_x, dataset_y, device, user_id, feature_name_lst_x, feature_name_lst_y, train_time_steps, target_time_steps, week_true=True, normalize=True):
        if normalize:
            self.dataset_x = self._normalize(dataset_x)
            self.dataset_y = self._normalize(dataset_y)
        else:
            self.dataset_x = dataset_x
            self.dataset_y = dataset_y
        self.device = device
        self.user_id = user_id
        self.feature_name_lst_x = feature_name_lst_x
        self.feature_name_lst_y = feature_name_lst_y
        self.week_true = week_true
        self.train_time_steps = train_time_steps
        self.target_time_steps = target_time_steps

    def _normalize(self, df):
        if self.week_true:
            train_indices = df.index[df['week_year'].isin(pd.to_datetime(self.train_time_steps))]
        def min_max_normalize_subset(feature):
            min_val = feature.iloc[subset_indices].min()
            max_val = feature.iloc[subset_indices].max()
            if min_val == max_val:
                max_val += 1
            return (feature - min_val) / (max_val - min_val)
        normalized_df = df[df['week_year'].isin(pd.to_datetime(self.target_time_steps))].apply(min_max_normalize_subset)
        return normalized_df
    
    def __len__(self):
        return len(self.target_time_steps)
    def __getitem__(self, idx):
        time_step = self.target_time_steps[idx]
        y = self.dataset_y[self.dataset_y['week_year'] == time_step][feature_name_lst_y].value()
        print(y.shape)
        x = self.dataset_x[self.dataset_x['week_year'] == time_step][feature_name_lst_x].value()
        print(x.shape)
        return {"X": x, "Y": y, "user_id": self.user_id}