# Desafios posteriores:

- OK - Agrupar os valores de bpm de 5 em 5 minutos
- OK - Selecionar os bpms que fazem parte do sleep time
- OK - Combinar os dados awake no sleep time
- OK - Preencher os gaps
- Remover os dias em que há uma soneca durante o dia
- OK - Otimizar a função que preenche os gaps
- Agrupar dados da Letônia e do Brasil
- Entender como passar dados nulos para o modelo
- Separar dia-a-dia
- Fazer um grande subplot de cada dia
- Mudar o padding do sleep

# Imports

In [1]:
import requests
import datetime
import collections

import pandas as pd
import numpy as np
from tqdm import tqdm

import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio

pio.renderers.default = 'iframe'

# Requests

In [2]:
def data_request(start, end):   
    """
    Description
    -----------
    This function gets requests from the sleep and heart routes, in a specific date range

    Parameters
    ----------
    start : <string>
        Beginning of the interval. The day is in the format YYYY-MM-DD

    end : <string>
        End of the interval. The day is in the format YYYY-MM-DD

    Returns
    -------
    sleep_response : <class 'requests.models.Response'>
        Response from the sleep route

    heart_response : <class 'requests.models.Response'>
        Response from the heart route
    """
    
    # Personal Access Token used to access the user data via the Oura Cloud API
    headers = {'Authorization': 'Bearer Q3E2ETZRM4AKZULORX6LJNQOKSIOWOYG'}
    
    # Parent route
    url = f'https://api.ouraring.com/v2/usercollection/'
    
    # Define the parameters of each request
    sleep_params = {'start_date': start, 
                    'end_date': end}    
    heart_params={ 'start_datetime': f'{start}T00:00:01+03:00', 
                   'end_datetime': f'{end}T23:59:59+03:00'}
    
    # Request Sleep data and Heart data
    sleep_response = requests.request('GET', url+'sleep', headers = headers, params = sleep_params)
    heart_response = requests.request('GET', url+'heartrate', headers = headers, params = heart_params)
    
    return sleep_response, heart_response

In [3]:
START = '2023-07-25'
END = '2023-08-23'

sleep_response, heart_response = data_request(START, END)

# Heart Preprocessing

In [4]:
def heart_route_preprocessing(response):
    """
    Description
    -----------
    This function extracts the bpm and time from the json 

    Parameters
    ----------
    response : <class 'requests.models.Response'>
        Response from the heart route

    Returns
    -------
    heart_data : <class 'pandas.core.frame.DataFrame'>
        DataFrame with integer index and ['time', 'bpm'] columns 
    """
    
    # List that will be filled with the response data
    time, bpm = [], []

    # Extract the information from the response
    for data in response.json()['data']:
        
        bpm.append(data['bpm'])
        time.append(data['timestamp'])

    # Create a Empty DataFrame
    heart_data = pd.DataFrame(columns = ["time", "bpm"])
    
    # Store the response data in the DataFrame
    heart_data["time"], heart_data["bpm"] = time, bpm

    return heart_data

In [5]:
heart_data = heart_route_preprocessing(heart_response)

In [6]:
def time_preprocessing(time):
    """
    Description
    -----------
    This function changes the format of a date and adjusts the time according to the Latvian or Brazilian timezone. OBS: These timezones were chosen according to the countries in which Igor lived.

    Parameters
    ----------
    time : <str>
        Time in the format YYYY-MM-DDThh:mm:ss+00:00 (Example: 2023-07-24T21:04:37+00:00)

    Returns
    -------
    new_time : <class 'datetime.datetime'>
        Time in the format YYYY-MM-DD hh:mm:ss±03:00 (Example: 2023-07-25 00:04:37+03:00)
    """

    # Set the Latvian and Brazilian timezones (UTC +3 and UTC -3, respectively) 
    LV_TIMEZONE = datetime.timezone(offset = datetime.timedelta(hours=3))
    BR_TIMEZONE = datetime.timezone(offset = datetime.timedelta(hours=-3))

    # Covert string to Datetime
    new_time = datetime.datetime.strptime(time, "%Y-%m-%dT%H:%M:%S%z")
    
    # Choose the correct timezone, according with the date that Igor comeback to Brazil
    if (new_time <= datetime.datetime(2023, 8, 24, tzinfo=datetime.timezone.utc)):
        new_time = new_time.astimezone(LV_TIMEZONE)
        
    else:
        new_time = new_time.astimezone(BR_TIMEZONE)
    
    return new_time

In [7]:
heart_data['time'] = heart_data['time'].apply(time_preprocessing)

In [8]:
def groups_5min(data):
    """
    Description
    -----------
    Every 5 minutes, the Oura Ring measures the heart rate for 60 consecutive seconds. However, only the reliable measures are stored in the API. 
    Therefore, this function groups all the heart rates that were collected in the same 60s batch.

    Parameters
    ----------
    data : <class 'pandas.core.frame.DataFrame'>
        Heart data. DataFrame with integer index and ['time', 'bpm'] columns

    Returns
    -------
    new_data : <class 'pandas.core.frame.DataFrame'>
        DataFrame with integer index and ['time', 'bpm','state'] columns
    """
    
    # Set the time difference between the current row and the previous one
    mask = heart_data['time'].diff().dt.seconds

    # Every time that a time diff is greater than 60s, add +1 to the label 
    mask = mask.gt(60).cumsum()

    # Group the batches according to the mask, maintaining the time of the first measure, and the mean of the bpm    
    new_data = data.groupby(mask, as_index=True)[['time','bpm']].agg({'time':'first', 'bpm':'mean'}).round(1)
    
    # Set the awake state (will be important during the 4-stages classification)
    new_data['state'] = 'awake' 
    
    return new_data

In [9]:
heart_data = groups_5min(heart_data)

# Sleep Preprocessing

In [10]:
def heart_rate_extractor(day_data):
    """
    Description
    -----------
    This function extracts the heart rate from the sleep route and gives back the DataFrame in the same format as the heart route Dataframe. 
   
    Parameters
    ----------
    day_data : <dict>
        JSON with data of one-night sleep time

    Returns
    -------
    new_data : <class 'pandas.core.frame.DataFrame'>
        Heart data during the sleep time. DataFrame with integer index and ['time', 'bpm','state'] columns
    """
    
    # Create the columns 
    time = []
    bpm = day_data['heart_rate']['items']
    state = ['sleep']*len(bpm)
    
    # Extract the start and end of the sleep time
    start = datetime.datetime.strptime(day_data['bedtime_start'], "%Y-%m-%dT%H:%M:%S%z")
    end = datetime.datetime.strptime(day_data['bedtime_end'], "%Y-%m-%dT%H:%M:%S%z")
    
    # Create a spaced timelist within the sleep time interval
    aux = start
    while aux < end:
        time.append(aux)
        aux += datetime.timedelta(minutes=5)
        
    # Padding of the bpm length according to time length
    while len(bpm) != len(time):
        bpm.append(None)
        state.append('sleep')
        
    # Create a DataFrame with the sleep data of a unique day
    heart_data = pd.DataFrame(columns = ["time", "bpm", "state"])
    heart_data["time"], heart_data["bpm"], heart_data["state"] = time, bpm, state

    return heart_data

In [11]:
def sleep_route_preprocessing(response):
    """
    Description
    -----------
    This function iterate over the days to extracts the sleep data with the heart_rate_extractor

    Parameters
    ----------
    response : <class 'requests.models.Response'>
        Response from the sleep route

    Returns
    -------
    sleep_data : <class 'pandas.core.frame.DataFrame'>
        DataFrame with integer index and ['time', 'bpm', 'state'] columns 
        
    start_bedtime : <list>
        List of with the start bedtime of each day 
    
    end_bedtime : <list>
        List of with the end bedtime of each day
    """
    
    df_list = []
    start_bedtime = []
    end_bedtime = []

    for day_data in response.json()['data']:

        if day_data['heart_rate'] != None:
            if len(day_data['heart_rate']['items']) > 40:

                df_list.append(heart_rate_extractor(day_data))
                start_bedtime.append(day_data['bedtime_start'])
                end_bedtime.append(day_data['bedtime_end'])
    
    sleep_data = pd.concat(df_list)
    
    return sleep_data, start_bedtime, end_bedtime

In [12]:
sleep_data, start_bedtime, end_bedtime = sleep_route_preprocessing(sleep_response)

# Filling the gaps

In [73]:
full_data = pd.concat([sleep_data, heart_data]).dropna(ignore_index=True).sort_values(['time']).reset_index(drop=True)

In [74]:
def time_rounder(full_data):
    """
    Description
    -----------
    This function round the time in multiple hours of 5 minutes (Ex: 00h00, 00h05, 00h10, 00h15, and so on).

    Parameters
    ----------
    full_data : <class 'pandas.core.frame.DataFrame'>
        Sleep and Heart route concatenated. DataFrame with integer index and ['time', 'bpm', 'state'] columns

    Returns
    -------
    full_data : <class 'pandas.core.frame.DataFrame'>
        Full data with the time rounded to 5 minutes interval. DataFrame with integer index and ['time', 'bpm', 'state'] columns
    """
    
    # Round the time in multiple hours of 5 minutes
    full_data['new_time'] = full_data['time'].round('5min')
    
    # Take all duplicate time after the round
    duplicate = [item for item, count in collections.Counter(full_data["new_time"]).items() if count > 1]
    
    for dupl in duplicate:
        
        idx = full_data[full_data['new_time'] == dupl].index
        
        # Round down the first duplicated row
        floor_round = full_data.iloc[idx[0]]['time'].floor('5min')
        
        # Round up the second duplicated row
        ceil_round = full_data.iloc[idx[1]]['time'].ceil('5min')

        # Replace the first duplicated if the new round doesn't conflict with the previous value
        if floor_round != full_data.iloc[idx[0]-1]['new_time']:
            full_data.at[idx[0],'new_time'] = floor_round

        # Replace the second duplicated if the new round doesn't conflict with the next value
        elif ceil_round != full_data.iloc[idx[1]+1]['new_time']:
            full_data.at[idx[1],'new_time'] = ceil_round

        # Exclude the first duplicated if the two replace methods didn't work
        else:
            full_data = full_data.drop(axis=0,index=idx[0]).reset_index(drop=True)

    # Maintain the same format as the DataFrame from the input
    full_data = full_data.drop('time',axis=1)
    full_data = full_data[['new_time','bpm','state']].rename(columns={"new_time": "time"})
    
    return full_data

In [75]:
round_data = time_rounder(full_data)

In [76]:
round_data

Unnamed: 0,time,bpm,state
0,2023-07-25 00:05:00+03:00,73.0,awake
1,2023-07-25 00:10:00+03:00,67.7,awake
2,2023-07-25 00:15:00+03:00,68.0,awake
3,2023-07-25 00:20:00+03:00,64.7,awake
4,2023-07-25 00:25:00+03:00,72.0,awake
...,...,...,...
5255,2023-08-23 23:40:00+03:00,68.3,awake
5256,2023-08-23 23:45:00+03:00,74.3,awake
5257,2023-08-23 23:50:00+03:00,55.3,awake
5258,2023-08-23 23:55:00+03:00,58.7,awake


In [77]:
def gap_filler(full_data, start, end):
    """
    Description
    -----------
    This function round the time in multiple hours of 5 minutes (Ex: 00h00, 00h05, 00h10, 00h15, and so on).

    Parameters
    ----------
    full_data : <class 'pandas.core.frame.DataFrame'>
        Sleep and Heart route concatenated. DataFrame with integer index and ['time', 'bpm', 'state'] columns

    start : <string>
        Beginning of the interval. The day is in the format YYYY-MM-DD

    end : <string>
        End of the interval. The day is in the format YYYY-MM-DD

    Returns
    -------
    full_data : <class 'pandas.core.frame.DataFrame'>
        Full data with iserted gaps filled with NaN values. DataFrame with timestamp index and ['bpm', 'state'] columns
    """

    #Add the hour in the start and end time
    start_time = datetime.datetime.strptime(start + "T00:00:00+03:00", "%Y-%m-%dT%H:%M:%S%z")
    end_time = datetime.datetime.strptime(end + "T23:55:00+03:00", "%Y-%m-%dT%H:%M:%S%z")
    
    # Create a spaced timelist within the sleep time interval
    time_list = set()
    aux = start_time
    
    while aux <= end_time:
        time_list.add(aux)
        aux += datetime.timedelta(minutes=5)
    
    # Take just the times that are not in the data 
    gaps = time_list - time_list.intersection(full_data['time'])
    
    # Concatenate the data with the new rolls
    new_rows = {'time': list(gaps), 'bpm':[None]*len(gaps), 'state':[None]*len(gaps)}
    full_data = pd.concat([full_data, pd.DataFrame(new_rows)], ignore_index=True)
    
    # Set the index as a timestamp
    full_data.index = full_data['time']
    full_data = full_data.drop('time', axis=1).sort_index()
    
    # Fill the None state with the state of the previous roll
    full_data['state'] = full_data['state'].ffill()
    
    return full_data

In [109]:
filler_data = gap_filler(round_data, START, END)

# Data Labeling

In [136]:
lay_down = [datetime.datetime.strptime(start, "%Y-%m-%dT%H:%M:%S%z") for start in start_bedtime]
get_up = [datetime.datetime.strptime(end, "%Y-%m-%dT%H:%M:%S%z") + datetime.timedelta(minutes=5) for end in end_bedtime]

In [137]:
df_get_up = pd.DataFrame(get_up, columns = ["time"])
df_get_up['time'] = df_get_up['time'].round('5min')

In [138]:
filler_data['label'] = ''
filler_data[filler_data.index.isin(df_get_up['time'])]['label'] = 'get_up'



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [139]:
filler_data.loc[filler_data.index.isin(df_get_up['time']), 'label'] = 'get_up'

In [140]:
df_get_up

Unnamed: 0,time
0,2023-07-26 08:40:00+03:00
1,2023-07-27 07:00:00+03:00
2,2023-07-28 09:35:00+03:00
3,2023-07-29 06:45:00+03:00
4,2023-07-31 09:20:00+03:00
5,2023-08-01 09:50:00+03:00
6,2023-08-02 09:00:00+03:00
7,2023-08-03 05:30:00+03:00
8,2023-08-04 11:35:00+03:00
9,2023-08-05 11:45:00+03:00


In [141]:
filler_data[950:1000]

Unnamed: 0_level_0,bpm,state,label
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2023-07-28 07:10:00+03:00,,sleep,
2023-07-28 07:15:00+03:00,48.0,sleep,
2023-07-28 07:20:00+03:00,51.0,sleep,
2023-07-28 07:25:00+03:00,53.0,sleep,
2023-07-28 07:30:00+03:00,50.0,sleep,
2023-07-28 07:35:00+03:00,50.0,sleep,
2023-07-28 07:40:00+03:00,50.0,sleep,
2023-07-28 07:45:00+03:00,50.0,sleep,
2023-07-28 07:50:00+03:00,49.0,sleep,
2023-07-28 07:55:00+03:00,52.0,sleep,


In [95]:
get_up[1]

datetime.datetime(2023, 7, 26, 23, 19, 4, tzinfo=datetime.timezone(datetime.timedelta(seconds=10800)))

In [93]:
filler_data[filler_data['time'] > get_up[1]]

Unnamed: 0,time,bpm,state,label
568,2023-07-26 23:20:00+03:00,,awake,
569,2023-07-26 23:25:00+03:00,52.0,sleep,
570,2023-07-26 23:30:00+03:00,52.0,sleep,
571,2023-07-26 23:35:00+03:00,52.0,sleep,
572,2023-07-26 23:40:00+03:00,52.0,sleep,
...,...,...,...,...
8636,2023-08-23 23:40:00+03:00,68.3,awake,
8637,2023-08-23 23:45:00+03:00,74.3,awake,
8638,2023-08-23 23:50:00+03:00,55.3,awake,
8639,2023-08-23 23:55:00+03:00,58.7,awake,


# Day Separator

Rule: The day finish after the last sleep label, and start 5 minutes after the last sleep label of the previous day

In [43]:
def day_batcher(df):
    
    sleep_rows = df[df['state'] == 'sleep']
    sleep_rows['time_diff'] = sleep_rows['time'].shift(-1).diff()
    
    day_end = sleep_rows[sleep_rows['time_diff'] > datetime.timedelta(hours=3)].index  
    day_batch = []

    for idx in range(len(day_end) - 1):

        if idx == 0:
            day_batch.append(df.iloc[0:day_end[idx]])
        else:
            day_batch.append(df.iloc[day_end[idx]+1: day_end[idx+1]])
            
    return day_batch

In [44]:
day_batcher(full_data)

KeyError: 'time'

# Main Function

In [14]:
full_data = pd.concat([sleep_data, heart_data]).dropna(ignore_index=True)
full_data = time_rounder(full_data)
full_data = gap_filler(full_data,START,END)
full_data = full_data.reset_index()

In [15]:
# generate color list
colors=['red' if val == 'awake' else 'blue' for val in full_data['state']]

fig = go.Figure(go.Scatter(
    x = full_data.index,
    y = full_data['bpm'],
    mode='lines',  
    line={'color': 'gray'},
    name="Combined"
))

fig.add_trace(go.Scatter(
    x = full_data[full_data['state'] == 'awake'].index,
    y = full_data[full_data['state'] == 'awake']['bpm'],
    mode='markers',  
    line={'color': 'green'},
    name="Awake"
))

fig.add_trace(go.Scatter(
    x = full_data[full_data['state'] == 'sleep'].index,
    y = full_data[full_data['state'] == 'sleep']['bpm'],
    mode='markers',  
    line={'color': 'blue'},
    name="Seep"
))

fig.update_layout(title = 'BPM Time Series')
fig.show()