In [None]:
# default_exp analyze_by_person

# Analysis By Person

> Module processes actiwatch data for a single individual, with the ability to add additional astral (sunset/sunrise) and sleep timing data.

In [None]:
#hide
from nbdev.showdoc import *

%run load_actiwatch_data.py
%run firsttime.py

import numpy as np
import pandas as pd

from joblib import *
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar

## Loading Actiwatch Data

To begin processing raw data for analysis, all folders containing Actiwatch data should be loaded into a dictionary. Dictionary keys should be identifying names for the folder where the data is stored, which should be used as the value.

For example:

The below directory uses

1. **key** = v1
        v1 indicates visit number 1, or baseline
        
2. **value** = data
        data indicates that the csv files to be loaded are stored in a folder called "data"

In [None]:
directory = {
    'v1': 'data/v1',
    'v3': 'data/v3'
}

In [None]:
#export
def get_raw_data(key:str, directory: dict, grouping:str = 'Group'):
    """Loads raw actiwatch data for a particular season.

    #### Parameters

    key: str

        The key to load actiwatch data from (for example, "v1")
    directory: dict

        Dictionary of valid folders to load actiwatch data from.
        Folders should have .csv files in them.
    grouping: str

        Name of the generated column for specifying groupings, where
        the values will be the name of the key given. Default = 'Group'

    """
    raw_data, summary_data = load_actiwatch_data(directory[key],uidprefix = key)
    raw_data['Group'] = key
    return raw_data

In [None]:
show_doc(get_raw_data, title_level = 3)

<h3 id="get_raw_data" class="doc_header"><code>get_raw_data</code><a href="__main__.py#L2" class="source_link" style="float:right">[source]</a></h3>

> <code>get_raw_data</code>(**`key`**:`str`, **`directory`**:`dict`, **`grouping`**:`str`=*`'Group'`*)

Loads raw actiwatch data for a particular season.

#### Parameters
    
key: str
    
    The key to load actiwatch data from (for example, "v1")          
directory: dict

    Dictionary of valid folders to load actiwatch data from.
    Folders should have .csv files in them. 
grouping: str
    
    Name of the generated column for specifying groupings, where
    the values will be the name of the key given. Default = 'Group'
    

#### Example

An example of output for this function would be:

In [None]:
raw_data = get_raw_data('v1', directory)
raw_data.head() 

Found 1 csv files in data/v1/. Pass #1, raw data
.
.
Pass #2, data summary
.
.EOF without retrieving summary data: data/v1\v1_sample.csv


Unnamed: 0_level_0,Off-Wrist Status,Activity,Marker,White Light,Red Light,Green Light,Blue Light,Sleep/Wake,Interval Status,UID,Group
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2018-06-06 14:58:00,0,,,,,,,,EXCLUDED,v1v1\v1,v1
2018-06-06 14:58:30,0,,,,,,,,EXCLUDED,v1v1\v1,v1
2018-06-06 14:59:00,0,,,,,,,,EXCLUDED,v1v1\v1,v1
2018-06-06 14:59:30,0,,,,,,,,EXCLUDED,v1v1\v1,v1
2018-06-06 15:00:00,0,,,,,,,,EXCLUDED,v1v1\v1,v1


## Exporting Timing Data

In [None]:
#export
def export_timing_data(timing_data):
    """ Exports timing data to parquet.

    #### Parameters

    timing_data: pd.DataFrame

        Timing data
    """
    timing_data.Date = timing_data.Date.values.astype("datetime64[s]")
    timing_data.to_parquet(
        outfile + "timing.parquet", engine = "fastparquet", compression = "gzip"
       )

In [None]:
show_doc(export_timing_data, title_level = 3)

<h3 id="export_timing_data" class="doc_header"><code>export_timing_data</code><a href="__main__.py#L2" class="source_link" style="float:right">[source]</a></h3>

> <code>export_timing_data</code>(**`timing_data`**)

Exports timing data to parquet.

#### Parameters

timing_data: pd.DataFrame

    Timing data

## Recalculating Timing Data (if Necessary)

Sometimes it may be valuable to calculate (or recalculate) the timing data. The ability to do so is provided using the below function, which sets up the timing and raw dataframes. 

In this example, we'll go ahead and use the Seattle as our location for determining sunlight (sunset and sunrise timings), and ask the function to recalculate both the raw and timing data. Example thresholds for lux are also specified below. The outfile specified is where the data will be written to.

In [None]:
recalculate_raw = False 
recalculate_timing = False
location = "seattle"
thresholds = [ [5], [10], [50], [100], [500], [1000] ] 
outfile = "../SALA/example_output/"

In [None]:
#export
def process_timing_data(location: str,
                     outfile: str,
                     thresholds: list,
                     key: str,
                     directory: dict,
                     recalc_raw: bool = False,
                     recalc_timing: bool = False,
                     export_hook = None
                     ):
    """Setup timing and raw dataframes or recalculate their
    values if specified and necessary. Both operations take
    a long time and lots of memory. Ill-advised to recalculate
    timing specifically if the data has already been created.

    #### Parameters

    location: str

        Location for calculating light, for example 'Seattle'
    outfile: str

        File for re-written data to be placed in, or for data to be loaded from
    thresholds: list

        List of light thresholds for the watch data
    key: str

        The key to load actiwatch data from
    directory: dict

        Dictionary of valid seasons to retrieve actiwatch data from
    recalc_raw: bool

        Forces recalculation process if true, loads processed data from disk otherwise.
        Default value is 'False'
    recalc_timing: bool

        Forces recalculation of light timing data, loads it from disk otherwise.
        Default value is 'False'
    export_hook: function

        Placeholder for user to use their own function during data processing.
        This function should take in the timing data as a parameter. See
        documentation for example.
    #### Returns

        (as a tuple of pd.DataFrames) all the data, the timing data for a particular location
    """
    if recalc_raw:
        print("Loading raw data from disk...")
        raw_results = (
            Parallel(n_jobs=len(directory))(delayed(get_raw_data)(key, directory) for key in directory.keys())
                   )
        all_data = pd.concat(raw_results)
        # save data to parquet file
        all_data.to_parquet(outfile + "raw.parquet", engine = 'fastparquet',
                           compression = "gzip")
    else:
        # read data from parquet file
        all_data = pd.read_parquet(outfile + "raw.parquet")

    if recalc_timing:
        print("Calculating light timing data...")

        timing_results = (Parallel(n_jobs=len(thresholds))
            (delayed(firstAndLastLight)(all_data, threshold) for threshold in thresholds)
                      )
        timing_data = pd.concat(timing_results)
        print("Adding holiday markers to timing data...")
        cal = calendar()

        holidays = (
            cal.holidays(start = timing_data.Date.min(), end = timing_data.Date.max())
        )

        nn = pd.DatetimeIndex( timing_data.Date )
        timing_data["DayofWeek"] = nn.dayofweek
        days = ["Mon", "Tues", "Wed", "Thu", "Fri", "Sat", "Sun"]
        day_type = ["Weekday","Weekday","Weekday",
                    "Weekday","Weekday","Weekend/Holiday","Weekend/Holiday"]

        day_group = []
        dtp_group = []
        wknd_holiday = []

        # add days of week to data
        for index, row in timing_data.iterrows():
            day_group.append(row["Group"].split(location)[0] + days[row["DayofWeek"]])
            if holidays.isin([row["Date"]]).any():
                dtp_group.append(row['Group'].split(location)[0] + "Weekend/Holiday")
                wknd_holiday.append(True)
            else:
                dtp_group.append(
                    row["Group"].split(location)[0] + day_type[row["DayofWeek"]]
                )
                wknd_holiday.append(row['DayofWeek'] > 4)

        timing_data["GroupDayofWeek"] = day_group
        timing_data["GroupDayType"] = dtp_group
        timing_data["Weekend/Holiday"] = wknd_holiday

        # function hook for extra processing before exporting to parquet
        if export_hook:
            timing_data = export_hook(timing_data)

        timing_copy = timing_data.copy()
        timing_copy["Watch period"] = pd.to_timedelta(timing_copy["Watch period"])
        export_timing_data(timing_copy)
    else:
        timing_copy = pd.read_parquet(outfile + "timing.parquet", engine = "fastparquet")
        # return date to original format
        timing_copy.Date = timing_copy.Date.apply(lambda x: x.date())
        timing_data = timing_copy.copy()

    return all_data, timing_data

In [None]:
show_doc(process_timing_data, title_level = 3)

<h3 id="process_timing_data" class="doc_header"><code>process_timing_data</code><a href="__main__.py#L2" class="source_link" style="float:right">[source]</a></h3>

> <code>process_timing_data</code>(**`location`**:`str`, **`outfile`**:`str`, **`thresholds`**:`list`, **`key`**:`str`, **`directory`**:`dict`, **`recalc_raw`**:`bool`=*`False`*, **`recalc_timing`**:`bool`=*`False`*, **`export_hook`**=*`None`*)

Setup timing and raw dataframes or recalculate their
values if specified and necessary. Both operations take
a long time and lots of memory. Ill-advised to recalculate
timing specifically if the data has already been created.

#### Parameters

location: str

    Location for calculating light, for example 'Seattle'  
outfile: str

    File for re-written data to be placed in, or for data to be loaded from    
thresholds: list

    List of light thresholds for the watch data
key: str

    The key to load actiwatch data from     
directory: dict

    Dictionary of valid seasons to retrieve actiwatch data from
recalc_raw: bool

    Forces recalculation process if true, loads processed data from disk otherwise.
    Default value is 'False'
recalc_timing: bool

    Forces recalculation of light timing data, loads it from disk otherwise.
    Default value is 'False'
export_hook: function

    Placeholder for user to use their own function during data processing.
    This function should take in the timing data as a parameter. See 
    documentation for example.
#### Returns

    (as a tuple of pd.DataFrames) all the data, the timing data for a particular location

### Function Hook Example

Before exporting timing data within the processing cycle, users can run their own custom analysis function to subset the data as desired. A very minimal example of such a function is provided below.

In [None]:
#exports
def remove_first_day(timing_data):
    """Example function hook for removing data for the first day
    where its obvious that light data is non-existent (NaT)

     #### Parameters

    timing_data: pd.DataFrame

        Timing data
    """
    data = (
    timing_data[(timing_data["Last Light"].apply(np.isnat) == False)
               & (timing_data["Date"] != timing_data["Date"].min())]
            )
    return data

#### Example

An example of output for this function would be:

**Note**: timing_data is split into three images for easier viewing

In [None]:
all_data, timing_data = process_timing_data(location, outfile, thresholds, 'v1', directory, True, True, remove_first_day)

Loading raw data from disk...
Calculating light timing data...
Adding holiday markers to timing data...


In [None]:
all_data.head()

Unnamed: 0_level_0,Off-Wrist Status,Activity,Marker,White Light,Red Light,Green Light,Blue Light,Sleep/Wake,Interval Status,UID,Group
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2018-06-06 14:58:00,0,,,,,,,,EXCLUDED,v1v1\v1,v1
2018-06-06 14:58:30,0,,,,,,,,EXCLUDED,v1v1\v1,v1
2018-06-06 14:59:00,0,,,,,,,,EXCLUDED,v1v1\v1,v1
2018-06-06 14:59:30,0,,,,,,,,EXCLUDED,v1v1\v1,v1
2018-06-06 15:00:00,0,,,,,,,,EXCLUDED,v1v1\v1,v1


In [None]:
timing_data.iloc[:,:7].head()

Unnamed: 0,UID,Date,Threshold,Last Light,Mins to LL from 4AM,First Light,Mins to FL from 4AM
0,v1v1\v1,2018-07-03,5,2018-07-03 23:31:00,1171.0,2018-07-03 07:00:30,180.0
1,v1v1\v1,2018-07-02,5,2018-07-02 23:25:00,1165.0,2018-07-02 07:57:30,237.0
2,v1v1\v1,2018-07-10,5,2018-07-10 08:08:00,248.0,2018-07-10 06:59:00,179.0
3,v1v1\v1,2018-06-27,5,2018-06-27 22:57:00,1137.0,2018-06-27 09:12:30,312.0
4,v1v1\v1,2018-06-29,5,2018-06-29 22:18:30,1098.0,2018-06-29 06:47:00,167.0


In [None]:
timing_data.iloc[:,7:14].head()

Unnamed: 0,Time above threshold,Time above threshold AM,Minutes above threshold,Minutes above threshold AM,Lux minutes,Lux minutes AM,Group
0,0 days 12:47:30,0 days 03:37:30,767.5,217.5,377968.04,143815.5,v1
1,0 days 09:06:30,0 days 03:49:00,546.5,229.0,221486.77,69199.365,v1
2,0 days 01:00:30,0 days 01:00:30,60.5,60.5,47044.555,47044.555,v1
3,0 days 10:25:00,0 days 02:23:00,625.0,143.0,640912.35,247699.58,v1
4,0 days 05:00:30,0 days 03:26:30,300.5,206.5,65985.73,64188.25,v1


In [None]:
timing_data.iloc[:,14:].head()

Unnamed: 0,Watch period,DayofWeek,GroupDayofWeek,GroupDayType,Weekend/Holiday
0,0 days 00:00:30,1,v1Tues,v1Weekday,False
1,0 days 00:00:30,0,v1Mon,v1Weekday,False
2,0 days 00:00:30,1,v1Tues,v1Weekday,False
3,0 days 00:00:30,2,v1Wed,v1Weekday,False
4,0 days 00:00:30,4,v1Fri,v1Weekday,False


## Setting Sunset and Sunrise Timings

Sunset and sunrise timings can also be calculated for actiwatch data. To do so, the specific location to be calculated for is required. Of most importance for the calculation is the longitude and latitude. 

In [None]:
#hide
from astral import LocationInfo, sun

In [None]:
#export
def set_sun_timings(timing_data,
                    loc:str,
                    region: str,
                    timezone: str,
                    latitude: float,
                    longitude: float
                   ):
    """Given a location (city), calculate sunset and sunrise timings for the data

    #### Parameters

    timing_data: pd.DataFrame

        Timing data
    loc: str (any string)

        Name of location to lookup for sunrise/sunset calculations
    region: str (any string)

        Name of the region the location belongs to
    timezone: str

        the location's timezone (a list of timezones can be obtained from pytz.all_timezones)
    latitude: float

        Latitude position of the location for sunrise/sunset calculations
    longitude: float

        Longitude position of the location for sunrise/sunset calculations
    #### Returns

        Modified timing data with sunrise and sunset calculations

    """
    # add location info for calculating astral data
    city = LocationInfo(loc, region, timezone, latitude, longitude)

    timing_data["Sunrise"] = timing_data.Date.apply( lambda x: sun.sunrise(city.observer,
                                                                           x,
                                                                           tzinfo = city.tzinfo))

    timing_data["Sunset"] = timing_data.Date.apply( lambda x: sun.sunset(city.observer,
                                                                         x,
                                                                         tzinfo = city.tzinfo))


    return timing_data

In [None]:
show_doc(set_sun_timings, title_level = 3)

<h3 id="set_sun_timings" class="doc_header"><code>set_sun_timings</code><a href="__main__.py#L2" class="source_link" style="float:right">[source]</a></h3>

> <code>set_sun_timings</code>(**`timing_data`**, **`loc`**:`str`, **`region`**:`str`, **`timezone`**:`str`, **`latitude`**:`float`, **`longitude`**:`float`)

Given a location (city), calculate sunset and sunrise timings for the data

#### Parameters

timing_data: pd.DataFrame

    Timing data
loc: str (any string)

    Name of location to lookup for sunrise/sunset calculations
region: str (any string)

    Name of the region the location belongs to
timezone: str  

    the location's timezone (a list of timezones can be obtained from pytz.all_timezones)
latitude: float

    Latitude position of the location for sunrise/sunset calculations
longitude: float

    Longitude position of the location for sunrise/sunset calculations
#### Returns

    Modified timing data with sunrise and sunset calculations
    

#### Example

An example of the added portion to timing data from this function would be:

In [None]:
timing_data = (
    set_sun_timings(timing_data, "Seattle", "United States", "America/Los_Angeles", 47.65, -122.30)
)

timing_data[["Sunrise", "Sunset"]].head()

Unnamed: 0,Sunrise,Sunset
0,2018-07-03 05:17:02.116743-07:00,2018-07-03 21:09:39.386344-07:00
1,2018-07-02 05:16:22.672261-07:00,2018-07-02 21:09:58.273719-07:00
2,2018-07-10 05:22:31.640667-07:00,2018-07-10 21:06:14.940775-07:00
3,2018-06-27 05:13:37.265731-07:00,2018-06-27 21:10:53.349244-07:00
4,2018-06-29 05:14:36.884762-07:00,2018-06-29 21:10:39.206298-07:00


## Adding Sleep Information

Adding sleep information to the timing data is also possible. The below function adds sleep data, splitting a sleep day at 6pm (under the assumption that people generally do not sleep/end their day at 6pm).

In [None]:
#export
def process_sleep_data(timing_data, num_sleeps: int = 2):
    """Processes sleep data for existing timing data.

    #### Parameters

    timing_data: pd.DataFrame

        Timing data
    num_sleeps: int

        Cutoff for number of sleeps to display in first resulting frame.
        Default = 2, frame will store days with 3+ sleep instances

    #### Returns

        short_frame: pd.DataFrame

            Onset, offset, and duration for sleep periods on days with
            more than num_sleeps number of sleep periods
        timing_data: pd.DataFrame

            Modified timing data with included sleep information

    """
    sleepers = []
    sleep_onsets = []
    sleep_offsets = []
    sleep_durations = []
    sleep_onsetMSLMs = []
    sleep_offsetMSLMs = []
    for arow in timing_data.itertuples():
        UID = arow.UID
        DT = pd.to_datetime(arow.Date)
        TM = pd.to_datetime(DT + pd.Timedelta("1 day"))
        today = DT.strftime("%Y-%m-%d")

        nextday = TM.strftime("%Y-%m-%d")

        # taking raw timing data entry and splitting a "sleep day" at 6pm
        # under the assumption that people do not end their days that early
        day_split = all_data.query("UID == @UID").loc[today +" 18:00":nextday + " 18:00"]

        # REST-S = watch thinks user is asleep
        asleep = day_split[ day_split["Interval Status"] == "REST-S"].copy()

        # there may be more than one sleep period in a given day's data
        # new sleep period = when there is more than 1 hour between successive REST-S entries
        sleep_periods = []
        per = 0
        count = 0

        try:
            lt = asleep.index[0]
            for time in asleep.index:
                # allow up to 1 hour of being awake in the middle of the night
                if (time - lt > pd.Timedelta("1 hour")):
                    per += 1
                lt = time
                sleep_periods.append(per)
            asleep["Sleep period"] = sleep_periods
        except IndexError:
            asleep["Sleep period"] = [pd.to_datetime(0)]


        try:
        # calc sleep onsets/offsets/duration for each period of sleep in a person-day of data
            sleeps = asleep.reset_index().groupby("Sleep period").apply( lambda x: pd.DataFrame({
                     "Sleep onset": [x.DateTime.min()],
                     "Sleep offset": [x.DateTime.max()],
                     "Sleep duration": [x.DateTime.max() - x.DateTime.min()]
                     }, index = x.DateTime.dt.normalize() ))
        # if the value is = 0 -> np.int64 (not a DateTime)
        except AttributeError:
            sleeps = asleep.reset_index().groupby("Sleep period").apply( lambda x: pd.DataFrame({
             "Sleep onset": [pd.to_datetime(DT)],
             "Sleep offset": [pd.to_datetime(DT)],
             "Sleep duration": [pd.to_timedelta(x.DateTime.max() - x.DateTime.min())]
             }))
        sleeps = sleeps.drop_duplicates().sort_values(by="Sleep duration", ascending = False)
        onset = sleeps.iloc[0]['Sleep onset']
        offset = sleeps.iloc[0]['Sleep offset']
        dur =  sleeps.iloc[0]['Sleep duration']

        # if onset is actually a datetime
        if not isinstance(onset, np.int64):
            onMSLM = (onset - DT).total_seconds() / 60.0

        # if offset is actually a datetime
        if not isinstance(offset, np.int64):
            offMSLM = np.maximum((offset - TM).total_seconds() / 60.0, 0.0)

        sleep_onsets.append(onset)
        sleep_offsets.append(offset)
        sleep_durations.append(dur)
        sleep_onsetMSLMs.append(onMSLM)
        sleep_offsetMSLMs.append(offMSLM)
        sleep_count = sleeps.shape[0]

        # adding to short_frame
        if sleep_count > num_sleeps:
            sleeps['UID'] = UID
            sleeps['DT'] = DT
            sleeps.reset_index(drop = True).set_index(['UID','DT'])
            sleepers.append(sleeps)
    short_frame = (
                   pd.concat(sleepers).reset_index().drop('DateTime',axis=1)
                   .set_index(['UID','DT']).drop_duplicates()
                   )
    timing_data["Sleep onset"] = sleep_onsets
    timing_data["Sleep offset"] = sleep_offsets
    timing_data["Sleep duration"] = sleep_durations
    timing_data["Sleep onset MSLM"] = sleep_onsetMSLMs
    timing_data["Sleep offset MSLM"] = sleep_offsetMSLMs

    return short_frame, timing_data

In [None]:
show_doc(process_sleep_data, title_level = 3)

<h3 id="process_sleep_data" class="doc_header"><code>process_sleep_data</code><a href="__main__.py#L2" class="source_link" style="float:right">[source]</a></h3>

> <code>process_sleep_data</code>(**`timing_data`**, **`num_sleeps`**:`int`=*`2`*)

Processes sleep data for existing timing data.

#### Parameters

timing_data: pd.DataFrame

    Timing data
num_sleeps: int

    Cutoff for number of sleeps to display in first resulting frame.
    Default = 2, frame will store days with 3+ sleep instances

#### Returns

    short_frame: pd.DataFrame
    
        Onset, offset, and duration for sleep periods on days with
        more than num_sleeps number of sleep periods     
    timing_data: pd.DataFrame
    
        Modified timing data with included sleep information

#### Example


An example of the short frame and a slice of the added sections to the timing data include:

In [None]:
short_frame = process_sleep_data(timing_data)[0]
short_frame.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Sleep period,Sleep onset,Sleep offset,Sleep duration
UID,DT,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
v1v1\v1,2018-06-28,0,2018-06-29 00:40:30,2018-06-29 06:41:00,0 days 06:00:30
v1v1\v1,2018-06-28,2,2018-06-29 13:42:00,2018-06-29 15:23:30,0 days 01:41:30
v1v1\v1,2018-06-28,1,2018-06-29 08:50:30,2018-06-29 09:04:00,0 days 00:13:30
v3v3\v3,2018-09-24,0,2018-09-24 21:31:00,2018-09-25 04:44:30,0 days 07:13:30
v3v3\v3,2018-09-24,1,2018-09-25 09:16:00,2018-09-25 10:30:00,0 days 01:14:00


In [None]:
timing_data = process_sleep_data(timing_data)[1]
timing_data[
    ["Sleep onset", "Sleep offset",
     "Sleep duration", "Sleep onset MSLM",
     "Sleep offset MSLM"]
    ].head()

Unnamed: 0,Sleep onset,Sleep offset,Sleep duration,Sleep onset MSLM,Sleep offset MSLM
0,2018-07-03 23:57:30,2018-07-04 06:14:00,0 days 06:16:30,1437.5,374.0
1,2018-07-02 23:35:30,2018-07-03 06:35:30,0 days 07:00:00,1415.5,395.5
2,2018-07-10 00:00:00,2018-07-10 00:00:00,0 days 00:00:00,0.0,0.0
3,2018-06-27 23:07:30,2018-06-28 06:57:00,0 days 07:49:30,1387.5,417.0
4,2018-06-29 22:20:00,2018-06-30 07:44:00,0 days 09:24:00,1340.0,464.0


Don't forget to export the data once finished.

In [None]:
export_timing_data(timing_data)