In [None]:
# default_exp SALA

# SALA

> Core module of SALA, built to process actiwatch data for a single individual. Prepares actiwatch style data exported in a CSV from Philips Actiware watches and produces additional analyses.

In [None]:
#hide
from nbdev.showdoc import *

%run load_actiwatch_data.py
%run firsttime.py

import numpy as np
import pandas as pd

from joblib import *
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
from astral import LocationInfo, sun

In [None]:
#export
class SALA:
    """
    DataFrame-like storage for actiwatch data loaded either from a directory of csv files
    or an existing SALA or dataframe object. 
    
    
        Attributes
        ----------
        data: pd.DataFrame or None
            Initialized as None, but can be set as a dataframe, which is expected to contain
            light and sleep information consistent with SALA formatting. It should only be 
            pre-set to an existing dataframe when trying to migrate existing data to a SALA
            object. 

        timezone: str
            Single timezone specified for all data within the object. A list of 
            valid timezones can be obtained from pytz.all_timezones. Note that it is impossible
            for different timezones to be present (all data must be converted to a single timezone)

        latitude: float
            Latitude position for sunrise/sunset calculations. 

        longitude: float
            Longitude position for sunrise/sunset calculations.

        Methods
        -------
        init(data=None, directory=None, timezone=None, latitude=None, longitude=None)
            Initialization with a pre-processed SALA-eqsue dataframe or raw data and file details
            for loading and processing data.

        get_raw_data_from_key(key, directory, grouping='Group')
            Loads and combines all raw data from multiple csv files within a specified file source 
            based on a given key. Key indicates a grouping of multiple csvs. 
            
        get_raw_data(directory, grouping='Group')
            Loads and combines all raw data from multiple csv files for all keys within
            a directory for a given directory of file sources. 

        export(data)
            Exports the data within a SALA object to a parquet file format. 

        process_data(raw_data, thresholds)
            Handles unprocessed combined raw data outputting first and last light times, 
            and group identifiers for all specified light thresholds.

        sun_timings()
            Calculates sunset and sunrise timing information for currently stored SALA
            data, based on the timezone info within the stored data. 
            
        do_everything()
            TO ADD AFTER TESTING OTHER NEW FUNCTIONS. 
            
        process_sleep_data
            Processes sleep data for existing timing data, generating a summary dataframe
            based on the number of sleep periods within the data. 
    """
    
    def __init__(self, latitude, longitude, timezone, data=None, directory = None):
        """
        Initializes a SALA object either from existing parsed timing data, or from a directory
        of csvs. Timezone information can be optionally included to allow for sunset, sunrise 
        data to be added.

        #### Parameters

            timezone: str 
                A valid timezone (a list of timezones can be obtained from pytz.all_timezones).
            
            latitude: float 
                Latitude position for sunrise/sunset calculations. Northern latitudes
                should be positive values.
                
            longitude: float 
                Longitude position for sunrise/sunset calculations. Eastern longitudes
                should be positive values.
                
            data: pd.DataFrame (optional)
                If not None, data should be a pre-processed SALA-format dataframe, expected to contain 
                details on light and sleep information. 
            
            directory: dictionary (optional)
                Dictionary of valid folder names to load actiwatch data from.
                Folders should have .csv files in them.
        """
        self._data = data
        self._directory = directory
        self._timezone = timezone
        self._latitude = latitude
        self._longitude = longitude
    
    @property
    def data(self):
        """Getter method for data."""
        return self._data
    
    @data.setter
    def data(self, value):
        """Setter method for data."""
        if type(value) != pd.DataFrame:
            raise TypeError("Error: Data must be of type pd.DataFrame")
        self._data = value
    
    @property
    def directory(self):
        """Getter method for directory."""
        return self._directory
    
    @directory.setter
    def directory(self, value):
        """Setter method for directory."""
        if type(value) != str:
            raise TypeError("Error: directory must be a valid string")
        self._directory = value
        
    @property
    def timezone(self):
        """Getter method for timezone."""
        return self._timezone
    
    @directory.setter
    def timezone(self, value):
        """Setter method for timezone."""
        if type(value) != str:
            raise TypeError("Error: timezone must be a valid string")
        self._timezone = value
        
    @property
    def latitude(self):
        """Getter method for latitude."""
        return self._latitude
    
    @directory.setter
    def latitude(self, value):
        """Setter method for latitude."""
        if not isinstance(value, (int, float, complex)):
            raise TypeError("Error: latitude must be a numeric")
        self._latitude = value
    
    @property
    def longitude(self):
        """Getter method for longitude."""
        return self._longitude
    
    @longitude.setter
    def longitude(self, value):
        """Setter method for longitude."""
        if not isinstance(value, (int, float, complex)):
            raise TypeError("Error: longitude must be a numeric")
        self._longitude = value
        
    def get_raw_data_from_key(self, key, directory = None, grouping = 'Group'):
        """Loads and combines raw actiwatch data from any csv files found in
           the specified directory matching a particular key within the directory.

            #### Parameters

            key: str

                The key to load actiwatch data from (for example, "v1").
                
            directory: dict

                Dictionary of valid folders to load actiwatch data from.
                Folders should have .csv files in them. If no dictionary
                is provided, it uses the one initialized as part of the SALA
                object.
                
            grouping: str

                Name of the generated column for specifying groupings, where
                the values will be the name of the key given. Default = 'Group'.
                
            #### Returns

            All of the raw unprocessed data within the directory matching a specified key.

    """
        if directory is None and self._directory is None:
            raise ValueError("Error: a valid source of data must be provided.")
        if directory is not None:
            self._directory = directory
        raw_data = load_actiwatch_data(self.directory[key], uidprefix = key)[0]
        raw_data[grouping] = key
        return raw_data
    
    def get_raw_data(self, outfile, directory = None, grouping = 'Group', export = True):
        """Loads and combines raw actiwatch data from any csv files found in
           the specified directory for all keys within the directory.

            #### Parameters
          
            outfile: str
            
                Directory to save to. (e.g. ../SALA/example_output/)
          
            directory: dict

                Dictionary of valid folders to load actiwatch data from.
                Folders should have .csv files in them. If no dictionary
                is provided, it uses the one initialized as part of the SALA
                object.
                
            grouping: str

                Name of the generated column for specifying groupings, where
                the values will be the name of the key given. Default = 'Group'.
                
            export: bool
            
                Whether or not to export combined raw data to a parquet file saved in the designated
                outfile location. 
                
            #### Returns

            All of the raw unprocessed data within the directory for all keys as a single
            dataframe.

    """
        if directory is None and self._directory is None:
            raise ValueError("Error: a valid source of data must be provided.")
        if directory is not None:
            self._directory = directory
        raw_results = (
            Parallel(n_jobs=len(self._directory))(delayed(self.get_raw_data_from_key)(key, self._directory) for key in self._directory.keys())
                   )
        # save data to parquet file
        all_data = pd.concat(raw_results)
        
        if export: 
            all_data.to_parquet(outfile + "raw.parquet", engine = 'fastparquet',
                                   compression = "gzip")
        
        return pd.read_parquet(outfile + "raw.parquet")
    
    def export(self, outfile, data=None):
        """
        Exports existing timing data to a parquet format.
        
        #### Parameters
            outfile: str

                Directory to save to. (e.g. ../SALA/example_output/)
            data: pd.DataFrame

            Desired dataframe for exporting. 
        """
        
        if self.data is None and data is None:
            raise Exception("Error: no timing data available to export.")
        if data is None:
            data = self.data
        # putting date information in a parquet valid format
        data["Date"] = data["Date"].values.astype("datetime64[s]")
        data.to_parquet(f"{outfile}timing.parquet", 
                               engine = "fastparquet", compression="gzip")
    
    
    def process_data(self,
                     raw_data, 
                     thresholds):
        """Handles unprocessed combined raw data outputting first and last light times, 
            and group identifiers for all specified light thresholds.

        #### Parameters
        
        raw_data: pd.DataFrame
            
            Combined dataframe of all raw data from desired directory. This can be
            accomplished by using the get_raw_data function within the SALA class. 

        thresholds: list

            List of light thresholds for the watch data.

        #### Returns
            
            Processed timing data in a dataframe format, with specific identifier columns based
            on weekday and weekend/holiday groupings. 
        """
        timing_results = (Parallel(n_jobs=len(thresholds))
        (delayed(firstAndLastLight)(raw_data, threshold) for threshold in thresholds)
                         )
        timing_data = pd.concat(timing_results)

        # loading federal holidays to classify dates as weekend/holiday
        cal = calendar()
        holidays = (
        cal.holidays(start = timing_data.Date.min(), end = timing_data.Date.max())
    )
        # retrieve day number (e.g. 0) from date index
        timing_data["DayofWeek"] = pd.DatetimeIndex(timing_data["Date"]).dayofweek
        days = ["Mon", "Tues", "Wed", "Thu", "Fri", "Sat", "Sun"]
        day_type = ["Weekday","Weekday","Weekday",
                "Weekday","Weekday","Weekend/Holiday","Weekend/Holiday"]

        # result should be a combination of Group identifier and the day of the week (e.g. Mon)
        timing_data["GroupDayofWeek"] = (timing_data["Group"] + np.array(days)[timing_data["DayofWeek"]])

        is_holiday = pd.to_datetime(timing_data["Date"]).isin(holidays)
        weekends = (timing_data["Group"] + "Weekend/Holiday")

         # result should be a combination of Group identifier and day type (e.g. Weekday)
        day_types = (timing_data["Group"] + np.array(day_type)[timing_data["DayofWeek"]])                               

        timing_data["GroupDayType"] = day_types.where(~is_holiday).combine_first(weekends.where(is_holiday))
        timing_data["Weekend/Holiday"] = ((timing_data["DayofWeek"] > 4) | is_holiday)

        self._data = timing_data
        timing_data["Watch period"] = pd.to_timedelta(timing_data["Watch period"])
            
        return timing_data
    
    def sun_timings(self):
        """Calculates sunrise and sunset timing information for data present in the
        SALA object.

        #### Returns

            Modified timing data with sunrise and sunset calculations
        """
        
        if self._timezone is None or self._latitude is None or self._longitude is None:
            raise ValueError("Error: Missing timezone, latitude, or longitude info.")
        
        # add location info for calculating astral data
        city = LocationInfo("location", "region", self._timezone, self._latitude, self._longitude)
        self._data["Sunrise"] = self._data["Date"].apply( lambda x: sun.sunrise(city.observer,
                                                                           x,
                                                                           tzinfo = city.tzinfo))
        self._data["Sunset"] = self._data["Date"].apply( lambda x: sun.sunset(city.observer,
                                                                         x,
                                                                         tzinfo = city.tzinfo))
        return self._data
    
    
    def do_everything(self, outfile, thresholds, directory = None, grouping = "Group", export = True):
        """Handles the full SALA pipeline (excluding sleep period analysis), from processing and combining raw data
        to parsing and calculating processed data with sunrise and sunset information. First loads and compiles 
        all existing raw data for every key within the given directory. Then processes all raw data, calculating
        additional information for all specified light thresholds. Also adds sunrise and sunset information. 
        
        #### Parameters
        
        outfile: str
            
                Directory to save to. (e.g. ../SALA/example_output/)
                
        thresholds: list

            List of light thresholds for the watch data.
            
        directory: dict

            Dictionary of valid folders to load actiwatch data from.
            Folders should have .csv files in them. If no dictionary
            is provided, it uses the one initialized as part of the SALA
            object.
                
        grouping: str

            Name of the generated column for specifying groupings, where
            the values will be the name of the key given. Default = 'Group'.
            
        export: bool
            
            Whether or not to export processed timing data to a parquet file saved in the designated
            outfile location. 
            
        #### Returns
            
            Processed timing data in a dataframe format, with specific identifier columns based
            on weekday and weekend/holiday groupings, and included sunrise and sunset calculations. 
        """
        if directory == None:
            directory = self.directory
            
        raw_data = self.get_raw_data(outfile, directory, grouping)
        data = self.process_data(raw_data, thresholds)
        self.sun_timings()
        
        if export:
            self.export(data = self.data, outfile = outfile)
            
        return self._data
        
    
    def process_sleep(self, raw_data, sleep_split = "18:00", num_sleeps = 3):
        """Processes sleep data for existing timing data.

        #### Parameters
        
        raw_data: pd.DataFrame

            Combined dataframe of all raw data from desired directory. This can be
            accomplished by using the get_raw_data function within the SALA class. 
            
        sleep_split: str

            Time to split the sleep day. Default is "18:00", which is 6:00PM.
            
        num_sleeps: int

            Cutoff for number of sleeps to display in first resulting frame.
            Default = 3, frame will store days with 3+ sleep instances

        #### Returns

            short_frame: pd.DataFrame

                Onset, offset, and duration for sleep periods on days with
                more than num_sleeps number of sleep periods
                
            timing_data: pd.DataFrame

                Modified timing data with included sleep information

        """
        sleepers = []
        sleep_onsets = []
        sleep_offsets = []
        sleep_durations = []
        sleep_onsetMSLMs = []
        sleep_offsetMSLMs = []
        
        timing_data = self._data
        for arow in timing_data.itertuples():
            UID = arow.UID
            DT = pd.to_datetime(arow.Date)
            TM = pd.to_datetime(DT + pd.Timedelta("1 day"))
            today = DT.strftime("%Y-%m-%d")

            nextday = TM.strftime("%Y-%m-%d")

            # taking raw timing data entry and splitting a "sleep day" at 6pm
            # under the assumption that people do not end their days that early
            day_split = raw_data.query("UID == @UID").loc[today +" " + sleep_split:nextday + " 18:00"]

            # REST-S = watch thinks user is asleep
            asleep = day_split[ day_split["Interval Status"] == "REST-S"].copy()

            # there may be more than one sleep period in a given day's data
            # new sleep period = when there is more than 1 hour between successive REST-S entries
            sleep_periods = []
            per = 0
            count = 0

            try:
                lt = asleep.index[0]
                for time in asleep.index:
                    # allow up to 1 hour of being awake in the middle of the night
                    if (time - lt > pd.Timedelta("1 hour")):
                        per += 1
                    lt = time
                    sleep_periods.append(per)
                asleep["Sleep period"] = sleep_periods
            except IndexError:
                asleep["Sleep period"] = [pd.to_datetime(0)]

            try:
            # calc sleep onsets/offsets/duration for each period of sleep in a person-day of data
                sleeps = asleep.reset_index().groupby("Sleep period").apply( lambda x: pd.DataFrame({
                         "Sleep onset": [x.DateTime.min()],
                         "Sleep offset": [x.DateTime.max()],
                         "Sleep duration": [x.DateTime.max() - x.DateTime.min()]
                         }, index = x.DateTime.dt.normalize() ))
            # if the value is = 0 -> np.int64 (not a DateTime)
            except AttributeError:
                sleeps = asleep.reset_index().groupby("Sleep period").apply( lambda x: pd.DataFrame({
                 "Sleep onset": [pd.to_datetime(DT)],
                 "Sleep offset": [pd.to_datetime(DT)],
                 "Sleep duration": [pd.to_timedelta(x.DateTime.max() - x.DateTime.min())]
                 }))
            sleeps = sleeps.drop_duplicates().sort_values(by="Sleep duration", ascending = False)
            onset = sleeps.iloc[0]['Sleep onset']
            offset = sleeps.iloc[0]['Sleep offset']
            dur =  sleeps.iloc[0]['Sleep duration']

            # if onset is actually a datetime
            if not isinstance(onset, np.int64):
                onMSLM = (onset - DT).total_seconds() / 60.0

            # if offset is actually a datetime
            if not isinstance(offset, np.int64):
                offMSLM = np.maximum((offset - TM).total_seconds() / 60.0, 0.0)

            sleep_onsets.append(onset)
            sleep_offsets.append(offset)
            sleep_durations.append(dur)
            sleep_onsetMSLMs.append(onMSLM)
            sleep_offsetMSLMs.append(offMSLM)
            sleep_count = sleeps.shape[0]

            # adding to short_frame
            if sleep_count >= num_sleeps:
                sleeps['UID'] = UID
                sleeps['DT'] = DT
                sleeps.reset_index(drop = True).set_index(['UID','DT'])
                sleepers.append(sleeps)
        short_frame = (
                       pd.concat(sleepers).reset_index().drop('DateTime',axis=1)
                       .set_index(['UID','DT']).drop_duplicates()
                       )
        timing_data["Sleep onset"] = sleep_onsets
        timing_data["Sleep offset"] = sleep_offsets
        timing_data["Sleep duration"] = sleep_durations
        timing_data["Sleep onset MSLM"] = sleep_onsetMSLMs
        timing_data["Sleep offset MSLM"] = sleep_offsetMSLMs
        
        self._data = timing_data
        
        return short_frame, timing_data

## Creating SALA Objects

SALA objects can be created by using the initialization method provided. This method requires location specific information and either existing data or a data source directory:

#### Latitude, Longitude, Timezone

These location specific information pieces are necessary in generating accurate sunrise and sunset timings.


#### Data

This should consist of a processed dataframe in SALA-style, with corresponding columns such as 
"First Light", "Date", "Lux Minutes", etc. Data should be entered as a variable in cases where existing data missing sunrise or sunset data is available or for immediate use with SALA-style plots. 

#### Directory

In most cases, it is preferred to have a directory entered. A directory should be in dictionary style, where keys denote file groupings (e.g. baseline versus intervention) and the values denote relative file paths/folders to find csv stored data in. 

In [None]:
show_doc(SALA.__init__, title_level = 3)

<h3 id="SALA.__init__" class="doc_header"><code>SALA.__init__</code><a href="__main__.py#L60" class="source_link" style="float:right">[source]</a></h3>

> <code>SALA.__init__</code>(**`latitude`**, **`longitude`**, **`timezone`**, **`data`**=*`None`*, **`directory`**=*`None`*)

Initializes a SALA object either from existing parsed timing data, or from a directory
of csvs. Timezone information can be optionally included to allow for sunset, sunrise 
data to be added.

#### Parameters

    timezone: str 
        A valid timezone (a list of timezones can be obtained from pytz.all_timezones).
    
    latitude: float 
        Latitude position for sunrise/sunset calculations. Northern latitudes
        should be positive values.
        
    longitude: float 
        Longitude position for sunrise/sunset calculations. Eastern longitudes
        should be positive values.
        
    data: pd.DataFrame (optional)
        If not None, data should be a pre-processed SALA-format dataframe, expected to contain 
        details on light and sleep information. 
    
    directory: dictionary (optional)
        Dictionary of valid folder names to load actiwatch data from.
        Folders should have .csv files in them.

SALA objects can be initialized in one of two methods.

In [None]:
directory = {
    'base_': 'data/v1',
    'follow_up_': 'data/v3'
}
timezone = "America/Los_Angeles"
latitude = 47.65
longitude = -122.30

sala_from_directory = SALA(latitude, longitude, timezone, directory = directory)

data = pd.read_parquet("example_output/timing.parquet")
sala_from_data = SALA(latitude, longitude, timezone, data = data)

In [None]:
#hide
sala = sala_from_directory

## Loading Actiwatch Data and Raw Data Manipulation

Actiwatch data should be loaded in a directory-style setup with key value pairings. This is intended to provide a generally flexible method for group labeling within the data for easier grouped searching and analysis.

#### Keys

Keys should be indicative of the group name and are used in generating UIDs. The in-documentation example below uses base_ and follow_up_ as its keys. 

#### Values

Corresponding Values should be relative file paths to find csv data to be loaded for a respective group. The example data uses csvs within two folders "data/v1" and "data/v3" which correspond to the keys "base_" and "follow_up_" respectively. Note that the trailing part of the folder path (after the final /) is appended to the UID. The remaining part of the UID is build using the filename within the subfolder. 


Following this structure, an example file titled "user1234" in "data/v1" would generate a UID of "base_v1\user1234".

In [None]:
show_doc(SALA.get_raw_data_from_key, title_level = 3)

<h3 id="SALA.get_raw_data_from_key" class="doc_header"><code>SALA.get_raw_data_from_key</code><a href="__main__.py#L153" class="source_link" style="float:right">[source]</a></h3>

> <code>SALA.get_raw_data_from_key</code>(**`key`**, **`directory`**=*`None`*, **`grouping`**=*`'Group'`*)

Loads and combines raw actiwatch data from any csv files found in
the specified directory matching a particular key within the directory.

 #### Parameters

 key: str

     The key to load actiwatch data from (for example, "v1").
     
 directory: dict

     Dictionary of valid folders to load actiwatch data from.
     Folders should have .csv files in them. If no dictionary
     is provided, it uses the one initialized as part of the SALA
     object.
     
 grouping: str

     Name of the generated column for specifying groupings, where
     the values will be the name of the key given. Default = 'Group'.
     
 #### Returns

 All of the raw unprocessed data within the directory matching a specified key.

### Loading Data for a Single Key 

Raw data for a particular key within the directory can be gathered by giving this function a key and a directory to load data from. If no directory is given, the function will automatically use the directory specified when creating a SALA object. 

In [None]:
raw_data = sala.get_raw_data_from_key("base_")
raw_data.dropna().head()

Found 1 csv files in data/v1/. Pass #1, raw data
.
.
Pass #2, data summary
.
.EOF without retrieving summary data: data/v1\user1234_v1sample.csv


Unnamed: 0_level_0,Off-Wrist Status,Activity,Marker,White Light,Red Light,Green Light,Blue Light,Sleep/Wake,Interval Status,UID,Group
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2018-06-25 12:31:00,0,0.0,0.0,37.09,28.4,15.5,14.5,1.0,ACTIVE,base_v1\user1234,base_
2018-06-25 12:31:30,0,170.0,0.0,156.15,159.0,59.9,65.3,1.0,ACTIVE,base_v1\user1234,base_
2018-06-25 12:32:00,0,194.0,0.0,149.03,113.0,49.8,50.6,1.0,ACTIVE,base_v1\user1234,base_
2018-06-25 12:32:30,0,0.0,0.0,473.95,365.0,161.0,161.0,1.0,ACTIVE,base_v1\user1234,base_
2018-06-25 12:33:00,0,62.0,0.0,317.82,264.0,112.0,115.0,1.0,ACTIVE,base_v1\user1234,base_


In [None]:
show_doc(SALA.get_raw_data, title_level = 3)

<h3 id="SALA.get_raw_data" class="doc_header"><code>SALA.get_raw_data</code><a href="__main__.py#L188" class="source_link" style="float:right">[source]</a></h3>

> <code>SALA.get_raw_data</code>(**`outfile`**, **`directory`**=*`None`*, **`grouping`**=*`'Group'`*, **`export`**=*`True`*)

Loads and combines raw actiwatch data from any csv files found in
the specified directory for all keys within the directory.

 #### Parameters

 outfile: str
 
     Directory to save to. (e.g. ../SALA/example_output/)

 directory: dict

     Dictionary of valid folders to load actiwatch data from.
     Folders should have .csv files in them. If no dictionary
     is provided, it uses the one initialized as part of the SALA
     object.
     
 grouping: str

     Name of the generated column for specifying groupings, where
     the values will be the name of the key given. Default = 'Group'.
     
 export: bool
 
     Whether or not to export combined raw data to a parquet file saved in the designated
     outfile location. 
     
 #### Returns

 All of the raw unprocessed data within the directory for all keys as a single
 dataframe.

### Loading Data for All Keys

Raw data for all keys can similarly be loaded by providing an outfile to save the generated file to. Saving is controlled via a boolean command. A directory to load data from is also necessary. If no directory is given, the function will automatically use the directory specified when creating a SALA object.

In [None]:
outfile = "../SALA/example_output/"
all_raw_data = sala.get_raw_data(outfile, export=False)
all_raw_data.dropna().head()

Unnamed: 0_level_0,Off-Wrist Status,Activity,Marker,White Light,Red Light,Green Light,Blue Light,Sleep/Wake,Interval Status,UID,Group
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2018-06-25 12:31:00,0,0.0,0.0,37.09,28.4,15.5,14.5,1.0,ACTIVE,base_v1\user1234,base_
2018-06-25 12:31:30,0,170.0,0.0,156.15,159.0,59.9,65.3,1.0,ACTIVE,base_v1\user1234,base_
2018-06-25 12:32:00,0,194.0,0.0,149.03,113.0,49.8,50.6,1.0,ACTIVE,base_v1\user1234,base_
2018-06-25 12:32:30,0,0.0,0.0,473.95,365.0,161.0,161.0,1.0,ACTIVE,base_v1\user1234,base_
2018-06-25 12:33:00,0,62.0,0.0,317.82,264.0,112.0,115.0,1.0,ACTIVE,base_v1\user1234,base_


## Exporting Data

SALA provides its own export function for taking existing SALA data and saving it to a parquet file. To save existing SALA data an outfile location must be provided. By default SALA exports the data stored within the object. 

In [None]:
show_doc(SALA.export, title_level = 3)

<h3 id="SALA.export" class="doc_header"><code>SALA.export</code><a href="__main__.py#L237" class="source_link" style="float:right">[source]</a></h3>

> <code>SALA.export</code>(**`outfile`**, **`data`**=*`None`*)

Exports existing timing data to a parquet format.

#### Parameters
    outfile: str

        Directory to save to. (e.g. ../SALA/example_output/)
    data: pd.DataFrame

    Desired dataframe for exporting. 

## Processing Data

The main functionality of SALA's is data processing. SALA's processing functions handle unprocessed combined raw data to outputt first and last light times, and group identifiers for specified light thresholds. SALA also supports adding sunrise and sunset data based on latitude and longitude.

In [None]:
show_doc(SALA.process_data, title_level = 3)

<h3 id="SALA.process_data" class="doc_header"><code>SALA.process_data</code><a href="__main__.py#L260" class="source_link" style="float:right">[source]</a></h3>

> <code>SALA.process_data</code>(**`raw_data`**, **`thresholds`**)

Handles unprocessed combined raw data outputting first and last light times, 
    and group identifiers for all specified light thresholds.

#### Parameters

raw_data: pd.DataFrame
    
    Combined dataframe of all raw data from desired directory. This can be
    accomplished by using the get_raw_data function within the SALA class. 

thresholds: list

    List of light thresholds for the watch data.

#### Returns
    
    Processed timing data in a dataframe format, with specific identifier columns based
    on weekday and weekend/holiday groupings. 

In [None]:
thresholds = [[5], [10], [50], [100], [500], [1000]] 
outfile = "../SALA/example_output/"

At this stage, any extra processing functions can be directly applied to the data.

In [None]:
#exports
def remove_first_day(data):
    """An example function that removes data
    from the first day of recording. Typically the first
    day has no light data for these watches (represented
    as 'NaT')
    """
    return data[(data["Last Light"].apply(np.isnat) == False)
               & (data["Date"] != data["Date"].min())]


In [None]:
sala.data = sala.process_data(all_raw_data, thresholds)
sala.data = remove_first_day(sala.data)

In [None]:
sala.data.iloc[:,:7].head()

Unnamed: 0,UID,Date,Threshold,Last Light,Mins to LL from 4AM,First Light,Mins to FL from 4AM
0,base_v1\user1234,2018-06-30,5,2018-06-30 22:22:00,1102.0,2018-06-30 07:56:30,236.0
1,base_v1\user1234,2018-07-04,5,2018-07-04 22:22:30,1102.0,2018-07-04 06:26:00,146.0
2,base_v1\user1234,2018-07-07,5,2018-07-08 00:01:30,1201.0,2018-07-07 06:54:30,174.0
3,base_v1\user1234,2018-06-26,5,2018-06-26 23:46:00,1186.0,2018-06-26 07:01:30,181.0
4,base_v1\user1234,2018-07-08,5,2018-07-08 20:45:30,1005.0,2018-07-08 06:45:00,165.0


In [None]:
sala.data.iloc[:,7:14].head()

Unnamed: 0,Time above threshold,Time above threshold AM,Minutes above threshold,Minutes above threshold AM,Lux minutes,Lux minutes AM,Group
0,0 days 10:04:00,0 days 03:26:30,604.0,206.5,779254.835,473764.04,base_
1,0 days 10:01:00,0 days 05:24:00,601.0,324.0,607621.15,448941.815,base_
2,0 days 11:45:30,0 days 04:37:30,705.5,277.5,814059.62,355668.225,base_
3,0 days 10:22:30,0 days 04:15:00,622.5,255.0,1221873.31,646422.485,base_
4,0 days 11:46:30,0 days 04:39:30,706.5,279.5,478718.375,154556.71,base_


In [None]:
sala.data.iloc[:,14:].head()

Unnamed: 0,Watch period,DayofWeek,GroupDayofWeek,GroupDayType,Weekend/Holiday
0,0 days 00:00:30,5,base_Sat,base_Weekend/Holiday,True
1,0 days 00:00:30,2,base_Wed,base_Weekend/Holiday,True
2,0 days 00:00:30,5,base_Sat,base_Weekend/Holiday,True
3,0 days 00:00:30,1,base_Tues,base_Weekday,False
4,0 days 00:00:30,6,base_Sun,base_Weekend/Holiday,True


## Setting Sunset and Sunrise

SALA provides the ability to add sunrise and sunset information to processed data. To do so, the specific location (longitude and latitude) is required. 

In [None]:
show_doc(SALA.sun_timings, title_level = 3)

<h3 id="SALA.sun_timings" class="doc_header"><code>SALA.sun_timings</code><a href="__main__.py#L315" class="source_link" style="float:right">[source]</a></h3>

> <code>SALA.sun_timings</code>()

Calculates sunrise and sunset timing information for data present in the
SALA object.

#### Returns

    Modified timing data with sunrise and sunset calculations

In [None]:
sala.data = sala.sun_timings()
sala.data[["Sunrise", "Sunset"]].head()

Unnamed: 0,Sunrise,Sunset
0,2018-06-30 05:15:10.009843-07:00,2018-06-30 21:10:28.189308-07:00
1,2018-07-04 05:17:43.572176-07:00,2018-07-04 21:09:17.890736-07:00
2,2018-07-07 05:19:59.495724-07:00,2018-07-07 21:07:57.880755-07:00
3,2018-06-26 05:13:10.853540-07:00,2018-06-26 21:10:56.479092-07:00
4,2018-07-08 05:20:48.480661-07:00,2018-07-08 21:07:26.089296-07:00


## Complete Processing

SALA additionally has a do-it-all function that handles the entire process from loading data from a directory up to adding sunrise and sunset information. It additionally defaults to exporting the information. It requires an outfile for potential saving, and light thresholds to work with. If a directory is not provided, it will use the directory present within the SALA object that was provided upon object creation.

In [None]:
show_doc(SALA.do_everything, title_level = 3)

<h3 id="SALA.do_everything" class="doc_header"><code>SALA.do_everything</code><a href="__main__.py#L338" class="source_link" style="float:right">[source]</a></h3>

> <code>SALA.do_everything</code>(**`outfile`**, **`thresholds`**, **`directory`**=*`None`*, **`grouping`**=*`'Group'`*, **`export`**=*`True`*)

Handles the full SALA pipeline (excluding sleep period analysis), from processing and combining raw data
to parsing and calculating processed data with sunrise and sunset information. First loads and compiles 
all existing raw data for every key within the given directory. Then processes all raw data, calculating
additional information for all specified light thresholds. Also adds sunrise and sunset information. 

#### Parameters

outfile: str
    
        Directory to save to. (e.g. ../SALA/example_output/)
        
thresholds: list

    List of light thresholds for the watch data.
    
directory: dict

    Dictionary of valid folders to load actiwatch data from.
    Folders should have .csv files in them. If no dictionary
    is provided, it uses the one initialized as part of the SALA
    object.
        
grouping: str

    Name of the generated column for specifying groupings, where
    the values will be the name of the key given. Default = 'Group'.
    
export: bool
    
    Whether or not to export processed timing data to a parquet file saved in the designated
    outfile location. 
    
#### Returns
    
    Processed timing data in a dataframe format, with specific identifier columns based
    on weekday and weekend/holiday groupings, and included sunrise and sunset calculations. 

In [None]:
thresholds = [[5], [10], [50], [100], [500], [1000]] 
outfile = "../SALA/example_output/"
results = sala.do_everything(outfile, thresholds, export=False)

## Additional Sleep Information

Adding sleep information to the processed data is also possible. The below function adds sleep data, allowing a "sleep day" to be split at a customizable time. The outputs of the function are:

1. short_frame: 
    a separate dataframe meant to be a quick way of visually subsetting and viewing bi/polyphasic instances. 
    This frame defaults to storing occurances of at least 3 sleep periods within a "sleep day", but can be modified.

2. timing_data:
    modifies stored data to have sleep period information

In [None]:
show_doc(SALA.process_sleep, title_level = 3)

<h3 id="SALA.process_sleep" class="doc_header"><code>SALA.process_sleep</code><a href="__main__.py#L389" class="source_link" style="float:right">[source]</a></h3>

> <code>SALA.process_sleep</code>(**`raw_data`**, **`sleep_split`**=*`'18:00'`*, **`num_sleeps`**=*`3`*)

Processes sleep data for existing timing data.

#### Parameters

raw_data: pd.DataFrame

    Combined dataframe of all raw data from desired directory. This can be
    accomplished by using the get_raw_data function within the SALA class. 
    
sleep_split: str

    Time to split the sleep day. Default is "18:00", which is 6:00PM.
    
num_sleeps: int

    Cutoff for number of sleeps to display in first resulting frame.
    Default = 3, frame will store days with 3+ sleep instances

#### Returns

    short_frame: pd.DataFrame

        Onset, offset, and duration for sleep periods on days with
        more than num_sleeps number of sleep periods
        
    timing_data: pd.DataFrame

        Modified timing data with included sleep information

In [None]:
short_frame, timing_data = sala.process_sleep(all_raw_data)

In [None]:
short_frame.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Sleep period,Sleep onset,Sleep offset,Sleep duration
UID,DT,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
base_v1\user1234,2018-06-28,0,2018-06-29 00:40:30,2018-06-29 06:41:00,0 days 06:00:30
base_v1\user1234,2018-06-28,2,2018-06-29 13:42:00,2018-06-29 15:23:30,0 days 01:41:30
base_v1\user1234,2018-06-28,1,2018-06-29 08:50:30,2018-06-29 09:04:00,0 days 00:13:30
follow_up_v3\user1234,2018-09-17,0,2018-09-17 23:11:00,2018-09-18 06:29:30,0 days 07:18:30
follow_up_v3\user1234,2018-09-17,2,2018-09-18 16:11:00,2018-09-18 16:40:30,0 days 00:29:30


In [None]:
timing_data[
    ["Sleep onset", "Sleep offset",
     "Sleep duration", "Sleep onset MSLM",
     "Sleep offset MSLM"]
    ].head()

Unnamed: 0,Sleep onset,Sleep offset,Sleep duration,Sleep onset MSLM,Sleep offset MSLM
0,2018-07-04 22:23:00,2018-07-05 06:36:00,0 days 08:13:00,1343.0,396.0
1,2018-07-10 00:00:00,2018-07-10 00:00:00,0 days 00:00:00,0.0,0.0
2,2018-06-29 00:40:30,2018-06-29 06:41:00,0 days 06:00:30,1480.5,401.0
3,2018-07-05 23:27:00,2018-07-06 06:29:00,0 days 07:02:00,1407.0,389.0
4,2018-07-01 20:56:30,2018-07-02 07:48:30,0 days 10:52:00,1256.5,468.5
