### Temporary Notebook for EDA

---

#### Imports

In [1]:
import os
import numpy as np
import pandas as pd

#### Reading in the Data

In [20]:
# Directory information
fname = 'transformed_data'
fp = os.path.join('/Users/nicolebrye/Desktop', fname)

In [18]:
def readData(fp):
    '''
    Given a file path, reads in a set of data
    '''
    
    OUT = pd.read_csv(fp, index_col=[0])
    
    if OUT.index.name:
        OUT = OUT.reset_index()
        
    return OUT

In [None]:
# List of files
files = os.listdir(fp)
dfs   = [readData(os.path.join(fp, i)) for i in files]

---
#### Working with DF[0]

In [47]:



def cleanOura(df, thresh):
    
    '''
    Preliminary cleaning for raw Oura Data
    
    :param df: DataFrame containing raw Oura Data
    :param thresh: Number of missing values acceptable for each row (if number of NaN 
    values exceeds thresh, the entire row will be deleted)
    :returns: A cleaned DataFrame
    '''
    
    # Filter out rows with a specified number of missing values
    over = (df.isna().sum(axis=1) > thresh)
    OUT  = df.loc[~over]
    
    # Day of week feature
    OUT = OUT.assign(date = pd.to_datetime(OUT['date']))
    OUT = OUT.assign(day_name = OUT['date'].dt.day_name())
    OUT = OUT.assign(is_weekend = OUT['day_name'].isin(['Saturday', 'Sunday']))
    
    return OUT


In [35]:
dfs[0].isna().sum(axis=1) > 10

0      False
1      False
2      False
3      False
4      False
       ...  
784     True
785     True
786     True
787     True
788     True
Length: 789, dtype: bool

In [50]:
cleaned = cleanOura(dfs[0], 10)

In [56]:
cleaned.dtypes.astype(str).str.contains("datetime")

date                            True
Sleep Score                    False
Total Sleep Score              False
REM Sleep Score                False
Deep Sleep Score               False
Sleep Efficiency Score         False
Restfulness Score              False
Sleep Latency Score            False
Sleep Timin Score              False
Total Sleep Duration           False
Total Bedtime                  False
Awake Time                     False
REM Sleep Duration             False
Light Sleep Duration           False
Deep Sleep Duration            False
Restless Sleep                 False
Sleep Efficiency               False
Sleep Latency                  False
Sleep Timing                   False
Bedtime Start                  False
Bedtime End                    False
Average Resting Heart Rate     False
Lowest Resting Heart Rate      False
Average HRV                    False
Temperature Deviation (°C)     False
Temperature Trend Deviation    False
Respiratory Rate               False
A

In [91]:
"Sleep Score".lower().replace(" ", "_")

'sleep_score'

In [87]:
def linePrep(df, cols, start_date=None, end_date='Present'):
    
    '''
    Preps the desired columns for line plotting. Will use a range
    of dates to filter the data.
    
    :param df: Cleaned DataFrame
    :param cols: A list of columns to prepare for line plotting
    :start_date: A string in the format YYYY-MM-DD. If left as the default
    (None), it will be set to the earliest date available.
    :end_date: A string in the format YYYY-MM-DD. If left as the default
    ('Present'), it will be set to the latest date available.
    :returns: None
    '''
    
    date_col = df.columns[df.dtypes.astype(str).str.contains('datetime')][0]
    
    # Setting the start and end Timestamps
    sdate = pd.Timestamp(start_date)
    if pd.isnull(sdate):
        sdate = min(df[date_col])
        
    try:
        edate = pd.Timestamp(end_date)
    except:
        edate = max(df[date_col])
    
    # Prepping the data
    DFS = []
    for x in cols:
        
        fname = x.lower().replace(' ', '_') + '_line.csv'
        
        sub = df[[date_col, x]]
        sub = sub.loc[(sub['date'] >= sdate) & (sub['date'] <= edate)]
        
        sub.to_csv(os.path.join(output_dir, fname))
        
        

In [97]:
def corrPrep(df, cols, start_date=None, end_date='Present'):
    
    date_col = df.columns[df.dtypes.astype(str).str.contains('datetime')][0]
    
    # Setting the start and end Timestamps
    sdate = pd.Timestamp(start_date)
    if pd.isnull(sdate):
        sdate = min(df[date_col])
        
    try:
        edate = pd.Timestamp(end_date)
    except:
        edate = max(df[date_col])
        
    # Prepping the data
    for x in cols:
        
        fname = x[0].lower().replace(' ', '') + '_' + \
                x[1].lower().replace(' ', '') + '_corr.csv'
        
        sub = df[[date_col, x[0], x[1]]]
        sub = sub.loc[(sub['date'] >= sdate) & (sub['date'] <= edate)]
        
        # Calculate the Pearson correlation coefficient
        corr = np.corrcoef(sub[x[0]], sub[x[1]])[0, 1]
        
        sub.to_csv(os.path.join(output_dir, fname))

In [98]:
corrPrep(cleaned, [["Sleep Score", "Deep Sleep Score"]], start_date = '2022-06-07')

0.78896018836758


In [89]:
linePrep(cleaned, ["Sleep Score"], start_date = '2022-06-07')

          date  Sleep Score
774 2022-06-07         87.0
775 2022-06-08         84.0
776 2022-06-09         81.0
777 2022-06-10         89.0
778 2022-06-11         95.0


In [99]:
import sys

In [108]:
sys.path.insert(0, 'src')

In [109]:
os.getcwd()

'/Users/nicolebrye/Desktop/DSC180A/medical-dashboarding-b01.github.io/src'

In [101]:
import json


In [113]:
config = json.load(open(os.path.join('/Users/nicolebrye/Desktop/DSC180A/medical-dashboarding-b01.github.io',
                                     'config/data-params.json')))
config

{'data_fp': 'data/raw', 'testdata_fp': 'test/testdata/raw'}

In [114]:
config['testdata_fp']

'test/testdata/raw'

In [121]:
dfs[0][-100:].to_csv('/Users/nicolebrye/Desktop/testdata.csv')