In [15]:
import xml.etree.ElementTree as ET
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
import numpy as np

# Fetch My Health Data From Appple
> I've exported all my appleHealth data from the app in xml Format and I'm going to read all the data into pandas dataframe

In [16]:
# create element tree object
tree = ET.parse('/Users/emretuygan/Documents/Sabanci/Sabanci_2.2/CS210/myCaffeineStudy/appleData1.xml') 
# for every health record, extract the attributes
root = tree.getroot()
record_list = [x.attrib for x in root.iter('Record')]

In [47]:
record_data = pd.DataFrame(record_list)

# proper type to dates
for col in ['creationDate', 'startDate', 'endDate']:
    record_data[col] = pd.to_datetime(record_data[col])
    
# Conditions and corresponding values for specific value types
conditions = [
    record_data['value'] == 'HKCategoryValueSleepAnalysisAsleepREM',
    record_data['value'] == 'HKCategoryValueSleepAnalysisAsleepCore',
    record_data['value'] == 'HKCategoryValueSleepAnalysisAsleepDeep',
    record_data['value'] == 'HKCategoryValueSleepAnalysisAwake'
]

new_values = [1, 1, 1, 1]

# Set type value as record_data['value'] conditionally
record_data['type'] = np.where(np.any(conditions, axis=0), record_data['value'], record_data['type'])

for condition, new_value in zip(conditions, new_values):
    record_data['value'] = np.where(condition, new_value, record_data['value'])


Unnamed: 0,type,sourceName,sourceVersion,unit,creationDate,startDate,endDate,value,device
0,HKQuantityTypeIdentifierDietaryWater,YAZIO,1667,mL,2023-10-22 10:51:40+03:00,2023-10-22 10:51:40+03:00,2023-10-22 10:51:40+03:00,500.0,
1,HKQuantityTypeIdentifierDietaryWater,YAZIO,1667,mL,2023-10-22 19:00:05+03:00,2023-10-22 19:00:05+03:00,2023-10-22 19:00:05+03:00,72.468,
2,HKQuantityTypeIdentifierDietaryWater,YAZIO,1667,mL,2023-10-22 19:00:54+03:00,2023-10-22 19:00:53+03:00,2023-10-22 19:00:53+03:00,0.63,
3,HKQuantityTypeIdentifierDietaryWater,YAZIO,1667,mL,2023-10-23 13:23:06+03:00,2023-10-23 13:23:06+03:00,2023-10-23 13:23:06+03:00,9.19975,
4,HKQuantityTypeIdentifierDietaryWater,YAZIO,1667,mL,2023-10-24 14:45:41+03:00,2023-10-24 14:45:40+03:00,2023-10-24 14:45:40+03:00,145.989,


### Data that I'm interested in:
- General info about my sleep (total sleep duration, start and end of my sleep session)
- Total time I was in Rem Sleep
- Total time I was in Core Sleep
- Total time I was in Deep Sleep
- Total time that I spent awakeAwake



# Sleep Analysis Dataframe

- indexes are the related dates

- bedTime is the start time of my sleep sequence

- awake time is the time when I wake up (year-month-day hour:minute:second)

- time_in_bed represent the time that I stay in bed (hours:minutes:seconds)

- REM_total is the total miutes that I spent in REM phase

- Deep_Sleep_total is the total miutes that I spent in deep sleep phase

- Core_Sleep_total is the total miutes that I spent in deep core sleep phase

- Awake_total is the total minutes that I spent awake during night

In [160]:
sleep_data = record_data.copy()

# Filter for SleepAnalysis records from the specific source and specific types
sleep_data = sleep_data[
    (
        (sleep_data['type'] == 'HKCategoryTypeIdentifierSleepAnalysis') & 
         (sleep_data['sourceName'] == "Emre Apple\xa0Watch’u")
    )]
sleep_data = sleep_data.groupby(sleep_data['creationDate'].dt.date).agg(
    bed_time=('startDate', 'min'), 
    awake_time=('endDate', 'max'))

sleep_data['time_in_bed'] = sleep_data['awake_time'] - sleep_data['bed_time']

sleep_data['bed_time'] = pd.to_datetime(sleep_data['bed_time'])
sleep_data['bed_time'] = sleep_data['bed_time'].dt.strftime('%Y-%m-%d %H:%M:%S')

sleep_data['awake_time'] = pd.to_datetime(sleep_data['awake_time'])
sleep_data['awake_time'] = sleep_data['awake_time'].dt.strftime('%Y-%m-%d %H:%M:%S')

# Function to format timedelta to hours:minutes:seconds
def format_timedelta(td):
    hours, remainder = divmod(td.seconds, 3600)
    minutes, seconds = divmod(remainder, 60)
    return f'{hours:02}:{minutes:02}:{seconds:02}'

sleep_data['time_in_bed'] = sleep_data['time_in_bed'].apply(format_timedelta)

sleep_data.head()

Unnamed: 0_level_0,bed_time,awake_time,sleep_counts,time_in_bed
creationDate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2023-10-24,2023-10-23 23:31:05,2023-10-24 08:10:05,12,08:39:00
2023-10-25,2023-10-25 00:05:58,2023-10-25 08:44:28,6,08:38:30
2023-10-26,2023-10-25 23:45:44,2023-10-26 08:09:14,10,08:23:30
2023-10-27,2023-10-27 00:15:04,2023-10-27 07:54:34,9,07:39:30
2023-10-28,2023-10-28 00:55:30,2023-10-28 08:55:00,12,07:59:30


In [161]:
# ADD Total Minutes in REM stage
def calculate_total_duration(record_data, type_value, colName):
    df = record_data.copy()

    # Filter for SleepAnalysis records of the specific type
    df = df[df['type'] == type_value]

    # Calculate the differences between 'endDate' and 'startDate' in minutes
    df['duration'] = (df['endDate'] - df['startDate']).dt.total_seconds() / 60

    # Group the data by date and calculate the total sum of durations for each date
    grouped_data = df.groupby(df['creationDate'].dt.date)['duration'].sum().reset_index()
    grouped_data.rename(columns={'startDate': 'Date', 'duration': f'{colName}'}, inplace=True)
    grouped_data.set_index('creationDate', inplace=True)

    return grouped_data


In [162]:
grouped_data = calculate_total_duration(record_data.copy(), 'HKCategoryValueSleepAnalysisAsleepREM', 'REM_total')
merged_df = sleep_data.merge(grouped_data['REM_total'], how='left', left_index=True, right_index=True)
merged_df.head()

Unnamed: 0_level_0,bed_time,awake_time,sleep_counts,time_in_bed,REM_total
creationDate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2023-10-24,2023-10-23 23:31:05,2023-10-24 08:10:05,12,08:39:00,102.5
2023-10-25,2023-10-25 00:05:58,2023-10-25 08:44:28,6,08:38:30,87.5
2023-10-26,2023-10-25 23:45:44,2023-10-26 08:09:14,10,08:23:30,81.0
2023-10-27,2023-10-27 00:15:04,2023-10-27 07:54:34,9,07:39:30,86.0
2023-10-28,2023-10-28 00:55:30,2023-10-28 08:55:00,12,07:59:30,91.0


## ADD total time in deep sleep

In [163]:
grouped_data = calculate_total_duration(record_data.copy(), 'HKCategoryValueSleepAnalysisAsleepDeep', 'Deep_Sleep_total')
merged_df = merged_df.merge(grouped_data['Deep_Sleep_total'], how='left', left_index=True, right_index=True)
merged_df.head()

Unnamed: 0_level_0,bed_time,awake_time,sleep_counts,time_in_bed,REM_total,Deep_Sleep_total
creationDate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2023-10-24,2023-10-23 23:31:05,2023-10-24 08:10:05,12,08:39:00,102.5,51.0
2023-10-25,2023-10-25 00:05:58,2023-10-25 08:44:28,6,08:38:30,87.5,49.0
2023-10-26,2023-10-25 23:45:44,2023-10-26 08:09:14,10,08:23:30,81.0,45.5
2023-10-27,2023-10-27 00:15:04,2023-10-27 07:54:34,9,07:39:30,86.0,67.5
2023-10-28,2023-10-28 00:55:30,2023-10-28 08:55:00,12,07:59:30,91.0,77.0


## ADD total time in core sleep

In [164]:
grouped_data = calculate_total_duration(record_data.copy(), 'HKCategoryValueSleepAnalysisAsleepCore', 'Core_Sleep_total')
merged_df = merged_df.merge(grouped_data['Core_Sleep_total'], how='left', left_index=True, right_index=True)
merged_df.head()

Unnamed: 0_level_0,bed_time,awake_time,sleep_counts,time_in_bed,REM_total,Deep_Sleep_total,Core_Sleep_total
creationDate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2023-10-24,2023-10-23 23:31:05,2023-10-24 08:10:05,12,08:39:00,102.5,51.0,339.0
2023-10-25,2023-10-25 00:05:58,2023-10-25 08:44:28,6,08:38:30,87.5,49.0,348.0
2023-10-26,2023-10-25 23:45:44,2023-10-26 08:09:14,10,08:23:30,81.0,45.5,350.5
2023-10-27,2023-10-27 00:15:04,2023-10-27 07:54:34,9,07:39:30,86.0,67.5,275.0
2023-10-28,2023-10-28 00:55:30,2023-10-28 08:55:00,12,07:59:30,91.0,77.0,291.5


## ADD total time awake

In [165]:
grouped_data = calculate_total_duration(record_data.copy(), 'HKCategoryValueSleepAnalysisAwake', 'Awake_total')
merged_df = merged_df.merge(grouped_data['Awake_total'], how='left', left_index=True, right_index=True)
merged_df.head()

Unnamed: 0_level_0,bed_time,awake_time,sleep_counts,time_in_bed,REM_total,Deep_Sleep_total,Core_Sleep_total,Awake_total
creationDate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2023-10-24,2023-10-23 23:31:05,2023-10-24 08:10:05,12,08:39:00,102.5,51.0,339.0,26.5
2023-10-25,2023-10-25 00:05:58,2023-10-25 08:44:28,6,08:38:30,87.5,49.0,348.0,34.0
2023-10-26,2023-10-25 23:45:44,2023-10-26 08:09:14,10,08:23:30,81.0,45.5,350.5,26.5
2023-10-27,2023-10-27 00:15:04,2023-10-27 07:54:34,9,07:39:30,86.0,67.5,275.0,31.0
2023-10-28,2023-10-28 00:55:30,2023-10-28 08:55:00,12,07:59:30,91.0,77.0,291.5,20.0


# My Caffeine Consumption Data: - caffein_df
> I've been logging my caffeine consumption in an app called HiCoffe. It is logging the caffeine consumption to apple health as well bu I'm going to get my directly from the app as it is providing my caffein consumption in hourly basis (apple only provides daily).

In [8]:
# Replace 'file_path.csv' with your CSV file's path
file_path = '/Users/emretuygan/Documents/Sabanci/Sabanci_2.2/CS210/myCaffeineStudy/HiCoffee_Exported-Data_UTF8_2023-12-11_11-44-15.csv'

# Read the CSV file into a DataFrame
HiCoffe_df = pd.read_csv(file_path)

# Display the first few rows of the DataFrame to verify the data has been loaded
HiCoffe_df.head()

Unnamed: 0,Date,Beverage,Brand,Caffeine (mg)
0,"Oct 21, 2023 at 16:30:44",Espresso,-,75
1,"Oct 22, 2023 at 13:55:15",Coffee Capsule,-,60
2,"Oct 22, 2023 at 16:55:27",Diet Coke (250 mL),-,32
3,"Oct 22, 2023 at 18:54:52",Espresso,-,75
4,"Oct 23, 2023 at 9:56:00",Brewed Coffee - Decaf Pike Place Roast (Short),Starbucks,15


## Edit my consumtion data:
- I want to cluster my consumption in daily basis.
- I'm going to divide my day into four and structure my consumption in those four timezones which are ['08.00-12.00', '12.00-16.00', '16.00-20.00', '20.00-24.00']
By that I can also investigate the affect of consuming coffe on significant interaval ass well as my total consumption

In [166]:
caffein_df = HiCoffe_df.copy()

# Convert 'Date' column to datetime
caffein_df['Date'] = pd.to_datetime(caffein_df['Date'], format='%b %d, %Y at %H:%M:%S')

# Create time interval columns
caffein_df['Caffeine 08.00-12.00'] = 0
caffein_df['Caffeine 12.00-16.00'] = 0
caffein_df['Caffeine 16.00-20.00'] = 0
caffein_df['Caffeine 20.00-24.00'] = 0

# Assign caffeine consumption to time interval columns
for index, row in caffein_df.iterrows():
    hour = row['Date'].hour
    caffeine = row['Caffeine (mg)']
    
    if 8 <= hour < 12:
        caffein_df.at[index, 'Caffeine 08.00-12.00'] = caffeine
    elif 12 <= hour < 16:
        caffein_df.at[index, 'Caffeine 12.00-16.00'] = caffeine
    elif 16 <= hour < 20:
        caffein_df.at[index, 'Caffeine 16.00-20.00'] = caffeine
    elif 20 <= hour <= 23:
        caffein_df.at[index, 'Caffeine 20.00-24.00'] = caffeine

# Group by Date and sum up the columns
caffein_df = caffein_df.groupby(caffein_df['Date'].dt.date).sum()
# Reset index to make 'Date' a column again
caffein_df.reset_index(inplace=True)
# Set 'Date' as index
caffein_df.set_index('Date', inplace=True)

caffein_df.head()

Unnamed: 0_level_0,Caffeine (mg),Caffeine 08.00-12.00,Caffeine 12.00-16.00,Caffeine 16.00-20.00,Caffeine 20.00-24.00
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2023-10-21,75,0,0,75,0
2023-10-22,167,0,60,107,0
2023-10-23,81,15,66,0,0
2023-10-24,88,46,42,0,0
2023-10-25,34,2,32,0,0


- benim kaç mg ile uyuyamadığımı bul bunu yaparken net uyuyamamak söz konusu olmadığı için elindeki 
uyku datası ile bir uyku puan bul ve buna treshold ata.
- kafein yarılanma ömrünü bul (metabolizmaya bağlı olduğu için bunu kalp ritmin ile correlasyon ara et)


In [None]:
from sklearn.linear_model import LinearRegression

# Assuming 'X' contains the columns for caffeine intake in each time interval and 'y' is sleep quality
X = data[['Caffeine 08.00-12.00', 'Caffeine 12.00-16.00', 'Caffeine 16.00-20.00', 'Caffeine 20.00-24.00']]
y = data['Overall_Sleep_Quality']

# Initialize Linear Regression model
model = LinearRegression()

# Fit the model
model.fit(X, y)

# Coefficients for each interval
coefficients = model.coef_
print("Coefficients:", coefficients)

In [None]:

''' 
#handle duplicate rows
sleep_data = sleep_data.drop_duplicates(subset=['startDate', 'endDate'], keep='first')
#handle overlaps
# Identify overlapping records with the same start point
overlap_mask = sleep_data.duplicated(subset=['startDate'], keep=False)
# Calculate duration for each row
sleep_data['duration'] = sleep_data['endDate'] - sleep_data['startDate']
# Select rows with the minimum duration for each unique start point
min_duration_rows = sleep_data[overlap_mask].groupby('startDate')['duration'].idxmin()
# Keep the rows with the minimum duration and remove others
sleep_data = sleep_data.drop(index=sleep_data[overlap_mask].index.difference(min_duration_rows))
# calulate time between date(s)
sleep_data['time_asleep'] = sleep_data['endDate'] - sleep_data['startDate']
'''

#fin total REM in minutes
remDf = record_data.copy()

# Filter for SleepAnalysis records of specific type(s)
remDf = remDf[remDf['type'] == 'HKCategoryValueSleepAnalysisAsleepREM']

# Calculate the differences between 'endDate' and 'startDate' in minutes
remDf['duration'] = (remDf['endDate'] - remDf['startDate']).dt.total_seconds() / 60

# Group the data by date and calculate the total sum of durations for each date
grouped_data = remDf.groupby(remDf['creationDate'].dt.date)['duration'].sum().reset_index()
grouped_data.rename(columns={'startDate': 'Date', 'duration': 'REM_total'}, inplace=True)
grouped_data.set_index('creationDate', inplace=True)
grouped_data.head()
# Resetting the index of grouped_data to use 'Date' for merging
merged_df = sleep_data.merge(grouped_data['REM_total'], how='left', left_index=True, right_index=True)
merged_df.head()

# Resting Heart Rate - RHRdf
- startDate and endDate define an interval of the measurment
- value is my resting hearth rate
- provided unit is count/min
- indexes are the date of the RHR

In [4]:
# Filter Resting Heart Rate records and create a DataFrame
resting_heart_rate_data = record_data[record_data['type'] == 'RestingHeartRate']

# Create DataFrame and set 'creationDate' as index
RHRdf = pd.DataFrame(resting_heart_rate_data)
RHRdf.set_index('creationDate', inplace=True)

# Drop specified columns
columns_to_drop = ['type', 'unit', 'index', 'sourceName', 'sourceVersion', 'device']
RHRdf.drop(columns=columns_to_drop, inplace=True, errors='ignore')
RHRdf.head()

Unnamed: 0_level_0,startDate,endDate,value
creationDate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2023-10-24 17:50:59+03:00,2023-10-24 00:04:39+03:00,2023-10-24 17:45:23+03:00,67.0
2023-10-25 22:23:11+03:00,2023-10-25 00:02:47+03:00,2023-10-25 22:19:09+03:00,61.0
2023-10-26 15:38:17+03:00,2023-10-26 00:06:30+03:00,2023-10-26 15:33:49+03:00,73.0
2023-10-29 22:34:46+03:00,2023-10-29 00:02:27+03:00,2023-10-29 16:48:04+03:00,66.0
2023-10-29 22:34:46+03:00,2023-10-28 00:47:17+03:00,2023-10-28 16:54:04+03:00,79.0
