In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import glob
import seaborn as sns
import unittest
from unittest import TestCase
import json

# For testing in an ipynb:
# import importlib
# importlib.reload(FitBit)
# Run second line in a cell everytime you make changes to FitBit.py

In [11]:
class FitBit:
    def __init__(self, file_path):
        """
        Initialize FitBit object and load all data.
        
        Parameters
        ----------
        file_path : str
            The path to the folder containing all data collected from FitBit.
            
        Returns
        -------
        FitBit
            An instance of the FitBit class with various DataFrames loaded.
        """

        # Initialize file_path attribute
        self.file_path = file_path

        # Load various types of data into respective attributes
        # Using load_and_concat function to fetch and concatenate data files matching the pattern
        # Each attribute will hold a DataFrame containing the respective type of data

        #self.sleep = self.load_and_concat('/Users/harrisonkane/Desktop/BME/fitbit_dbdp/data/Global Export Data/sleep-*.json')
        #self.energy = self.load_and_concat("/Users/harrisonkane/Desktop/BME/fitbit_dbdp/data/Global Export Data/calories-*.json")
        #self.steps = self.load_and_concat("/Users/harrisonkane/Desktop/BME/fitbit_dbdp/data/Global Export Data/steps-*.json")
        #self.distance = self.load_and_concat("/Users/harrisonkane/Desktop/BME/fitbit_dbdp/data/Global Export Data/distance-*.json")
        self.oxygen = self.load_and_concat("/Users/harrisonkane/Desktop/BME/fitbit_dbdp/data/Global Export Data/estimated_oxygen_variation-*.json")
        self.resting_heart_rate = self.load_and_concat("/Users/harrisonkane/Desktop/BME/fitbit_dbdp/data/Global Export Data/resting_heart_rate-*.json")
        #self.heart_rate = self.load_and_concat("/Users/harrisonkane/Desktop/BME/fitbit_dbdp/data/Global Export Data/heart_rate-*.json")
        self.respiration_rate = self.load_and_concat("/Users/harrisonkane/Desktop/BME/fitbit_dbdp/data/Global Export Data/distance-*.json")
        #self.sleep_stage = self.load_and_concat("/Users/harrisonkane/Desktop/BME/fitbit_dbdp/data/Global Export Data/sleep-*.json")
        self.floors_climbed = self.load_and_concat("/Users/harrisonkane/Desktop/BME/fitbit_dbdp/data/Global Export Data/altitude-*.json")
    
   
    def load(self, file_path):
        """
        Load data from a specified file path.

        Parameters
        ----------
        file_path : str
            The path to the data file.
        
        Returns
        -------
        pd.DataFrame
            The loaded data as a pandas DataFrame.
        """

        # Update the file path
        self.file_path = file_path

        # Check if the file exists
        if os.path.exists(self.file_path):

            # Get file format (csv or json)
            file = self.file_path.split('/')[-1]
            file_name, file_format = file.split('.')

            # Load data depending on file format
            if file_format == 'csv':
                print(f"CSV data loaded from {self.file_path}")
                return pd.read_csv(self.file_path)
            elif file_format == 'json':
                print(f"JSON data loaded from {self.file_path}")
                return pd.read_json(self.file_path)
            else:
                print(f"Unsupported file format: {file_format}")
        else:
            print(f"The path {self.file_path} does not exist.")
    
    def load_and_concat(self, pattern):
        """
        Load and concatenate multiple files that match the given file name pattern.
        
        Parameters
        ----------
        pattern : str
            The file name pattern to search for.
        
        Returns
        -------
        pd.DataFrame
            A DataFrame consisting of concatenated data from all matched files.
            Returns an empty DataFrame if no data was found.
        """

        # Get list of all file paths that match the pattern
        file_paths = glob.glob(os.path.join(self.file_path, pattern))
        data_frames = []

        # Loop through all matched file paths
        for file_path in file_paths:
            # Load each file into a DataFrame using the load method
            df = self.load(file_path)
            # Append DataFrame to the list if it's not None
            if df is not None:
                data_frames.append(df)

        # Concatenate all DataFrames if the list is not empty
        if data_frames:
            return pd.concat(data_frames, ignore_index=True)
        else:
            # Return an empty DataFrame if no data was found
            return pd.DataFrame()
    
    def heart_rate(self,confidence=True):
        '''Get heart_rate dataframe 
        
        Returns
        -------
        heart_rate: pandas.DataFrame
            Dataframe with columns for timestamp (datetime64) and heart_rate in bpm (int)
        '''
        temp_df = self.load_and_concat("/workspaces/fitbit_dbdp/data/Global Export Data/heart_rate-*.json")
        df_unpacked = pd.concat([temp_df.drop('value', axis=1), pd.json_normalize(temp_df['value'])], axis=1)
        df_unpacked = df_unpacked.sort_values(by='dateTime')    
        df_unpacked = df_unpacked.reset_index(drop=True)
        if confidence: 
            df = df_unpacked[['dateTime','bpm','confidence']]
        else:
            df = df_unpacked[['dateTime','bpm']]
        return df

    ### Still need to figure out the unit and also why the number decreases as time goes on?
    def active_calories(self):
        '''Get active_calories dataframe 
        
        Returns
        -------
        active_calories: pandas.DataFrame
            Dataframe with columns for start_time (datetime64), end_time (datetime64) and active calories in kilocalories (int)
        '''
        temp_df = self.load_and_concat("/workspaces/fitbit_dbdp/data/Global Export Data/calories-*.json")
        # Add datetime of the next row
        temp_df['end_time'] = temp_df['dateTime'].shift(-1)
        temp_df['next_value'] = temp_df['value'].shift(-1)
        temp_df['active calories'] = temp_df['next_value'] - temp_df['value']
        df = temp_df[(temp_df['active calories']>0.00)&(~temp_df['active calories'].isna())]
        df=df[['dateTime','end_time','active calories']]
        return df


    def steps(self):
        '''Get steps dataframe 
        
        Returns
        -------
        steps: pandas.DataFrame
            Dataframe with columns for start_time (datetime64), end_time (datetime64) and step count (int)
        '''
        temp_df = self.load_and_concat("/workspaces/fitbit_dbdp/data/Global Export Data/steps-*.json")
        temp_df['end_time'] = temp_df['dateTime'].shift(-1)
        temp_df['next_value'] = temp_df['value'].shift(-1)
        temp_df['steps'] = temp_df['next_value'] + temp_df['value']
        df=temp_df[(temp_df['steps']>0.00)&(~temp_df['steps'].isna())]
        df=df[['dateTime','end_time','steps']]
        return df
    

    def sleep_stage_summary(self):
        '''Get summary of sleep stages for the night of a day
        
        Returns
        -------
        sleep_stage_summary: pandas.DataFrame
            Dataframe with columns for date (datetime64), wake (seconds, int), light (seconds, int), deep (seconds, itn), REM (seconds, int) '''
        
        temp_df = self.load_and_concat("/workspaces/fitbit_dbdp/data/Global Export Data/sleep-*.json")[['dateOfSleep','levels']]
        temp_df['light'] = temp_df['levels'].apply(lambda x: x.get('summary', {}).get('light', {}).get('minutes'))
        temp_df['deep'] = temp_df['levels'].apply(lambda x: x.get('summary', {}).get('deep', {}).get('minutes'))
        temp_df['REM'] = temp_df['levels'].apply(lambda x: x.get('summary', {}).get('rem', {}).get('minutes'))
        temp_df['wake'] = temp_df['levels'].apply(lambda x: x.get('summary', {}).get('wake', {}).get('minutes'))
        df = temp_df[['dateOfSleep','light','deep','REM','wake']]
        return df
    
    def distance(self):
        '''Get distance dataframe 
        
        Returns
        -------
        distance: pandas.DataFrame
            Dataframe with columns for start_time (datetime64), end_time (datetime64) and distance in meters (int)
        '''
        temp_df = temp_df = self.load_and_concat("/workspaces/fitbit_dbdp/data/Global Export Data/distance-*.json")
        return temp_df
    
    def spo2(self):
        '''Get SpO2 dataframe 
        
        Returns
        -------
        spo2: pandas.DataFrame
            Dataframe with columns for timestamp (datetime64) and SpO2 % (int)
        '''
        return self._spo2()


In [12]:
file_path = '/Users/harrisonkane/Desktop/BME/fitbit_dbdp/data'
fitbit= FitBit(file_path)
heart_rate = fitbit.heart_rate()
print(heart_rate)

JSON data loaded from /workspaces/fitbit_dbdp/data/Global Export Data/heart_rate-2023-09-22.json
JSON data loaded from /workspaces/fitbit_dbdp/data/Global Export Data/heart_rate-2023-09-26.json
JSON data loaded from /workspaces/fitbit_dbdp/data/Global Export Data/heart_rate-2023-09-20.json
JSON data loaded from /workspaces/fitbit_dbdp/data/Global Export Data/heart_rate-2023-09-23.json
JSON data loaded from /workspaces/fitbit_dbdp/data/Global Export Data/heart_rate-2023-09-25.json
JSON data loaded from /workspaces/fitbit_dbdp/data/Global Export Data/heart_rate-2023-09-24.json
JSON data loaded from /workspaces/fitbit_dbdp/data/Global Export Data/heart_rate-2023-09-21.json
                 dateTime  bpm  confidence
0     2023-09-20 19:33:32   70           0
1     2023-09-20 19:33:42   75           0
2     2023-09-20 19:33:47  108           1
3     2023-09-20 19:33:52  101           1
4     2023-09-20 19:33:57   92           1
...                   ...  ...         ...
72032 2023-09-27 03:

In [13]:
file_path = '/Users/harrisonkane/Desktop/BME/fitbit_dbdp/data'
fitbit= FitBit(file_path)
energy = fitbit.active_calories()
print(energy)

JSON data loaded from /workspaces/fitbit_dbdp/data/Global Export Data/calories-2023-09-19.json
                 dateTime            end_time  active calories
2371  2023-09-20 15:31:00 2023-09-20 15:32:00             0.59
2373  2023-09-20 15:33:00 2023-09-20 15:34:00             1.99
2375  2023-09-20 15:35:00 2023-09-20 15:36:00             2.34
2376  2023-09-20 15:36:00 2023-09-20 15:37:00             2.35
2380  2023-09-20 15:40:00 2023-09-20 15:41:00             0.23
...                   ...                 ...              ...
12414 2023-09-27 14:54:00 2023-09-27 14:55:00             0.70
12415 2023-09-27 14:55:00 2023-09-27 14:56:00             0.71
12416 2023-09-27 14:56:00 2023-09-27 14:57:00             3.74
12418 2023-09-27 14:58:00 2023-09-27 14:59:00             0.47
12419 2023-09-27 14:59:00 2023-09-27 15:00:00             0.47

[2555 rows x 3 columns]


In [14]:
file_path = '/Users/harrisonkane/Desktop/BME/fitbit_dbdp/data'
fitbit= FitBit(file_path)
step = fitbit.steps()
print(step)

JSON data loaded from /workspaces/fitbit_dbdp/data/Global Export Data/steps-2023-09-19.json
                dateTime            end_time  steps
1    2023-09-20 19:33:00 2023-09-20 19:34:00   11.0
2    2023-09-20 19:34:00 2023-09-20 19:35:00   17.0
3    2023-09-20 19:35:00 2023-09-20 19:36:00   71.0
4    2023-09-20 19:36:00 2023-09-20 19:37:00  146.0
5    2023-09-20 19:37:00 2023-09-20 19:38:00  107.0
...                  ...                 ...    ...
5588 2023-09-27 19:00:00 2023-09-27 19:01:00  216.0
5589 2023-09-27 19:01:00 2023-09-27 19:02:00  216.0
5590 2023-09-27 19:02:00 2023-09-27 19:03:00  108.0
5591 2023-09-27 19:03:00 2023-09-27 19:04:00   39.0
5592 2023-09-27 19:04:00 2023-09-27 19:05:00   37.0

[2493 rows x 3 columns]


In [15]:
file_path = '/Users/harrisonkane/Desktop/BME/fitbit_dbdp/data'
fitbit= FitBit(file_path)
sleep = fitbit.sleep_stage_summary()
print(sleep)

JSON data loaded from /workspaces/fitbit_dbdp/data/Global Export Data/sleep-2023-09-19.json
  dateOfSleep  light   deep   REM  wake
0  2023-09-27  277.0   55.0  29.0  69.0
1  2023-09-26  197.0  106.0  71.0  69.0
2  2023-09-25    NaN    NaN   NaN   NaN
3  2023-09-24  208.0  120.0  94.0  46.0
4  2023-09-23  266.0   58.0  98.0  42.0
5  2023-09-22  240.0   59.0  53.0  44.0
6  2023-09-21  247.0   42.0  67.0  36.0
7  2023-09-20    NaN    NaN   NaN   NaN


In [17]:
file_path = '/Users/harrisonkane/Desktop/BME/fitbit_dbdp/data'
fitbit= FitBit(file_path)
distance = fitbit.distance()
print(distance)

JSON data loaded from /workspaces/fitbit_dbdp/data/Global Export Data/distance-2023-09-19.json
                dateTime  value
0    2023-09-20 19:32:00      0
1    2023-09-20 19:33:00      0
2    2023-09-20 19:34:00    820
3    2023-09-20 19:35:00    440
4    2023-09-20 19:36:00   4850
...                  ...    ...
5591 2023-09-27 19:03:00    140
5592 2023-09-27 19:04:00   2760
5593 2023-09-27 19:05:00      0
5594 2023-09-27 19:06:00      0
5595 2023-09-27 19:07:00      0

[5596 rows x 2 columns]


In [16]:
#####
# Unit Test
#####

file_path = '/Users/harrisonkane/Desktop/BME/fitbit_dbdp/data'
fitbit= FitBit(file_path)

print(fitbit.heart_rate)

<bound method FitBit.heart_rate of <__main__.FitBit object at 0x7fb642218fa0>>


In [None]:
class Preprocess:
    def __init__(self,data,column):
        self.data = data
        self.column = column

    @classmethod
    def imputeData(self, method='mean'):
        """
        Impute missing values in the data.
        Args:
            data (pd.DataFrame): Input data containing missing values.
        Returns:
            pd.DataFrame: Data with missing values imputed.
        """

        if method == 'mean':
            self.data[self.column] = self.data[self.column].fillna(self.data[self.column].mean())
        elif method == 'median':
            self.data[self.column] = self.data[self.column].fillna(self.data[self.column].median())
        elif method == 'zero':
            self.data[self.column] = self.data[self.column].fillna(0)
        return self.data

    @classmethod
    def sampleData(self, downsample=True, sample_rate=0):
        """
        Sample the data, optionally downsampling it.
        Args:
            data (pd.DataFrame): Input data for sampling.
            downsample (bool): Whether to downsample the data.
        Returns:
            pd.DataFrame: Sampled data.
        """
        
        if downsample == True:
            sampled_data = self.data.sample(frac=sample_rate, random_state=1)
        else: 
            if not isinstance(self.data.index, pd.DatetimeIndex):
                self.data.index = pd.to_datetime(self.data.index)
            data_index = self.data.index
            sampled_data = self.data.resample(rule=str(sample_rate)+'D').asfreq()
            sampled_data.interpolate(method='linear', inplace=True)
            sampled_data = sampled_data.reindex(data_index, method='nearest')
        return sampled_data

    @classmethod
    def covertTime(self, time_col, timezone='UTC'):
        """
        Convert a specific column in the data to a time format.
        Args:
            data (pd.DataFrame): Input data containing time data.
            time_col (str): Name of the time column to convert.
        Returns:
            pd.DataFrame: Data with the time column converted.
        """
        # FOR FUTURE: with standard format, shouldn't need to ask for time_col
        # As all DataFrames will have same name for time column
        # Also won't need to convert to datetime because will already be converted

        # Ensure the specified column exists in the DataFrame
        if time_col not in self.data.columns:
            raise ValueError(f"Column '{time_col}' not found in the DataFrame.")

        # Convert column to DateTime format (although it already should be in DateTime format)
        self.data[time_col] = pd.to_datetime(self.data[time_col], errors='coerce')

        # Apply the specified timezone (default is 'UTC')
        self.data[time_col] = self.data[time_col].dt.tz_localize(timezone)

        return self.data


In [None]:
#####
# Unit Test
#####

data = pd.DataFrame({'A': [1, 2, None, 4, 5], 'B': [5, 4, 3, None, 1]})
preprocessor = Preprocess(data, 'A')

assert preprocessor.imputeData('mean').isnull().sum().sum(), 0
assert preprocessor.imputeData('median').isnull().sum().sum(), 0
assert preprocessor.imputeData('zero').isnull().sum().sum(), 0

# Test sampleData method for downsampling
downsampled_data = preprocessor.sampleData(downsample=True, sample_rate=0.5)
assert downsampled_data.shape[0] < data.shape[0]

# # Test sampleData method for upsampling
upsampled_data = preprocessor.sampleData(downsample=False, sample_rate=2)
upsampled_data
# assert upsampled_data.shape[0] > data.shape[0]


In [None]:
class EDA:
    def __init__(self,data):
        ## Assume it is already processed
        self.data = data
        self.interval = None
        self.mean = None
        self.median = None
        self.std = None
    
    @classmethod
    def describeData(self):
        self.mean = np.mean(self.data)
        self.median = np.median(self.data)
        self.interval = np.max(self.data) - np.min(self.data)
        self.std = np.std(self.data)
        print(f"Mean: {self.mean:.5f}")
        print(f"Median: {self.median:.5f}")
        print(f"Interval: {self.interval:.5f}")
        print(f"Standard Deviation: {self.std:.5f}")

    @classmethod
    def dataDistribution(self):
        plt.figure(figsize=(8, 6))
        sns.distplot(self.data, bins=30, kde=False, color='blue')
        plt.title('Data Distribution Plot')
        plt.xlabel('Values')
        plt.ylabel('Frequency')
        plt.show()
    
    @classmethod
    def detectOutliers(self):
        iqr = np.percentile(self.data, 75) - np.percentile(self.data, 25)
        lower_bound = np.percentile(self.data, 25) - 1.5 * iqr
        upper_bound = np.percentile(self.data, 75) + 1.5 * iqr
        outliers = [x for x in self.data if x < lower_bound or x > upper_bound]
        print("Outliers:", outliers)
    
    @classmethod
    def correlationAnalysis(self,other_data):
    # Add code to perform correlation analysis
        pass
    

In [None]:
#####
# Unit Test
#####

data = np.random.normal(0, 1, 1000)
eda = EDA(data)
eda.describeData()
eda.dataDistribution()
eda.detectOutliers()