In [1]:
from __future__ import division, print_function

import collections
import csv
import datetime
import xml.etree.ElementTree as ET

import numpy as np
import pandas as pd

from datetime import datetime, timedelta
from scipy.interpolate import CubicSpline
import matplotlib.pyplot as plt

import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error

import pickle
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
history_len = 7
ph = 6

In [3]:
def round_up_to_nearest_five_minutes(ts):
    # Parse the timestamp
    dt = datetime.strptime(ts, "%d-%m-%Y %H:%M:%S")
    
    # Calculate minutes to add to round up to the nearest 5 minutes
    minutes_to_add = (5 - dt.minute % 5) % 5
    if minutes_to_add == 0 and dt.second == 0:
        # If exactly on a 5 minute mark and second is 0, no need to add time
        minutes_to_add = 0
    
    # Add the necessary minutes
    new_dt = dt + timedelta(minutes=minutes_to_add)
    
    # Return the new timestamp in the same format
    return new_dt.strftime( "%d-%m-%Y %H:%M:%S")

In [4]:
def preprocess_t1dexi_cgm(path, round):

    subject = pd.read_csv(path)
    # Group by 'Category' column
    grouped = subject.groupby('LBCAT')
    # Create a dictionary to store the split DataFrames
    split_dfs = {category: group for category, group in grouped}
    selected_cgm = split_dfs["CGM"][["LBORRES", "LBDTC"]]
    new_df_cgm = pd.DataFrame(selected_cgm)

    new_df_cgm['LBDTC'] = pd.to_datetime(new_df_cgm['LBDTC'], errors='coerce')  # Convert 'date' column to datetime if not already
    new_df_cgm.sort_values('LBDTC', inplace=True)  # Sort the DataFrame by the 'date' column

    if round == True:
        rounded_timestamp = []
        for ts in new_df_cgm["LBDTC"]:
            rounded_timestamp.append(round_up_to_nearest_five_minutes(ts))
        new_df_cgm["rounded_LBDTC"] = rounded_timestamp
        formatted_data = [[{'ts': row['rounded_LBDTC'], 'value': row['LBORRES']}] for _, row in new_df_cgm.iterrows()]

    else:
        # Convert each row to the desired format
        formatted_data = [[{'ts': row['LBDTC'].to_pydatetime(), 'value': row['LBORRES']}] for _, row in new_df_cgm.iterrows()]
    
    return formatted_data

In [5]:
def segement_data_as_15min(data):
    df = pd.DataFrame(data)

    # Calculate time differences
    df['time_diff'] = df['timestamp'].diff()

    # Identify large gaps
    df['new_segment'] = df['time_diff'] > pd.Timedelta(hours=0.25)

    # Find indices where new segments start
    segment_starts = df[df['new_segment']].index

    # Initialize an empty dictionary to store segments
    segments = {}
    prev_index = 0

    # Loop through each segment start and slice the DataFrame accordingly
    for i, start in enumerate(segment_starts, 1):
        segments[f'segment_{i}'] = df.iloc[prev_index:start].reset_index(drop=True)
        prev_index = start

    # Add the last segment from the last gap to the end of the DataFrame
    segments[f'segment_{len(segment_starts) + 1}'] = df.iloc[prev_index:].reset_index(drop=True)

    # Optionally remove helper columns from each segment
    for segment in segments.values():
        segment.drop(columns=['time_diff', 'new_segment'], inplace=True)
    
    return segments

In [6]:
# Function to align and update segments with meal data
def update_segments_with_meals(segments, meal_df):
    for segment_name, segment_df in segments.items():
        # Initialize the 'carbs' column to zeros
        segment_df['carb_effect'] = 0

        # Iterate through each timestamp in the segment
        for i, row in segment_df.iterrows():
            # Find the closest meal timestamp and its carb information
            meal_df['time_difference'] = abs(meal_df['ts'] - row['timestamp'])
            closest_meal = meal_df.loc[meal_df['time_difference'].idxmin()]
            
            # Check if the closest meal is within 5 minutes
            if closest_meal['time_difference'] <= pd.Timedelta(minutes=5):
                # Ensure that the meal is assigned to only one segment and is the closest
                if not meal_df.at[closest_meal.name, 'assigned']:
                    segment_df.at[i, 'carb_effect'] = closest_meal['carb_effect']
                    meal_df.at[closest_meal.name, 'assigned'] = True  # Mark as assigned
                else:
                    # Check if the current timestamp is closer than the one it was assigned to
                    assigned_index = segment_df[segment_df['carb_effect'] == closest_meal['carb_effect']].index[0]
                    if row['timestamp'] - closest_meal['ts'] < segment_df.at[assigned_index, 'timestamp'] - closest_meal['ts']:
                        # Reassign the meal to the new closer timestamp
                        segment_df.at[assigned_index, 'carb_effect'] = 0  # Remove carbs from previously assigned timestamp
                        segment_df.at[i, 'carb_effect'] = closest_meal['carb_effect']  # Assign carbs to the new closer timestamp
            # else:
            #     print(f"Meal type {meal['type']} on {meal['ts']} is too far from closest timestamp in {closest_segment} with a difference of {closest_diff}.")

    return segments

In [7]:
# Function to align and update segments with meal data
def update_segments_with_basal(segments, basal_df):
    for segment_name, segment_df in segments.items():
        # Initialize the 'carbs' column to zeros
        segment_df['basal_rate'] = None

        # Iterate through each timestamp in the segment
        for i, row in segment_df.iterrows():
            # Find the closest meal timestamp and its carb information
            for _, basal_row in basal_df.iterrows():
                if basal_row['ts'] <= row['timestamp'] < (basal_row['end_ts'] if pd.notna(basal_row['end_ts']) else pd.Timestamp('2099-12-31')):
                    segment_df.at[i, 'basal_rate'] = basal_row['value']
                    break

    return segments

In [8]:
def expand_meal_entry(meal_row):
    meal_time = meal_row['ts']
    end_effect_time = meal_time + timedelta(hours=3)
    carb = float(meal_row['carbs'])

    c_eff_list = [0, 0, 0, ]

    for i in range(1, 10):
        c_eff = (i * 0.111) * carb
        if c_eff > carb:
            print("C_eff > carb")
            c_eff = carb
        c_eff_list.append(c_eff)

    for j in range(1, 25):
        c_eff = (1 - (j * 0.028)) * carb
        if c_eff < 0:
            print("C_eff < 0")
            c_eff = 0
        c_eff_list.append(c_eff)

    timestamp_list = pd.date_range(start=meal_time, end=end_effect_time, freq='5min')
    d = {"ts": timestamp_list[:-1], "carb_effect": c_eff_list}
    meal_effect_df = pd.DataFrame(data = d)

    return meal_effect_df

    

In [9]:
# Read in bolus and temp basal information
# Need to set the 
def preprocess_t1dexi_bolus_tempbasal(filepath, round):
    subject_facm = pd.read_csv(filepath)
    # Group by 'Category' column
    grouped = subject_facm.groupby('FACAT')

    split_dfs = {category: group for category, group in grouped}
    # Step 1: Extract the desired columns
    new_df_bolus = split_dfs["BOLUS"][["FAORRES", "FADTC"]]
    new_df_bolus['FADTC'] = pd.to_datetime(new_df_bolus['FADTC'], format="%Y-%m-%d %H:%M:%S")
    new_df_bolus.reset_index(drop=True, inplace=True)
    new_df_bolus = new_df_bolus.rename(columns={'FAORRES': 'dose', 'FADTC': 'ts_begin'})
    new_df_bolus['assigned'] = False
    # new_df_bolus['end_ts'] = new_df_bolus['ts_begin'].shift(-1)
    return new_df_bolus

In [10]:
def update_segments_with_bolus(segments, bolus_df):
    for segment_name, segment_df in segments.items():
        # Initialize the 'dose' column to zeros
        segment_df['bolus_dose'] = 0

        # Iterate through each timestamp in the segment
        for i, row in segment_df.iterrows():
            # Find the closest bolus timestamp and its carb information
            bolus_df['time_difference'] = abs(bolus_df['ts'] - row['timestamp'])
            closest_bolus = bolus_df.loc[bolus_df['time_difference'].idxmin()]
            
            # Check if the closest bolus is within 5 minutes
            if closest_bolus['time_difference'] <= pd.Timedelta(minutes=5):
                # Ensure that the bolus is assigned to only one segment and is the closest
                if not bolus_df.at[closest_bolus.name, 'assigned']:
                    segment_df.at[i, 'bolus_dose'] = closest_bolus['bolus_effect']
                    bolus_df.at[closest_bolus.name, 'assigned'] = True  # Mark as assigned
                else:
                    # Check if the current timestamp is closer than the one it was assigned to
                    assigned_index = segment_df[segment_df['bolus_dose'] == closest_bolus['bolus_effect']].index[0]
                    if row['timestamp'] - closest_bolus['ts'] < closest_bolus['ts'] - segment_df.at[assigned_index, 'timestamp']:
                        # Reassign the bolus to the new closer timestamp
                        segment_df.at[assigned_index, 'bolus_dose'] = 0  # Remove dose from previously assigned timestamp
                        segment_df.at[i, 'bolus_dose'] = closest_bolus['bolus_effect']  # Assign dose to the new closer timestamp
            # else:
            #     print(f"bolus type {bolus['type']} on {bolus['ts']} is too far from closest timestamp in {closest_segment} with a difference of {closest_diff}.")

    return segments

In [11]:
def expand_bolus_entry(bolus_row):
    bolus_time = bolus_row['ts_begin']
    timestamp_list = [bolus_time, ]
    # end_effect_time = bolus_time + timedelta(hours=3)
    dose = float(bolus_row['dose'])

    b_eff_list = [dose, ]
    b_eff = dose

    i = 1
    while b_eff > 0:
        b_eff = dose - (i * 0.07)
        b_eff_list.append(b_eff)
        timestamp_list.append(bolus_time + timedelta(minutes=5 * i))
        i += 1
    # print(len(timestamp_list[:-1]))
    # print(len(b_eff_list[:-1]))


    d = {"ts": timestamp_list[:-1], "bolus_effect": b_eff_list[:-1]}
    bolus_effect_df = pd.DataFrame(data = d)

    return bolus_effect_df

In [12]:
def compute_accumulated_step(window_list, step_df):
    start_time = window_list[0]
    end_time = window_list[-1]

    step_list = []
    counter = 1
    for idx, step_row in step_df.iterrows():
        
        if step_row['ts'] >= start_time and step_row['ts'] < end_time:
            step_list.append(counter * float(step_row['value']))
            counter += 1

        if step_row['ts'] >= end_time:
            break
    # print("length of step_list ", len(step_list))
    if len(step_list) == 0:
        return None
    accumulate_step = sum(step_list)/len(step_list)
    return accumulate_step
    

In [13]:
def label_delta_transform(labels_list):
    # label_lower_percentile = -12.75
    # label_upper_percentile = 12.85
    label_lower_percentile = np.percentile(labels_list, 10)
    label_upper_percentile = np.percentile(labels_list, 90)
    transformed_labels = []
    for label in labels_list:
        if label <= label_lower_percentile:
            transformed_labels.append(1)
        elif label_lower_percentile < label < label_upper_percentile:
            trans_label = round((256/(label_upper_percentile - label_lower_percentile))*(label + abs(label_lower_percentile) + 0.05))
            transformed_labels.append(trans_label)
        elif label >= label_upper_percentile:
            transformed_labels.append(256)
    return transformed_labels


def prepare_dataset(segments, ph):
    '''
    ph = 6, 30 minutes ahead
    ph = 12, 60 minutes ahead
    '''
    features_list = []
    labels_list = []
    raw_glu_list = []
    
    
    # Iterate over each segment
    for segment_name, segment_df in segments.items():
        # Ensure all columns are of numeric type
        segment_df['carb_effect'] = pd.to_numeric(segment_df['carb_effect'], errors='coerce')
        segment_df['basal_rate'] = pd.to_numeric(segment_df['basal_rate'], errors='coerce')
        segment_df['bolus_dose'] = pd.to_numeric(segment_df['bolus_dose'], errors='coerce')
        segment_df['steps'] = pd.to_numeric(segment_df['steps'], errors='coerce')

        # Fill NaNs that might have been introduced by conversion errors
        segment_df.fillna(0, inplace=True)

        # Maximum index for creating a complete feature set
        print("len of segment_df is ", len(segment_df))
        max_index = len(segment_df) - (history_len + ph)  # Subtracting only 15+ph to ensure i + 15 + ph is within bounds
        
        # Iterate through the data to create feature-label pairs
        for i in range(max_index):
            # Extracting features from index i to i+15
            segment_df = segment_df.reset_index(drop = True)
            features = segment_df.loc[i:i+history_len, ['glucose_value', 'carb_effect', 'bolus_dose', 'steps']].values
            # Extracting label for index i+15+ph
            # label = segment_df.loc[i+15+ph, 'glucose_value'] - segment_df.loc[i+15, 'glucose_value']
            
            raw_glu_list.append(segment_df.loc[i+history_len+ph, 'glucose_value'])
            features_list.append(features)
            # labels_list.append(label)
            
    print("len of features_list " + str(len(features_list)))
    # print("len of labels_list " + str(len(labels_list)))
    
    # new_labels_list = label_delta_transform(labels_list)    
    # print("after label transform, the len of label list "+str(len(new_labels_list)))    
    
    return features_list, raw_glu_list


In [14]:
import torch
import torch.nn as nn
import torch.optim as optim


class StackedLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size, dropout_prob):
        super(StackedLSTM, self).__init__()
        
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        # First LSTM layer
        self.lstm1 = nn.LSTM(input_size, hidden_size, num_layers=1, batch_first=True).to(device)
        
        # Dropout layer
        self.dropout = nn.Dropout(dropout_prob).to(device)
        
        # Second LSTM layer
        self.lstm2 = nn.LSTM(hidden_size, hidden_size, num_layers=1, batch_first=True).to(device)
        
        # Fully connected layers
        self.fc1 = nn.Linear(hidden_size, 512).to(device)
        self.fc2 = nn.Linear(512, 128).to(device)
        self.fc3 = nn.Linear(128, output_size).to(device)
        
        # Activation functions
        self.relu = nn.ReLU()
        
    
    def forward(self, x):
        batch_size = x.size(0)  # Get the batch size from the input tensor

        # Initialize hidden and cell state for the first LSTM layer
        h0 = torch.zeros(1, batch_size, self.hidden_size).to(x.device)
        c0 = torch.zeros(1, batch_size, self.hidden_size).to(x.device)
        
        # First LSTM layer
        out, (hn, cn) = self.lstm1(x, (h0, c0))
        
        # Dropout layer
        out = self.dropout(out)
        
        # Initialize hidden and cell state for the second LSTM layer
        h1 = torch.zeros(1, batch_size, self.hidden_size).to(x.device)
        c1 = torch.zeros(1, batch_size, self.hidden_size).to(x.device)
        
        # Second LSTM layer
        out, (hn, cn) = self.lstm2(out, (h1, c1))
        
        # Fully connected layers
        out = out[:, -1, :]  # Get the last time step output
        out = self.relu(self.fc1(out))
        out = self.relu(self.fc2(out))
        out = self.fc3(out)
        
        return out



In [15]:
input_size = 4# Number of input features
hidden_size = 128  # Hidden vector size
num_layers = 2  # Number of LSTM layers
output_size = 1  # Single output
dropout_prob = 0.2  # Dropout probability


model = StackedLSTM(input_size, hidden_size, num_layers, output_size, dropout_prob) # input_size, hidden_size, num_layers, output_size, dropout_prob
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.00005)

In [16]:
# input_channels = 4  # Number of features
# output_channels = 1  # Predicting a single value (glucose level)
# num_blocks = 4  # Number of WaveNet blocks
# dilations = [2**i for i in range(num_blocks)]  # Dilation rates: 1, 2, 4, 8

# model = StackedLSTM(input_channels, output_channels, num_blocks, dilations)
# print(model)

# # Example of how to define the loss and optimizer
# criterion = nn.MSELoss()
# optimizer = torch.optim.Adam(model.parameters(), lr=0.0008)

# Implementation

In [17]:
overlap = ['854.csv',
 '979.csv',
 '816.csv',
 '953.csv',
 '981.csv',
 '1617.csv',
 '1343.csv',
 '987.csv',
 '255.csv',
 '907.csv',
 '856.csv',
 '354.csv',
 '894.csv',
 '862.csv',
 '900.csv',
 '695.csv'] 
# '85.csv',
# '911.csv',

In [18]:
subject = pd.read_csv(f"/Users/baiyinglu/Desktop/AugmentedHealthLab/T1DEXI_Apr52024/try/LB_split/854.csv")

FileNotFoundError: [Errno 2] No such file or directory: '/Users/baiyinglu/Desktop/AugmentedHealthLab/T1DEXI_Apr52024/try/LB_split/854.csv'

In [None]:
glucose = preprocess_t1dexi_cgm(f"/Users/baiyinglu/Desktop/AugmentedHealthLab/T1DEXI_Apr52024/try/LB_split/854.csv", False)

In [None]:
glucose_dict = {entry[0]['ts']: entry[0]['value'] for entry in glucose}

# Create the multi-channel database
g_data = []
for timestamp in glucose_dict:
    record = {
        'timestamp': timestamp,
        'glucose_value': glucose_dict[timestamp],
        # 'meal_type': None,
        # 'meal_carbs': 0
    }
    
    g_data.append(record)

# Create DataFrame
glucose_df = pd.DataFrame(g_data)

# Convert glucose values to numeric type for analysis
glucose_df['glucose_value'] = pd.to_numeric(glucose_df['glucose_value'])

# Calculate percentiles
lower_percentile = np.percentile(glucose_df['glucose_value'], 2)
upper_percentile = np.percentile(glucose_df['glucose_value'], 98)

# Print thresholds
print(f"2% lower threshold: {lower_percentile}")
print(f"98% upper threshold: {upper_percentile}")

glucose_df

2% lower threshold: 69.0
98% upper threshold: 226.0


Unnamed: 0,timestamp,glucose_value
0,2020-09-29 00:04:38,84.0
1,2020-09-29 00:09:38,83.0
2,2020-09-29 00:14:38,83.0
3,2020-09-29 00:19:38,83.0
4,2020-09-29 00:24:38,83.0
...,...,...
7864,2020-10-26 23:38:59,211.0
7865,2020-10-26 23:43:59,206.0
7866,2020-10-26 23:48:59,201.0
7867,2020-10-26 23:53:59,197.0


In [None]:
# Example: print each segment
segments = segement_data_as_15min(glucose_df)
# interpolated_segements = detect_missing_and_spline_interpolate(segments)

In [None]:
meal = pd.read_csv("/Users/baiyinglu/Desktop/AugmentedHealthLab/T1DEXI_Apr52024/try/ML_split/854.csv")
selected_meal_column = meal[["MLDOSE", "MLDTC"]]

meal_df = selected_meal_column.rename(columns={'MLDOSE': 'carbs', 'MLDTC': 'ts'})
# Fix: Use format='mixed' to handle different date formats
meal_df['ts'] = pd.to_datetime(meal_df['ts'], format='mixed')

meal_df['assigned'] = False

# Extract unique dates
unique_dates = meal_df['ts'].dt.date.unique()

# Convert to list
meal_avaiable_dates_list = unique_dates.tolist()

cleaned_segments = {}

# Iterate through each segment and filter by unique dates
for segment_name, df in segments.items():
    # Convert timestamp column to datetime and then extract the date part
    df['date'] = pd.to_datetime(df['timestamp']).dt.date
    
    # Filter the DataFrame to only include rows where the date is in unique_dates_list
    filtered_df = df[df['date'].isin(meal_avaiable_dates_list)]
    
    # Drop the 'date' column as it's no longer needed
    filtered_df = filtered_df.drop(columns=['date'])
    
    # Store the filtered DataFrame in the cleaned_segments dictionary
    cleaned_segments[segment_name] = filtered_df
empty_d = {"ts": [], "carb_effect": []}
whole_meal_effect_df = pd.DataFrame(data = empty_d)
# Expand meal entries
for index, meal_row in meal_df.iterrows():
    meal_effect_df = expand_meal_entry(meal_row)

    # Merge the DataFrames on the 'ts' column with an outer join
    merged_df = pd.merge(whole_meal_effect_df, meal_effect_df, on='ts', how='outer', suffixes=('_df1', '_df2'))

    # Fill NaN values with 0 for the carb_effect columns
    merged_df['carb_effect_df1'] = merged_df['carb_effect_df1'].fillna(0)
    merged_df['carb_effect_df2'] = merged_df['carb_effect_df2'].fillna(0)

    # Sum the carb_effect values
    merged_df['carb_effect'] = merged_df['carb_effect_df1'] + merged_df['carb_effect_df2']

    # Keep only the required columns
    whole_meal_effect_df = merged_df[['ts', 'carb_effect']]

whole_meal_effect_df['assigned'] = False

    # Update the segments with meal data
meal_updated_segments = update_segments_with_meals(cleaned_segments, whole_meal_effect_df)
# Update the segments with meal data
# meal_updated_segments = update_segments_with_meals(cleaned_segments, meal_df)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  whole_meal_effect_df['assigned'] = False
  segment_df.at[i, 'carb_effect'] = closest_meal['carb_effect']
  segment_df.at[i, 'carb_effect'] = closest_meal['carb_effect']


In [None]:
meal_df['ts']

0   2020-06-20 00:00:00
1   2020-06-20 00:00:00
2   2020-09-30 16:21:08
3   2020-10-11 15:33:05
Name: ts, dtype: datetime64[ns]

In [None]:
# subject_facm = pd.read_csv(f"../FACM_split/854.csv")
# # Group by 'Category' column
# grouped = subject_facm.groupby('FACAT')

# split_dfs = {category: group for category, group in grouped}
# # Step 1: Extract the desired columns
# new_df_basal = split_dfs["BASAL"][["FAORRES", "FADTC"]]
# new_df_basal['FADTC'] = pd.to_datetime(new_df_basal['FADTC'], format="%Y-%m-%d %H:%M:%S")
# new_df_basal.reset_index(drop=True, inplace=True)
# new_df_basal = new_df_basal.rename(columns={'FAORRES': 'value', 'FADTC': 'ts'})
# new_df_basal['assigned'] = False
# new_df_basal['end_ts'] = new_df_basal['ts'].shift(-1)
# new_df_basal[:10]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df_basal['FADTC'] = pd.to_datetime(new_df_basal['FADTC'], format="%Y-%m-%d %H:%M:%S")


Unnamed: 0,value,ts,assigned,end_ts
0,0.0,2020-09-29 00:01:12,False,2020-09-29 00:46:03
1,0.101,2020-09-29 00:46:03,False,2020-09-29 00:51:02
2,0.0,2020-09-29 00:51:02,False,2020-09-29 01:10:58
3,0.121,2020-09-29 01:10:58,False,2020-09-29 01:15:58
4,0.178,2020-09-29 01:15:58,False,2020-09-29 01:20:57
5,0.3,2020-09-29 01:20:57,False,2020-09-29 01:25:56
6,0.367,2020-09-29 01:25:56,False,2020-09-29 01:30:56
7,0.351,2020-09-29 01:30:56,False,2020-09-29 01:35:55
8,0.375,2020-09-29 01:35:55,False,2020-09-29 01:40:54
9,0.409,2020-09-29 01:40:54,False,2020-09-29 01:45:54


In [None]:
# basal_updated_segments = update_segments_with_basal(meal_updated_segments, new_df_basal)

In [None]:
new_df_bolus = preprocess_t1dexi_bolus_tempbasal(f"/Users/baiyinglu/Desktop/AugmentedHealthLab/T1DEXI_Apr52024/try/FACM_split/854.csv", False)

empty_b = {"ts": [], "bolus_effect": []}
whole_bolus_effect_df = pd.DataFrame(data = empty_b)

for index, bolus_row in new_df_bolus.iterrows():
    bolus_effect_df = expand_bolus_entry(bolus_row)

    # Merge the DataFrames on the 'ts' column with an outer join
    merged_df = pd.merge(whole_bolus_effect_df, bolus_effect_df, on='ts', how='outer', suffixes=('_df1', '_df2'))

    # Fill NaN values with 0 for the carb_effect columns
    merged_df['bolus_effect_df1'] = merged_df['bolus_effect_df1'].fillna(0)
    merged_df['bolus_effect_df2'] = merged_df['bolus_effect_df2'].fillna(0)
    

    # Sum the carb_effect values
    merged_df['bolus_effect'] = merged_df['bolus_effect_df1'] + merged_df['bolus_effect_df2']

    # Keep only the required columns
    whole_bolus_effect_df = merged_df[['ts', 'bolus_effect']]

whole_bolus_effect_df["assigned"] = False

bolus_updated_segments = update_segments_with_bolus(meal_updated_segments, whole_bolus_effect_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df_bolus['FADTC'] = pd.to_datetime(new_df_bolus['FADTC'], format="%Y-%m-%d %H:%M:%S")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  whole_bolus_effect_df["assigned"] = False
  segment_df.at[i, 'bolus_dose'] = closest_bolus['bolus_effect']
  segment_df.at[i, 'bolus_dose'] = closest_bolus['bolus_effect']


In [None]:
bolus_updated_segments

{'segment_1': Empty DataFrame
 Columns: [timestamp, glucose_value, carb_effect, basal_rate, bolus_dose]
 Index: [],
 'segment_2':               timestamp  glucose_value  carb_effect basal_rate  bolus_dose
 142 2020-09-30 00:04:40          186.0          0.0      0.425       0.000
 143 2020-09-30 00:09:39          184.0          0.0      0.425       0.000
 144 2020-09-30 00:14:39          187.0          0.0      0.425       0.000
 145 2020-09-30 00:19:39          189.0          0.0      0.573       0.517
 146 2020-09-30 00:24:39          190.0          0.0      0.425       0.447
 ..                  ...            ...          ...        ...         ...
 425 2020-09-30 23:39:41          172.0          0.0      0.727       0.000
 426 2020-09-30 23:44:42          168.0          0.0      0.661       0.000
 427 2020-09-30 23:49:42          161.0          0.0      0.625       0.000
 428 2020-09-30 23:54:41          164.0          0.0      0.625       0.000
 429 2020-09-30 23:59:41          1

In [None]:
def optimize_step_processing(bolus_updated_segments, step_df):
    # Convert step_df timestamps to datetime if they aren't already
    step_df['ts'] = pd.to_datetime(step_df['ts'])
    step_df['stepvalue'] = pd.to_numeric(step_df['stepvalue'])
    
    # Pre-calculate weights for step accumulation (1 to 10 for 50 minutes window)
    weights = np.arange(1, 11)
    
    for segment_name, segment_df in bolus_updated_segments.items():
        print(segment_name)  # Keep the progress print if needed
        # Convert timestamps if needed
        segment_df['timestamp'] = pd.to_datetime(segment_df['timestamp'])
        
        # Create array to store accumulated steps
        accumulate_step_list = []
        
        # Get all unique window starts for this segment
        window_starts = segment_df['timestamp'].apply(lambda x: x - timedelta(minutes=50))
        window_ends = segment_df['timestamp']
        
        # Process each window
        for start, end in zip(window_starts, window_ends):
            # Filter steps within the window
            mask = (step_df['ts'] >= start) & (step_df['ts'] < end)
            window_steps = step_df.loc[mask, 'stepvalue']
            
            if len(window_steps) == 0:
                accumulate_step_list.append(None)
            else:
                # Take last 10 steps (or pad with zeros if less than 10)
                last_steps = window_steps.iloc[-10:] if len(window_steps) > 10 else window_steps
                weighted_sum = (last_steps.values * weights[:len(last_steps)]).sum()
                accumulate_step_list.append(weighted_sum / len(last_steps))
        
        # Assign accumulated steps to segment
        segment_df['steps'] = accumulate_step_list
    
    return bolus_updated_segments


In [None]:
# Read in the Step count data
subject_fa = pd.read_csv(f"/Users/baiyinglu/Desktop/AugmentedHealthLab/T1DEXI_Apr52024/try/FA_split/854.csv")
grouped = subject_fa.groupby('FAOBJ')
split_dfs = {category: group for category, group in grouped}
new_df_step = split_dfs["10-SECOND INTERVAL STEP COUNT"][["FAORRES", "FADTC"]]
step_df = new_df_step.rename(columns={'FAORRES': 'stepvalue', 'FADTC': 'ts'})
step_df.reset_index(inplace=True)
step_df['ts'] = pd.to_datetime(step_df['ts'])

# Process steps with optimized function
step_updated_segments = optimize_step_processing(bolus_updated_segments, step_df)



  subject_fa = pd.read_csv(f"/Users/baiyinglu/Desktop/AugmentedHealthLab/T1DEXI_Apr52024/try/FA_split/854.csv")


segment_1
segment_2
segment_3
segment_4
segment_5


In [None]:
# accumulate_step_list = []
# # test_segment = segments["segment_1"]
# for segment_name, segment_df in bolus_updated_segments.items():
#     print(segment_name)
#     accumulate_step_list = []
#     for index, cgm_row in segment_df.iterrows():
#         current = cgm_row['timestamp']
#         first_timestamp = current - timedelta(minutes=50)
#         window_list = pd.date_range(start=first_timestamp, end=current, freq='5min')

#         accumulated_step = compute_accumulated_step(window_list, step_df)
#         accumulate_step_list.append(accumulated_step)
#     segment_df['steps'] = accumulate_step_list

segment_1
segment_2
segment_3
segment_4
segment_5


In [None]:
features_list, raw_glu_list = prepare_dataset(bolus_updated_segments, ph)
# Assuming features_list and raw_glu_list are already defined
features_array = np.array(features_list)
labels_array = np.array(raw_glu_list)

# Step 1: Split into 80% train+val and 20% test
X_temp, X_test, y_temp, y_test = train_test_split(features_array, labels_array, test_size=0.2, shuffle=False)

# Step 2: Split the 80% into 70% train and 10% val (0.7/0.8 = 0.875)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.125, shuffle=False)

# Convert the splits to torch tensors
X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32)
X_val = torch.tensor(X_val, dtype=torch.float32)
y_val = torch.tensor(y_val, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.float32)

# Create DataLoaders
train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=False)

val_dataset = TensorDataset(X_val, y_val)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

test_dataset = TensorDataset(X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

len of segment_df is  0
len of segment_df is  288
len of segment_df is  287
len of segment_df is  0
len of segment_df is  0
len of features_list 549


In [None]:
num_epochs =500
for epoch in range(num_epochs):
    model.train()
    
    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Training Loss: {loss.item():.4f}')


    model.eval()
    with torch.no_grad():
        total_loss = 0
        for inputs, targets in val_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, targets.float())
            total_loss += loss.item()
        
        avg_loss = total_loss / len(val_loader)
        print(f'Test Loss: {avg_loss:.4f}')

model.eval()
predictions = []
actuals = []
with torch.no_grad():
    for inputs, targets in val_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        outputs = model(inputs)
        predictions.append(outputs)
        actuals.append(targets)

predictions = torch.cat(predictions).cpu().numpy()
actuals = torch.cat(actuals).cpu().numpy()


rmse = root_mean_squared_error(actuals,predictions)
print(f'RMSE on validation set: {rmse}')

In [None]:
model.eval()
predictions = []
actuals = []
with torch.no_grad():
    for inputs, targets in test_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        outputs = model(inputs)
        predictions.append(outputs)
        actuals.append(targets)

predictions = torch.cat(predictions).cpu().numpy()
actuals = torch.cat(actuals).cpu().numpy()


rmse = root_mean_squared_error(actuals,predictions)
print(f'RMSE on validation set: {rmse}')

RMSE on validation set: 25.823055267333984


# Implement on the group

In [None]:
overlap = ['854.csv',
 '979.csv',
 '816.csv',
 '953.csv',
 '981.csv',
 '1617.csv',
 '1343.csv',
 '987.csv',
 '255.csv',
 '907.csv',
 '856.csv',
 '354.csv',
 '894.csv',
 '862.csv',
 '900.csv',
 '695.csv']

In [None]:
new_test_rmse_list = []

In [None]:
for ffile in overlap[:-1]:
    print(ffile)
    subject = pd.read_csv(f"../LB_split/{ffile}")
    glucose = preprocess_t1dexi_cgm(f"../LB_split/{ffile}", False)
    glucose_dict = {entry[0]['ts']: entry[0]['value'] for entry in glucose}

    # Create the multi-channel database
    g_data = []
    for timestamp in glucose_dict:
        record = {
            'timestamp': timestamp,
            'glucose_value': glucose_dict[timestamp],
            # 'meal_type': None,
            # 'meal_carbs': 0
        }
        
        g_data.append(record)

    # Create DataFrame
    glucose_df = pd.DataFrame(g_data)

    segments = segement_data_as_15min(glucose_df)

    meal = pd.read_csv(f"../ML_split/{ffile}")
    selected_meal_column = meal[["MLDOSE", "MLDTC"]]

    meal_df = selected_meal_column.rename(columns={'MLDOSE': 'carbs', 'MLDTC': 'ts'})
    meal_df['ts'] = pd.to_datetime(meal_df['ts'], format="%Y-%m-%d %H:%M:%S")

    meal_df['assigned'] = False

    # Extract unique dates
    unique_dates = meal_df['ts'].dt.date.unique()

    # Convert to list
    meal_avaiable_dates_list = unique_dates.tolist()

    cleaned_segments = {}

    # Iterate through each segment and filter by unique dates
    for segment_name, df in segments.items():
        # Convert timestamp column to datetime and then extract the date part
        df['date'] = pd.to_datetime(df['timestamp']).dt.date
        
        # Filter the DataFrame to only include rows where the date is in unique_dates_list
        filtered_df = df[df['date'].isin(meal_avaiable_dates_list)]
        
        # Drop the 'date' column as it's no longer needed
        filtered_df = filtered_df.drop(columns=['date'])
        
        # Store the filtered DataFrame in the cleaned_segments dictionary
        cleaned_segments[segment_name] = filtered_df

    # Expand meal entries
    for index, meal_row in meal_df.iterrows():
        meal_effect_df = expand_meal_entry(meal_row)

        # Merge the DataFrames on the 'ts' column with an outer join
        merged_df = pd.merge(whole_meal_effect_df, meal_effect_df, on='ts', how='outer', suffixes=('_df1', '_df2'))

        # Fill NaN values with 0 for the carb_effect columns
        merged_df['carb_effect_df1'] = merged_df['carb_effect_df1'].fillna(0)
        merged_df['carb_effect_df2'] = merged_df['carb_effect_df2'].fillna(0)

        # Sum the carb_effect values
        merged_df['carb_effect'] = merged_df['carb_effect_df1'] + merged_df['carb_effect_df2']

        # Keep only the required columns
        whole_meal_effect_df = merged_df[['ts', 'carb_effect']]

    whole_meal_effect_df['assigned'] = False

    # Update the segments with meal data
    meal_updated_segments = update_segments_with_meals(cleaned_segments, whole_meal_effect_df)

    subject_facm = pd.read_csv(f"../FACM_split/{ffile}")
    # Group by 'Category' column
    grouped = subject_facm.groupby('FACAT')

    split_dfs = {category: group for category, group in grouped}
    # Step 1: Extract the desired columns
    new_df_basal = split_dfs["BASAL"][["FAORRES", "FADTC"]]
    new_df_basal['FADTC'] = pd.to_datetime(new_df_basal['FADTC'], format="%Y-%m-%d %H:%M:%S")
    new_df_basal.reset_index(drop=True, inplace=True)
    new_df_basal = new_df_basal.rename(columns={'FAORRES': 'value', 'FADTC': 'ts'})
    new_df_basal['assigned'] = False
    new_df_basal['end_ts'] = new_df_basal['ts'].shift(-1)
    
    basal_updated_segments = update_segments_with_basal(meal_updated_segments, new_df_basal)

    new_df_bolus = preprocess_t1dexi_bolus_tempbasal(f"../FACM_split/{ffile}", False)


    empty_b = {"ts": [], "bolus_effect": []}
    whole_bolus_effect_df = pd.DataFrame(data = empty_b)

    for index, bolus_row in new_df_bolus.iterrows():
        bolus_effect_df = expand_bolus_entry(bolus_row)

        # Merge the DataFrames on the 'ts' column with an outer join
        merged_df = pd.merge(whole_bolus_effect_df, bolus_effect_df, on='ts', how='outer', suffixes=('_df1', '_df2'))

        # Fill NaN values with 0 for the carb_effect columns
        merged_df['bolus_effect_df1'] = merged_df['bolus_effect_df1'].fillna(0)
        merged_df['bolus_effect_df2'] = merged_df['bolus_effect_df2'].fillna(0)
        

        # Sum the carb_effect values
        merged_df['bolus_effect'] = merged_df['bolus_effect_df1'] + merged_df['bolus_effect_df2']

        # Keep only the required columns
        whole_bolus_effect_df = merged_df[['ts', 'bolus_effect']]

    whole_bolus_effect_df["assigned"] = False

    bolus_updated_segments = update_segments_with_bolus(basal_updated_segments, whole_bolus_effect_df)

    # bolus_updated_segments = update_segments_with_bolus(basal_updated_segments, new_df_bolus)
    # Steps
    # Read in the Step count data
    subject_fa = pd.read_csv(f"../FA_split/{ffile}")
    # Group by 'Category' column
    grouped = subject_fa.groupby('FAOBJ')

    split_dfs = {category: group for category, group in grouped}
    # Step 1: Extract the desired columns
    new_df_step = split_dfs["10-SECOND INTERVAL STEP COUNT"][["FAORRES", "FADTC"]]
    step_df = new_df_step.rename(columns={'FAORRES': 'stepvalue', 'FADTC': 'ts'})
    step_df.reset_index(inplace=True)
    # for i in range(len(step_df)):
    #     step_ts = datetime.strptime(step_df['ts'][i], "%Y-%m-%d %H:%M:%S")
    #     step_df['ts'][i] = step_ts
    step_df['ts'] = pd.to_datetime(step_df['ts'])
    accumulate_step_list = []
    # test_segment = segments["segment_1"]
    for segment_name, segment_df in bolus_updated_segments.items():
        print(segment_name)
        accumulate_step_list = []
        for index, cgm_row in segment_df.iterrows():
            current = cgm_row['timestamp']
            first_timestamp = current - timedelta(minutes=50)
            window_list = pd.date_range(start=first_timestamp, end=current, freq='5min')

            accumulated_step = compute_accumulated_step(window_list, step_df)
            accumulate_step_list.append(accumulated_step)
        segment_df['steps'] = accumulate_step_list
    

    features_list, raw_glu_list = prepare_dataset(bolus_updated_segments, ph)
    # Assuming features_list and raw_glu_list are already defined
    features_array = np.array(features_list)
    labels_array = np.array(raw_glu_list)

    # Step 1: Split into 80% train+val and 20% test
    X_temp, X_test, y_temp, y_test = train_test_split(features_array, labels_array, test_size=0.2, shuffle=False)

    # Step 2: Split the 80% into 70% train and 10% val (0.7/0.8 = 0.875)
    X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.125, shuffle=False)

    # Convert the splits to torch tensors
    X_train = torch.tensor(X_train, dtype=torch.float32)
    y_train = torch.tensor(y_train, dtype=torch.float32)
    X_val = torch.tensor(X_val, dtype=torch.float32)
    y_val = torch.tensor(y_val, dtype=torch.float32)
    X_test = torch.tensor(X_test, dtype=torch.float32)
    y_test = torch.tensor(y_test, dtype=torch.float32)

    # Create DataLoaders
    train_dataset = TensorDataset(X_train, y_train)
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=False)

    val_dataset = TensorDataset(X_val, y_val)
    val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

    test_dataset = TensorDataset(X_test, y_test)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

    
    model = StackedLSTM(input_size, hidden_size, num_layers, output_size, dropout_prob) # input_size, hidden_size, num_layers, output_size, dropout_prob
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.00005)

    num_epochs =500
    for epoch in range(num_epochs):
        model.train()
        
        for inputs, targets in train_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            
            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            
            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        
        print(f'Epoch [{epoch+1}/{num_epochs}], Training Loss: {loss.item():.4f}')


        model.eval()
        with torch.no_grad():
            total_loss = 0
            for inputs, targets in val_loader:
                inputs, targets = inputs.to(device), targets.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, targets.float())
                total_loss += loss.item()
            
            avg_loss = total_loss / len(val_loader)
            print(f'Test Loss: {avg_loss:.4f}')

    model.eval()
    predictions = []
    actuals = []
    with torch.no_grad():
        for inputs, targets in val_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            predictions.append(outputs)
            actuals.append(targets)

    predictions = torch.cat(predictions).cpu().numpy()
    actuals = torch.cat(actuals).cpu().numpy()


    rmse = root_mean_squared_error(actuals,predictions)
    print(f'RMSE on validation set: {rmse}')

    model.eval()
    predictions = []
    actuals = []
    with torch.no_grad():
        for inputs, targets in test_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            predictions.append(outputs)
            actuals.append(targets)

    predictions = torch.cat(predictions).cpu().numpy()
    actuals = torch.cat(actuals).cpu().numpy()


    rmse = root_mean_squared_error(actuals,predictions)
    print(f'RMSE on validation set: {rmse}')
    new_test_rmse_list.append(rmse)