In [1]:
import argparse
import json
import os
import pandas as pd
import os

import datetime
import numpy as np
from cgms_data_seg_diatrend import CGMSDataSeg
from cnn_ohio import regressor, regressor_transfer, test_ckpt
from data_reader_DiaTrend import DataReader

In [2]:
import tensorflow as tf

# New method in TensorFlow 2.x:
# This will list the devices TensorFlow recognizes
print("TensorFlow version:", tf.__version__)
print("List of devices available to TensorFlow:")
print(tf.config.list_physical_devices())



TensorFlow version: 2.10.1
List of devices available to TensorFlow:
[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [3]:
# Example subjects
file_names = [
    'Subject11',
    'Subject26', 
    'Subject3', 
    'Subject30', 
    'Subject31', 
    'Subject36', 
    'Subject15', 
    'Subject37', 
    'Subject38', 
    'Subject39', 
    'Subject41', 
    'Subject42', 
    'Subject43',
    'Subject5', 
    'Subject6', 
    'Subject8', 
]

In [2]:
len(file_names)

16

In [5]:
for subj in file_names:
    subject = pd.read_excel(f"C:/Users/baiyi/OneDrive/Desktop/BGprediction/DiaTrend/dataset/{subj}.xlsx","CGM")
    split_index = int(len(subject) * 0.8)
    # Split the DataFrame
    train_df = subject[:split_index]
    test_df = subject[split_index:]

    # Save the DataFrames to CSV files
    train_df.to_csv(f'C:/Users/baiyi/OneDrive/Desktop/BGprediction/DiaTrend/train/{subj}_training_data.csv', index=False)
    test_df.to_csv(f'C:/Users/baiyi/OneDrive/Desktop/BGprediction/DiaTrend/test/{subj}_testing_data.csv', index=False)

    # Optionally, confirm the operation
    print(f"Training data saved with {len(train_df)} records.")
    print(f"Testing data saved with {len(test_df)} records.")

Training data saved with 237796 records.
Testing data saved with 59450 records.
Training data saved with 58584 records.
Testing data saved with 14646 records.
Training data saved with 318856 records.
Testing data saved with 79714 records.
Training data saved with 39179 records.
Testing data saved with 9795 records.
Training data saved with 40054 records.
Testing data saved with 10014 records.
Training data saved with 21895 records.
Testing data saved with 5474 records.
Training data saved with 170774 records.
Testing data saved with 42694 records.
Training data saved with 22718 records.
Testing data saved with 5680 records.
Training data saved with 22183 records.
Testing data saved with 5546 records.
Training data saved with 20896 records.
Testing data saved with 5224 records.
Training data saved with 19178 records.
Testing data saved with 4795 records.
Training data saved with 17768 records.
Testing data saved with 4442 records.
Training data saved with 14820 records.
Testing data sav

In [None]:
# Need to do the train_test split
# First 80% in training
# Last 20% in test
# split_index = int(len(subject) * 0.8)
# # Split the DataFrame
# train_df = subject[:split_index]
# test_df = subject[split_index:]

# # Save the DataFrames to CSV files
# train_df.to_csv(f'C:/Users/baiyi/OneDrive/Desktop/BGprediction/DiaTrend/train/{subj}_training_data.csv', index=False)
# test_df.to_csv(f'C:/Users/baiyi/OneDrive/Desktop/BGprediction/DiaTrend/test/{subj}_testing_data.csv', index=False)

# # Optionally, confirm the operation
# print(f"Training data saved with {len(train_df)} records.")
# print(f"Testing data saved with {len(test_df)} records.")



In [4]:
# Assuming your DataFrame is named df
subject['date'] = pd.to_datetime(subject['date'])  # Convert 'date' column to datetime if not already
subject.sort_values('date', inplace=True)  # Sort the DataFrame by the 'date' column


In [11]:
# Assuming self.interval_timedelta is set, for example:
interval_timedelta = datetime.timedelta(minutes=6)  # Example timedelta of 6 minutes, providing a range for latency

# Create a list to store the results
res = []

# Initialize the first group
if not subject.empty:
    current_group = [subject.iloc[0]['mg/dl']]
    last_time = subject.iloc[0]['date']

# Iterate over rows in DataFrame starting from the second row
for index, row in subject.iloc[1:].iterrows():
    current_time = row['date']
    if (current_time - last_time) <= interval_timedelta:
        # If the time difference is within the limit, add to the current group
        current_group.append(row['mg/dl'])
    else:
        # Otherwise, start a new group
        res.append(current_group)
        current_group = [row['mg/dl']]
    last_time = current_time

# Add the last group if it's not empty
if current_group:
    res.append(current_group)


# Functions

In [4]:
def preprocess_DiaTrend(path):

    subject = pd.read_csv(path)
    subject['date'] = pd.to_datetime(subject['date'], errors='coerce')  # Convert 'date' column to datetime if not already
    print(subject['date'][0])
    subject.sort_values('date', inplace=True)  # Sort the DataFrame by the 'date' column

    # Assuming self.interval_timedelta is set, for example:
    interval_timedelta = datetime.timedelta(minutes=6)  # Example timedelta of 6 minutes, providing a range for latency

    # Create a list to store the results
    res = []

    # Initialize the first group
    if not subject.empty:
        current_group = [subject.iloc[0]['mg/dl']]
        last_time = subject.iloc[0]['date']

    # Iterate over rows in DataFrame starting from the second row
    for index, row in subject.iloc[1:].iterrows():
        current_time = row['date']
        if (current_time - last_time) <= interval_timedelta:
            # If the time difference is within the limit, add to the current group
            current_group.append(row['mg/dl'])
        else:
            # Otherwise, start a new group
            res.append(current_group)
            current_group = [row['mg/dl']]
        last_time = current_time

    # Add the last group if it's not empty
    if current_group:
        res.append(current_group)
    
    # Filter out groups with fewer than 10 glucose readings
    # res = [group for group in res if len(group) >= 10]

    return res

# For loop to generate res for train and test

In [5]:
# Fomulate a loop to create a list to include all the files in train and test datset and generate the res for each of them seperately

# Define the directory path
train_directory_path = r'C:\Users\baiyi\OneDrive\Desktop\BGprediction\DiaTrend\train'  # Use a raw string for paths on Windows

# List files without their extensions
train_file_names = [os.path.splitext(file)[0] for file in os.listdir(train_directory_path)
              if os.path.isfile(os.path.join(train_directory_path, file))]

# Print the list of file names
print(train_file_names)


['Subject11_training_data', 'Subject15_training_data', 'Subject26_training_data', 'Subject30_training_data', 'Subject31_training_data', 'Subject36_training_data', 'Subject37_training_data', 'Subject38_training_data', 'Subject39_training_data', 'Subject3_training_data', 'Subject41_training_data', 'Subject42_training_data', 'Subject43_training_data', 'Subject5_training_data', 'Subject6_training_data', 'Subject8_training_data']


In [6]:
cleaned_subjects = [s.replace("_training_data", "") for s in train_file_names]

print(cleaned_subjects)

['Subject11', 'Subject15', 'Subject26', 'Subject30', 'Subject31', 'Subject36', 'Subject37', 'Subject38', 'Subject39', 'Subject3', 'Subject41', 'Subject42', 'Subject43', 'Subject5', 'Subject6', 'Subject8']


In [7]:
# Define the directory path
test_directory_path = r'C:\Users\baiyi\OneDrive\Desktop\BGprediction\DiaTrend\test'  # Use a raw string for paths on Windows

# List files without their extensions
test_file_names = [os.path.splitext(file)[0] for file in os.listdir(test_directory_path)
              if os.path.isfile(os.path.join(test_directory_path, file))]

# Print the list of file names
print(test_file_names)

['Subject11_testing_data', 'Subject15_testing_data', 'Subject26_testing_data', 'Subject30_testing_data', 'Subject31_testing_data', 'Subject36_testing_data', 'Subject37_testing_data', 'Subject38_testing_data', 'Subject39_testing_data', 'Subject3_testing_data', 'Subject41_testing_data', 'Subject42_testing_data', 'Subject43_testing_data', 'Subject5_testing_data', 'Subject6_testing_data', 'Subject8_testing_data']


In [8]:
train_data = dict()
for subj in train_file_names:
    print(subj)
    subj_path = f'C:/Users/baiyi/OneDrive/Desktop/BGprediction/DiaTrend/train/{subj}.csv'
    reader = preprocess_DiaTrend(subj_path)
    train_data[subj] = reader

Subject11_training_data
2018-06-19 16:19:32
Subject15_training_data
2019-10-01 12:51:22
Subject26_training_data
2021-02-25 17:28:53
Subject30_training_data
2019-11-20 18:49:52
Subject31_training_data
2019-06-25 19:28:12
Subject36_training_data
2019-06-24 13:01:19
Subject37_training_data
2019-11-20 19:40:51
Subject38_training_data
2019-11-22 20:13:43
Subject39_training_data
2019-06-17 14:59:59
Subject3_training_data
2016-08-25 00:39:41.582000
Subject41_training_data
2022-02-15 22:27:15
Subject42_training_data
2019-06-25 19:12:35
Subject43_training_data
2022-02-24 22:27:55
Subject5_training_data
2017-03-16 15:43:40
Subject6_training_data
2017-11-10 10:33:16
Subject8_training_data
2018-08-13 11:36:19


In [9]:
# Have not been run
test_data = dict()
for subj in test_file_names:
    print(subj)
    subj_path = f'C:/Users/baiyi/OneDrive/Desktop/BGprediction/DiaTrend/test/{subj}.csv'
    reader = preprocess_DiaTrend(subj_path)
    test_data[subj] = reader

Subject11_testing_data
2021-09-18 12:06:10
Subject15_testing_data
2021-06-11 06:24:07
Subject26_testing_data
2021-10-03 03:56:22
Subject30_testing_data
2019-06-30 12:11:34
Subject31_testing_data
2019-02-04 17:15:31
Subject36_testing_data
2019-04-02 00:05:54
Subject37_testing_data
2019-08-27 02:25:05
Subject38_testing_data
2019-09-03 11:39:11
Subject39_testing_data
2019-03-30 03:40:14
Subject3_testing_data
2020-12-15 09:38:16
Subject41_testing_data
2022-04-25 04:11:13
Subject42_testing_data
2019-04-20 18:37:49
Subject43_testing_data
2022-04-20 09:10:25
Subject5_testing_data
2021-03-30 17:20:32
Subject6_testing_data
2021-08-15 19:33:17
Subject8_testing_data
2021-09-07 06:48:39


In [10]:
epoch = 10
ph = 6
path = "../diatrend_results"

In [11]:
# a dumb dataset instance
train_dataset = CGMSDataSeg(
    "ohio", "C:/Users/baiyi/OneDrive/Desktop/BGprediction/DiaTrend/train/Subject11_training_data.csv", 5
)
sampling_horizon = 12
prediction_horizon = ph
scale = 0.01
outtype = "Same"
# train on training dataset
# k_size, nblock, nn_size, nn_layer, learning_rate, batch_size, epoch, beta
with open(f'../diatrend_results/config.json') as json_file:
    config = json.load(json_file)
argv = (
    config["k_size"],
    config["nblock"],
    config["nn_size"],
    config["nn_layer"],
    config["learning_rate"],
    config["batch_size"],
    epoch,
    config["beta"],
)
l_type = config["loss"]
# test on patients data
outdir = os.path.join(path, f"ph_{prediction_horizon}_{l_type}")
if not os.path.exists(outdir):
    os.makedirs(outdir)
all_errs = []

2018-06-19 16:19:32
Reading 571 segments


In [33]:
cleaned_subjects

['Subject11',
 'Subject15',
 'Subject26',
 'Subject3',
 'Subject30',
 'Subject31',
 'Subject36',
 'Subject37',
 'Subject38',
 'Subject39',
 'Subject41',
 'Subject42',
 'Subject43',
 'Subject5',
 'Subject6',
 'Subject8']

In [34]:
train_data.keys()

dict_keys(['Subject11_training_data', 'Subject15_training_data', 'Subject26_training_data', 'Subject30_training_data', 'Subject31_training_data', 'Subject36_training_data', 'Subject37_training_data', 'Subject38_training_data', 'Subject39_training_data', 'Subject3_training_data', 'Subject41_training_data', 'Subject42_training_data', 'Subject43_training_data', 'Subject5_training_data', 'Subject6_training_data', 'Subject8_training_data'])

In [35]:
train_pids = set(cleaned_subjects) - set([pid])
train_pids

{'Subject15',
 'Subject26',
 'Subject3',
 'Subject30',
 'Subject31',
 'Subject36',
 'Subject37',
 'Subject38',
 'Subject39',
 'Subject41',
 'Subject42',
 'Subject43',
 'Subject5',
 'Subject6',
 'Subject8'}

In [12]:
# Loop
cleaned_subjects.sort()
standard = False  # do not use standard
all_errs = []
for pid in cleaned_subjects: # First 9 as subset, can be an example
    train_pids = set(cleaned_subjects) - set([pid])
    local_train_data = []
    for k in train_pids:
        local_train_data += train_data[k + "_training_data"]
    print(f"Pretrain data: {sum([sum(x) for x in local_train_data])}")
    
    train_dataset.data = local_train_data
    train_dataset.set_cutpoint = -1
    train_dataset.reset(
        sampling_horizon,
        prediction_horizon,
        scale,
        100,
        False,
        outtype,
        1,
        standard,
    )
    regressor(train_dataset, *argv, l_type, outdir)
    # Fine-tune and test
    # target_test_dataset = CGMSDataSeg(
    #     "ohio", f"C:/Users/baiyi/OneDrive/Desktop/BGprediction/OhioT1DM/2018/test/{pid}-ws-testing.xml", 6
    # )
    target_test_dataset = CGMSDataSeg(
    "ohio", f"C:/Users/baiyi/OneDrive/Desktop/BGprediction/DiaTrend/test/{pid}_testing_data.csv", 5
    )
    target_test_dataset.set_cutpoint = 1
    target_test_dataset.reset(
        sampling_horizon,
        prediction_horizon,
        scale,
        0.01,
        False,
        outtype,
        1,
        standard,
    )
    # target_train_dataset = CGMSDataSeg(
    #     "ohio", f"C:/Users/baiyi/OneDrive/Desktop/BGprediction/OhioT1DM/2018/test/{pid}-ws-testing.xml", 5
    # )
    target_train_dataset = CGMSDataSeg(
    "ohio", f"C:/Users/baiyi/OneDrive/Desktop/BGprediction/DiaTrend/train/{pid}_training_data.csv", 5
    )
    target_train_dataset.set_cutpoint = -1
    target_train_dataset.reset(
        sampling_horizon,
        prediction_horizon,
        scale,
        100,
        False,
        outtype,
        1,
        standard,
    )
    err, labels = test_ckpt(target_test_dataset, outdir)
    errs = [err]
    transfer_res = [labels]
    for i in range(1, 2):
        err, labels = regressor_transfer(
            target_train_dataset,
            target_test_dataset,
            config["batch_size"],
            epoch,
            outdir,
            i,
        )
        errs.append(err)
        transfer_res.append(labels)
    transfer_res = np.concatenate(transfer_res, axis=1)
    np.savetxt(
        f"{outdir}/{pid}.txt",
        transfer_res,
        fmt="%.4f %.4f %.4f %.4f",
    )
    all_errs.append([pid] + errs)
all_errs = np.array(all_errs)
np.savetxt(f"{outdir}/errors.txt", all_errs, fmt="%s %.4f %.4f")
# label pair:(groundTruth, y_pred)

Pretrain data: 320622559.02958834
Building dataset, requesting data from 0 to 273724
Train data requested beyond limit, using all but last one
############################ Data structure summary ############################
Hypo/no_hypo: 43430/1327730
Found 273724 continuous time series
Data shape: (1371162, 12), Train/test: 1371160/2
Train test ratio: 685580.00
################################################################################
Feature size is: 
0
In regressor, x =
Tensor("x:0", shape=(None, 12), dtype=float32)
In regressor, y =
Tensor("add:0", shape=(None, 12), dtype=float32)
line73: Shape of y: (None, 12)
Before L2 regularization
Before training for loop
int(low_fid_data.train_n / batch_size) =  21424
Epoch 0, train loss: 0.283173
Epoch 1, train loss: 0.217323
Epoch 2, train loss: 0.268464
Epoch 3, train loss: 0.247767
Epoch 4, train loss: 0.236048
Epoch 5, train loss: 0.279901
Instructions for updating:
Use standard file APIs to delete files with this prefix.
Epoch 6, 

TypeError: Mismatch between array dtype ('<U32') and format specifier ('%s %.4f %.4f')

In [13]:
all_errs

array([['Subject11', '0.32737887', '0.31751657'],
       ['Subject15', '0.21702431', '0.21618843'],
       ['Subject26', '0.31711185', '0.27349338'],
       ['Subject3', '0.2685392', '0.2699521'],
       ['Subject30', '0.21345535', '0.22502957'],
       ['Subject31', '0.1765919', '0.17653388'],
       ['Subject36', '0.23704633', '0.23471707'],
       ['Subject37', '0.24713682', '0.24656081'],
       ['Subject38', '0.23195074', '0.2247002'],
       ['Subject39', '0.1977754', '0.18702544'],
       ['Subject41', '0.19897726', '0.19630045'],
       ['Subject42', '0.16103005', '0.15816924'],
       ['Subject43', '0.26570606', '0.25967422'],
       ['Subject5', '0.20555699', '0.2006869'],
       ['Subject6', '0.2507571', '0.24363846'],
       ['Subject8', '0.31099993', '0.30476284']], dtype='<U32')

In [None]:
['Subject11', '0.32737887', '0.31751657'],
['Subject15', '0.21702431', '0.21618843'],
['Subject26', '0.31711185', '0.27349338'],
['Subject3', '0.2685392', '0.2699521'],
['Subject30', '0.21345535', '0.22502957'],
['Subject31', '0.1765919', '0.17653388'],
['Subject36', '0.23704633', '0.23471707'],
['Subject37', '0.24713682', '0.24656081'],
['Subject38', '0.23195074', '0.2247002'],
['Subject39', '0.1977754', '0.18702544'],
['Subject41', '0.19897726', '0.19630045'],
['Subject42', '0.16103005', '0.15816924'],
['Subject43', '0.26570606', '0.25967422'],
['Subject5', '0.20555699', '0.2006869'],
['Subject6', '0.2507571', '0.24363846'],
['Subject8', '0.31099993', '0.30476284']

In [None]:
['Subject11', '0.3257632', '0.31918257'],
['Subject15', '0.21759439', '0.21790388'],
['Subject26', '0.28789735', '0.27718654'],
['Subject3', '0.26801455', '0.26734337'],
['Subject30', '0.21501394', '0.20972419'],
['Subject31', '0.1775641', '0.17692266'],
['Subject36', '0.23820741', '0.23973823'],
['Subject37', '0.24739642', '0.24186915'],
['Subject38', '0.2375784', '0.22447872'],
['Subject39', '0.2006417', '0.18739489'],
['Subject41', '0.19930626', '0.1959955'],
['Subject42', '0.16283263', '0.15715173'],
['Subject43', '0.26754174', '0.26674312'],
['Subject5', '0.20940122', '0.20045878'],
['Subject6', '0.25433964', '0.24411151'],
['Subject8', '0.32000047', '0.30507126']，## 30 sampling horizon

In [14]:
# Convert the second and third columns to floats
second_column = all_errs[:, 1].astype(float)
third_column = all_errs[:, 2].astype(float)

# Calculate the average
average_second_column = np.mean(second_column)
average_third_column = np.mean(third_column)

print("Average of the second column:", average_second_column)
print("Average of the third column:", average_third_column)

Average of the second column: 0.23918988500000002
Average of the third column: 0.23343434749999997


In [38]:
# Convert the second and third columns to floats
second_column = all_errs[:, 1].astype(float)
third_column = all_errs[:, 2].astype(float)

# Calculate the average
average_second_column = np.mean(second_column)
average_third_column = np.mean(third_column)

print("Average of the second column:", average_second_column)
print("Average of the third column:", average_third_column)

Average of the second column: 0.23931833875
Average of the third column: 0.23320475625


In [None]:
# Also, if the previous cell run into an issue but all result txt files are ready
# You can run this to evaluate:
import argparse
import json
import os
import pandas as pd
import os
import datetime
import numpy as np

# List all files and directories in the current directory
files_and_directories = os.listdir('.')

# Filter for files that end with .txt
txt_files = [file for file in files_and_directories if file.endswith('.txt')]


# Read the data from the text file
def calcuate_rmse(file):
    data = np.loadtxt(file)  # Make sure to replace 'data.txt' with your actual file path
    print(file)
    # Splitting the data into groundtruth and predictions
    groundtruth = data[:, 0]  # First column as ground truth (also same as third column)
    predictions_1 = data[:, 1]  # Second column as predictions from method 1
    predictions_2 = data[:, 3]  # Fourth column as predictions from method 2

    # Function to calculate RMSE
    def calculate_rmse(true_values, predictions):
        mse = np.mean((true_values - predictions) ** 2)
        rmse = np.sqrt(mse)
        return rmse

    # Calculate RMSE for each method
    rmse_method_1 = calculate_rmse(groundtruth, predictions_1)
    rmse_method_2 = calculate_rmse(groundtruth, predictions_2)

    print("RMSE for Method 1:", rmse_method_1)
    print("RMSE for Method 2:", rmse_method_2)
    return rmse_method_1


rmse_list = []
for f in txt_files[1:]:
    rmse1 = calcuate_rmse(f)
    print(rmse1)
    rmse_list.append(rmse1)

print(np.average(rmse_list))

print(rmse_list)