In [1]:
import argparse
import json
import os
import pandas as pd
import os

import datetime
import numpy as np
from cgms_data_seg_diatrend import CGMSDataSeg
from cnn_ohio import regressor, regressor_transfer, test_ckpt
from data_reader_DiaTrend import DataReader

import tensorflow as tf

# New method in TensorFlow 2.x:
# This will list the devices TensorFlow recognizes
print("TensorFlow version:", tf.__version__)
print("List of devices available to TensorFlow:")
print(tf.config.list_physical_devices())

TensorFlow version: 2.10.1
List of devices available to TensorFlow:
[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [None]:
def preprocess_DiaTrend(path):

    subject = pd.read_csv(path)
    subject['date'] = pd.to_datetime(subject['date'], errors='coerce')  # Convert 'date' column to datetime if not already
    print(subject['date'][0])
    subject.sort_values('date', inplace=True)  # Sort the DataFrame by the 'date' column

    # Assuming self.interval_timedelta is set, for example:
    interval_timedelta = datetime.timedelta(minutes=6)  # Example timedelta of 6 minutes, providing a range for latency

    # Create a list to store the results
    res = []

    # Initialize the first group
    if not subject.empty:
        current_group = [subject.iloc[0]['mg/dl']]
        last_time = subject.iloc[0]['date']

    # Iterate over rows in DataFrame starting from the second row
    for index, row in subject.iloc[1:].iterrows():
        current_time = row['date']
        if (current_time - last_time) <= interval_timedelta:
            # If the time difference is within the limit, add to the current group
            current_group.append(row['mg/dl'])
        else:
            # Otherwise, start a new group
            res.append(current_group)
            current_group = [row['mg/dl']]
        last_time = current_time

    # Add the last group if it's not empty
    if current_group:
        res.append(current_group)
    
    # Filter out groups with fewer than 10 glucose readings
    # res = [group for group in res if len(group) >= 10]

    return res

# The entire loop

In [18]:
epoch = 80
ph = 6
path = "../diatrend_results"
# Correct path with raw string
for sh in [6, 12, 18, 24]:
    for fold_num in range(1, 6):
        train_directory_path = f'C:/Users/baiyi/OneDrive/Desktop/Modify_GenBG/modified_diatrend_subset/fold{fold_num}_training'
        # List files without their extensions
        train_file_names = [os.path.splitext(file)[0] for file in os.listdir(train_directory_path)
                            if os.path.isfile(os.path.join(train_directory_path, file))]
        cleaned_subjects = [s.replace("processed_cgm_data_", "") for s in train_file_names]


        # Define the directory path
        test_directory_path = f'C:/Users/baiyi/OneDrive/Desktop/Modify_GenBG/modified_diatrend_subset/fold{fold_num}_test'  # Use a raw string for paths on Windows
        # List files without their extensions
        test_file_names = [os.path.splitext(file)[0] for file in os.listdir(test_directory_path)
                    if os.path.isfile(os.path.join(test_directory_path, file))]
        cleaned_test_subjects = [s.replace("processed_cgm_data_", "") for s in test_file_names]

        train_data = dict()
        for subj in train_file_names:

            subj_path = f'C:/Users/baiyi/OneDrive/Desktop/Modify_GenBG/modified_diatrend_subset/fold{fold_num}_training/{subj}.csv'
            reader = preprocess_DiaTrend(subj_path)
            train_data[subj] = reader

        test_data = dict()
        for subj in test_file_names:

            subj_path = f'C:/Users/baiyi/OneDrive/Desktop/Modify_GenBG/modified_diatrend_subset/fold{fold_num}_test/{subj}.csv'
            reader = preprocess_DiaTrend(subj_path)
            test_data[subj] = reader

        # a dumb dataset instance 
        train_dataset = CGMSDataSeg(
            "diatrend", "C:/Users/baiyi/OneDrive/Desktop/Modify_GenBG/modified_diatrend_subset/fold1_training/processed_cgm_data_Subject12.csv", 5
        )
        sampling_horizon = sh
        prediction_horizon = ph
        scale = 0.01
        outtype = "Same"
        # train on training dataset
        # k_size, nblock, nn_size, nn_layer, learning_rate, batch_size, epoch, beta
        with open(f'../diatrend_results/config.json') as json_file:
            config = json.load(json_file)
        argv = (
            config["k_size"],
            config["nblock"],
            config["nn_size"],
            config["nn_layer"],
            config["learning_rate"],
            config["batch_size"],
            epoch,
            config["beta"],
        )
        l_type = config["loss"]
        # test on patients data
        outdir = os.path.join(path, f"ph_{prediction_horizon}_sh{sampling_horizon}_fold{fold_num}_{l_type}")
        if not os.path.exists(outdir):
            os.makedirs(outdir)
        all_errs = []

        # Train on the training fold
        cleaned_subjects.sort()
        standard = False  

        train_pids = set(cleaned_subjects)
        local_train_data = []
        for k in train_pids:
            local_train_data += train_data["processed_cgm_data_" + k]


        train_dataset.data = local_train_data
        train_dataset.set_cutpoint = -1
        train_dataset.reset(
            sampling_horizon,
            prediction_horizon,
            scale,
            100,
            False,
            outtype,
            1,
            standard,
        )
        regressor(train_dataset, *argv, l_type, outdir)

        # Evaluate on the test patients
        all_errs = []
        for pid in cleaned_test_subjects: # First 9 as subset, can be an example
            # Fine-tune and test
            target_test_dataset = CGMSDataSeg(
            "diatrend", f"C:/Users/baiyi/OneDrive/Desktop/Modify_GenBG/modified_diatrend_subset/fold{fold_num}_test/processed_cgm_data_{pid}.csv", 5
            )
            target_test_dataset.set_cutpoint = 1
            target_test_dataset.reset(
                sampling_horizon,
                prediction_horizon,
                scale,
                0.01,
                False,
                outtype,
                1,
                standard,
            )

            err, labels = test_ckpt(target_test_dataset, outdir)
            np.savetxt(
                f"{outdir}/{pid}.txt",
                [err],
                fmt="%.4f",
            )
            all_errs.append([str(pid), err]) 
        all_errs = np.array(all_errs, dtype=object)  # Use dtype=object to handle mixed types
        np.savetxt(f"{outdir}/errors.txt", all_errs, fmt="%s %.4f")
        # label pair:(groundTruth, y_pred)

2019-06-09 00:04:51
2019-06-04 02:21:39
2019-09-05 00:01:34
2019-10-02 00:01:19
2019-12-25 00:02:52
2020-01-01 00:03:08
2019-11-24 00:04:24
2020-04-09 00:00:29
2020-05-14 00:00:34
2021-02-14 00:04:12
2020-09-16 00:03:53
2020-06-26 00:03:03
2021-03-14 00:02:30
2021-06-30 00:00:12
2021-02-26 00:03:54
2021-06-12 00:04:25
2021-11-06 00:03:22
2018-12-05 00:02:57
2019-05-25 00:04:25
2019-01-01 00:02:49
2021-02-01 00:00:24
2021-07-14 00:00:17
2021-05-23 00:01:16
2021-12-09 06:22:05
2019-03-11 00:00:21
2019-08-06 00:01:42
2019-08-14 00:54:21
2019-03-11 00:04:15
2021-12-11 00:02:00
2022-02-16 00:02:15
2019-04-05 00:04:13
2022-02-25 00:02:54
2021-09-19 00:00:16
2019-08-17 00:00:56
2019-08-16 00:00:35
2019-06-27 00:00:03
2021-11-27 11:48:00
2019-06-27 00:00:53
2019-08-17 11:32:32
2019-06-27 00:02:26
2019-08-16 00:01:20
2019-06-09 00:00:34
2015-12-05 00:00:51.037000
2018-09-30 00:03:02
2018-06-20 00:04:30
2015-12-05 00:01:23.363000
2016-08-25 00:39:41.582000
2017-06-16 00:04:15
2017-03-17 00:03:34

In [None]:
for subj in file_names:
    subject = pd.read_excel(f"C:/Users/username/OneDrive/Desktop/BGprediction/DiaTrend/dataset/{subj}.xlsx","CGM")
    split_index = int(len(subject) * 0.8)
    # Split the DataFrame
    train_df = subject[:split_index]
    test_df = subject[split_index:]

    # Save the DataFrames to CSV files
    train_df.to_csv(f'C:/Users/username/OneDrive/Desktop/BGprediction/DiaTrend/train/{subj}_training_data.csv', index=False)
    test_df.to_csv(f'C:/Users/username/OneDrive/Desktop/BGprediction/DiaTrend/test/{subj}_testing_data.csv', index=False)

    # Optionally, confirm the operation
    print(f"Training data saved with {len(train_df)} records.")
    print(f"Testing data saved with {len(test_df)} records.")

In [None]:
# Need to do the train_test split
# First 80% in training
# Last 20% in test
# split_index = int(len(subject) * 0.8)
# # Split the DataFrame
# train_df = subject[:split_index]
# test_df = subject[split_index:]

# # Save the DataFrames to CSV files
# train_df.to_csv(f'C:/Users/username/OneDrive/Desktop/BGprediction/DiaTrend/train/{subj}_training_data.csv', index=False)
# test_df.to_csv(f'C:/Users/username/OneDrive/Desktop/BGprediction/DiaTrend/test/{subj}_testing_data.csv', index=False)

# # Optionally, confirm the operation
# print(f"Training data saved with {len(train_df)} records.")
# print(f"Testing data saved with {len(test_df)} records.")



In [4]:
# Assuming your DataFrame is named df
subject['date'] = pd.to_datetime(subject['date'])  # Convert 'date' column to datetime if not already
subject.sort_values('date', inplace=True)  # Sort the DataFrame by the 'date' column


In [11]:
# Assuming self.interval_timedelta is set, for example:
interval_timedelta = datetime.timedelta(minutes=6)  # Example timedelta of 6 minutes, providing a range for latency

# Create a list to store the results
res = []

# Initialize the first group
if not subject.empty:
    current_group = [subject.iloc[0]['mg/dl']]
    last_time = subject.iloc[0]['date']

# Iterate over rows in DataFrame starting from the second row
for index, row in subject.iloc[1:].iterrows():
    current_time = row['date']
    if (current_time - last_time) <= interval_timedelta:
        # If the time difference is within the limit, add to the current group
        current_group.append(row['mg/dl'])
    else:
        # Otherwise, start a new group
        res.append(current_group)
        current_group = [row['mg/dl']]
    last_time = current_time

# Add the last group if it's not empty
if current_group:
    res.append(current_group)


# Functions

In [2]:
def preprocess_DiaTrend(path):

    subject = pd.read_csv(path)
    subject['date'] = pd.to_datetime(subject['date'], errors='coerce')  # Convert 'date' column to datetime if not already
    print(subject['date'][0])
    subject.sort_values('date', inplace=True)  # Sort the DataFrame by the 'date' column

    # Assuming self.interval_timedelta is set, for example:
    interval_timedelta = datetime.timedelta(minutes=6)  # Example timedelta of 6 minutes, providing a range for latency

    # Create a list to store the results
    res = []

    # Initialize the first group
    if not subject.empty:
        current_group = [subject.iloc[0]['mg/dl']]
        last_time = subject.iloc[0]['date']

    # Iterate over rows in DataFrame starting from the second row
    for index, row in subject.iloc[1:].iterrows():
        current_time = row['date']
        if (current_time - last_time) <= interval_timedelta:
            # If the time difference is within the limit, add to the current group
            current_group.append(row['mg/dl'])
        else:
            # Otherwise, start a new group
            res.append(current_group)
            current_group = [row['mg/dl']]
        last_time = current_time

    # Add the last group if it's not empty
    if current_group:
        res.append(current_group)
    
    # Filter out groups with fewer than 10 glucose readings
    # res = [group for group in res if len(group) >= 10]

    return res

# For loop to generate res for train and test

In [16]:
# Fomulate a loop to create a list to include all the files in train and test datset and generate the res for each of them seperately
# C:/Users/username/OneDrive/Desktop/Modify_GenBG/modified_diatrend_subset/fold1_training/processed_cgm_data_Subject12.csv
fold_num = 2

# Correct path with raw string
train_directory_path = f'C:/Users/baiyi/OneDrive/Desktop/Modify_GenBG/modified_diatrend_subset/fold{fold_num}_training'

# List files without their extensions
train_file_names = [os.path.splitext(file)[0] for file in os.listdir(train_directory_path)
                    if os.path.isfile(os.path.join(train_directory_path, file))]

# Print the list of file names
print(train_file_names)


['processed_cgm_data_Subject1', 'processed_cgm_data_Subject10', 'processed_cgm_data_Subject11', 'processed_cgm_data_Subject2', 'processed_cgm_data_Subject23', 'processed_cgm_data_Subject24', 'processed_cgm_data_Subject25', 'processed_cgm_data_Subject26', 'processed_cgm_data_Subject27', 'processed_cgm_data_Subject28', 'processed_cgm_data_Subject29', 'processed_cgm_data_Subject3', 'processed_cgm_data_Subject30', 'processed_cgm_data_Subject31', 'processed_cgm_data_Subject32', 'processed_cgm_data_Subject33', 'processed_cgm_data_Subject34', 'processed_cgm_data_Subject35', 'processed_cgm_data_Subject36', 'processed_cgm_data_Subject37', 'processed_cgm_data_Subject38', 'processed_cgm_data_Subject39', 'processed_cgm_data_Subject4', 'processed_cgm_data_Subject40', 'processed_cgm_data_Subject41', 'processed_cgm_data_Subject42', 'processed_cgm_data_Subject43', 'processed_cgm_data_Subject44', 'processed_cgm_data_Subject45', 'processed_cgm_data_Subject46', 'processed_cgm_data_Subject47', 'processed_

In [4]:
cleaned_subjects = [s.replace("processed_cgm_data_", "") for s in train_file_names]

print(cleaned_subjects)

['Subject12', 'Subject13', 'Subject14', 'Subject15', 'Subject16', 'Subject17', 'Subject18', 'Subject19', 'Subject20', 'Subject21', 'Subject22', 'Subject23', 'Subject24', 'Subject25', 'Subject26', 'Subject27', 'Subject28', 'Subject29', 'Subject30', 'Subject31', 'Subject32', 'Subject33', 'Subject34', 'Subject35', 'Subject36', 'Subject37', 'Subject38', 'Subject39', 'Subject40', 'Subject41', 'Subject42', 'Subject43', 'Subject44', 'Subject45', 'Subject46', 'Subject47', 'Subject48', 'Subject49', 'Subject50', 'Subject51', 'Subject53', 'Subject54']


In [5]:
# Define the directory path
test_directory_path = r'C:\Users\baiyi\OneDrive\Desktop\Modify_GenBG\modified_diatrend_subset\fold1_test'  # Use a raw string for paths on Windows

# List files without their extensions
test_file_names = [os.path.splitext(file)[0] for file in os.listdir(test_directory_path)
              if os.path.isfile(os.path.join(test_directory_path, file))]

# Print the list of file names
print(test_file_names)

['processed_cgm_data_Subject1', 'processed_cgm_data_Subject10', 'processed_cgm_data_Subject11', 'processed_cgm_data_Subject2', 'processed_cgm_data_Subject3', 'processed_cgm_data_Subject4', 'processed_cgm_data_Subject5', 'processed_cgm_data_Subject6', 'processed_cgm_data_Subject7', 'processed_cgm_data_Subject8', 'processed_cgm_data_Subject9']


In [6]:
cleaned_test_subjects = [s.replace("processed_cgm_data_", "") for s in test_file_names]

print(cleaned_test_subjects)

['Subject1', 'Subject10', 'Subject11', 'Subject2', 'Subject3', 'Subject4', 'Subject5', 'Subject6', 'Subject7', 'Subject8', 'Subject9']


In [7]:
train_data = dict()
for subj in train_file_names:
    print(subj)
    subj_path = f'C:/Users/baiyi/OneDrive/Desktop/Modify_GenBG/modified_diatrend_subset/fold1_training/{subj}.csv'
    reader = preprocess_DiaTrend(subj_path)
    train_data[subj] = reader

processed_cgm_data_Subject12
2019-06-09 00:04:51
processed_cgm_data_Subject13
2019-06-04 02:21:39
processed_cgm_data_Subject14
2019-09-05 00:01:34
processed_cgm_data_Subject15
2019-10-02 00:01:19
processed_cgm_data_Subject16
2019-12-25 00:02:52
processed_cgm_data_Subject17
2020-01-01 00:03:08
processed_cgm_data_Subject18
2019-11-24 00:04:24
processed_cgm_data_Subject19
2020-04-09 00:00:29
processed_cgm_data_Subject20
2020-05-14 00:00:34
processed_cgm_data_Subject21
2021-02-14 00:04:12
processed_cgm_data_Subject22
2020-09-16 00:03:53
processed_cgm_data_Subject23
2020-06-26 00:03:03
processed_cgm_data_Subject24
2021-03-14 00:02:30
processed_cgm_data_Subject25
2021-06-30 00:00:12
processed_cgm_data_Subject26
2021-02-26 00:03:54
processed_cgm_data_Subject27
2021-06-12 00:04:25
processed_cgm_data_Subject28
2021-11-06 00:03:22
processed_cgm_data_Subject29
2018-12-05 00:02:57
processed_cgm_data_Subject30
2019-05-25 00:04:25
processed_cgm_data_Subject31
2019-01-01 00:02:49
processed_cgm_data_S

In [9]:
# Have not been run
test_data = dict()
for subj in test_file_names:
    print(subj)
    subj_path = f'C:/Users/baiyi/OneDrive/Desktop/Modify_GenBG/modified_diatrend_subset/fold1_test/{subj}.csv'
    reader = preprocess_DiaTrend(subj_path)
    test_data[subj] = reader

processed_cgm_data_Subject1
2015-12-05 00:00:51.037000
processed_cgm_data_Subject10
2018-09-30 00:03:02
processed_cgm_data_Subject11
2018-06-20 00:04:30
processed_cgm_data_Subject2
2015-12-05 00:01:23.363000
processed_cgm_data_Subject3
2016-08-25 00:39:41.582000
processed_cgm_data_Subject4
2017-06-16 00:04:15
processed_cgm_data_Subject5
2017-03-17 00:03:34
processed_cgm_data_Subject6
2017-11-11 00:03:09
processed_cgm_data_Subject7
2017-10-04 00:02:14
processed_cgm_data_Subject8
2018-08-14 04:16:19
processed_cgm_data_Subject9
2019-04-10 01:29:52


In [10]:
epoch = 80
sh = 6
ph = 6
path = "../diatrend_results"

In [11]:
# a dumb dataset instance "C:\Users\baiyi\OneDrive\Desktop\Modify_GenBG\modified_diatrend_subset\fold1_training\processed_cgm_data_Subject12.csv"
train_dataset = CGMSDataSeg(
    "diatrend", "C:/Users/baiyi/OneDrive/Desktop/Modify_GenBG/modified_diatrend_subset/fold1_training/processed_cgm_data_Subject12.csv", 5
)
sampling_horizon = sh
prediction_horizon = ph
scale = 0.01
outtype = "Same"
# train on training dataset
# k_size, nblock, nn_size, nn_layer, learning_rate, batch_size, epoch, beta
with open(f'../diatrend_results/config.json') as json_file:
    config = json.load(json_file)
argv = (
    config["k_size"],
    config["nblock"],
    config["nn_size"],
    config["nn_layer"],
    config["learning_rate"],
    config["batch_size"],
    epoch,
    config["beta"],
)
l_type = config["loss"]
# test on patients data
outdir = os.path.join(path, f"ph_{prediction_horizon}_sh{sampling_horizon}_{l_type}")
if not os.path.exists(outdir):
    os.makedirs(outdir)
all_errs = []

2019-06-09 00:04:51
Reading 12 segments


In [None]:
# Loop
cleaned_subjects.sort()
standard = False  # do not use standard

train_pids = set(cleaned_subjects)
local_train_data = []
for k in train_pids:
    local_train_data += train_data["processed_cgm_data_" + k]
print(f"Pretrain data: {sum([sum(x) for x in local_train_data])}")

train_dataset.data = local_train_data
train_dataset.set_cutpoint = -1
train_dataset.reset(
    sampling_horizon,
    prediction_horizon,
    scale,
    100,
    False,
    outtype,
    1,
    standard,
)
regressor(train_dataset, *argv, l_type, outdir)

In [15]:
all_errs = []
for pid in cleaned_test_subjects: # First 9 as subset, can be an example
    # train_pids = set(cleaned_subjects)
    # local_train_data = []
    # for k in train_pids:
    #     local_train_data += train_data[k + "_training_data"]
    # print(f"Pretrain data: {sum([sum(x) for x in local_train_data])}")
    
    # train_dataset.data = local_train_data
    # train_dataset.set_cutpoint = -1
    # train_dataset.reset(
    #     sampling_horizon,
    #     prediction_horizon,
    #     scale,
    #     100,
    #     False,
    #     outtype,
    #     1,
    #     standard,
    # )
    # regressor(train_dataset, *argv, l_type, outdir)
    # Fine-tune and test
    target_test_dataset = CGMSDataSeg(
    "diatrend", f"C:/Users/baiyi/OneDrive/Desktop/Modify_GenBG/modified_diatrend_subset/fold1_test/processed_cgm_data_{pid}.csv", 5
    )
    target_test_dataset.set_cutpoint = 1
    target_test_dataset.reset(
        sampling_horizon,
        prediction_horizon,
        scale,
        0.01,
        False,
        outtype,
        1,
        standard,
    )

    # target_train_dataset = CGMSDataSeg(
    # "ohio", f"C:/Users/baiyi/OneDrive/Desktop/Modify_GenBG/modified_diatrend_subset/fold1_training/{pid}.csv", 5
    # )
    # target_train_dataset.set_cutpoint = -1
    # target_train_dataset.reset(
    #     sampling_horizon,
    #     prediction_horizon,
    #     scale,
    #     100,
    #     False,
    #     outtype,
    #     1,
    #     standard,
    # )
    err, labels = test_ckpt(target_test_dataset, outdir)
    # errs = [err]
    # transfer_res = [labels]
    # for i in range(1, 2):
    #     err, labels = regressor_transfer(
    #         target_train_dataset,
    #         target_test_dataset,
    #         config["batch_size"],
    #         epoch,
    #         outdir,
    #         i,
    #     )
    #     errs.append(err)
    #     transfer_res.append(labels)
    # transfer_res = np.concatenate(transfer_res, axis=1)
    np.savetxt(
        f"{outdir}/{pid}.txt",
        [err],
        fmt="%.4f",
    )
    all_errs.append([str(pid), err]) 
all_errs = np.array(all_errs, dtype=object)  # Use dtype=object to handle mixed types
np.savetxt(f"{outdir}/errors.txt", all_errs, fmt="%s %.4f")
# label pair:(groundTruth, y_pred)

2015-12-05 00:00:51.037000
Reading 209 segments
Building dataset, requesting data from 0 to 209
############################ Data structure summary ############################
Hypo/no_hypo: 0/1
Found 209 continuous time series
Data shape: (8611, 6), Train/test: 1/8610
Train test ratio: 0.00
################################################################################
INFO:tensorflow:Restoring parameters from ../diatrend_results\ph_6_sh6_rmse\pretrain
2018-09-30 00:03:02
Reading 10 segments
Building dataset, requesting data from 0 to 10
############################ Data structure summary ############################
Hypo/no_hypo: 0/1
Found 10 continuous time series
Data shape: (11829, 6), Train/test: 1/11828
Train test ratio: 0.00
################################################################################
INFO:tensorflow:Restoring parameters from ../diatrend_results\ph_6_sh6_rmse\pretrain
2018-06-20 00:04:30
Reading 22 segments
Building dataset, requesting data from 0 to 22
###

In [None]:
all_errs

In [None]:
# Convert the second and third columns to floats
second_column = all_errs[:, 1].astype(float)
third_column = all_errs[:, 2].astype(float)

# Calculate the average
average_second_column = np.mean(second_column)
average_third_column = np.mean(third_column)

print("Average of the second column:", average_second_column)
print("Average of the third column:", average_third_column)

In [38]:
# Convert the second and third columns to floats
second_column = all_errs[:, 1].astype(float)
third_column = all_errs[:, 2].astype(float)

# Calculate the average
average_second_column = np.mean(second_column)
average_third_column = np.mean(third_column)

print("Average of the second column:", average_second_column)
print("Average of the third column:", average_third_column)

Average of the second column: 0.23931833875
Average of the third column: 0.23320475625


In [None]:
# Also, if the previous cell run into an issue but all result txt files are ready
# You can run this to evaluate:
import argparse
import json
import os
import pandas as pd
import os
import datetime
import numpy as np

# List all files and directories in the current directory
files_and_directories = os.listdir('.')

# Filter for files that end with .txt
txt_files = [file for file in files_and_directories if file.endswith('.txt')]


# Read the data from the text file
def calcuate_rmse(file):
    data = np.loadtxt(file)  # Make sure to replace 'data.txt' with your actual file path
    print(file)
    # Splitting the data into groundtruth and predictions
    groundtruth = data[:, 0]  # First column as ground truth (also same as third column)
    predictions_1 = data[:, 1]  # Second column as predictions from method 1
    predictions_2 = data[:, 3]  # Fourth column as predictions from method 2

    # Function to calculate RMSE
    def calculate_rmse(true_values, predictions):
        mse = np.mean((true_values - predictions) ** 2)
        rmse = np.sqrt(mse)
        return rmse

    # Calculate RMSE for each method
    rmse_method_1 = calculate_rmse(groundtruth, predictions_1)
    rmse_method_2 = calculate_rmse(groundtruth, predictions_2)

    print("RMSE for Method 1:", rmse_method_1)
    print("RMSE for Method 2:", rmse_method_2)
    return rmse_method_1


rmse_list = []
for f in txt_files[1:]:
    rmse1 = calcuate_rmse(f)
    print(rmse1)
    rmse_list.append(rmse1)

print(np.average(rmse_list))

print(rmse_list)