In [6]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm

from load_xml_data import load_ohiot1dm_xml

In [7]:
xml_file = "../ohiot1dm/train/559-ws-training.xml"
dataframes = load_ohiot1dm_xml(xml_file)

# Access individual dataframes
df_glucose = dataframes['glucose_level']
df_basal = dataframes['basal']
df_tempbasal = dataframes['tempbasal']
df_bolus = dataframes['bolus']
df_meal = dataframes['meal']
df_step = dataframes['step']

In [8]:
# Inspect start and end times for each dataframe

print("=" * 60)
print("GLUCOSE LEVEL DATA")
print("=" * 60)
print(f"Start time: {df_glucose['timestamp'].min()}")
print(f"End time:   {df_glucose['timestamp'].max()}")
print(f"Duration:   {df_glucose['timestamp'].max() - df_glucose['timestamp'].min()}")
print(f"Total records: {len(df_glucose)}")
print()

print("=" * 60)
print("BASAL INSULIN DATA")
print("=" * 60)
print(f"Start time: {df_basal['timestamp'].min()}")
print(f"End time:   {df_basal['timestamp'].max()}")
print(f"Duration:   {df_basal['timestamp'].max() - df_basal['timestamp'].min()}")
print(f"Total records: {len(df_basal)}")
print()

print("=" * 60)
print("TEMPORARY BASAL DATA")
print("=" * 60)
print(f"Start time: {df_tempbasal['ts_begin'].min()}")
print(f"End time:   {df_tempbasal['ts_end'].max()}")
print(f"Duration:   {df_tempbasal['ts_end'].max() - df_tempbasal['ts_begin'].min()}")
print(f"Total records: {len(df_tempbasal)}")
print()

print("=" * 60)
print("BOLUS DATA")
print("=" * 60)
print(f"Start time: {df_bolus['ts_begin'].min()}")
print(f"End time:   {df_bolus['ts_end'].max()}")
print(f"Duration:   {df_bolus['ts_end'].max() - df_bolus['ts_begin'].min()}")
print(f"Total records: {len(df_bolus)}")
print()

print("=" * 60)
print("MEAL DATA")
print("=" * 60)
print(f"Start time: {df_meal['timestamp'].min()}")
print(f"End time:   {df_meal['timestamp'].max()}")
print(f"Duration:   {df_meal['timestamp'].max() - df_meal['timestamp'].min()}")
print(f"Total records: {len(df_meal)}")
print()

print("=" * 60)
print("STEP DATA")
print("=" * 60)
print(f"Start time: {df_step['timestamp'].min()}")
print(f"End time:   {df_step['timestamp'].max()}")
print(f"Duration:   {df_step['timestamp'].max() - df_step['timestamp'].min()}")
print(f"Total records: {len(df_step)}")


GLUCOSE LEVEL DATA
Start time: 2021-12-07 01:17:00
End time:   2022-01-17 23:56:00
Duration:   41 days 22:39:00
Total records: 10796

BASAL INSULIN DATA
Start time: 2021-12-07 00:00:00
End time:   2022-01-17 16:00:00
Duration:   41 days 16:00:00
Total records: 163

TEMPORARY BASAL DATA
Start time: 2021-12-07 04:49:28
End time:   2022-01-17 12:55:30
Duration:   41 days 08:06:02
Total records: 34

BOLUS DATA
Start time: 2021-12-07 01:08:04
End time:   2022-01-17 18:05:28
Duration:   41 days 16:57:24
Total records: 152

MEAL DATA
Start time: 2021-12-07 15:25:00
End time:   2022-01-17 17:30:00
Duration:   41 days 02:05:00
Total records: 150

STEP DATA
Start time: 2021-12-07 12:57:00
End time:   2022-01-17 23:59:00
Duration:   41 days 11:02:00
Total records: 12288


In [9]:
import os
import pandas as pd

# Get all XML files in the train folder
train_folder = "../ohiot1dm/train"
xml_files = [f for f in os.listdir(train_folder) if f.endswith('.xml')]
xml_files.sort()

print(f"Found {len(xml_files)} XML files in {train_folder}")
print("=" * 80)

# Dictionary to store file validity information
file_validity = {}

# Check validity of each file
for xml_file in xml_files:
    file_path = os.path.join(train_folder, xml_file)
    print(f"\nProcessing: {xml_file}")
    
    try:
        dataframes = load_ohiot1dm_xml(file_path)
        
        # Check if each data type is valid (not empty)
        validity = {
            'glucose_level': len(dataframes['glucose_level']) > 0,
            'basal': len(dataframes['basal']) > 0,
            'tempbasal': len(dataframes['tempbasal']) > 0,
            'bolus': len(dataframes['bolus']) > 0,
            'meal': len(dataframes['meal']) > 0,
            'step': len(dataframes['step']) > 0
        }
        
        file_validity[xml_file] = {
            'validity': validity,
            'dataframes': dataframes
        }
        
        # Print validity status
        for data_type, is_valid in validity.items():
            status = "✓" if is_valid else "✗"
            print(f"  {status} {data_type}: {len(dataframes[data_type])} records")
            
    except Exception as e:
        print(f"  ERROR loading file: {e}")
        file_validity[xml_file] = None

print("\n" + "=" * 80)
print("SUMMARY: Files with ALL data types valid")
print("=" * 80)

valid_files = []
for file_name, data in file_validity.items():
    if data is not None:
        if all(data['validity'].values()):
            valid_files.append(file_name)
            print(f"✓ {file_name}")

print(f"\nTotal files with all data types: {len(valid_files)}/{len(xml_files)}")


Found 12 XML files in ../ohiot1dm/train

Processing: 540-ws-training.xml
  ERROR loading file: float() argument must be a string or a number, not 'NoneType'

Processing: 544-ws-training.xml
  ERROR loading file: float() argument must be a string or a number, not 'NoneType'

Processing: 552-ws-training.xml
  ERROR loading file: float() argument must be a string or a number, not 'NoneType'

Processing: 559-ws-training.xml
  ✓ glucose_level: 10796 records
  ✓ basal: 163 records
  ✓ tempbasal: 34 records
  ✓ bolus: 152 records
  ✓ meal: 150 records
  ✓ step: 12288 records

Processing: 563-ws-training.xml
  ✓ glucose_level: 12124 records
  ✓ basal: 87 records
  ✓ tempbasal: 2 records
  ✓ bolus: 347 records
  ✓ meal: 129 records
  ✓ step: 11971 records

Processing: 567-ws-training.xml
  ERROR loading file: float() argument must be a string or a number, not 'NoneType'

Processing: 570-ws-training.xml
  ✓ glucose_level: 10982 records
  ✓ basal: 118 records
  ✓ tempbasal: 3 records
  ✓ bolus: 3

In [10]:
# Detailed time inspection for files with all valid data types
print("\n" + "=" * 80)
print("DETAILED TIME ANALYSIS FOR FILES WITH ALL DATA TYPES")
print("=" * 80)

for file_name in valid_files:
    print(f"\n{'=' * 80}")
    print(f"FILE: {file_name}")
    print('=' * 80)
    
    dfs = file_validity[file_name]['dataframes']
    
    # Glucose Level
    print("\nGLUCOSE LEVEL:")
    print(f"  Start: {dfs['glucose_level']['timestamp'].min()}")
    print(f"  End:   {dfs['glucose_level']['timestamp'].max()}")
    print(f"  Duration: {dfs['glucose_level']['timestamp'].max() - dfs['glucose_level']['timestamp'].min()}")
    print(f"  Records: {len(dfs['glucose_level'])}")
    
    # Basal
    print("\nBASAL:")
    print(f"  Start: {dfs['basal']['timestamp'].min()}")
    print(f"  End:   {dfs['basal']['timestamp'].max()}")
    print(f"  Duration: {dfs['basal']['timestamp'].max() - dfs['basal']['timestamp'].min()}")
    print(f"  Records: {len(dfs['basal'])}")
    
    # Temporary Basal
    print("\nTEMPORARY BASAL:")
    print(f"  Start: {dfs['tempbasal']['ts_begin'].min()}")
    print(f"  End:   {dfs['tempbasal']['ts_end'].max()}")
    print(f"  Duration: {dfs['tempbasal']['ts_end'].max() - dfs['tempbasal']['ts_begin'].min()}")
    print(f"  Records: {len(dfs['tempbasal'])}")
    
    # Bolus
    print("\nBOLUS:")
    print(f"  Start: {dfs['bolus']['ts_begin'].min()}")
    print(f"  End:   {dfs['bolus']['ts_end'].max()}")
    print(f"  Duration: {dfs['bolus']['ts_end'].max() - dfs['bolus']['ts_begin'].min()}")
    print(f"  Records: {len(dfs['bolus'])}")
    
    # Meal
    print("\nMEAL:")
    print(f"  Start: {dfs['meal']['timestamp'].min()}")
    print(f"  End:   {dfs['meal']['timestamp'].max()}")
    print(f"  Duration: {dfs['meal']['timestamp'].max() - dfs['meal']['timestamp'].min()}")
    print(f"  Records: {len(dfs['meal'])}")
    
    # Step
    print("\nSTEP:")
    print(f"  Start: {dfs['step']['timestamp'].min()}")
    print(f"  End:   {dfs['step']['timestamp'].max()}")
    print(f"  Duration: {dfs['step']['timestamp'].max() - dfs['step']['timestamp'].min()}")
    print(f"  Records: {len(dfs['step'])}")



DETAILED TIME ANALYSIS FOR FILES WITH ALL DATA TYPES

FILE: 559-ws-training.xml

GLUCOSE LEVEL:
  Start: 2021-12-07 01:17:00
  End:   2022-01-17 23:56:00
  Duration: 41 days 22:39:00
  Records: 10796

BASAL:
  Start: 2021-12-07 00:00:00
  End:   2022-01-17 16:00:00
  Duration: 41 days 16:00:00
  Records: 163

TEMPORARY BASAL:
  Start: 2021-12-07 04:49:28
  End:   2022-01-17 12:55:30
  Duration: 41 days 08:06:02
  Records: 34

BOLUS:
  Start: 2021-12-07 01:08:04
  End:   2022-01-17 18:05:28
  Duration: 41 days 16:57:24
  Records: 152

MEAL:
  Start: 2021-12-07 15:25:00
  End:   2022-01-17 17:30:00
  Duration: 41 days 02:05:00
  Records: 150

STEP:
  Start: 2021-12-07 12:57:00
  End:   2022-01-17 23:59:00
  Duration: 41 days 11:02:00
  Records: 12288

FILE: 563-ws-training.xml

GLUCOSE LEVEL:
  Start: 2021-09-13 12:33:00
  End:   2021-10-28 23:56:00
  Duration: 45 days 11:23:00
  Records: 12124

BASAL:
  Start: 2021-09-13 09:40:35
  End:   2021-10-25 18:30:00
  Duration: 42 days 08:49:2

In [11]:
# Create a summary dataframe for easy comparison
summary_data = []

for file_name in valid_files:
    dfs = file_validity[file_name]['dataframes']
    patient_id = file_name.split('-')[0]
    
    summary_data.append({
        'patient_id': patient_id,
        'file': file_name,
        'glucose_start': dfs['glucose_level']['timestamp'].min(),
        'glucose_end': dfs['glucose_level']['timestamp'].max(),
        'glucose_records': len(dfs['glucose_level']),
        'basal_start': dfs['basal']['timestamp'].min(),
        'basal_end': dfs['basal']['timestamp'].max(),
        'basal_records': len(dfs['basal']),
        'tempbasal_records': len(dfs['tempbasal']),
        'bolus_records': len(dfs['bolus']),
        'meal_records': len(dfs['meal']),
        'step_records': len(dfs['step']),
    })

summary_df = pd.DataFrame(summary_data)
print("\nSUMMARY TABLE:")
print(summary_df.to_string(index=False))



SUMMARY TABLE:
patient_id                file       glucose_start         glucose_end  glucose_records         basal_start           basal_end  basal_records  tempbasal_records  bolus_records  meal_records  step_records
       559 559-ws-training.xml 2021-12-07 01:17:00 2022-01-17 23:56:00            10796 2021-12-07 00:00:00 2022-01-17 16:00:00            163                 34            152           150         12288
       563 563-ws-training.xml 2021-09-13 12:33:00 2021-10-28 23:56:00            12124 2021-09-13 09:40:35 2021-10-25 18:30:00             87                  2            347           129         11971
       570 570-ws-training.xml 2021-12-07 16:29:00 2022-01-16 23:59:00            10982 2021-12-07 00:00:00 2022-01-16 17:00:00            118                  3            326           136         12335
       575 575-ws-training.xml 2021-11-17 12:04:00 2022-01-01 23:55:00            11866 2021-11-17 13:00:00 2022-01-01 19:00:00            126                 12   

# Inspect BrisT1D Open dataset

In [13]:
# Load the Bris-T1D Open dataset
df = pd.read_csv("../Bris-T1D Open/Bris-T1D Open/device_data/processed_state/P02.csv")

print("Dataset shape:", df.shape)
print("\nColumn names:")
print(df.columns.tolist())
print("\n" + "="*80)

# Filter rows where 'carbs' column is not empty and not zero
if 'carbs' in df.columns:
    # Filter: not null, not NaN, and not zero
    df_with_carbs = df[
        (df['carbs'].notna()) & 
        (df['carbs'] != 0)
    ]
    
    print(f"\nRows with carbs (not empty and not zero): {len(df_with_carbs)}")
    print(f"Percentage of total: {len(df_with_carbs)/len(df)*100:.2f}%")
    print("\n" + "="*80)
    print("\nFirst 20 rows with carbs:")
    print(df_with_carbs.head(20))
    
    print("\n" + "="*80)
    print("\nCarbs statistics:")
    print(df_with_carbs['carbs'].describe())
    
else:
    print("\nERROR: 'carbs' column not found in dataset")
    print("Available columns:", df.columns.tolist())


Dataset shape: (174856, 10)

Column names:
['timestamp', 'bg', 'insulin', 'carbs', 'hr', 'dist', 'steps', 'cals', 'activity', 'device']


Rows with carbs (not empty and not zero): 642
Percentage of total: 0.37%


First 20 rows with carbs:
                timestamp  bg  insulin  carbs  hr  dist  steps  cals activity  \
253   2023-06-01 10:32:00 NaN      NaN   30.0 NaN   NaN    NaN   NaN      NaN   
284   2023-06-01 11:47:00 NaN      NaN   10.0 NaN   NaN    NaN   NaN      NaN   
347   2023-06-01 14:24:00 NaN      NaN   20.0 NaN   NaN    NaN   NaN      NaN   
448   2023-06-01 18:33:00 NaN      NaN   55.0 NaN   NaN    NaN   NaN      NaN   
503   2023-06-01 20:50:00 NaN   0.1807   23.0 NaN   NaN    NaN   NaN      NaN   
787   2023-06-02 08:24:00 NaN      NaN   50.0 NaN   NaN    NaN   NaN      NaN   
890   2023-06-02 11:13:00 NaN      NaN   20.0 NaN   NaN    NaN   NaN      NaN   
972   2023-06-02 13:29:00 NaN      NaN   50.0 NaN   NaN    NaN   NaN      NaN   
1753  2023-06-03 11:10:00 NaN   

In [14]:
# Inspect rows with steps not empty and not zero
print("\n" + "="*80)
print("INSPECTING STEPS DATA")
print("="*80)

if 'steps' in df.columns:
    # Filter: not null, not NaN, and not zero
    df_with_steps = df[
        (df['steps'].notna()) & 
        (df['steps'] != 0)
    ]
    
    print(f"\nRows with steps (not empty and not zero): {len(df_with_steps)}")
    print(f"Percentage of total: {len(df_with_steps)/len(df)*100:.2f}%")
    print("\n" + "="*80)
    print("\nFirst 20 rows with steps:")
    print(df_with_steps.head(20))
    
    print("\n" + "="*80)
    print("\nSteps statistics:")
    print(df_with_steps['steps'].describe())
    
    # Show distribution of steps values
    print("\n" + "="*80)
    print("\nSteps value distribution:")
    print(df_with_steps['steps'].value_counts().head(20))
    
else:
    print("\nERROR: 'steps' column not found in dataset")
    print("Available columns:", df.columns.tolist())



INSPECTING STEPS DATA

Rows with steps (not empty and not zero): 11327
Percentage of total: 6.48%


First 20 rows with steps:
               timestamp  bg  insulin  carbs     hr   dist  steps   cals  \
770  2023-06-02 07:55:00 NaN      NaN    NaN   98.0   84.9  109.0   3.58   
772  2023-06-02 08:00:00 NaN      NaN    NaN  100.0   53.1   66.0   4.05   
776  2023-06-02 08:05:00 NaN      NaN    NaN    NaN   27.8   37.0   4.44   
781  2023-06-02 08:15:00 NaN      NaN    NaN   98.0  150.3  188.0   5.25   
788  2023-06-02 08:25:00 NaN      NaN    NaN    NaN  152.7  184.0   8.47   
797  2023-06-02 08:40:00 NaN      NaN    NaN  106.0   13.9   16.0   2.86   
803  2023-06-02 08:50:00 NaN      NaN    NaN   84.0   30.5   40.0   4.36   
810  2023-06-02 09:00:00 NaN      NaN    NaN   93.0   49.2   58.0   6.24   
816  2023-06-02 09:10:00 NaN      NaN    NaN  100.5  361.2  455.0   5.63   
821  2023-06-02 09:20:00 NaN      NaN    NaN   91.7  249.3  313.0   6.33   
824  2023-06-02 09:25:00 NaN      NaN

# Inspect HUPA-UCM

In [16]:
# Inspect rows with carb_input not zero and not empty
import pandas as pd

# Load the HUPA dataset
df = pd.read_csv("../HUPA-UCM/Preprocessed/HUPA0001P.csv", sep=';')

print("Dataset shape:", df.shape)
print("\nColumn names:")
print(df.columns.tolist())
print("\n" + "="*80)

# Filter rows where 'carb_input' column is not empty and not zero
if 'carb_input' in df.columns:
    # Filter: not null, not NaN, and not zero
    df_with_carbs = df[
        (df['carb_input'].notna()) & 
        (df['carb_input'] != 0)
    ]
    
    print(f"\nRows with carb_input (not empty and not zero): {len(df_with_carbs)}")
    print(f"Percentage of total: {len(df_with_carbs)/len(df)*100:.2f}%")
    print("\n" + "="*80)
    print("\nFirst 20 rows with carb_input:")
    print(df_with_carbs.head(20))
    
    print("\n" + "="*80)
    print("\nCarb_input statistics:")
    print(df_with_carbs['carb_input'].describe())
    
    # Show distribution of carb_input values
    print("\n" + "="*80)
    print("\nCarb_input value distribution (top 20):")
    print(df_with_carbs['carb_input'].value_counts().head(20))
    
else:
    print("\nERROR: 'carb_input' column not found in dataset")
    print("Available columns:", df.columns.tolist())

Dataset shape: (4096, 8)

Column names:
['time', 'glucose', 'calories', 'heart_rate', 'steps', 'basal_rate', 'bolus_volume_delivered', 'carb_input']


Rows with carb_input (not empty and not zero): 40
Percentage of total: 0.98%


First 20 rows with carb_input:
                     time  glucose  calories  heart_rate  steps  basal_rate  \
38    2018-06-13T21:50:00    119.0    9.3380  102.571429   52.0    0.029167   
186   2018-06-14T10:10:00    129.0    7.0035   91.908257   31.0    0.037500   
240   2018-06-14T14:40:00    148.0    8.5330   96.475000   22.0    0.037500   
289   2018-06-14T18:45:00    219.0   18.1930  111.237410  227.0    0.091667   
323   2018-06-14T21:35:00    136.0    4.5080   89.470085    0.0    0.058333   
488   2018-06-15T11:20:00    126.0    4.1860   75.698413    0.0    0.037500   
530   2018-06-15T14:50:00    129.0    5.4740   84.614035    0.0    0.037500   
559   2018-06-15T17:15:00    150.0    7.8085   93.038835   37.0    0.091667   
606   2018-06-15T21:10:00   

# Convert UCHTT1DM dataset

In [19]:
# Install required package for reading Excel files
import sys
import subprocess

try:
    import openpyxl
    print("openpyxl is already installed")
except ImportError:
    print("Installing openpyxl...")
    try:
        # Try installing with --user flag for externally managed environments
        subprocess.check_call([sys.executable, "-m", "pip", "install", "--user", "openpyxl"])
        print("openpyxl installed successfully!")
    except subprocess.CalledProcessError:
        # If that fails, try with --break-system-packages
        print("Trying alternative installation method...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", "--break-system-packages", "openpyxl"])
        print("openpyxl installed successfully!")


Installing openpyxl...
Collecting openpyxl
  Downloading openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
Collecting et-xmlfile
  Downloading et_xmlfile-2.0.0-py3-none-any.whl (18 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-2.0.0 openpyxl-3.1.5
openpyxl installed successfully!


You should consider upgrading via the '/Applications/Xcode.app/Contents/Developer/usr/bin/python3 -m pip install --upgrade pip' command.


In [20]:
# Read UCHTT1DM dataset files
import pandas as pd

patient_id = "HT_01"
base_path = f"../UCHTT1DM/{patient_id}"

# Read the three Excel files
df_carbs = pd.read_excel(f"{base_path}/Carbohidrates.xlsx")
df_igar = pd.read_excel(f"{base_path}/IGAR.xlsx")
df_steps = pd.read_excel(f"{base_path}/Steps.xlsx")

# Display information about Carbohydrates data
print("=" * 80)
print("CARBOHYDRATES DATA")
print("=" * 80)
print(f"Shape: {df_carbs.shape}")
print(f"\nColumns: {df_carbs.columns.tolist()}")
print(f"\nFirst 10 rows:")
print(df_carbs.head(10))
print(f"\nData types:")
print(df_carbs.dtypes)

# Display information about IGAR data
print("\n" + "=" * 80)
print("IGAR (Glucose) DATA")
print("=" * 80)
print(f"Shape: {df_igar.shape}")
print(f"\nColumns: {df_igar.columns.tolist()}")
print(f"\nFirst 10 rows:")
print(df_igar.head(10))
print(f"\nData types:")
print(df_igar.dtypes)

# Display information about Steps data
print("\n" + "=" * 80)
print("STEPS DATA")
print("=" * 80)
print(f"Shape: {df_steps.shape}")
print(f"\nColumns: {df_steps.columns.tolist()}")
print(f"\nFirst 10 rows:")
print(df_steps.head(10))
print(f"\nData types:")
print(df_steps.dtypes)


CARBOHYDRATES DATA
Shape: (1721, 2)

Columns: ['Unnamed: 0', 'Value (g)']

First 10 rows:
           Unnamed: 0  Value (g)
0 2020-12-10 22:40:00      102.8
1 2020-12-10 22:45:00        0.0
2 2020-12-10 22:50:00        0.0
3 2020-12-10 22:55:00        0.0
4 2020-12-10 23:00:00        0.0
5 2020-12-10 23:05:00        0.0
6 2020-12-10 23:10:00        0.0
7 2020-12-10 23:15:00        0.0
8 2020-12-10 23:20:00        0.0
9 2020-12-10 23:25:00        0.0

Data types:
Unnamed: 0    datetime64[ns]
Value (g)            float64
dtype: object

IGAR (Glucose) DATA
Shape: (1721, 2)

Columns: ['Unnamed: 0', 'Value (g)']

First 10 rows:
           Unnamed: 0  Value (g)
0 2020-12-10 22:40:00   1.215528
1 2020-12-10 22:45:00   4.933160
2 2020-12-10 22:50:00   9.743197
3 2020-12-10 22:55:00  12.812123
4 2020-12-10 23:00:00  15.862592
5 2020-12-10 23:05:00  18.272020
6 2020-12-10 23:10:00  19.774606
7 2020-12-10 23:15:00  20.584073
8 2020-12-10 23:20:00  20.827517
9 2020-12-10 23:25:00  20.842448

Data t

In [21]:
# Inspect carbohydrates rows with non-zero values
print("=" * 80)
print("CARBOHYDRATES - NON-ZERO VALUES ONLY")
print("=" * 80)

# Check what columns exist
print(f"Available columns: {df_carbs.columns.tolist()}")
print()

# Find the value column (might be 'Value', 'value', or similar)
value_col = None
for col in df_carbs.columns:
    if 'value' in col.lower() or 'carb' in col.lower():
        value_col = col
        break

if value_col:
    # Filter rows where value is not zero and not null
    df_carbs_nonzero = df_carbs[
        (df_carbs[value_col].notna()) & 
        (df_carbs[value_col] != 0)
    ]
    
    print(f"Using column: '{value_col}'")
    print(f"\nRows with non-zero values: {len(df_carbs_nonzero)}")
    print(f"Percentage of total: {len(df_carbs_nonzero)/len(df_carbs)*100:.2f}%")
    print("\n" + "=" * 80)
    print("\nFirst 20 rows with non-zero values:")
    print(df_carbs_nonzero.head(20))
    
    print("\n" + "=" * 80)
    print(f"\n{value_col} statistics (non-zero):")
    print(df_carbs_nonzero[value_col].describe())
    
    print("\n" + "=" * 80)
    print(f"\n{value_col} distribution (top 20 values):")
    print(df_carbs_nonzero[value_col].value_counts().head(20))
    
else:
    print("\nERROR: Could not find value column")
    print("Please specify the correct column name")


CARBOHYDRATES - NON-ZERO VALUES ONLY
Available columns: ['Unnamed: 0', 'Value (g)']

Using column: 'Value (g)'

Rows with non-zero values: 29
Percentage of total: 1.69%


First 20 rows with non-zero values:
              Unnamed: 0  Value (g)
0    2020-12-10 22:40:00     102.80
154  2020-12-11 11:30:00      54.90
174  2020-12-11 13:10:00      13.30
302  2020-12-11 23:50:00     109.80
430  2020-12-12 10:30:00      72.55
460  2020-12-12 13:00:00     107.03
525  2020-12-12 18:25:00      39.38
567  2020-12-12 21:55:00     101.20
585  2020-12-12 23:25:00      38.09
742  2020-12-13 12:30:00      62.28
764  2020-12-13 14:20:00      49.17
854  2020-12-13 21:50:00      78.42
1036 2020-12-14 13:00:00      36.88
1090 2020-12-14 17:30:00      35.43
1142 2020-12-14 21:50:00      21.52
1292 2020-12-15 10:20:00      72.55
1309 2020-12-15 11:45:00      20.30
1352 2020-12-15 15:20:00       5.83
1384 2020-12-15 18:00:00      30.88
1396 2020-12-15 19:00:00      30.88


Value (g) statistics (non-zero):
co

In [18]:
# Convert all XLSX files to CSV in UCHTT1DM subfolders
import pandas as pd
import os
from pathlib import Path

base_path = "../UCHTT1DM"

# Get all subfolders
subfolders = [f for f in os.listdir(base_path) if os.path.isdir(os.path.join(base_path, f)) and not f.startswith('.')]
subfolders.sort()

print(f"Found {len(subfolders)} subfolders in UCHTT1DM")
print("=" * 80)

total_converted = 0
errors = []

for subfolder in subfolders:
    subfolder_path = os.path.join(base_path, subfolder)
    
    # Find all xlsx files in this subfolder
    xlsx_files = [f for f in os.listdir(subfolder_path) if f.endswith('.xlsx')]
    
    if not xlsx_files:
        print(f"\n{subfolder}: No XLSX files found")
        continue
    
    print(f"\n{subfolder}: Found {len(xlsx_files)} XLSX files")
    
    for xlsx_file in xlsx_files:
        xlsx_path = os.path.join(subfolder_path, xlsx_file)
        csv_file = xlsx_file.replace('.xlsx', '.csv')
        csv_path = os.path.join(subfolder_path, csv_file)
        
        try:
            # Read the Excel file
            df = pd.read_excel(xlsx_path)
            
            # Save as CSV
            df.to_csv(csv_path, index=False)
            
            print(f"  ✓ {xlsx_file} → {csv_file} ({len(df)} rows, {len(df.columns)} columns)")
            total_converted += 1
            
        except Exception as e:
            error_msg = f"{subfolder}/{xlsx_file}: {str(e)}"
            errors.append(error_msg)
            print(f"  ✗ {xlsx_file}: ERROR - {str(e)}")

print("\n" + "=" * 80)
print("CONVERSION SUMMARY")
print("=" * 80)
print(f"Total files converted: {total_converted}")
print(f"Errors encountered: {len(errors)}")

if errors:
    print("\nErrors:")
    for error in errors:
        print(f"  - {error}")


Found 20 subfolders in UCHTT1DM

HT_01: Found 5 XLSX files
  ✗ IGAR.xlsx: ERROR - Missing optional dependency 'openpyxl'.  Use pip or conda to install openpyxl.
  ✗ Steps.xlsx: ERROR - Missing optional dependency 'openpyxl'.  Use pip or conda to install openpyxl.
  ✗ Carbohidrates.xlsx: ERROR - Missing optional dependency 'openpyxl'.  Use pip or conda to install openpyxl.
  ✗ Glucose.xlsx: ERROR - Missing optional dependency 'openpyxl'.  Use pip or conda to install openpyxl.
  ✗ Heart Rate.xlsx: ERROR - Missing optional dependency 'openpyxl'.  Use pip or conda to install openpyxl.

HT_02: Found 5 XLSX files
  ✗ IGAR.xlsx: ERROR - Missing optional dependency 'openpyxl'.  Use pip or conda to install openpyxl.
  ✗ Steps.xlsx: ERROR - Missing optional dependency 'openpyxl'.  Use pip or conda to install openpyxl.
  ✗ Carbohidrates.xlsx: ERROR - Missing optional dependency 'openpyxl'.  Use pip or conda to install openpyxl.
  ✗ Glucose.xlsx: ERROR - Missing optional dependency 'openpyxl'.  Us