In [54]:
import pandas as pd
import numpy as np
from datetime import datetime
from torch.utils.data import DataLoader, Dataset
import ast
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
import torch.nn as nn
import keras.backend as K
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import GRU, Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.applications import ResNet50

# Data Pre-Processing

### Load and Merge Datasets

In [55]:
# Load Datasets

# Training datasets
cgm_train = pd.read_csv('cgm_train.csv')
image_train = pd.read_csv('img_train.csv')
demo_viome_train = pd.read_csv('demo_viome_train.csv')
label_train = pd.read_csv('label_train.csv')

# Test datasets
cgm_test = pd.read_csv('cgm_test.csv')
image_test = pd.read_csv('img_test.csv')
demo_viome_test = pd.read_csv('demo_viome_test.csv')
label_test = pd.read_csv('label_test_breakfast_only.csv')

In [56]:
# Merge CGM and Image datasets
data_train = pd.merge(image_train, cgm_train, on=['Subject ID', 'Day'])

In [57]:
data_train

Unnamed: 0,Subject ID,Day,Image Before Breakfast,Image Before Lunch,Breakfast Time,Lunch Time,CGM Data
0,1,2,"[[[140, 122, 108], [135, 118, 104], [118, 104,...","[[[41, 152, 201], [77, 164, 205], [88, 157, 13...",2021-09-19 08:41:00,2021-09-19 12:24:00,"[('2021-09-19 08:20:00', 98.26666666666667), (..."
1,1,3,"[[[67, 58, 47], [59, 52, 41], [51, 45, 35], [4...","[[[40, 59, 77], [35, 56, 72], [20, 36, 47], [9...",2021-09-20 09:50:00,2021-09-20 15:20:00,"[('2021-09-20 09:10:00', 97.18333333333334), (..."
2,1,4,"[[[199, 195, 193], [198, 193, 192], [196, 192,...","[[[53, 44, 38], [51, 43, 36], [54, 47, 39], [4...",2021-09-21 09:34:00,2021-09-21 13:09:00,"[('2021-09-21 09:20:00', 107.36666666666666), ..."
3,1,5,"[[[149, 121, 80], [157, 128, 86], [159, 130, 8...","[[[30, 28, 28], [20, 18, 17], [31, 27, 23], [2...",2021-09-22 09:46:00,2021-09-22 13:50:00,"[('2021-09-22 09:25:00', 107.28333333333333), ..."
4,1,6,"[[[175, 184, 198], [192, 206, 219], [160, 165,...","[[[74, 85, 100], [59, 69, 81], [73, 84, 96], [...",2021-09-23 09:07:00,2021-09-23 13:17:00,"[('2021-09-23 08:55:00', 103.0), ('2021-09-23 ..."
...,...,...,...,...,...,...,...
319,7,6,"[[[68, 34, 35], [82, 60, 51], [63, 55, 38], [3...","[[[90, 77, 75], [92, 78, 75], [94, 83, 81], [9...",2021-12-18 08:52:00,2021-12-18 12:28:00,"[('2021-12-18 08:50:00', 101.36), ('2021-12-18..."
320,7,7,"[[[26, 26, 22], [17, 17, 13], [18, 19, 14], [9...","[[[17, 9, 8], [10, 7, 7], [3, 3, 4], [3, 3, 3]...",2021-12-19 08:43:00,2021-12-19 13:13:00,"[('2021-12-19 08:40:00', 100.68), ('2021-12-19..."
321,7,8,"[[[43, 37, 33], [42, 36, 31], [42, 37, 33], [4...","[[[122, 108, 107], [124, 110, 108], [124, 111,...",2021-12-20 09:06:00,2021-12-20 12:46:00,"[('2021-12-20 09:00:00', 104.04), ('2021-12-20..."
322,7,9,"[[[41, 38, 33], [41, 38, 33], [41, 38, 33], [4...","[[[59, 46, 32], [63, 51, 41], [57, 42, 28], [6...",2021-12-21 08:34:00,2021-12-21 12:38:00,"[('2021-12-21 08:25:00', 96.4), ('2021-12-21 0..."


### Pre-Process Food Pictures (Image Dataset)

In [58]:
def preprocess_img(data_train):

    # Load pretrained model
    base_model = ResNet50(weights='imagenet', include_top=False, pooling='avg')
    pca = PCA(n_components=10)
    def extract_features(image):
        image = np.expand_dims(image, axis=0)  # Add batch dimension
        features = base_model.predict(image)
        return features.flatten()  # Flatten the output
        
    # Preprocess for Breakfast image
    image_before_breakfast_matrix = []
    null_breakfast = []

    for i in range(len(data_train)):
        string_matrix = data_train['Image Before Breakfast'][i]
        parsed_matrix = ast.literal_eval(string_matrix)
        matrix_array = np.array(parsed_matrix)
        print(f"Matrix shape: {matrix_array.shape}, index: {i}")
        if matrix_array.shape == (0,):
            null_breakfast.append(i)
        image_before_breakfast_matrix.append(matrix_array)

    data_train = data_train.drop(null_breakfast)
    data_train = data_train.reset_index(drop=True)
    a = [extract_features(image) for image in image_before_breakfast_matrix if image.size != 0]
    reduced_features_a = pca.fit_transform(a)
    data_train['Reduced_img_before_breakfast'] = [list(row) for row in reduced_features_a]

    # Preprocess for Lunch image
    image_before_lunch_matrix = []
    null_lunch = []

    for i in range(len(data_train)):
        string_matrix = data_train['Image Before Lunch'][i]
        parsed_matrix = ast.literal_eval(string_matrix)
        matrix_array = np.array(parsed_matrix)
        print(f"Matrix shape: {matrix_array.shape}, index: {i}")
        if matrix_array.shape == (0,):
            null_lunch.append(i)
        image_before_lunch_matrix.append(matrix_array)

    data_train = data_train.drop(null_lunch)
    data_train = data_train.reset_index(drop=True)
    b = [extract_features(image) for image in image_before_lunch_matrix if image.size != 0]
    reduced_features_b = pca.fit_transform(b)
    data_train['Reduced_img_before_lunch'] = [list(row) for row in reduced_features_b]
    data_train.drop(columns=['Image Before Breakfast', 'Image Before Lunch'], axis=1, inplace=True)

    return data_train

In [None]:
data_train = preprocess_img(data_train)

data_train

Matrix shape: (64, 64, 3), index: 0
Matrix shape: (64, 64, 3), index: 1
Matrix shape: (64, 64, 3), index: 2
Matrix shape: (64, 64, 3), index: 3
Matrix shape: (64, 64, 3), index: 4
Matrix shape: (64, 64, 3), index: 5
Matrix shape: (64, 64, 3), index: 6
Matrix shape: (64, 64, 3), index: 7
Matrix shape: (64, 64, 3), index: 8
Matrix shape: (64, 64, 3), index: 9
Matrix shape: (64, 64, 3), index: 10
Matrix shape: (64, 64, 3), index: 11
Matrix shape: (64, 64, 3), index: 12
Matrix shape: (64, 64, 3), index: 13
Matrix shape: (64, 64, 3), index: 14
Matrix shape: (64, 64, 3), index: 15
Matrix shape: (64, 64, 3), index: 16
Matrix shape: (64, 64, 3), index: 17
Matrix shape: (64, 64, 3), index: 18
Matrix shape: (64, 64, 3), index: 19
Matrix shape: (64, 64, 3), index: 20
Matrix shape: (64, 64, 3), index: 21
Matrix shape: (64, 64, 3), index: 22
Matrix shape: (64, 64, 3), index: 23
Matrix shape: (64, 64, 3), index: 24
Matrix shape: (64, 64, 3), index: 25
Matrix shape: (64, 64, 3), index: 26
Matrix shap

In [None]:
# # Placeholder for missing images (a blank black image)
# def create_placeholder_image(size=(64, 64, 3)):
#     return np.zeros(size, dtype=np.float32)  # Normalized [0, 1] range

# # Function to preprocess image data
# def preprocess_image(img_data, size=(64, 64)):
#     try:
#         img_array = np.array(img_data, dtype=np.uint8)  # Ensure valid data type

#         # Check for empty image
#         if img_array.size == 0 or img_array.ndim != 3 or img_array.shape[2] != 3:
#             raise ValueError(f"Invalid or empty image dimensions: {img_array.shape}")

#         img_resized = np.array(Image.fromarray(img_array).resize(size))  # Resize
#         img_normalized = img_resized / 255.0  # Normalize pixel values to [0, 1]
#         return img_normalized
#     except Exception as e:
#         print(f"Error preprocessing image: {e}")
#         return create_placeholder_image(size)

# # Preprocess the dataset
# def preprocess_dataset(data):
#     # Define placeholder image
#     placeholder_image = create_placeholder_image()

#     # Iterate over rows to preprocess images
#     breakfast_images = []
#     lunch_images = []

#     for index, row in data.iterrows():
#         # Handle missing breakfast images
#         if pd.isnull(row['Image Before Breakfast']) or row['Image Before Breakfast'] == '[]':  # Check for empty list or NaN
#             breakfast_images.append(placeholder_image)
#         else:
#             try:
#                 img_data = eval(row['Image Before Breakfast'])  # Convert string to list
#                 breakfast_images.append(preprocess_image(img_data))
#             except Exception as e:
#                 print(f"Error at index {index}, breakfast: {e}")
#                 breakfast_images.append(placeholder_image)

#         # Handle missing lunch images
#         if pd.isnull(row['Image Before Lunch']) or row['Image Before Lunch'] == '[]':  # Check for empty list or NaN
#             lunch_images.append(placeholder_image)
#         else:
#             try:
#                 img_data = eval(row['Image Before Lunch'])  # Convert string to list
#                 lunch_images.append(preprocess_image(img_data))
#             except Exception as e:
#                 print(f"Error at index {index}, lunch: {e}")
#                 lunch_images.append(placeholder_image)

#     # Add preprocessed images back to the dataset
#     data['Image Before Breakfast'] = breakfast_images
#     data['Image Before Lunch'] = lunch_images

#     return data

# # Apply preprocessing
# cgm_image_data_processed = preprocess_dataset(temp_data_processed)

# # Save the processed dataset if needed
# # processed_data.to_pickle("processed_img_train.pkl")  # Save in pickle format for further use
# print(cgm_image_data_processed)

### Pre-Process CGM Data (Time-Series Glucose Levels) - COMPLETE

In [None]:
# Updating missing values with mean

def preprocess_cgm(data_train):

    # Function to check if CGM Data is an empty array
    def is_cgm_data_empty(row):
        try:
            cgm_list = ast.literal_eval(row['CGM Data'])
            return len(cgm_list) == 0
        except:
            return True

    # Function to filter out rows where CGM Data is empty
    data_train = data_train[~data_train.apply(is_cgm_data_empty, axis=1)]

    # Get the HH:MM:SS breakfast and lunch times and calculate the mean for each subject ID
    data_train.loc[:, 'Breakfast Time'] = pd.to_datetime(data_train['Breakfast Time'], errors='coerce')
    data_train.loc[:, 'Lunch Time'] = pd.to_datetime(data_train['Lunch Time'], errors='coerce')

    def mean_time(times):
        total_seconds = sum([t.hour * 3600 + t.minute * 60 + t.second for t in times if pd.notna(t)])
        mean_seconds = total_seconds // len([t for t in times if pd.notna(t)])
        return pd.to_datetime(mean_seconds, unit='s').time()

    mean_times = data_train.groupby('Subject ID')[['Breakfast Time', 'Lunch Time']].apply(
        lambda group: pd.Series({
            'Breakfast Time': mean_time(group['Breakfast Time']),
            'Lunch Time': mean_time(group['Lunch Time'])
        })
    )

    mean_times = mean_times.reset_index()

    # Find the reference date for any row within the same subject:
    def get_reference_date(subject_id):
        day_2_breakfast_index = data_train[(data_train['Subject ID'] == subject_id) & (data_train['Day'] == 2)]['Breakfast Time'].first_valid_index()
        if day_2_breakfast_index is None or pd.isna(data_train.loc[day_2_breakfast_index, 'Breakfast Time']):
            day_3_breakfast_index = data_train[(data_train['Subject ID'] == subject_id) & (data_train['Day'] == 4)]['Breakfast Time'].first_valid_index()
            if day_3_breakfast_index is not None:
                reference_date = data_train.loc[day_3_breakfast_index, 'Breakfast Time']
                reference_day = 4
            else:
                reference_date = None
                reference_day = None
        else:
            reference_date = data_train.loc[day_2_breakfast_index, 'Breakfast Time']
            reference_day = 2
        
        return pd.Series([reference_date.date(), reference_day])

    mean_times[['Reference Date', 'Reference Day']] = mean_times['Subject ID'].apply(get_reference_date)

    # Update missing values
    def update_missing_breakfast_time(row, mean_times):
        subject_id = row['Subject ID']
        day = row['Day']
        
        mean_breakfast_time = mean_times.loc[mean_times['Subject ID'] == subject_id, 'Breakfast Time'].iloc[0]
        mean_reference_date = mean_times.loc[mean_times['Subject ID'] == subject_id, 'Reference Date'].iloc[0]
        reference_day = mean_times.loc[mean_times['Subject ID'] == subject_id, 'Reference Day'].iloc[0]
        current_date = mean_reference_date + pd.Timedelta(days=(day - reference_day))
        updated_breakfast_time = pd.to_datetime(current_date.strftime('%Y-%m-%d') + ' ' + mean_breakfast_time.strftime('%H:%M:%S'))
        row['Breakfast Time'] = updated_breakfast_time
        return row

    data_train = data_train.apply(
        lambda row: update_missing_breakfast_time(row, mean_times) if pd.isna(row['Breakfast Time']) else row, axis=1
    )

    def update_missing_lunch_time(row, mean_times):
        subject_id = row['Subject ID']
        day = row['Day']
        mean_lunch_time = mean_times.loc[mean_times['Subject ID'] == subject_id, 'Lunch Time'].iloc[0]
        mean_reference_date = mean_times.loc[mean_times['Subject ID'] == subject_id, 'Reference Date'].iloc[0]
        reference_day = mean_times.loc[mean_times['Subject ID'] == subject_id, 'Reference Day'].iloc[0]
        current_date = mean_reference_date + pd.Timedelta(days=(day - reference_day))
        updated_lunch_time = pd.to_datetime(current_date.strftime('%Y-%m-%d') + ' ' + mean_lunch_time.strftime('%H:%M:%S'))
        row['Lunch Time'] = updated_lunch_time
        return row

    data_train = data_train.apply(
        lambda row: update_missing_lunch_time(row, mean_times) if pd.isna(row['Lunch Time']) else row, axis=1
    )

    data_train['Time Between Meals'] = (data_train['Lunch Time'] - data_train['Breakfast Time']).dt.total_seconds()

    def safe_get_glucose_data(cgm_data):
        try:
            if cgm_data and isinstance(cgm_data, list) and all(isinstance(entry, tuple) and len(entry) >= 2 for entry in cgm_data):
                return sum([entry[1] for entry in cgm_data]) / len(cgm_data)
            else:
                return None
        except Exception as e:
            print(f"Error processing data: {e}")
            return None

    # Apply the function to the 'CGM Data' column
    Mean_Glucose = data_train['CGM Data'].apply(safe_get_glucose_data)
    Max_Glucose = data_train['CGM Data'].apply(
        lambda x: max([entry[1] for entry in x]) if x and isinstance(x, list) and all(isinstance(entry, tuple) and len(entry) >= 2 for entry in x) else None
    )
    Min_Glucose = data_train['CGM Data'].apply(
        lambda x: min([entry[1] for entry in x]) if x and isinstance(x, list) and all(isinstance(entry, tuple) and len(entry) >= 2 for entry in x) else None
    )
    Std_Glucose = data_train['CGM Data'].apply(
        lambda x: pd.Series([entry[1] for entry in x]).std() if x and isinstance(x, list) and all(isinstance(entry, tuple) and len(entry) >= 2 for entry in x) else None
    )
    data_train['CGM Data'] = data_train['CGM Data'].apply(ast.literal_eval)

    data_train['Mean_Glucose'] = data_train['CGM Data'].apply(
        lambda x: sum([entry[1] for entry in x]) / len(x) if x else None
    )
    data_train['Max_Glucose'] = data_train['CGM Data'].apply(
        lambda x: max([entry[1] for entry in x]) if x else None
    )
    data_train['Min_Glucose'] = data_train['CGM Data'].apply(
        lambda x: min([entry[1] for entry in x]) if x else None
    )
    data_train['Std_Glucose'] = data_train['CGM Data'].apply(
        lambda x: pd.Series([entry[1] for entry in x]).std() if x else None
    )

    return data_train

In [None]:
data_train = preprocess_cgm(data_train)

data_train

Unnamed: 0,Subject ID,Day,Breakfast Time,Lunch Time,CGM Data,Reduced_img_before_breakfast,Reduced_img_before_lunch,Time Between Meals,Mean_Glucose,Max_Glucose,Min_Glucose,Std_Glucose
0,1,2,2021-09-19 08:41:00,2021-09-19 12:24:00,"[(2021-09-19 08:20:00, 98.26666666666667), (20...","[-32.91928047158247, 13.888790849359642, 0.305...","[23.074055405498918, 9.36279840473365, -29.019...",13380.0,90.989097,141.816667,40.733333,21.618896
1,1,3,2021-09-20 09:50:00,2021-09-20 15:20:00,"[(2021-09-20 09:10:00, 97.18333333333334), (20...","[-26.219316263974584, 16.424703743708214, -0.4...","[-5.970847077816804, -17.233244496181673, -14....",19800.0,97.619082,118.083333,87.183333,6.088044
2,1,4,2021-09-21 09:34:00,2021-09-21 13:09:00,"[(2021-09-21 09:20:00, 107.36666666666666), (2...","[-17.86024001380985, 10.620841236259638, -16.3...","[-30.777239192648928, 12.674365094458542, -17....",12900.0,110.482796,139.900000,90.000000,12.068927
3,1,5,2021-09-22 09:46:00,2021-09-22 13:50:00,"[(2021-09-22 09:25:00, 107.28333333333333), (2...","[-22.16294018289692, -14.158064305457387, 0.06...","[0.9527481173828339, 9.93729447868345, -13.841...",14640.0,100.235590,126.000000,84.366667,10.515336
4,1,6,2021-09-23 09:07:00,2021-09-23 13:17:00,"[(2021-09-23 08:55:00, 103.0), (2021-09-23 09:...","[-34.523258800819974, -0.5308802734574567, 6.3...","[-13.064853292144731, 8.075026293004358, -38.5...",15000.0,105.868153,124.633333,92.316667,6.152499
...,...,...,...,...,...,...,...,...,...,...,...,...
286,7,6,2021-12-18 08:52:00,2021-12-18 12:28:00,"[(2021-12-18 08:50:00, 101.36), (2021-12-18 08...","[34.08229264951511, 81.75710130576871, 53.9032...","[52.317760336616395, 37.45572974952651, -37.56...",12960.0,111.874667,136.360000,86.320000,13.147774
287,7,7,2021-12-19 08:43:00,2021-12-19 13:13:00,"[(2021-12-19 08:40:00, 100.68), (2021-12-19 08...","[-4.772387612000207, 33.625020758590985, 6.841...","[-26.31511985939777, -1.6458231174627136, 6.21...",16200.0,118.940408,177.040000,81.680000,23.180273
288,7,8,2021-12-20 09:06:00,2021-12-20 12:46:00,"[(2021-12-20 09:00:00, 104.04), (2021-12-20 09...","[24.554002373388368, 56.19980844749697, 50.114...","[50.35121167943291, -25.185911034165546, -31.6...",13200.0,102.462222,126.360000,76.000000,13.187046
289,7,9,2021-12-21 08:34:00,2021-12-21 12:38:00,"[(2021-12-21 08:25:00, 96.4), (2021-12-21 08:3...","[2.5593543587830556, 34.02232430320033, 10.929...","[25.349492297530325, -26.763417976221163, -14....",14640.0,101.840000,159.360000,70.000000,24.678828


In [None]:
# # Function to check if CGM Data is an empty array
# def is_cgm_data_empty(row):
#     try:
#         cgm_list = ast.literal_eval(row['CGM Data'])
#         return len(cgm_list) == 0
#     except:
#         return True

# # Function to filter out rows where CGM Data is empty
# cgm_train = cgm_train[~cgm_train.apply(is_cgm_data_empty, axis=1)]

# # Handle missing breakfast and lunch times
# cgm_train['Breakfast Time'] = pd.to_datetime(cgm_train['Breakfast Time'], errors='coerce')
# cgm_train['Lunch Time'] = pd.to_datetime(cgm_train['Lunch Time'], errors='coerce')

# # Extract CGM data as list of tuples, convert to list of time series values
# cgm_train['CGM Data'] = cgm_train['CGM Data'].apply(lambda x: eval(x) if isinstance(x, str) else [])

# # Extract features from CGM data (flatten the time and glucose values)
# def extract_cgm_features(cgm_data):
#     times = [entry[0] for entry in cgm_data]
#     glucose_levels = [entry[1] for entry in cgm_data]
#     return times, glucose_levels

# cgm_train['CGM Times'], cgm_train['CGM Levels'] = zip(*cgm_train['CGM Data'].apply(extract_cgm_features))

# # Normalize glucose levels
# scaler = StandardScaler()
# cgm_train['CGM Levels'] = cgm_train['CGM Levels'].apply(lambda x: scaler.fit_transform(np.array(x).reshape(-1, 1)).flatten())

# # We need to pad the sequences to a fixed length for GRU input
# max_sequence_length = 300  # Define a maximum length for the sequences
# cgm_train['Padded CGM Levels'] = pad_sequences(cgm_train['CGM Levels'], maxlen=max_sequence_length, padding='post', value=0, dtype='float32').tolist()

# # Mask labels: We will use NaN or a predefined mask value for missing times -- 1
# cgm_train['Breakfast Time Masked'] = cgm_train['Breakfast Time'].isna().astype(int)
# cgm_train['Lunch Time Masked'] = cgm_train['Lunch Time'].isna().astype(int)

# # Prepare the target variable: encode the time values for breakfast and lunch
# def encode_times(time_column):
#     return (time_column - pd.Timestamp('2021-09-18')) // pd.Timedelta('1s')

# # Filter rows where both Breakfast and Lunch times are missing (i.e., both masks are 0)
# filtered_cgm_train = cgm_train[(cgm_train['Breakfast Time Masked'] == 0) & (cgm_train['Lunch Time Masked'] == 0)].copy()

# # Encode breakfast and lunch times only for rows where both are missing
# filtered_cgm_train['Breakfast Time Encoded'] = encode_times(filtered_cgm_train['Breakfast Time'])
# filtered_cgm_train['Lunch Time Encoded'] = encode_times(filtered_cgm_train['Lunch Time'])

# time_scaler = MinMaxScaler()

# # Reshape and scale both 'Breakfast Time Encoded' and 'Lunch Time Encoded'
# filtered_cgm_train[['Breakfast Time Encoded', 'Lunch Time Encoded']] = time_scaler.fit_transform(
#     filtered_cgm_train[['Breakfast Time Encoded', 'Lunch Time Encoded']]
# )

# X_train = np.array(filtered_cgm_train['Padded CGM Levels'].tolist())
# y_train = filtered_cgm_train[['Breakfast Time Encoded', 'Lunch Time Encoded']].values

# # Reshape X_train to (samples, time steps, features)
# X_train_reshaped = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))  # (samples, time steps, features)

# # Define the GRU model
# def create_gru_model(input_shape):
#     model = Sequential([
#         Input(shape=input_shape),
#         GRU(32, activation='relu'),
#         Dense(2)  # Output two values: breakfast and lunch times
#     ])
#     model.compile(optimizer=Adam(learning_rate=0.001, clipvalue=1.0), loss='mse')
#     return model

# # Create and compile the model
# model = create_gru_model((X_train_reshaped.shape[1], 1))

# # Train the model
# model.fit(X_train_reshaped, y_train, epochs=10, batch_size=32, validation_split=0.2, verbose=1)


In [None]:
# def predict_meal_times(model, X):
#     X = np.array(X.tolist()) if isinstance(X, pd.Series) else np.array(X)
#     X_reshaped = X.reshape((X.shape[0], X.shape[1], 1))
#     predictions = model.predict(X_reshaped)
#     reference_date = pd.Timestamp('2021-09-18')

#     # Decode predictions back to original scale using inverse transformation
#     decoded_times = time_scaler.inverse_transform(predictions)

#     # Add the decoded seconds back to the reference date
#     decoded_breakfast_timestamps = reference_date + pd.to_timedelta(decoded_times[:, 0], unit='s')
#     decoded_lunch_timestamps = reference_date + pd.to_timedelta(decoded_times[:, 1], unit='s')

#     decoded_predictions = pd.DataFrame({
#         'Predicted Breakfast Time': decoded_breakfast_timestamps,
#         'Predicted Lunch Time': decoded_lunch_timestamps
#     })

#     return decoded_predictions

# # Extract rows with missing breakfast or lunch times
# missing_data = cgm_train[cgm_train['Breakfast Time'].isna() | cgm_train['Lunch Time'].isna()]

# # Ensure that `Padded CGM Levels` is included in `missing_data`
# predict_missing = missing_data['Padded CGM Levels']
# missing_data_copy = missing_data.copy()

# # Make predictions for missing breakfast and lunch times
# predicted_times = predict_meal_times(model, predict_missing)

# # Reset indices for both DataFrames to align by row order
# missing_data_copy = missing_data_copy.reset_index(drop=True)
# predicted_times = predicted_times.reset_index(drop=True)

# # Add the 'Predicted Breakfast Time' column
# missing_data_copy['Predicted Breakfast Time'] = predicted_times['Predicted Breakfast Time']
# missing_data_copy['Predicted Lunch Time'] = predicted_times['Predicted Lunch Time']

### Pre-Process Viome Data (Demographic Data) - COMPLETE

In [None]:
def preprocess_viome(demo_viome_train):
    array_of_means = []
    array_of_max = []
    array_of_min = []
    for i in range(36):
        list1 = demo_viome_train['Viome'][i].split(',')
        random_sum = 0
        random_max = -1000
        random_min = 1000
        for j in list1:
            temp_num = float(j)
            random_sum += temp_num
            if temp_num > random_max:
                random_max = temp_num
            if temp_num < random_min:
                random_min = temp_num
        array_of_means.append(random_sum/27)
        array_of_max.append(random_max)
        array_of_min.append(random_min)

    np_array_of_means = np.array(array_of_means)
    np_array_of_max = np.array(array_of_max)
    np_array_of_min = np.array(array_of_min)

    demo_viome_train['Viome_Mean'] = np_array_of_means
    demo_viome_train['Viome_Max'] = np_array_of_max
    demo_viome_train['Viome_Min'] = np_array_of_min
    demo_viome_train.drop(columns=['Viome'], inplace=True)

    demo_viome_train = pd.get_dummies(demo_viome_train, columns=['Race'], drop_first=False)

    return demo_viome_train

In [None]:
demo_viome_train = preprocess_viome(demo_viome_train)
data_train = pd.merge(data_train, demo_viome_train, on=['Subject ID'])

data_train

KeyError: 'Viome'

# Multimodal Training