In [1582]:
import pandas as pd
import numpy as np
from datetime import datetime
from torch.utils.data import DataLoader, Dataset
import ast
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
import torch.nn as nn
import keras.backend as K
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense, Input
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import MinMaxScaler

# Data Pre-Processing

In [1583]:
# Load Datasets

# Training datasets
cgm_train = pd.read_csv('cgm_train.csv')
image_train = pd.read_csv('img_train.csv')
demo_viome_train = pd.read_csv('demo_viome_train.csv')
label_train = pd.read_csv('label_train.csv')

# Test datasets
cgm_test = pd.read_csv('cgm_test.csv')
image_test = pd.read_csv('img_test.csv')
demo_viome_test = pd.read_csv('demo_viome_test.csv')
label_test = pd.read_csv('label_test_breakfast_only.csv')

### Pre-Process CGM Data (Time-Series Glucose Levels)

In [None]:
# Updating missing values with mean

# Function to check if CGM Data is an empty array
def is_cgm_data_empty(row):
    try:
        cgm_list = ast.literal_eval(row['CGM Data'])
        return len(cgm_list) == 0
    except:
        return True

# Function to filter out rows where CGM Data is empty
cgm_train = cgm_train[~cgm_train.apply(is_cgm_data_empty, axis=1)]

# 1. Get the HH:MM:SS breakfast and lunch times and calculate the mean for each subject ID
cgm_train['Breakfast Time'] = pd.to_datetime(cgm_train['Breakfast Time'], errors='coerce')
cgm_train['Lunch Time'] = pd.to_datetime(cgm_train['Lunch Time'], errors='coerce')

def mean_time(times):
    total_seconds = sum([t.hour * 3600 + t.minute * 60 + t.second for t in times if pd.notna(t)])
    mean_seconds = total_seconds // len([t for t in times if pd.notna(t)])
    return pd.to_datetime(mean_seconds, unit='s').time()

mean_times = cgm_train.groupby('Subject ID')[['Breakfast Time', 'Lunch Time']].apply(
    lambda group: pd.Series({
        'Breakfast Time': mean_time(group['Breakfast Time']),
        'Lunch Time': mean_time(group['Lunch Time'])
    })
)

mean_times = mean_times.reset_index()

# 2. Find the reference date for any row within the same subject:
def get_reference_date(subject_id):
    day_2_breakfast_index = cgm_train[(cgm_train['Subject ID'] == subject_id) & (cgm_train['Day'] == 2)]['Breakfast Time'].first_valid_index()
    if day_2_breakfast_index is None or pd.isna(cgm_train.loc[day_2_breakfast_index, 'Breakfast Time']):
        day_3_breakfast_index = cgm_train[(cgm_train['Subject ID'] == subject_id) & (cgm_train['Day'] == 4)]['Breakfast Time'].first_valid_index()
        if day_3_breakfast_index is not None:
            reference_date = cgm_train.loc[day_3_breakfast_index, 'Breakfast Time']
            reference_day = 4
        else:
            reference_date = None
            reference_day = None
    else:
        reference_date = cgm_train.loc[day_2_breakfast_index, 'Breakfast Time']
        reference_day = 2
    
    return pd.Series([reference_date.date(), reference_day])


mean_times[['Reference Date', 'Reference Day']] = mean_times['Subject ID'].apply(get_reference_date)

# 3. Update missing_values_df
def update_missing_breakfast_time(row, mean_times):
    subject_id = row['Subject ID']
    day = row['Day']
    
    mean_breakfast_time = mean_times.loc[mean_times['Subject ID'] == subject_id, 'Breakfast Time'].iloc[0]
    mean_reference_date = mean_times.loc[mean_times['Subject ID'] == subject_id, 'Reference Date'].iloc[0]
    reference_day = mean_times.loc[mean_times['Subject ID'] == subject_id, 'Reference Day'].iloc[0]
    current_date = mean_reference_date + pd.Timedelta(days=(day - reference_day))
    updated_breakfast_time = pd.to_datetime(current_date.strftime('%Y-%m-%d') + ' ' + mean_breakfast_time.strftime('%H:%M:%S'))
    row['Breakfast Time'] = updated_breakfast_time
    return row

cgm_train = cgm_train.apply(
    lambda row: update_missing_breakfast_time(row, mean_times) if pd.isna(row['Breakfast Time']) else row, axis=1
)

def update_missing_lunch_time(row, mean_times):
    subject_id = row['Subject ID']
    day = row['Day']
    mean_lunch_time = mean_times.loc[mean_times['Subject ID'] == subject_id, 'Lunch Time'].iloc[0]
    mean_reference_date = mean_times.loc[mean_times['Subject ID'] == subject_id, 'Reference Date'].iloc[0]
    reference_day = mean_times.loc[mean_times['Subject ID'] == subject_id, 'Reference Day'].iloc[0]
    current_date = mean_reference_date + pd.Timedelta(days=(day - reference_day))
    updated_lunch_time = pd.to_datetime(current_date.strftime('%Y-%m-%d') + ' ' + mean_lunch_time.strftime('%H:%M:%S'))
    row['Lunch Time'] = updated_lunch_time
    return row

cgm_train = cgm_train.apply(
    lambda row: update_missing_lunch_time(row, mean_times) if pd.isna(row['Lunch Time']) else row, axis=1
)

cgm_train


Unnamed: 0,Subject ID,Day,Breakfast Time,Lunch Time,CGM Data
52,8,9,2021-12-21 09:00:00,2021-12-21 12:45:00,"[('2021-12-21 08:10:00', 124.58), ('2021-12-21..."
88,12,9,2022-03-02 09:53:22,2022-03-02 14:55:00,"[('2022-03-02 11:00:00', 139.04), ('2022-03-02..."
97,13,9,2022-03-12 08:03:52,2022-03-12 12:39:00,"[('2022-03-12 12:40:00', 103.0), ('2022-03-12 ..."
107,14,10,2022-04-08 08:30:00,2022-04-08 12:48:00,"[('2022-04-08 08:15:00', 131.58), ('2022-04-08..."
110,15,4,2022-04-02 09:26:00,2022-04-02 13:46:07,"[('2022-04-02 09:20:00', 105.0), ('2022-04-02 ..."
121,16,6,2022-04-07 07:53:00,2022-04-07 12:43:37,"[('2022-04-07 07:45:00', 110.23666666666666), ..."
139,19,6,2022-04-21 09:46:07,2022-04-21 14:05:00,"[('2022-04-21 13:30:00', 117.0), ('2022-04-21 ..."
172,26,3,2022-08-08 09:49:10,2022-08-08 12:58:00,"[('2022-08-08 12:45:00', 116.0), ('2022-08-08 ..."
187,28,9,2022-08-24 09:10:10,2022-08-24 13:04:01,"[('2022-08-24 09:00:00', 143.28), ('2022-08-24..."
189,29,2,2022-09-29 06:14:06,2022-09-29 11:56:25,"[('2022-09-29 05:45:00', 131.68), ('2022-09-29..."


In [1585]:
missing_values_df

Unnamed: 0,Subject ID,Day,Breakfast Time,Lunch Time,CGM Data
52,8,9,2021-12-21 09:00:00,2021-12-21 12:45:00,"[('2021-12-21 08:10:00', 124.58), ('2021-12-21..."
88,12,9,2022-03-02 09:53:22,2022-03-02 14:55:00,"[('2022-03-02 11:00:00', 139.04), ('2022-03-02..."
97,13,9,2022-03-12 08:03:52,2022-03-12 12:39:00,"[('2022-03-12 12:40:00', 103.0), ('2022-03-12 ..."
107,14,10,2022-04-08 08:30:00,2022-04-08 12:48:00,"[('2022-04-08 08:15:00', 131.58), ('2022-04-08..."
110,15,4,2022-04-02 09:26:00,2022-04-02 13:46:07,"[('2022-04-02 09:20:00', 105.0), ('2022-04-02 ..."
121,16,6,2022-04-07 07:53:00,2022-04-07 12:43:37,"[('2022-04-07 07:45:00', 110.23666666666666), ..."
139,19,6,2022-04-21 09:46:07,2022-04-21 14:05:00,"[('2022-04-21 13:30:00', 117.0), ('2022-04-21 ..."
172,26,3,2022-08-08 09:49:10,2022-08-08 12:58:00,"[('2022-08-08 12:45:00', 116.0), ('2022-08-08 ..."
187,28,9,2022-08-24 09:10:10,2022-08-24 13:04:01,"[('2022-08-24 09:00:00', 143.28), ('2022-08-24..."
189,29,2,2022-09-29 06:14:06,2022-09-29 11:56:25,"[('2022-09-29 05:45:00', 131.68), ('2022-09-29..."


In [1586]:
# # Function to check if CGM Data is an empty array
# def is_cgm_data_empty(row):
#     try:
#         cgm_list = ast.literal_eval(row['CGM Data'])
#         return len(cgm_list) == 0
#     except:
#         return True

# # Function to filter out rows where CGM Data is empty
# cgm_train = cgm_train[~cgm_train.apply(is_cgm_data_empty, axis=1)]

# # Handle missing breakfast and lunch times
# cgm_train['Breakfast Time'] = pd.to_datetime(cgm_train['Breakfast Time'], errors='coerce')
# cgm_train['Lunch Time'] = pd.to_datetime(cgm_train['Lunch Time'], errors='coerce')

# # Extract CGM data as list of tuples, convert to list of time series values
# cgm_train['CGM Data'] = cgm_train['CGM Data'].apply(lambda x: eval(x) if isinstance(x, str) else [])

# # Extract features from CGM data (flatten the time and glucose values)
# def extract_cgm_features(cgm_data):
#     times = [entry[0] for entry in cgm_data]
#     glucose_levels = [entry[1] for entry in cgm_data]
#     return times, glucose_levels

# cgm_train['CGM Times'], cgm_train['CGM Levels'] = zip(*cgm_train['CGM Data'].apply(extract_cgm_features))

# # Normalize glucose levels
# scaler = StandardScaler()
# cgm_train['CGM Levels'] = cgm_train['CGM Levels'].apply(lambda x: scaler.fit_transform(np.array(x).reshape(-1, 1)).flatten())

Rows with empty CGM data have been removed. TODO: Put in function to reuse for test

In [1587]:
# # We need to pad the sequences to a fixed length for GRU input
# max_sequence_length = 300  # Define a maximum length for the sequences
# cgm_train['Padded CGM Levels'] = pad_sequences(cgm_train['CGM Levels'], maxlen=max_sequence_length, padding='post', value=0, dtype='float32').tolist()

# # Mask labels: We will use NaN or a predefined mask value for missing times -- 1
# cgm_train['Breakfast Time Masked'] = cgm_train['Breakfast Time'].isna().astype(int)
# cgm_train['Lunch Time Masked'] = cgm_train['Lunch Time'].isna().astype(int)

# # Prepare the target variable: encode the time values for breakfast and lunch
# def encode_times(time_column):
#     return (time_column - pd.Timestamp('2021-09-18')) // pd.Timedelta('1s')

# # Filter rows where both Breakfast and Lunch times are missing (i.e., both masks are 0)
# filtered_cgm_train = cgm_train[(cgm_train['Breakfast Time Masked'] == 0) & (cgm_train['Lunch Time Masked'] == 0)].copy()

# # Encode breakfast and lunch times only for rows where both are missing
# filtered_cgm_train['Breakfast Time Encoded'] = encode_times(filtered_cgm_train['Breakfast Time'])
# filtered_cgm_train['Lunch Time Encoded'] = encode_times(filtered_cgm_train['Lunch Time'])

# time_scaler = MinMaxScaler()

# # Reshape and scale both 'Breakfast Time Encoded' and 'Lunch Time Encoded'
# filtered_cgm_train[['Breakfast Time Encoded', 'Lunch Time Encoded']] = time_scaler.fit_transform(
#     filtered_cgm_train[['Breakfast Time Encoded', 'Lunch Time Encoded']]
# )

# X_train = np.array(filtered_cgm_train['Padded CGM Levels'].tolist())
# y_train = filtered_cgm_train[['Breakfast Time Encoded', 'Lunch Time Encoded']].values

In [1588]:
# # Reshape X_train to (samples, time steps, features)
# X_train_reshaped = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))  # (samples, time steps, features)

# # Define the GRU model
# def create_gru_model(input_shape):
#     model = Sequential([
#         Input(shape=input_shape),
#         GRU(32, activation='relu'),
#         Dense(2)  # Output two values: breakfast and lunch times
#     ])
#     model.compile(optimizer=Adam(learning_rate=0.001, clipvalue=1.0), loss='mse')
#     return model

# # Create and compile the model
# model = create_gru_model((X_train_reshaped.shape[1], 1))

# # Train the model
# model.fit(X_train_reshaped, y_train, epochs=10, batch_size=32, validation_split=0.2, verbose=1)


In [1589]:
# def predict_meal_times(model, X):
#     X = np.array(X.tolist()) if isinstance(X, pd.Series) else np.array(X)
#     X_reshaped = X.reshape((X.shape[0], X.shape[1], 1))
#     predictions = model.predict(X_reshaped)
#     reference_date = pd.Timestamp('2021-09-18')

#     # Decode predictions back to original scale using inverse transformation
#     decoded_times = time_scaler.inverse_transform(predictions)

#     # Add the decoded seconds back to the reference date
#     decoded_breakfast_timestamps = reference_date + pd.to_timedelta(decoded_times[:, 0], unit='s')
#     decoded_lunch_timestamps = reference_date + pd.to_timedelta(decoded_times[:, 1], unit='s')

#     decoded_predictions = pd.DataFrame({
#         'Predicted Breakfast Time': decoded_breakfast_timestamps,
#         'Predicted Lunch Time': decoded_lunch_timestamps
#     })

#     return decoded_predictions

# # Extract rows with missing breakfast or lunch times
# missing_data = cgm_train[cgm_train['Breakfast Time'].isna() | cgm_train['Lunch Time'].isna()]

# # Ensure that `Padded CGM Levels` is included in `missing_data`
# predict_missing = missing_data['Padded CGM Levels']
# missing_data_copy = missing_data.copy()

# # Make predictions for missing breakfast and lunch times
# predicted_times = predict_meal_times(model, predict_missing)

# # Reset indices for both DataFrames to align by row order
# missing_data_copy = missing_data_copy.reset_index(drop=True)
# predicted_times = predicted_times.reset_index(drop=True)

# # Add the 'Predicted Breakfast Time' column
# missing_data_copy['Predicted Breakfast Time'] = predicted_times['Predicted Breakfast Time']
# missing_data_copy['Predicted Lunch Time'] = predicted_times['Predicted Lunch Time']

In [1590]:
# np.savetxt('output.txt', y_breakfast_masked, delimiter=',')

In [1591]:
# class GRUPredictor(nn.Module):
#     def __init__(self, input_size, hidden_size, num_layers):
#         super(GRUPredictor, self).__init__()
#         self.hidden_size = hidden_size
#         self.num_layers = num_layers
#         self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True)
#         self.fc = nn.Linear(hidden_size, 1)
    
#     def forward(self, x):
#         h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
#         out, _ = self.gru(x, h0)
#         out = self.fc(out[:, -1, :])
#         return out
    
# model = GRUPredictor(input_size=1, hidden_size=64, num_layers=2)
# criterion = nn.MSELoss()
# optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# # Convert data to PyTorch tensors
# X_tensor = torch.FloatTensor(X.reshape(-1, max_sequence_length, 1))
# y_tensor = torch.FloatTensor(y_breakfast_masked_train)

# # Train the model
# num_epochs = 10
# for epoch in range(num_epochs):
#     outputs = model(X_tensor)
#     loss = criterion(outputs, y_tensor)
#     optimizer.zero_grad()
#     loss.backward()
#     optimizer.step()
#     print(f'Epoch: {epoch} Loss: {loss.item()}')

In [1592]:
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import GRU, Dense, Masking

# # Build the GRU model
# model = Sequential()

# # Masking layer to ignore padding during training
# model.add(Masking(mask_value=0., input_shape=(max_sequence_length, 1)))

# # GRU layers
# model.add(GRU(128, return_sequences=False))
# model.add(Dense(64, activation='relu'))

# # Output layer for predicting breakfast and lunch times (regression problem)
# model.add(Dense(1))

# # Compile the model
# model.compile(optimizer='adam', loss='mean_squared_error')

# # Model summary
# model.summary()
 

In [1593]:
# # Prepare the data for training
# X = np.expand_dims(X, axis=-1)  # Add a channel dimension for GRU input

# # Split into training and validation sets
# X_train, X_val, y_train, y_val = train_test_split(X, y_breakfast_masked, test_size=0.2, random_state=42)

# # Train the model
# history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, batch_size=32)

# # Evaluate the model
# loss = model.evaluate(X_val, y_val)
# print(f'Validation Loss: {loss}')


In [1594]:
# # Predict missing breakfast times (masked values)
# predicted_breakfast_times = model.predict(X_val)

# predicted_breakfast_times

# Convert the predicted time in seconds back to datetime format
# predicted_breakfast_times = pd.to_datetime(predicted_breakfast_times, unit='s', origin='1970-01-01')

# # You can use a similar approach for lunch time prediction
# predicted_lunch_times = model.predict(X_val)
# predicted_lunch_times = pd.to_datetime(predicted_lunch_times, unit='s', origin='1970-01-01')


In [1595]:
# import pandas as pd
# import numpy as np
# from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
# from sklearn.impute import SimpleImputer

# # Load dataset with proper delimiter (ensure '\t' for tab-separated values)
# file_path = 'demo_viome_train.csv'
# data = pd.read_csv(file_path, delimiter='\t')

# # Recheck column parsing
# if len(data.columns) == 1:
#     # If all data is in a single column, try splitting with a comma
#     data = pd.read_csv(file_path, delimiter=',')

# # Verify column names
# print("Columns in dataset after re-parsing:", data.columns)

# # Split the `Viome` column into individual features
# viome_split = data['Viome'].str.split(',', expand=True).astype(float)
# viome_split.columns = [f"Viome_{i}" for i in range(viome_split.shape[1])]

# # Drop the original Viome column and merge new features
# data = pd.concat([data.drop(columns=['Viome']), viome_split], axis=1)

# # Impute missing values for numeric columns
# numeric_cols = data.select_dtypes(include=[np.number]).columns
# imputer = SimpleImputer(strategy='mean')
# data[numeric_cols] = imputer.fit_transform(data[numeric_cols])

# # Normalize numeric data
# scaler = MinMaxScaler()
# data[numeric_cols] = scaler.fit_transform(data[numeric_cols])


# # Encode categorical columns
# categorical_cols = ['Gender', 'Race', 'Diabetes Status']
# encoder = OneHotEncoder(sparse_output=False, drop='first')  # Use sparse_output instead of sparse
# encoded_cats = pd.DataFrame(
#     encoder.fit_transform(data[categorical_cols]),
#     columns=encoder.get_feature_names_out(categorical_cols)
# )

# # Drop original categorical columns and merge encoded ones
# data = pd.concat([data.drop(columns=categorical_cols), encoded_cats], axis=1)

# # Final processed data
# print("Processed Data Shape:", data.shape)
# print("Processed Data Preview:")
# print(data.head())


In [1596]:
# import pandas as pd
# import numpy as np
# from PIL import Image

# # Load the dataset
# data = pd.read_csv("img_train.csv")  # Adjust the file path as necessary

# # Placeholder for missing images (a blank black image)
# def create_placeholder_image(size=(64, 64, 3)):
#     return np.zeros(size, dtype=np.float32)  # Normalized [0, 1] range

# # Function to preprocess image data
# def preprocess_image(img_data, size=(64, 64)):
#     try:
#         img_array = np.array(img_data, dtype=np.uint8)  # Ensure valid data type

#         # Check for empty image
#         if img_array.size == 0 or img_array.ndim != 3 or img_array.shape[2] != 3:
#             raise ValueError(f"Invalid or empty image dimensions: {img_array.shape}")

#         img_resized = np.array(Image.fromarray(img_array).resize(size))  # Resize
#         img_normalized = img_resized / 255.0  # Normalize pixel values to [0, 1]
#         return img_normalized
#     except Exception as e:
#         print(f"Error preprocessing image: {e}")
#         return create_placeholder_image(size)

# # Preprocess the dataset
# def preprocess_dataset(data):
#     # Define placeholder image
#     placeholder_image = create_placeholder_image()

#     # Create missingness indicators
#     data['Breakfast_Missing'] = data['Image Before Breakfast'].isnull().astype(int)
#     data['Lunch_Missing'] = data['Image Before Lunch'].isnull().astype(int)

#     # Iterate over rows to preprocess images
#     breakfast_images = []
#     lunch_images = []

#     for index, row in data.iterrows():
#         # Handle missing breakfast images
#         if pd.isnull(row['Image Before Breakfast']) or row['Image Before Breakfast'] == '[]':  # Check for empty list or NaN
#             breakfast_images.append(placeholder_image)
#         else:
#             try:
#                 img_data = eval(row['Image Before Breakfast'])  # Convert string to list
#                 breakfast_images.append(preprocess_image(img_data))
#             except Exception as e:
#                 print(f"Error at index {index}, breakfast: {e}")
#                 breakfast_images.append(placeholder_image)

#         # Handle missing lunch images
#         if pd.isnull(row['Image Before Lunch']) or row['Image Before Lunch'] == '[]':  # Check for empty list or NaN
#             lunch_images.append(placeholder_image)
#         else:
#             try:
#                 img_data = eval(row['Image Before Lunch'])  # Convert string to list
#                 lunch_images.append(preprocess_image(img_data))
#             except Exception as e:
#                 print(f"Error at index {index}, lunch: {e}")
#                 lunch_images.append(placeholder_image)

#     # Add preprocessed images back to the dataset
#     data['Processed_Breakfast_Images'] = breakfast_images
#     data['Processed_Lunch_Images'] = lunch_images

#     return data

# # Apply preprocessing
# processed_data = preprocess_dataset(data)

# # Save the processed dataset if needed
# # processed_data.to_pickle("processed_img_train.pkl")  # Save in pickle format for further use


In [1597]:
# import pandas as pd

# # Load the dataset
# label_data = pd.read_csv("label_train.csv")  # Adjust the file path as necessary

# # Step 1: Extract Output Labels
# output_labels = label_data[["Breakfast Calories", "Lunch Calories"]]

# # Step 2: Handle Missing Values in Labels
# # Replace missing values (if any) with the median
# output_labels = output_labels.fillna(output_labels.median())

# print(output_labels)

# # Step 3: Save the Extracted Labels
# # output_labels.to_csv("output_labels.csv", index=False)

# print("Output Labels Extracted and Saved!")
