In [390]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
import torch
import ast
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

# Data Pre-Processing

In [391]:
# Load Datasets

# Training datasets
cgm_train = pd.read_csv('cgm_train.csv')
image_train = pd.read_csv('img_train.csv')
demo_viome_train = pd.read_csv('demo_viome_train.csv')
label_train = pd.read_csv('label_train.csv')

# Test datasets
cgm_test = pd.read_csv('cgm_test.csv')
image_test = pd.read_csv('img_test.csv')
demo_viome_test = pd.read_csv('demo_viome_test.csv')
label_test = pd.read_csv('label_test_breakfast_only.csv')

### Pre-Process CGM Data (Time-Series Glucose Levels)

In [392]:
# Function to check if CGM Data is an empty array
def is_cgm_data_empty(row):
    try:
        cgm_list = ast.literal_eval(row['CGM Data'])
        return len(cgm_list) == 0
    except:
        return True

# Function to filter out rows where CGM Data is empty
cgm_train = cgm_train[~cgm_train.apply(is_cgm_data_empty, axis=1)]

# Handle missing breakfast and lunch times
cgm_train['Breakfast Time'] = pd.to_datetime(cgm_train['Breakfast Time'], errors='coerce')
cgm_train['Lunch Time'] = pd.to_datetime(cgm_train['Lunch Time'], errors='coerce')

# Extract CGM data as list of tuples, convert to list of time series values
cgm_train['CGM Data'] = cgm_train['CGM Data'].apply(lambda x: eval(x) if isinstance(x, str) else [])

# Extract features from CGM data (flatten the time and glucose values)
def extract_cgm_features(cgm_data):
    times = [entry[0] for entry in cgm_data]
    glucose_levels = [entry[1] for entry in cgm_data]
    return times, glucose_levels

cgm_train['CGM Times'], cgm_train['CGM Levels'] = zip(*cgm_train['CGM Data'].apply(extract_cgm_features))

# Normalize glucose levels
scaler = StandardScaler()
cgm_train['CGM Levels'] = cgm_train['CGM Levels'].apply(lambda x: scaler.fit_transform(np.array(x).reshape(-1, 1)).flatten())

Rows with empty CGM data have been removed. TODO: Put in function to reuse for test

In [393]:
# We need to pad the sequences to a fixed length for GRU input
max_sequence_length = 300  # Define a maximum length for the sequences

# Pad sequences of CGM data
X = pad_sequences(cgm_train['CGM Levels'], maxlen=max_sequence_length, padding='post', value=0, dtype='float32')

# Mask labels: We will use NaN or a predefined mask value for missing times -- 1
cgm_train['Breakfast Time Masked'] = cgm_train['Breakfast Time'].isna().astype(int)
cgm_train['Lunch Time Masked'] = cgm_train['Lunch Time'].isna().astype(int)

# Prepare the target variable: encode the time values for breakfast and lunch
def encode_times(time_column):
    return (time_column - pd.Timestamp('2019-09-18')) // pd.Timedelta('1s')

cgm_train['Breakfast Time Encoded'] = encode_times(cgm_train['Breakfast Time'])
cgm_train['Lunch Time Encoded'] = encode_times(cgm_train['Lunch Time'])

# For the target, we want to predict encoded times where it's available
y_breakfast = cgm_train['Breakfast Time Encoded'].values
y_lunch = cgm_train['Lunch Time Encoded'].values

cgm_train.to_csv('cleaned_data.csv')

# Mask missing values in the target variables for masked prediction
y_breakfast_masked = np.where(cgm_train['Breakfast Time Masked'] == 0, y_breakfast, 0.0)
y_lunch_masked = np.where(cgm_train['Lunch Time Masked'] == 0, y_lunch, 0.0)

In [394]:
np.savetxt('output.txt', y_breakfast_masked, delimiter=',')

In [395]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense, Masking

# Build the GRU model
model = Sequential()

# Masking layer to ignore padding during training
model.add(Masking(mask_value=0., input_shape=(max_sequence_length, 1)))

# GRU layers
model.add(GRU(128, return_sequences=False))
model.add(Dense(64, activation='relu'))

# Output layer for predicting breakfast and lunch times (regression problem)
model.add(Dense(1))

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Model summary
model.summary()
 

  super().__init__(**kwargs)


In [396]:
# Prepare the data for training
X = np.expand_dims(X, axis=-1)  # Add a channel dimension for GRU input

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y_breakfast_masked, test_size=0.2, random_state=42)

# Train the model
history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, batch_size=32)

# Evaluate the model
loss = model.evaluate(X_val, y_val)
print(f'Validation Loss: {loss}')


Epoch 1/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 158ms/step - loss: 8756759174840320.0000 - val_loss: 8291516506177536.0000
Epoch 2/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 144ms/step - loss: 8895275628232704.0000 - val_loss: 8291516506177536.0000
Epoch 3/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 139ms/step - loss: 8947575981867008.0000 - val_loss: 8291514358693888.0000
Epoch 4/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 124ms/step - loss: 9159788705349632.0000 - val_loss: 8291514358693888.0000
Epoch 5/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 127ms/step - loss: 9076891541569536.0000 - val_loss: 8291512211210240.0000
Epoch 6/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 121ms/step - loss: 8986005201747968.0000 - val_loss: 8291511137468416.0000
Epoch 7/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 121ms/step - loss: 9225260113068032.000

In [397]:
# Predict missing breakfast times (masked values)
predicted_breakfast_times = model.predict(X_val)

predicted_breakfast_times

# Convert the predicted time in seconds back to datetime format
# predicted_breakfast_times = pd.to_datetime(predicted_breakfast_times, unit='s', origin='1970-01-01')

# # You can use a similar approach for lunch time prediction
# predicted_lunch_times = model.predict(X_val)
# predicted_lunch_times = pd.to_datetime(predicted_lunch_times, unit='s', origin='1970-01-01')


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step 


array([[64.14637 ],
       [64.13842 ],
       [64.1451  ],
       [64.14146 ],
       [64.148834],
       [64.14213 ],
       [64.129715],
       [64.1469  ],
       [64.14532 ],
       [64.1467  ],
       [64.15348 ],
       [64.14712 ],
       [64.139626],
       [64.14457 ],
       [64.130035],
       [64.14807 ],
       [64.145996],
       [64.15392 ],
       [64.14655 ],
       [64.145294],
       [64.151794],
       [64.15048 ],
       [64.14708 ],
       [64.131996],
       [64.139366],
       [64.13807 ],
       [64.13343 ],
       [64.14701 ],
       [64.043335],
       [64.1446  ],
       [64.14582 ],
       [64.155266],
       [64.15049 ],
       [64.1486  ],
       [64.133804],
       [64.145485],
       [64.145424],
       [64.14951 ],
       [64.13933 ],
       [64.14805 ],
       [64.13708 ],
       [64.14672 ],
       [64.1444  ],
       [64.13642 ],
       [64.14362 ],
       [64.141396],
       [64.12594 ],
       [64.15031 ],
       [64.14416 ],
       [64.14529 ],


In [398]:
# import pandas as pd
# import numpy as np
# from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
# from sklearn.impute import SimpleImputer

# # Load dataset with proper delimiter (ensure '\t' for tab-separated values)
# file_path = 'demo_viome_train.csv'
# data = pd.read_csv(file_path, delimiter='\t')

# # Recheck column parsing
# if len(data.columns) == 1:
#     # If all data is in a single column, try splitting with a comma
#     data = pd.read_csv(file_path, delimiter=',')

# # Verify column names
# print("Columns in dataset after re-parsing:", data.columns)

# # Split the `Viome` column into individual features
# viome_split = data['Viome'].str.split(',', expand=True).astype(float)
# viome_split.columns = [f"Viome_{i}" for i in range(viome_split.shape[1])]

# # Drop the original Viome column and merge new features
# data = pd.concat([data.drop(columns=['Viome']), viome_split], axis=1)

# # Impute missing values for numeric columns
# numeric_cols = data.select_dtypes(include=[np.number]).columns
# imputer = SimpleImputer(strategy='mean')
# data[numeric_cols] = imputer.fit_transform(data[numeric_cols])

# # Normalize numeric data
# scaler = MinMaxScaler()
# data[numeric_cols] = scaler.fit_transform(data[numeric_cols])


# # Encode categorical columns
# categorical_cols = ['Gender', 'Race', 'Diabetes Status']
# encoder = OneHotEncoder(sparse_output=False, drop='first')  # Use sparse_output instead of sparse
# encoded_cats = pd.DataFrame(
#     encoder.fit_transform(data[categorical_cols]),
#     columns=encoder.get_feature_names_out(categorical_cols)
# )

# # Drop original categorical columns and merge encoded ones
# data = pd.concat([data.drop(columns=categorical_cols), encoded_cats], axis=1)

# # Final processed data
# print("Processed Data Shape:", data.shape)
# print("Processed Data Preview:")
# print(data.head())


In [399]:
# import pandas as pd
# import numpy as np
# from PIL import Image

# # Load the dataset
# data = pd.read_csv("img_train.csv")  # Adjust the file path as necessary

# # Placeholder for missing images (a blank black image)
# def create_placeholder_image(size=(64, 64, 3)):
#     return np.zeros(size, dtype=np.float32)  # Normalized [0, 1] range

# # Function to preprocess image data
# def preprocess_image(img_data, size=(64, 64)):
#     try:
#         img_array = np.array(img_data, dtype=np.uint8)  # Ensure valid data type

#         # Check for empty image
#         if img_array.size == 0 or img_array.ndim != 3 or img_array.shape[2] != 3:
#             raise ValueError(f"Invalid or empty image dimensions: {img_array.shape}")

#         img_resized = np.array(Image.fromarray(img_array).resize(size))  # Resize
#         img_normalized = img_resized / 255.0  # Normalize pixel values to [0, 1]
#         return img_normalized
#     except Exception as e:
#         print(f"Error preprocessing image: {e}")
#         return create_placeholder_image(size)

# # Preprocess the dataset
# def preprocess_dataset(data):
#     # Define placeholder image
#     placeholder_image = create_placeholder_image()

#     # Create missingness indicators
#     data['Breakfast_Missing'] = data['Image Before Breakfast'].isnull().astype(int)
#     data['Lunch_Missing'] = data['Image Before Lunch'].isnull().astype(int)

#     # Iterate over rows to preprocess images
#     breakfast_images = []
#     lunch_images = []

#     for index, row in data.iterrows():
#         # Handle missing breakfast images
#         if pd.isnull(row['Image Before Breakfast']) or row['Image Before Breakfast'] == '[]':  # Check for empty list or NaN
#             breakfast_images.append(placeholder_image)
#         else:
#             try:
#                 img_data = eval(row['Image Before Breakfast'])  # Convert string to list
#                 breakfast_images.append(preprocess_image(img_data))
#             except Exception as e:
#                 print(f"Error at index {index}, breakfast: {e}")
#                 breakfast_images.append(placeholder_image)

#         # Handle missing lunch images
#         if pd.isnull(row['Image Before Lunch']) or row['Image Before Lunch'] == '[]':  # Check for empty list or NaN
#             lunch_images.append(placeholder_image)
#         else:
#             try:
#                 img_data = eval(row['Image Before Lunch'])  # Convert string to list
#                 lunch_images.append(preprocess_image(img_data))
#             except Exception as e:
#                 print(f"Error at index {index}, lunch: {e}")
#                 lunch_images.append(placeholder_image)

#     # Add preprocessed images back to the dataset
#     data['Processed_Breakfast_Images'] = breakfast_images
#     data['Processed_Lunch_Images'] = lunch_images

#     return data

# # Apply preprocessing
# processed_data = preprocess_dataset(data)

# # Save the processed dataset if needed
# # processed_data.to_pickle("processed_img_train.pkl")  # Save in pickle format for further use


In [400]:
# import pandas as pd

# # Load the dataset
# label_data = pd.read_csv("label_train.csv")  # Adjust the file path as necessary

# # Step 1: Extract Output Labels
# output_labels = label_data[["Breakfast Calories", "Lunch Calories"]]

# # Step 2: Handle Missing Values in Labels
# # Replace missing values (if any) with the median
# output_labels = output_labels.fillna(output_labels.median())

# print(output_labels)

# # Step 3: Save the Extracted Labels
# # output_labels.to_csv("output_labels.csv", index=False)

# print("Output Labels Extracted and Saved!")
