In [110]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
import torch
import ast
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from datetime import datetime, timedelta
import torch.nn as nn

# Data Pre-Processing

In [111]:
# Load Datasets

# Training datasets
cgm_train = pd.read_csv('cgm_train.csv')
image_train = pd.read_csv('img_train.csv')
demo_viome_train = pd.read_csv('demo_viome_train.csv')
label_train = pd.read_csv('label_train.csv')

# Test datasets
cgm_test = pd.read_csv('cgm_test.csv')
image_test = pd.read_csv('img_test.csv')
demo_viome_test = pd.read_csv('demo_viome_test.csv')
label_test = pd.read_csv('label_test_breakfast_only.csv')

### Pre-Process CGM Data (Time-Series Glucose Levels)

In [112]:
# Function to check if CGM Data is an empty array
def is_cgm_data_empty(row):
    try:
        cgm_list = ast.literal_eval(row['CGM Data'])
        return len(cgm_list) == 0
    except:
        return True

# Function to filter out rows where CGM Data is empty
cgm_train = cgm_train[~cgm_train.apply(is_cgm_data_empty, axis=1)]

# Handle missing breakfast and lunch times
cgm_train['Breakfast Time'] = pd.to_datetime(cgm_train['Breakfast Time'], errors='coerce')
cgm_train['Lunch Time'] = pd.to_datetime(cgm_train['Lunch Time'], errors='coerce')

# Extract CGM data as list of tuples, convert to list of time series values
cgm_train['CGM Data'] = cgm_train['CGM Data'].apply(lambda x: eval(x) if isinstance(x, str) else [])

# Extract features from CGM data (flatten the time and glucose values)
def extract_cgm_features(cgm_data):
    times = [entry[0] for entry in cgm_data]
    glucose_levels = [entry[1] for entry in cgm_data]
    return times, glucose_levels

cgm_train['CGM Times'], cgm_train['CGM Levels'] = zip(*cgm_train['CGM Data'].apply(extract_cgm_features))

# Normalize glucose levels
scaler = StandardScaler()
cgm_train['CGM Levels'] = cgm_train['CGM Levels'].apply(lambda x: scaler.fit_transform(np.array(x).reshape(-1, 1)).flatten())

Rows with empty CGM data have been removed. TODO: Put in function to reuse for test

In [113]:
# We need to pad the sequences to a fixed length for GRU input
max_sequence_length = 300  # Define a maximum length for the sequences

# Pad sequences of CGM data
X = pad_sequences(cgm_train['CGM Levels'], maxlen=max_sequence_length, padding='post', value=0, dtype='float32')
cgm_train['Padded CGM Levels'] = pd.DataFrame(X).values.tolist()

# Mask labels: We will use NaN or a predefined mask value for missing times -- 1
cgm_train['Breakfast Time Masked'] = cgm_train['Breakfast Time'].isna().astype(int)
cgm_train['Lunch Time Masked'] = cgm_train['Lunch Time'].isna().astype(int)

# Prepare the target variable: encode the time values for breakfast and lunch
def encode_times(time_column):
    return (time_column - pd.Timestamp('2019-09-18')) // pd.Timedelta('1s')

cgm_train['Breakfast Time Encoded'] = encode_times(cgm_train['Breakfast Time'])
cgm_train['Lunch Time Encoded'] = encode_times(cgm_train['Lunch Time'])

# For the target, we want to predict encoded times where it's available
y_breakfast = cgm_train['Breakfast Time Encoded'].values
y_lunch = cgm_train['Lunch Time Encoded'].values

# Mask missing values in the target variables for masked prediction
y_breakfast_masked_train = cgm_train[cgm_train['Breakfast Time Masked'] == 0]
y_breakfast_masked_train = y_breakfast_masked_train['Breakfast Time Encoded']
x_breakfast_masked_train = cgm_train[cgm_train['Breakfast Time Masked'] == 0]
x_breakfast_masked_train = x_breakfast_masked_train['Padded CGM Levels']
y_breakfast_masked_test = cgm_train[cgm_train['Breakfast Time Masked'] == 1]
y_lunch_masked = np.where(cgm_train['Lunch Time Masked'] == 0, y_lunch, 00.0)

x_breakfast_masked_train

0      [0.33845457434654236, 0.1950594186782837, 0.29...
1      [-0.07194250077009201, 0.15369537472724915, 0....
2      [-0.25959381461143494, -0.10825366526842117, 0...
3      [0.673753023147583, 1.22025728225708, 1.237783...
4      [-0.4687596261501312, -0.37886980175971985, 0....
                             ...                        
319    [-0.8042100667953491, -0.5992313623428345, -0....
320    [-0.7918065786361694, -0.6305001974105835, -0....
321    [0.12031632661819458, 0.1935228854417801, 0.24...
322    [-0.22156518697738647, -0.04561636224389076, 0...
323    [-0.8055256009101868, -0.7820967435836792, -0....
Name: Padded CGM Levels, Length: 296, dtype: object

In [114]:
y_breakfast_masked_train

0      63276060.0
1      63366600.0
2      63452040.0
3      63539160.0
4      63623220.0
          ...    
319    71052720.0
320    71138580.0
321    71226360.0
322    71310840.0
323    71397840.0
Name: Breakfast Time Encoded, Length: 296, dtype: float64

In [115]:
# np.savetxt('output.txt', y_breakfast_masked, delimiter=',')

In [None]:
class GRUPredictor(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(GRUPredictor, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)
    
    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.gru(x, h0)
        out = self.fc(out[:, -1, :])
        return out
    
model = GRUPredictor(input_size=1, hidden_size=64, num_layers=2)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Convert data to PyTorch tensors
X_tensor = torch.FloatTensor(X.reshape(-1, max_sequence_length, 1))
y_tensor = torch.FloatTensor(y_breakfast_masked_train)

# Train the model
num_epochs = 10
for epoch in range(num_epochs):
    outputs = model(X_tensor)
    loss = criterion(outputs, y_tensor)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    print(f'Epoch: {epoch} Loss: {loss.item()}')

AttributeError: 'Series' object has no attribute 'reshape'

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense, Masking

# Build the GRU model
model = Sequential()

# Masking layer to ignore padding during training
model.add(Masking(mask_value=0., input_shape=(max_sequence_length, 1)))

# GRU layers
model.add(GRU(128, return_sequences=False))
model.add(Dense(64, activation='relu'))

# Output layer for predicting breakfast and lunch times (regression problem)
model.add(Dense(1))

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Model summary
model.summary()
 

  super().__init__(**kwargs)


In [None]:
# Prepare the data for training
X = np.expand_dims(X, axis=-1)  # Add a channel dimension for GRU input

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y_breakfast_masked, test_size=0.2, random_state=42)

# Train the model
history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, batch_size=32)

# Evaluate the model
loss = model.evaluate(X_val, y_val)
print(f'Validation Loss: {loss}')


Epoch 1/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 277ms/step - loss: 8916460554420224.0000 - val_loss: 8291516506177536.0000
Epoch 2/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 202ms/step - loss: 9280918594256896.0000 - val_loss: 8291516506177536.0000
Epoch 3/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 201ms/step - loss: 9132594046173184.0000 - val_loss: 8291514358693888.0000
Epoch 4/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 191ms/step - loss: 8660609352597504.0000 - val_loss: 8291512211210240.0000
Epoch 5/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 211ms/step - loss: 9007706060881920.0000 - val_loss: 8291511137468416.0000
Epoch 6/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 204ms/step - loss: 9318790542131200.0000 - val_loss: 8291510063726592.0000
Epoch 7/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 205ms/step - loss: 9239529068167168.000

KeyboardInterrupt: 

In [None]:
# Predict missing breakfast times (masked values)
predicted_breakfast_times = model.predict(X_val)

predicted_breakfast_times

# Convert the predicted time in seconds back to datetime format
# predicted_breakfast_times = pd.to_datetime(predicted_breakfast_times, unit='s', origin='1970-01-01')

# # You can use a similar approach for lunch time prediction
# predicted_lunch_times = model.predict(X_val)
# predicted_lunch_times = pd.to_datetime(predicted_lunch_times, unit='s', origin='1970-01-01')


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 91ms/step


array([[41.03498 ],
       [41.02909 ],
       [41.034218],
       [41.03254 ],
       [41.035957],
       [41.031616],
       [41.025402],
       [41.03465 ],
       [41.034233],
       [41.034958],
       [41.038597],
       [41.035507],
       [41.03218 ],
       [41.031498],
       [41.02701 ],
       [41.035076],
       [41.035027],
       [41.03981 ],
       [41.03463 ],
       [41.034374],
       [41.039078],
       [41.03743 ],
       [41.036057],
       [41.026268],
       [41.03254 ],
       [41.029503],
       [41.02926 ],
       [41.03536 ],
       [41.036808],
       [41.032738],
       [41.03444 ],
       [41.039345],
       [41.037926],
       [41.03646 ],
       [41.028667],
       [41.034866],
       [41.033653],
       [41.036583],
       [41.03086 ],
       [41.036552],
       [41.029762],
       [41.03369 ],
       [41.033985],
       [41.02951 ],
       [41.032887],
       [41.031998],
       [41.026478],
       [41.03766 ],
       [41.034203],
       [41.035088],


In [None]:
# import pandas as pd
# import numpy as np
# from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
# from sklearn.impute import SimpleImputer

# # Load dataset with proper delimiter (ensure '\t' for tab-separated values)
# file_path = 'demo_viome_train.csv'
# data = pd.read_csv(file_path, delimiter='\t')

# # Recheck column parsing
# if len(data.columns) == 1:
#     # If all data is in a single column, try splitting with a comma
#     data = pd.read_csv(file_path, delimiter=',')

# # Verify column names
# print("Columns in dataset after re-parsing:", data.columns)

# # Split the `Viome` column into individual features
# viome_split = data['Viome'].str.split(',', expand=True).astype(float)
# viome_split.columns = [f"Viome_{i}" for i in range(viome_split.shape[1])]

# # Drop the original Viome column and merge new features
# data = pd.concat([data.drop(columns=['Viome']), viome_split], axis=1)

# # Impute missing values for numeric columns
# numeric_cols = data.select_dtypes(include=[np.number]).columns
# imputer = SimpleImputer(strategy='mean')
# data[numeric_cols] = imputer.fit_transform(data[numeric_cols])

# # Normalize numeric data
# scaler = MinMaxScaler()
# data[numeric_cols] = scaler.fit_transform(data[numeric_cols])


# # Encode categorical columns
# categorical_cols = ['Gender', 'Race', 'Diabetes Status']
# encoder = OneHotEncoder(sparse_output=False, drop='first')  # Use sparse_output instead of sparse
# encoded_cats = pd.DataFrame(
#     encoder.fit_transform(data[categorical_cols]),
#     columns=encoder.get_feature_names_out(categorical_cols)
# )

# # Drop original categorical columns and merge encoded ones
# data = pd.concat([data.drop(columns=categorical_cols), encoded_cats], axis=1)

# # Final processed data
# print("Processed Data Shape:", data.shape)
# print("Processed Data Preview:")
# print(data.head())


In [None]:
# import pandas as pd
# import numpy as np
# from PIL import Image

# # Load the dataset
# data = pd.read_csv("img_train.csv")  # Adjust the file path as necessary

# # Placeholder for missing images (a blank black image)
# def create_placeholder_image(size=(64, 64, 3)):
#     return np.zeros(size, dtype=np.float32)  # Normalized [0, 1] range

# # Function to preprocess image data
# def preprocess_image(img_data, size=(64, 64)):
#     try:
#         img_array = np.array(img_data, dtype=np.uint8)  # Ensure valid data type

#         # Check for empty image
#         if img_array.size == 0 or img_array.ndim != 3 or img_array.shape[2] != 3:
#             raise ValueError(f"Invalid or empty image dimensions: {img_array.shape}")

#         img_resized = np.array(Image.fromarray(img_array).resize(size))  # Resize
#         img_normalized = img_resized / 255.0  # Normalize pixel values to [0, 1]
#         return img_normalized
#     except Exception as e:
#         print(f"Error preprocessing image: {e}")
#         return create_placeholder_image(size)

# # Preprocess the dataset
# def preprocess_dataset(data):
#     # Define placeholder image
#     placeholder_image = create_placeholder_image()

#     # Create missingness indicators
#     data['Breakfast_Missing'] = data['Image Before Breakfast'].isnull().astype(int)
#     data['Lunch_Missing'] = data['Image Before Lunch'].isnull().astype(int)

#     # Iterate over rows to preprocess images
#     breakfast_images = []
#     lunch_images = []

#     for index, row in data.iterrows():
#         # Handle missing breakfast images
#         if pd.isnull(row['Image Before Breakfast']) or row['Image Before Breakfast'] == '[]':  # Check for empty list or NaN
#             breakfast_images.append(placeholder_image)
#         else:
#             try:
#                 img_data = eval(row['Image Before Breakfast'])  # Convert string to list
#                 breakfast_images.append(preprocess_image(img_data))
#             except Exception as e:
#                 print(f"Error at index {index}, breakfast: {e}")
#                 breakfast_images.append(placeholder_image)

#         # Handle missing lunch images
#         if pd.isnull(row['Image Before Lunch']) or row['Image Before Lunch'] == '[]':  # Check for empty list or NaN
#             lunch_images.append(placeholder_image)
#         else:
#             try:
#                 img_data = eval(row['Image Before Lunch'])  # Convert string to list
#                 lunch_images.append(preprocess_image(img_data))
#             except Exception as e:
#                 print(f"Error at index {index}, lunch: {e}")
#                 lunch_images.append(placeholder_image)

#     # Add preprocessed images back to the dataset
#     data['Processed_Breakfast_Images'] = breakfast_images
#     data['Processed_Lunch_Images'] = lunch_images

#     return data

# # Apply preprocessing
# processed_data = preprocess_dataset(data)

# # Save the processed dataset if needed
# # processed_data.to_pickle("processed_img_train.pkl")  # Save in pickle format for further use


In [None]:
# import pandas as pd

# # Load the dataset
# label_data = pd.read_csv("label_train.csv")  # Adjust the file path as necessary

# # Step 1: Extract Output Labels
# output_labels = label_data[["Breakfast Calories", "Lunch Calories"]]

# # Step 2: Handle Missing Values in Labels
# # Replace missing values (if any) with the median
# output_labels = output_labels.fillna(output_labels.median())

# print(output_labels)

# # Step 3: Save the Extracted Labels
# # output_labels.to_csv("output_labels.csv", index=False)

# print("Output Labels Extracted and Saved!")
