In [266]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from datetime import datetime
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import BertForMaskedLM, BertTokenizer
from torch.utils.data import DataLoader, Dataset
import torch
import ast

# Data Pre-Processing

In [267]:
# Load Datasets

# Training datasets
cgm_train = pd.read_csv('cgm_train.csv')
image_train = pd.read_csv('img_train.csv')
demo_viome_train = pd.read_csv('demo_viome_train.csv')
label_train = pd.read_csv('label_train.csv')

# Test datasets
cgm_test = pd.read_csv('cgm_test.csv')
image_test = pd.read_csv('img_test.csv')
demo_viome_test = pd.read_csv('demo_viome_test.csv')
label_test = pd.read_csv('label_test_breakfast_only.csv')

### Pre-Process CGM Data (Time-Series Glucose Levels)

In [None]:
# Function to check if CGM Data is an empty array
def is_cgm_data_empty(row):
    try:
        cgm_list = ast.literal_eval(row['CGM Data'])
        return len(cgm_list) == 0
    except:
        return True

# Apply the function to filter out rows where CGM Data is empty
cgm_train = cgm_train[~cgm_train.apply(is_cgm_data_empty, axis=1)]

Rows with empty CGM data have been removed. TODO: Put in function to reuse for test

In [269]:
# Handle missing breakfast and lunch times
cgm_train['Breakfast Time'] = pd.to_datetime(cgm_train['Breakfast Time'], errors='coerce')
cgm_train['Lunch Time'] = pd.to_datetime(cgm_train['Lunch Time'], errors='coerce')

# Mask missing breakfast and lunch times for training
cgm_train['Breakfast Time'] = cgm_train['Breakfast Time'].fillna('MASK')
cgm_train['Lunch Time'] = cgm_train['Lunch Time'].fillna('MASK')

In [270]:
# Step 2: Preprocessing CGM data
# Convert the CGM Data into a usable format (e.g., flatten the time-value pairs)
cgm_train['CGM Data'] = cgm_train['CGM Data'].apply(eval).apply(lambda x: ' '.join([f'{t[0]}:{t[1]}' for t in x]))

In [None]:
# Step 3: Prepare for model training
class TimePredictionDataset(Dataset):
    def __init__(self, cgm_data, breakfast_times, lunch_times):
        self.cgm_data = cgm_data
        self.breakfast_times = breakfast_times
        self.lunch_times = lunch_times

    def __len__(self):
        return len(self.cgm_data)

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(tokenizer.encode(self.cgm_data[idx]), dtype=torch.long),
            'labels': torch.tensor(tokenizer.encode(self.breakfast_times[idx]), dtype=torch.long),
            'labels_lunch': torch.tensor(tokenizer.encode(self.lunch_times[idx]), dtype=torch.long)
        }
    
# Tokenizer and model setup
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')

# Prepare dataset and dataloaders
train_dataset = TimePredictionDataset(cgm_train['CGM Data'], cgm_train['Breakfast Time'], cgm_train['Lunch Time'])
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)

# Step 4: Training loop
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

for epoch in range(3):  # Number of epochs
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids']
        labels = batch['labels']
        labels_lunch = batch['labels_lunch']
        
        outputs = model(input_ids, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        
    print(f'Epoch {epoch + 1}, Loss: {loss.item()}')

model.save_pretrained('./time_prediction_model')
tokenizer.save_pretrained('./time_prediction_model')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another archite

ValueError: Input 2023-09-02 08:08:29 is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers.

In [None]:
# Convert CGM Data from string to list of tuples
cgm_train['CGM Data'] = cgm_train['CGM Data'].apply(eval)

# Create a fixed time grid from 00:00 to 23:55 with 5-minute intervals
full_time_range = pd.date_range('2021-09-19 00:00', '2021-09-19 23:55', freq='5min')

def preprocess_subject_data(subject_data):
        breakfast_time = pd.to_datetime(subject_data['Breakfast Time'], errors='coerce')
        lunch_time = pd.to_datetime(subject_data['Lunch Time'], errors='coerce')

        cgm_df = pd.DataFrame(subject_data['CGM Data'], columns=['timestamp', 'glucose_level'])
        print(cgm_df)
        # cgm_df['timestamp'] = pd.to_datetime(cgm_df['timestamp'], errors='coerce')
        # cgm_df.dropna(subset=['timestamp'], inplace=True)
        # cgm_df.set_index('timestamp', inplace=True)
        # cgm_resampled = cgm_df.reindex(full_time_range, method=None)
        # cgm_resampled['glucose_level'] = cgm_resampled['glucose_level'].interpolate(method='time')

        # scaler = MinMaxScaler()
        # cgm_resampled['scaled_glucose'] = scaler.fit_transform(cgm_resampled[['glucose_level']])

        # fixed_length = 288
        # padded_glucose = pad_sequences(cgm_resampled[['scaled_glucose']].values.T, maxlen=fixed_length, padding='post', value=0)

        # return padded_glucose.flatten()

preprocessed_cgms = np.array(cgm_train.apply(preprocess_subject_data, axis=1))
# print(preprocessed_cgms)
    

SyntaxError: leading zeros in decimal integer literals are not permitted; use an 0o prefix for octal integers (<string>, line 1)

In [None]:
# import pandas as pd
# import numpy as np
# from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
# from sklearn.impute import SimpleImputer

# # Load dataset with proper delimiter (ensure '\t' for tab-separated values)
# file_path = 'demo_viome_train.csv'
# data = pd.read_csv(file_path, delimiter='\t')

# # Recheck column parsing
# if len(data.columns) == 1:
#     # If all data is in a single column, try splitting with a comma
#     data = pd.read_csv(file_path, delimiter=',')

# # Verify column names
# print("Columns in dataset after re-parsing:", data.columns)

# # Split the `Viome` column into individual features
# viome_split = data['Viome'].str.split(',', expand=True).astype(float)
# viome_split.columns = [f"Viome_{i}" for i in range(viome_split.shape[1])]

# # Drop the original Viome column and merge new features
# data = pd.concat([data.drop(columns=['Viome']), viome_split], axis=1)

# # Impute missing values for numeric columns
# numeric_cols = data.select_dtypes(include=[np.number]).columns
# imputer = SimpleImputer(strategy='mean')
# data[numeric_cols] = imputer.fit_transform(data[numeric_cols])

# # Normalize numeric data
# scaler = MinMaxScaler()
# data[numeric_cols] = scaler.fit_transform(data[numeric_cols])


# # Encode categorical columns
# categorical_cols = ['Gender', 'Race', 'Diabetes Status']
# encoder = OneHotEncoder(sparse_output=False, drop='first')  # Use sparse_output instead of sparse
# encoded_cats = pd.DataFrame(
#     encoder.fit_transform(data[categorical_cols]),
#     columns=encoder.get_feature_names_out(categorical_cols)
# )

# # Drop original categorical columns and merge encoded ones
# data = pd.concat([data.drop(columns=categorical_cols), encoded_cats], axis=1)

# # Final processed data
# print("Processed Data Shape:", data.shape)
# print("Processed Data Preview:")
# print(data.head())


In [None]:
# import pandas as pd
# import numpy as np
# from PIL import Image

# # Load the dataset
# data = pd.read_csv("img_train.csv")  # Adjust the file path as necessary

# # Placeholder for missing images (a blank black image)
# def create_placeholder_image(size=(64, 64, 3)):
#     return np.zeros(size, dtype=np.float32)  # Normalized [0, 1] range

# # Function to preprocess image data
# def preprocess_image(img_data, size=(64, 64)):
#     try:
#         img_array = np.array(img_data, dtype=np.uint8)  # Ensure valid data type

#         # Check for empty image
#         if img_array.size == 0 or img_array.ndim != 3 or img_array.shape[2] != 3:
#             raise ValueError(f"Invalid or empty image dimensions: {img_array.shape}")

#         img_resized = np.array(Image.fromarray(img_array).resize(size))  # Resize
#         img_normalized = img_resized / 255.0  # Normalize pixel values to [0, 1]
#         return img_normalized
#     except Exception as e:
#         print(f"Error preprocessing image: {e}")
#         return create_placeholder_image(size)

# # Preprocess the dataset
# def preprocess_dataset(data):
#     # Define placeholder image
#     placeholder_image = create_placeholder_image()

#     # Create missingness indicators
#     data['Breakfast_Missing'] = data['Image Before Breakfast'].isnull().astype(int)
#     data['Lunch_Missing'] = data['Image Before Lunch'].isnull().astype(int)

#     # Iterate over rows to preprocess images
#     breakfast_images = []
#     lunch_images = []

#     for index, row in data.iterrows():
#         # Handle missing breakfast images
#         if pd.isnull(row['Image Before Breakfast']) or row['Image Before Breakfast'] == '[]':  # Check for empty list or NaN
#             breakfast_images.append(placeholder_image)
#         else:
#             try:
#                 img_data = eval(row['Image Before Breakfast'])  # Convert string to list
#                 breakfast_images.append(preprocess_image(img_data))
#             except Exception as e:
#                 print(f"Error at index {index}, breakfast: {e}")
#                 breakfast_images.append(placeholder_image)

#         # Handle missing lunch images
#         if pd.isnull(row['Image Before Lunch']) or row['Image Before Lunch'] == '[]':  # Check for empty list or NaN
#             lunch_images.append(placeholder_image)
#         else:
#             try:
#                 img_data = eval(row['Image Before Lunch'])  # Convert string to list
#                 lunch_images.append(preprocess_image(img_data))
#             except Exception as e:
#                 print(f"Error at index {index}, lunch: {e}")
#                 lunch_images.append(placeholder_image)

#     # Add preprocessed images back to the dataset
#     data['Processed_Breakfast_Images'] = breakfast_images
#     data['Processed_Lunch_Images'] = lunch_images

#     return data

# # Apply preprocessing
# processed_data = preprocess_dataset(data)

# # Save the processed dataset if needed
# # processed_data.to_pickle("processed_img_train.pkl")  # Save in pickle format for further use


In [None]:
# import pandas as pd

# # Load the dataset
# label_data = pd.read_csv("label_train.csv")  # Adjust the file path as necessary

# # Step 1: Extract Output Labels
# output_labels = label_data[["Breakfast Calories", "Lunch Calories"]]

# # Step 2: Handle Missing Values in Labels
# # Replace missing values (if any) with the median
# output_labels = output_labels.fillna(output_labels.median())

# print(output_labels)

# # Step 3: Save the Extracted Labels
# # output_labels.to_csv("output_labels.csv", index=False)

# print("Output Labels Extracted and Saved!")
