In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
from chronos import ChronosPipeline

In [2]:
import random
seed = 42 # Set the seed for reproducibility
torch.manual_seed(seed) # For PyTorch
torch.cuda.manual_seed_all(seed)  # If you're using GPU
np.random.seed(seed) # For NumPy (if you use NumPy anywhere)
random.seed(seed) # For Python's built-in random module (if you use it anywhere)


In [3]:
import pandas as pd
import torch

def load_datasets():
    train_set = pd.read_parquet('loaded_data/a_patient_data_processed_cluster.parquet')
    test_set = pd.read_parquet('loaded_data/c_patient_data_processed_cluster.parquet')
    return train_set, test_set

train_df, test_df = load_datasets()

X_train = torch.tensor(train_df.drop(columns=["ICUType", "In-hospital_death"]).values, dtype=torch.float32)
y_train = torch.tensor(train_df["In-hospital_death"].values, dtype=torch.float32).unsqueeze(1)
X_test = torch.tensor(test_df.drop(columns=["ICUType", "In-hospital_death"]).values, dtype=torch.float32)
y_test = torch.tensor(test_df["In-hospital_death"].values, dtype=torch.float32).unsqueeze(1)

feature_columns = [col for col in train_df.columns if col not in ["RecordID", "In-hospital_death", "ICUType"]]

def pad_to_fixed_length(tensor, length=49):
    current_length = tensor.size(0)
    if current_length < length:
        padding = torch.zeros((length - current_length, tensor.size(1)))
        return torch.cat([tensor, padding], dim=0)
    else:
        return tensor

def process_dataframe(df):
    list_of_patient_data = []
    patient_labels = []
    grouped = df.groupby("RecordID")
    
    for record_id, group in grouped:
        # Extract features and labels
        group_data = group[feature_columns].values
        group_tensor = torch.tensor(group_data, dtype=torch.float32)
        group_tensor_fixed = pad_to_fixed_length(group_tensor, length=49)
        
        # Keep the RecordID intact
        patient_data = pd.DataFrame(group_tensor_fixed.numpy(), columns=feature_columns)
        patient_data['RecordID'] = record_id  # Add the RecordID as a column
        
        list_of_patient_data.append(patient_data)
        # For the label, we assume that if any timestep indicates death, the patient is labeled as death (1)
        patient_labels.append(group["In-hospital_death"].max())
    
    # Combine all the patient data
    final_df = pd.concat(list_of_patient_data, ignore_index=True)
    final_labels = pd.DataFrame(patient_labels, columns=["In-hospital_death"])
    
    return final_df, final_labels

# Process training and testing data and keep the RecordID intact
processed_train_data, labels_tensor_train = process_dataframe(train_df)
processed_test_data, labels_tensor_test = process_dataframe(test_df)

In [4]:
print(processed_train_data.shape, labels_tensor_train.shape)
processed_train_data
processed_train_data.groupby('RecordID').size()

(195853, 42) (3997, 1)


RecordID
132539.0    49
132540.0    49
132541.0    49
132543.0    49
132545.0    49
            ..
142665.0    49
142667.0    49
142670.0    49
142671.0    49
142673.0    49
Length: 3997, dtype: int64

In [5]:
import pandas as pd
import torch
from chronos import ChronosPipeline

# Load the Chronos pipeline for time-series forecasting or embeddings
pipeline = ChronosPipeline.from_pretrained(
    "amazon/chronos-t5-small",  # Use the Chronos model
    torch_dtype=torch.float16,  # Using float16 precision for faster processing
)


In [6]:

# Function to get embeddings for each variable using the Chronos pipeline
def get_embedding_for_variable(variable_data):
    # Extract the relevant column for time-series (e.g., 'variable_1') and convert to tensor
    context = torch.tensor(variable_data.values, dtype=torch.float32)
    context = context.unsqueeze(0)  # Add a batch dimension (making it a 2D tensor)
    
    # Generate embeddings for the time-series data
    embeddings, tokenizer_state = pipeline.embed(context)  # Extract embeddings for the variable
    
    return embeddings

# Function to compute the aggregated embedding for a patient
def get_patient_embedding(patient_data, patient_id):
    # Filter data for the specific patient
    patient_df = patient_data[patient_data['RecordID'] == patient_id]
    
    # Create a list to store the embeddings for each variable
    embeddings = []
    
    # Iterate over each variable (assuming columns are 'variable_1', 'variable_2', etc.)
    for column in patient_df.columns:
        if column != "RecordID" and column != "Time":
            # Get the embedding for the current variable
            variable_data = patient_df[column]
            embedding = get_embedding_for_variable(variable_data)
            
            # Append the embedding to the list
            embeddings.append(embedding)
            
    
    # Convert the list of embeddings into a tensor and compute the average across all variables
    embeddings_tensor = torch.stack(embeddings)
    aggregated_embedding = torch.mean(embeddings_tensor, dim=0)  # Average embeddings across all variables
    
    return aggregated_embedding


In [7]:
# Load patient data from a Parquet file
patient_data = processed_train_data
first_5_record_ids = patient_data['RecordID'].unique()[:5]
#patient_data = patient_data[patient_data['RecordID'].isin(first_5_record_ids)]
print(patient_data)
print(patient_data.columns)

             ALP       ALT       AST       Age   Albumin       BUN  Bilirubin  \
0      -0.138920 -0.123519 -0.118468 -0.596605  0.009690 -0.292460  -0.160848   
1      -0.138920 -0.123519 -0.118468 -0.596605  0.009690 -0.292460  -0.160848   
2      -0.138920 -0.123519 -0.118468 -0.596605  0.009690 -0.292460  -0.160848   
3      -0.138920 -0.123519 -0.118468 -0.596605  0.009690 -0.292460  -0.160848   
4      -0.138920 -0.123519 -0.118468 -0.596605  0.009690 -0.292460  -0.160848   
...          ...       ...       ...       ...       ...       ...        ...   
195848 -0.717583 -0.172539 -0.035137  0.781907 -2.180711 -0.044264  -0.230621   
195849 -0.717583 -0.172539 -0.035137  0.781907 -2.180711 -0.044264  -0.230621   
195850 -0.717583 -0.172539 -0.035137  0.781907 -2.180711 -0.044264  -0.230621   
195851  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   0.000000   
195852  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   0.000000   

        Cholesterol  Creati

In [8]:
# Loop through all unique patients (recordID) and get their embeddings
all_patient_embeddings = {}

for patient_id in patient_data['RecordID'].unique()[:]:
    # Get the aggregated embedding for the current patient
    patient_embedding = get_patient_embedding(patient_data, patient_id)
    
    # Store the aggregated embedding for each patient in a dictionary
    all_patient_embeddings[patient_id] = patient_embedding
    torch.save(all_patient_embeddings, 'train_pat_embeddings.pth')

KeyboardInterrupt: 

In [None]:
import torch

# Load the full object (not just weights) from the .pth file
embeddings_tensor = torch.load('train_pat_embeddings.pth', map_location=torch.device('cpu'), weights_only=False)

print(len(embeddings_tensor.items()))

# Loop through all patient embeddings and check their sizes
#for patient_id, embedding in embeddings_tensor.items():
    #if isinstance(embedding, torch.Tensor):
        #print(f"Patient ID: {patient_id}, Embedding size: {embedding.size()}")
    #else:
        #print(f"Patient ID: {patient_id}, Embedding is not a tensor")
