In [None]:
import pandas as pd
import os
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, Dense

# --- Part 1: Load and Prepare the Data ---
print("Loading the processed data...")
processed_data_path = os.path.join('..', 'data', 'processed', 'cleaned_sepsis_data.csv')
df = pd.read_csv(processed_data_path)

# For reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# --- Part 2: Scale the Features ---
# Neural networks work best when input features are scaled to a similar range.
# We'll use StandardScaler to give each feature a mean of 0 and a variance of 1.

# We'll keep PatientID separate for later analysis
patient_ids = df['PatientID']
features_to_scale = df.drop('PatientID', axis=1)

scaler = StandardScaler()
scaled_features = scaler.fit_transform(features_to_scale)

# --- Part 3: Split Data for Training ---
# For an autoencoder, we train ONLY on the "normal" data.
# However, for this dataset, we don't have clear labels for what is normal.
# So, for a first attempt, we will train on a large portion of the data (80%)
# and use the rest for testing.

X_train, X_test = train_test_split(scaled_features, test_size=0.2, random_state=42)

print(f"Training data shape: {X_train.shape}")
print(f"Test data shape: {X_test.shape}")

print("\nData is now scaled and ready for the Autoencoder model.")

: 