In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [6]:
df = pd.read_csv('imputed_test_data.csv')
df = df.drop(columns = ['Unnamed: 0'])
df.head()

Unnamed: 0,subject_id,temp,WBC,HeartRate,RR,hourly_bin,sepsis
0,2,37.0,11.3,148.0,18.9,2138-07-17 20:00:00,0
1,2,37.0,11.3,144.0,18.9,2138-07-17 21:00:00,0
2,2,37.0,11.3,140.0,18.9,2138-07-17 22:00:00,0
3,3,37.0,11.3,87.0,16.0,2101-10-20 18:00:00,0
4,3,37.0,19.1,87.0,18.9,2101-10-20 19:00:00,0


In [7]:
sequence_length = 10
num_hours_ahead = 5

In [9]:
df['hourly_bin'] = pd.to_datetime(df['hourly_bin'])

In [10]:
def create_sequences_and_labels(data, sequence_length, num_hours_ahead):
    sequences = []
    labels = []
    for i in range(len(data) - sequence_length - num_hours_ahead + 1):
        sequence = data.iloc[i:i + sequence_length]
        sequences.append(sequence[['temp', 'WBC', 'HeartRate', 'RR']].values)
        
        # Find the hourly_bin at the end of the current sequence
        end_time = pd.to_datetime(sequence.iloc[-1]['hourly_bin'])

        
        # Look ahead in time by num_hours_ahead to check for sepsis
        future = data[(data['hourly_bin'] > end_time) & 
                      (data['hourly_bin'] <= end_time + pd.Timedelta(hours=num_hours_ahead))]
        label = 1 if future['sepsis'].any() else 0
        labels.append(label)
        
    return np.array(sequences), np.array(labels)


from sklearn.model_selection import GroupShuffleSplit
from sklearn.preprocessing import StandardScaler
# Split the dataset into a training set and a test set based on subject_id
train_inds, test_inds = next(GroupShuffleSplit(test_size=0.2, n_splits=2, random_state = 42).split(df, groups=df['subject_id']))

train_data = df.iloc[train_inds]
test_data = df.iloc[test_inds]

scaler = StandardScaler()


train_features = scaler.fit_transform(train_data[['temp', 'WBC', 'HeartRate', 'RR']])
train_data.loc[:, ['temp', 'WBC', 'HeartRate', 'RR']] = train_features


test_features = scaler.transform(test_data[['temp', 'WBC', 'HeartRate', 'RR']])
test_data.loc[:, ['temp', 'WBC', 'HeartRate', 'RR']] = test_features


X_train, y_train = [], []
X_test, y_test = [], []


for subject_id in train_data['subject_id'].unique():
    subject_data = train_data[train_data['subject_id'] == subject_id]
    subject_sequences, subject_labels = create_sequences_and_labels(subject_data, sequence_length, num_hours_ahead)
    X_train.extend(subject_sequences)
    y_train.extend(subject_labels)

# Create sequences and labels for test data
for subject_id in test_data['subject_id'].unique():
    subject_data = test_data[test_data['subject_id'] == subject_id]
    subject_sequences, subject_labels = create_sequences_and_labels(subject_data, sequence_length, num_hours_ahead)
    X_test.extend(subject_sequences)
    y_test.extend(subject_labels)

# Convert lists to numpy arrays
X_train = np.array(X_train)
y_train = np.array(y_train)
X_test = np.array(X_test)
y_test = np.array(y_test)