#### CSC 296S Deep Learning (Spring 2026)

#### Dr. Haiquan Chen, Dept of Computer Scicence

#### California State University, Sacramento



## Imports & Functions

In [None]:
#inport statements

import os
import tensorflow as tf
import sys
import sklearn as sk
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Conv1D, MaxPooling1D, Flatten
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.metrics import classification_report, roc_auc_score
#testing variables, set True for more in depth results
testing = False
testingResults = False

In [179]:
#This is a function that returns x as an average for each field for x and the final sepsis value as y.  However, I believe window size is better
#EX:
# 0 0 0 | 0
# 0 1 2 | 1
#Becomes:
# 0 0.5 1 | 1

'''#modify the data
preparedPatients = []
for i in range (1,6001):
    temp = patients[f"p{i:06d}"]
    #drop unneccessary fields
    temp.drop(['Bilirubin_direct', 'Bilirubin_total', 'TroponinI', 'Fibrinogen', 'Unit1', 'Unit2', 'EtCO2'], axis=1, inplace=True)
    row = temp.mean() #using the average per patient if applicatble
    row = row.fillna(global_means) #if average can't be calculated per patient, use global average
    row['SepsisLabel'] = temp['SepsisLabel'].iloc[-1] #overwrite sepsis label with last label 
    
    preparedPatients.append(row)

#store as a dataframe
df_patients = pd.DataFrame(preparedPatients)

#xy split (not using to_xy)
y = df_patients['SepsisLabel'].values.astype(np.float32) 
x = df_patients.drop('SepsisLabel', axis=1).values.astype(np.float32)
'''

# calculate the global standard deviations (similar to encode numeric zscore)
def calc_global_stds(df, replace_zeros=True):
    stds = df.std()
    if replace_zeros:
        stds = stds.replace(0, 1)  #avoid division by zero for constant columns
    return stds


## Load In The Data

In [180]:
#considering that there are many more patients without sepsis than with sepsis,
#it will help balance results to get every instance of sepsis

#get every instance of patient with sepsis in both datasets, put in a library
patients = {}
j = 1

#dataset A
for i in range (0,20644):
    try:
        filename = f"CC-2019-Sepsis/training_setA/p{i:06d}.psv"
        temp = pd.read_csv(filename, sep='|')
        #if the patient has sepsis, store in library
        if temp['SepsisLabel'].iloc[-1] == 1:
            key = f"p{j:06d}"
            patients[key] = temp
            j += 1
    except FileNotFoundError as e:
        if testing == True:
            print(f"file not found p{i}.psv")
        
#dataset B
for i in range (100000,120001):
    try:
        filename = f"CC-2019-Sepsis/training_setB/p{i:06d}.psv"
        temp = pd.read_csv(filename, sep='|')
        #if the patient has sepsis, store in library
        if temp['SepsisLabel'].iloc[-1] == 1:
            key = f"p{j:06d}"
            patients[key] = temp
            j += 1
    except FileNotFoundError as e:
        if testing == True:
            print(f"file not found p{i}.psv")
        
#number of sepsis patients found        
numSepsisPatients = (len(patients))

#add in non sepsis patients from dataset A for a total of 6000
for i in range (0,20643):
    if j > 6000:
        break
    try:
        filename = f"CC-2019-Sepsis/training_setA/p{i:06d}.psv"
        temp = pd.read_csv(filename, sep='|')
        if temp['SepsisLabel'].iloc[-1] == 0:
            key = f"p{j:06d}"
            patients[key] = temp
            j += 1
    except FileNotFoundError as e:
        if testing == True:
            print(f"file not found p{i}.psv")

## Data Preparation

In [181]:
#get the global average for all data

all_dfs = []
#dataset A
for i in range(0, 20644):
    try:
        filename = f"CC-2019-Sepsis/training_setA/p{i:06d}.psv"
        temp = pd.read_csv(filename, sep='|')
        all_dfs.append(temp)
    except FileNotFoundError:
        if testing == True:
            print(f"file not found p{i}.psv")
#dataset B
for i in range (100000,120001):
    try:
        filename = f"CC-2019-Sepsis/training_setB/p{i:06d}.psv"
        temp = pd.read_csv(filename, sep='|')
        all_dfs.append(temp)
    except FileNotFoundError as e:
        if testing == True:
            print(f"file not found p{i}.psv")
global_df = pd.concat(all_dfs, ignore_index=True)
global_means = global_df.mean()
global_stds = calc_global_stds(global_df)

In [182]:
#view the calculated means & current number of patients loaded in
if testingResults == True:
    print(global_means)
    print(f"number of sepsis patients: {numSepsisPatients}")
    print(f"total number of patients: {len(patients)}")

HR                   84.581443
O2Sat                97.193955
Temp                 36.977228
SBP                 123.750465
MAP                  82.400100
DBP                  63.830556
Resp                 18.726498
EtCO2                32.957657
BaseExcess           -0.689919
HCO3                 24.075481
FiO2                  0.554839
pH                    7.378934
PaCO2                41.021869
SaO2                 92.654188
AST                 260.223385
BUN                  23.915452
Alkalinephos        102.483661
Calcium               7.557531
Chloride            105.827910
Creatinine            1.510699
Bilirubin_direct      1.836177
Glucose             136.932283
Lactate               2.646666
Magnesium             2.051450
Phosphate             3.544238
Potassium             4.135528
Bilirubin_total       2.114059
TroponinI             8.290099
Hct                  30.794093
Hgb                  10.430833
PTT                  41.231193
WBC                  11.446405
Fibrinog

In [183]:
#checkout x and y values for patient 1
if testingResults == True:
    print(x[1])
    print(y[1])

[ 9.0000000e+01  1.0000000e+02  3.7669998e+01  1.4900000e+02
  8.8000000e+01  6.0000000e+01  1.8726498e+01 -6.8991917e-01
  2.4075481e+01  4.0000001e-01  7.3789339e+00  4.1021870e+01
  9.2654190e+01  2.6022339e+02  2.3915453e+01  1.0248366e+02
  7.5575309e+00  1.0582791e+02  1.5106994e+00  1.3693228e+02
  2.6466660e+00  2.0514503e+00  3.5442376e+00  4.1355281e+00
  3.0794094e+01  1.0430833e+01  4.1231194e+01  1.1446405e+01
  1.9601392e+02  6.5790001e+01  1.0000000e+00 -2.0000000e-02
  2.8000000e+01  8.8000000e+01  1.0000000e+02  3.6977230e+01
  1.4100000e+02  8.3000000e+01  5.7000000e+01  1.8500000e+01
 -6.8991917e-01  2.4075481e+01  5.5483866e-01  7.3789339e+00
  4.1021870e+01  9.2654190e+01  2.6022339e+02  2.3915453e+01
  1.0248366e+02  7.5575309e+00  1.0582791e+02  1.5106994e+00
  1.3693228e+02  2.6466660e+00  2.0514503e+00  3.5442376e+00
  4.1355281e+00  3.0794094e+01  1.0430833e+01  4.1231194e+01
  1.1446405e+01  1.9601392e+02  6.5790001e+01  1.0000000e+00
 -2.0000000e-02  2.90000

In [184]:
#test script to find the biggest window size we can have without padding (it's 8)
if testing == True:
    min_rows = float('inf')
    min_file = None

    for i in range(0, 20644):
        try:
            filename = f"CC-2019-Sepsis/training_setA/p{i:06d}.psv"
            temp = pd.read_csv(filename, sep='|')
            if len(temp) < min_rows:
                min_rows = len(temp)
                min_file = f"p{i:06d}"
        except FileNotFoundError:
            pass

    print(f"Smallest file: {min_file} with {min_rows} rows")
    for i in range (100000,120001):
        try:
            filename = f"CC-2019-Sepsis/training_setB/p{i:06d}.psv"
            temp = pd.read_csv(filename, sep='|')
            if len(temp) < min_rows:
                min_rows = len(temp)
                min_file = f"p{i:06d}"
        except FileNotFoundError:
            pass

    print(f"Smallest file: {min_file} with {min_rows} rows")

In [185]:
#define how many time entries per patient
WINDOW_SIZE = 8 
#columns to be dropped
DROP_COLS = ['Bilirubin_direct', 'Bilirubin_total', 'TroponinI', 'Fibrinogen', 'Unit1', 'Unit2', 'EtCO2', 'SepsisLabel']

preparedPatients = []
labels = []

for i in range(1, 6001):
    temp = patients[f"p{i:06d}"]
    temp = temp.drop(DROP_COLS, axis=1) #drop unneccessary fields and SepsisLabel
    temp = temp.fillna(global_means) #fill NaNs with global means
    
    feat_cols = temp.columns
    temp = (temp - global_means[feat_cols]) / global_stds[feat_cols]
    
    #pad with global mean values for each column if patient has fewer than WINDOW_SIZE rows
    if len(temp) < WINDOW_SIZE:
        padding = pd.DataFrame(
            np.zeros((WINDOW_SIZE - len(temp), len(feat_cols))),
            columns=feat_cols
        )
        temp = pd.concat([temp, padding], ignore_index=True)
    
    #take WINDOW_SIZE rows from end and flatten to 1D
    window = temp.iloc[-WINDOW_SIZE:].values.astype(np.float32)
    preparedPatients.append(window.flatten())
    
    #overwrite sepsis label with last label 
    labels.append(patients[f"p{i:06d}"]['SepsisLabel'].iloc[-1])

#set x and y values (not using to_xy())
x = np.array(preparedPatients, dtype=np.float32)
y = np.array(labels, dtype=np.float32)

In [186]:
#view the shape of x and y values as well as values for first entry of x
if testingResults == True:
    print(f"x shape: {x.shape}\ny shape: {y.shape}")
    print(f"values for first entry of x: {x[1]}")

x shape: (6000, 264)
y shape: (6000,)
values for first entry of x: [ 3.1275502e-01  9.5543689e-01  8.9968693e-01  1.0868638e+00
  3.4267443e-01 -2.7447355e-01  0.0000000e+00  0.0000000e+00
  0.0000000e+00 -1.3920323e-02  0.0000000e+00  0.0000000e+00
  0.0000000e+00  0.0000000e+00  0.0000000e+00  0.0000000e+00
  0.0000000e+00  0.0000000e+00  0.0000000e+00  0.0000000e+00
  0.0000000e+00  0.0000000e+00  0.0000000e+00  0.0000000e+00
  0.0000000e+00  0.0000000e+00  0.0000000e+00  0.0000000e+00
  0.0000000e+00  2.3071408e-01  8.8772053e-01  3.4577957e-01
  3.4648962e-02  1.9731653e-01  9.5543689e-01  0.0000000e+00
  7.4250448e-01  3.6709663e-02 -4.8943472e-01 -4.4427078e-02
  0.0000000e+00  0.0000000e+00  0.0000000e+00  0.0000000e+00
  0.0000000e+00  0.0000000e+00  0.0000000e+00  0.0000000e+00
  0.0000000e+00  0.0000000e+00  0.0000000e+00  0.0000000e+00
  0.0000000e+00  0.0000000e+00  0.0000000e+00  0.0000000e+00
  0.0000000e+00  0.0000000e+00  0.0000000e+00  0.0000000e+00
  0.0000000e+00  0

In [187]:
#train test split, 80/20 split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42, shuffle=True)

In [188]:
#contrast original dataframe sizes with split dataframe sizes
if testingResults == True:
    print(f"x_test shape: {x_test.shape}\nx_train shape: {x_train.shape}")
    print(f"y_test shape: {y_test.shape}\ny_train shape: {y_train.shape}")

x_test shape: (1200, 264)
x_train shape: (4800, 264)
y_test shape: (1200,)
y_train shape: (4800,)


## Some notes
-The data is now cleaned, split into x/y and then split into train/test for both x and y<br><br>
-The approach is to take out a window size from each file (first 8 entries) to use as x, filling in NaN values with either the local or global mean<br><br>
-The y value is the final SepsisLabel in the dataset, as that value will tell us if the patient had sepsis or not<br><br>
-This approach lines up very well with EC 4, as the window size functionality is already implemented.  Padding has also been implemented if necessary to test window sizes greater than 8 (the smallest files only have 8 time entries)<br><br>
-The data chosen specifically brings in ALL instances of sepsis patients from both datasets (roughly 2900) and pairs it with a similar amount of non sepsis patients (roughly 3100).  This technically could qualify as undersampling the non-sepsis data, which would fulfull EC 1 if we wanted to contrast it with an additional method of reading in the data that just takes the first 6000 entries instead of scanning for specific entries.  That being said, if there was some other EC you would rather do instead let me know<br><br>
-Ensure you put the unzipped data file CC-2019-Sepsis in the same directory as CSC296SProj1.ipynb<br><br>
-At this stage, assuming there are no bugs the data should be ready to be put into some models!

## Running Models

In [None]:
#first model to test data preproc is working (taken from a previous project)

checkpointer = ModelCheckpoint("fcn-best_weights.keras", verbose=0, save_best_only=True)
best_val_loss = float('inf')
best_model = None
for i in range(5):
    print(f"training FCN: {i}")
    model = Sequential()
    model.add(Dense(64, input_dim=x_train.shape[1], activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(32, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(16, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(8, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer=SGD(learning_rate=0.001, momentum=0.01), metrics=['accuracy'])
    monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=10, verbose=2, mode='min', restore_best_weights=True)
    history = model.fit(x_train, y_train, validation_data=(x_test, y_test), batch_size=64, callbacks=[monitor, checkpointer], verbose=1, epochs=10000)

    #track best model across runs by val loss
    val_loss = min(history.history['val_loss'])
    print(f"  Run {i} best val loss: {val_loss:.4f}")
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = model
        print(f"  New best model (val loss: {best_val_loss:.4f})")

#evaluation
print("\n--- Final Evaluation on Test Set ---")
y_pred_prob = best_model.predict(x_test)
y_pred = (y_pred_prob >= 0.5).astype(int).flatten()
print(classification_report(y_test, y_pred, target_names=['No Sepsis', 'Sepsis']))
print(f"ROC AUC: {roc_auc_score(y_test, y_pred_prob):.4f}")

training FCN: 0
Epoch 1/10000


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.4863 - loss: 0.7756 - val_accuracy: 0.4708 - val_loss: 0.7127
Epoch 2/10000
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 830us/step - accuracy: 0.4913 - loss: 0.7611 - val_accuracy: 0.4867 - val_loss: 0.7026
Epoch 3/10000
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 848us/step - accuracy: 0.4902 - loss: 0.7428 - val_accuracy: 0.4992 - val_loss: 0.6960
Epoch 4/10000
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 826us/step - accuracy: 0.5002 - loss: 0.7406 - val_accuracy: 0.5058 - val_loss: 0.6911
Epoch 5/10000
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 821us/step - accuracy: 0.4965 - loss: 0.7307 - val_accuracy: 0.5158 - val_loss: 0.6876
Epoch 6/10000
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 799us/step - accuracy: 0.5017 - loss: 0.7247 - val_accuracy: 0.5250 - val_loss: 0.6847
Epoch 7/10000
[1m75/75[0

## Evaluation