### This is where the model is trained.

For the POC level the following will not be addressed:
1. Hyperparameter tuning
2. kNN pre-imputer
3. Modifications and advancements to the model
4. Use of temporal data

In [1]:
import pandas as pd
import torch
from sklearn.preprocessing import MinMaxScaler

In [4]:
# Load the dataset
file_path = 'data/dat.csv'  # Replace with actual path if different
ehr_data = pd.read_csv(file_path)

In [7]:
# drop rows with >50% missing values
cleaned_data = ehr_data.select_dtypes(include=['number']).dropna(thresh=ehr_data.shape[1] * 0.5)

# Fill missing values with column means
cleaned_data.fillna(cleaned_data.mean(), inplace=True)

# Remove duplicates, completely empty and unnecessary columns
cleaned_data = cleaned_data.drop_duplicates().dropna(axis=1, how='all')
cleaned_data = cleaned_data.loc[:, cleaned_data.columns != 'Unnamed: 0']

# Display the cleaned data head for verification
print("Cleaned Data Sample:")
print(cleaned_data.head())

Cleaned Data Sample:
   inpatient.number  visit.times  body.temperature  pulse  respiration  \
0            857781            1              36.7     87           19   
1            743087            1              36.8     95           18   
2            866418            2              36.5     98           18   
3            775928            1              36.0     73           19   
4            810128            1              35.0     88           19   

   systolic.blood.pressure  diastolic.blood.pressure        map  weight  \
0                      102                        64  76.666667    50.0   
1                      150                        70  96.666667    51.0   
2                      102                        67  78.666667    70.0   
3                      110                        74  86.000000    65.0   
4                      134                        62  86.000000    76.0   

   height  ...  carboxyhemoglobin  body.temperature.blood.gas  \
0    1.64  ...    

# Converting data to a tensor

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
ehr_tensor = torch.tensor(cleaned_data.values, dtype=torch.float32).to(device)

print(f"EHR tensor shape: {ehr_tensor.shape}")

EHR tensor shape: torch.Size([1987, 150])


# Normalizing the data