# TensorFlow Testing Field

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor

In [2]:
# Load the dataset
data = pd.read_csv("mimic_mean_final.csv")

In [None]:
data = data[['Time_Zone', 'gender', 'age', 'race', 'Lactate', 'los']]

In [3]:
display(data)

Unnamed: 0,row_count,subject_id,hadm_id,Time_Zone,gender,age,race,Base Excess,Lactate,pCO2,...,Haptoglobin,Bilirubin Direct,Thyroxine (T4) Free,Sedimentation Rate,CK-MB,Amylase,PEEP set (cmH2O),Central Venous Pressure (mmHg),hospital_expire_flag,los
0,1,10004733,27411876,1,M,51,UNKNOWN,0.0,0.80,38.0,...,294.0,,,,,,5.000000,3.0,0,8.357373
1,2,10004733,27411876,2,M,51,UNKNOWN,0.0,0.75,38.0,...,294.0,,,,,,5.000000,3.0,0,8.357373
2,3,10004733,27411876,3,M,51,UNKNOWN,0.0,0.80,38.0,...,294.0,,,,,,5.000000,3.0,0,8.357373
3,4,10004733,27411876,4,M,51,UNKNOWN,0.0,0.75,38.0,...,294.0,,,,,,5.000000,3.0,0,8.357373
4,5,10004733,27411876,5,M,51,UNKNOWN,0.0,0.75,38.0,...,294.0,,,,,,5.000000,3.0,0,8.357373
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55787,55788,19999987,23865745,12,F,57,UNKNOWN,1.0,,44.0,...,,,,,43.0,,5.833333,,0,1.937847
55788,55789,19999987,23865745,13,F,57,UNKNOWN,1.0,,44.0,...,,,,,45.5,,5.000000,,0,1.937847
55789,55790,19999987,23865745,14,F,57,UNKNOWN,1.0,,44.0,...,,,,,43.0,,5.833333,,0,1.937847
55790,55791,19999987,23865745,15,F,57,UNKNOWN,1.0,,44.0,...,,,,,44.0,,5.000000,,0,1.937847


In [None]:
# Filtering rows where Time_Zone column equals 1
data = data[data['Time_Zone'] == 1]

In [None]:
"""
Keep only features with numeric values
because I want to impute only the
"""
numeric_data = data.select_dtypes(include='number')

# Select only the categorical columns
categorical_data = data.select_dtypes(exclude='number')

# We specify the imputer.
imputer = IterativeImputer(estimator=RandomForestRegressor(), random_state=0)

# Replace missing values with estimated values based on the iterative imputation process.
# Wrap the iteration in tqdm to show progress bar
imputed_data = []
for row in tqdm(numeric_data.values, desc="Imputing Values", total=len(numeric_data)):
    imputed_row = imputer.fit_transform(row.reshape(1, -1))
    imputed_data.append(imputed_row.ravel())

# Convert the imputed data array back into a pandas DataFrame
imputed_df = pd.DataFrame(imputed_data, columns=numeric_data.columns)

# Merge categorical_data with imputed_df
imputed_full_df = pd.concat([categorical_data.reset_index(drop=True), imputed_df.reset_index(drop=True)], axis=1)

Imputing Values:   4%|▍         | 2452/55792 [4:48:52<101:55:10,  6.88s/it]

In [None]:
# Export the merged DataFrame to a CSV file
imputed_full_df.to_csv('imputed_mimic_86_features.csv', index=False)

In [None]:
"""
Impute without process bar
"""


# Keep only features with numeric values 
numeric_data = data.select_dtypes(include='number')

# Select only the categorical columns
categorical_data = data.select_dtypes(exclude='number')

#  We specify the imputer.
imputer = IterativeImputer(estimator=RandomForestRegressor(), random_state=0)

# Replace missing values with estimated values based on the iterative imputation process.
imputed_data = imputer.fit_transform(numeric_data)

# We convert the imputed data array back into a pandas DataFrame 
imputed_df = pd.DataFrame(imputed_data, columns=numeric_data.columns)

# Merge categorical_data with imputed_df
imputed_full_df = pd.concat([categorical_data.reset_index(drop=True), imputed_df.reset_index(drop=True)], axis=1)