In [3]:
#IMPORT ALL THE REQUIRED LIBERARIES
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.preprocessing import MinMaxScaler
from sdv.single_table import CTGANSynthesizer
from sdv.metadata import SingleTableMetadata
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

# Fix multiprocessing issue for Windows
os.environ["NUMBA_NUM_THREADS"] = "1"
os.environ["JOBLIB_START_METHOD"] = "spawn"

In [4]:
import torch

In [5]:

# Load the dataset
INPUTEVENTS_MV = pd.read_csv("INPUTEVENTS_MV.csv.gz")
print(INPUTEVENTS_MV.shape)
INPUTEVENTS_MV.head(20)

(3618991, 31)


Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ICUSTAY_ID,STARTTIME,ENDTIME,ITEMID,AMOUNT,AMOUNTUOM,RATE,...,TOTALAMOUNTUOM,ISOPENBAG,CONTINUEINNEXTDEPT,CANCELREASON,STATUSDESCRIPTION,COMMENTS_EDITEDBY,COMMENTS_CANCELEDBY,COMMENTS_DATE,ORIGINALAMOUNT,ORIGINALRATE
0,241,27063,139787,223259.0,2133-02-05 06:29:00,2133-02-05 08:45:00,225166,6.774532,mEq,,...,ml,0,0,1,Rewritten,,RN,2133-02-05 12:52:00,10.0,0.05
1,242,27063,139787,223259.0,2133-02-05 05:34:00,2133-02-05 06:30:00,225944,28.132997,ml,30.142497,...,ml,0,0,0,FinishedRunning,,,,28.132998,30.255817
2,243,27063,139787,223259.0,2133-02-05 05:34:00,2133-02-05 06:30:00,225166,2.8133,mEq,,...,ml,0,0,0,FinishedRunning,,,,2.8133,0.050426
3,244,27063,139787,223259.0,2133-02-03 12:00:00,2133-02-03 12:01:00,225893,1.0,dose,,...,ml,0,0,2,Rewritten,RN,,2133-02-03 17:06:00,1.0,1.0
4,245,27063,139787,223259.0,2133-02-03 12:00:00,2133-02-03 12:01:00,220949,100.0,ml,,...,ml,0,0,2,Rewritten,RN,,2133-02-03 17:06:00,100.0,0.0
5,246,27063,139787,223259.0,2133-02-05 05:34:00,2133-02-05 07:03:00,225944,44.333333,ml,29.88764,...,ml,0,0,0,Rewritten,,,,44.333332,30.0
6,247,27063,139787,223259.0,2133-02-05 05:34:00,2133-02-05 07:03:00,225166,4.433333,mEq,,...,ml,0,0,0,Rewritten,,,,4.433333,0.05
7,248,27063,139787,223259.0,2133-02-05 09:43:00,2133-02-05 12:30:00,225944,83.600207,ml,30.036002,...,ml,0,0,0,Changed,,,,100.0,30.14892
8,249,27063,139787,223259.0,2133-02-05 09:43:00,2133-02-05 12:30:00,225166,8.360021,mEq,,...,ml,0,0,0,Changed,,,,10.000001,0.050248
9,250,27063,139787,223259.0,2133-02-05 05:34:00,2133-02-05 05:35:00,225944,0.5006,ml,30.036002,...,ml,0,0,0,Rewritten,,,,27.533001,30.14892


In [6]:
# Check missing values
missing = INPUTEVENTS_MV.isnull().sum()
print(missing[missing > 0])

ICUSTAY_ID                       1164
RATE                          1576139
RATEUOM                       1576139
SECONDARYORDERCATEGORYNAME     983428
TOTALAMOUNT                    534027
TOTALAMOUNTUOM                 530354
COMMENTS_EDITEDBY             3369730
COMMENTS_CANCELEDBY           3499629
COMMENTS_DATE                 3250366
dtype: int64


In [7]:
# Drop duplicates
INPUTEVENTS_MV.drop_duplicates(inplace=True)

In [8]:
INPUTEVENTS_MV.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3618991 entries, 0 to 3618990
Data columns (total 31 columns):
 #   Column                         Dtype  
---  ------                         -----  
 0   ROW_ID                         int64  
 1   SUBJECT_ID                     int64  
 2   HADM_ID                        int64  
 3   ICUSTAY_ID                     float64
 4   STARTTIME                      object 
 5   ENDTIME                        object 
 6   ITEMID                         int64  
 7   AMOUNT                         float64
 8   AMOUNTUOM                      object 
 9   RATE                           float64
 10  RATEUOM                        object 
 11  STORETIME                      object 
 12  CGID                           int64  
 13  ORDERID                        int64  
 14  LINKORDERID                    int64  
 15  ORDERCATEGORYNAME              object 
 16  SECONDARYORDERCATEGORYNAME     object 
 17  ORDERCOMPONENTTYPEDESCRIPTION  object 
 18  OR

In [9]:
# Convert date columns
INPUTEVENTS_MV['STARTTIME'] = pd.to_datetime(INPUTEVENTS_MV['STARTTIME'])
INPUTEVENTS_MV['ENDTIME'] = pd.to_datetime(INPUTEVENTS_MV['ENDTIME'])
INPUTEVENTS_MV['COMMENTS_DATE'] = pd.to_datetime(INPUTEVENTS_MV['COMMENTS_DATE'])

In [10]:
# drop rows where essential columns are missing
essential_cols = ['AMOUNT', 'RATE', 'ISOPENBAG',]
INPUTEVENTS_MV_cleaned = INPUTEVENTS_MV.dropna(subset=essential_cols)

In [11]:
INPUTEVENTS_MV_cleaned.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ICUSTAY_ID,STARTTIME,ENDTIME,ITEMID,AMOUNT,AMOUNTUOM,RATE,...,TOTALAMOUNTUOM,ISOPENBAG,CONTINUEINNEXTDEPT,CANCELREASON,STATUSDESCRIPTION,COMMENTS_EDITEDBY,COMMENTS_CANCELEDBY,COMMENTS_DATE,ORIGINALAMOUNT,ORIGINALRATE
1,242,27063,139787,223259.0,2133-02-05 05:34:00,2133-02-05 06:30:00,225944,28.132997,ml,30.142497,...,ml,0,0,0,FinishedRunning,,,NaT,28.132998,30.255817
5,246,27063,139787,223259.0,2133-02-05 05:34:00,2133-02-05 07:03:00,225944,44.333333,ml,29.88764,...,ml,0,0,0,Rewritten,,,NaT,44.333332,30.0
7,248,27063,139787,223259.0,2133-02-05 09:43:00,2133-02-05 12:30:00,225944,83.600207,ml,30.036002,...,ml,0,0,0,Changed,,,NaT,100.0,30.14892
9,250,27063,139787,223259.0,2133-02-05 05:34:00,2133-02-05 05:35:00,225944,0.5006,ml,30.036002,...,ml,0,0,0,Rewritten,,,NaT,27.533001,30.14892
13,254,27063,139787,223259.0,2133-02-05 07:31:00,2133-02-05 18:00:00,225158,104.833335,ml,10.0,...,ml,0,0,0,Stopped,,,NaT,250.0,10.0


In [12]:
#  Encode categorical features
cat_cols = INPUTEVENTS_MV_cleaned.select_dtypes(include='object').columns
for col in cat_cols:
    INPUTEVENTS_MV_cleaned[col] = INPUTEVENTS_MV_cleaned[col].astype('category').cat.codes


In [13]:
print("Rows:", INPUTEVENTS_MV_cleaned.shape[0])
print("Columns:", INPUTEVENTS_MV_cleaned.shape[1])
print(INPUTEVENTS_MV_cleaned.select_dtypes(include='object').nunique().sort_values(ascending=False))


Rows: 2042852
Columns: 31
Series([], dtype: float64)


In [14]:
# Scale numeric columns ( AMOUNT)
scaler = MinMaxScaler()
if 'AMOUNT' in INPUTEVENTS_MV_cleaned.columns:
    INPUTEVENTS_MV_cleaned['AMOUNT_scaled'] = scaler.fit_transform(INPUTEVENTS_MV_cleaned[['AMOUNT']])

In [15]:
# Save cleaned version
INPUTEVENTS_MV_cleaned.to_csv("INPUTEVENTS_MV_cleaned.csv", index=False)



In [16]:
!pip install sdv



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [None]:
# Generate metadata
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(INPUTEVENTS_MV_cleaned)

# Initialize CTGAN
synthesizer = CTGANSynthesizer(metadata=metadata, epochs=30)

# Fit synthesizer
synthesizer.fit(INPUTEVENTS_MV_cleaned)

# Generate synthetic samples
synthetic_INPUTEVENTS_MV = synthesizer.sample(num_rows=1000)
synthetic_INPUTEVENTS_MV.to_csv("synthetic_INPUTEVENTS_MV.csv", index=False)


In [None]:
plt.figure(figsize=(10, 5))
col = 'AMOUNT_scaled'

sns.kdeplot(INPUTEVENTS_MV_cleaned[col], label='Real', fill=True, alpha=0.5)
sns.kdeplot(synthetic_INPUTEVENTS_MV[col], label='Synthetic', fill=True, alpha=0.5)

plt.title(f"Distribution of {col}: Real vs Synthetic")
plt.legend()
plt.show()


In [None]:
# Choose columns with minimal NaNs and good representation
target = 'ISOPENBAG'
features = ['AMOUNT_scaled', 'RATE', 'CANCELREASON', 'STATUSDESCRIPTION']  # adjust as needed

# Encode again if not already
for col in features:
    if INPUTEVENTS_MV_cleaned[col].dtype == 'object':
        INPUTEVENTS_MV_cleaned[col] = INPUTEVENTS_MV_cleaned[col].astype('category').cat.codes
    if synthetic_INPUTEVENTS_MV[col].dtype == 'object':
        synthetic_INPUTEVENTS_MV[col] = synthetic_INPUTEVENTS_MV[col].astype('category').cat.codes

# Real data
Xr = INPUTEVENTS_MV_cleaned[features]
yr = INPUTEVENTS_MV_cleaned[target]
Xr_train, Xr_test, yr_train, yr_test = train_test_split(Xr, yr, test_size=0.3)
model_r = RandomForestClassifier().fit(Xr_train, yr_train)
acc_real = accuracy_score(yr_test, model_r.predict(Xr_test))

# Synthetic data
Xs = synthetic_INPUTEVENTS_MV[features]
ys = synthetic_INPUTEVENTS_MV[target]
Xs_train, Xs_test, ys_train, ys_test = train_test_split(Xs, ys, test_size=0.3)
model_s = RandomForestClassifier().fit(Xs_train, ys_train)
acc_synth = accuracy_score(ys_test, model_s.predict(Xs_test))

print(f"Model accuracy on real data:     {acc_real:.2f}")
print(f"Model accuracy on synthetic data:{acc_synth:.2f}")


In [None]:
import matplotlib.pyplot as plt

#  accuracy values
# These are from your previous results:
real_acc = real_acc          # Accuracy on real data using real training
vae_acc = vae_acc           # Accuracy on synthetic VAE data
gan_acc = syn_acc           # Accuracy on synthetic GAN (CTGAN) data

# Prepare bar chart data
labels = ['Real Data (Original)', 'CTGAN (Synthetic)', 'TVAE (Synthetic)']
accuracy_scores = [real_acc, gan_acc, vae_acc]

# Plotting
plt.figure(figsize=(10, 6))
bars = plt.bar(labels, accuracy_scores, color=['skyblue', 'orange', 'green'])

#  chart elements
plt.title('Model Accuracy Comparison: Real vs CTGAN vs TVAE', fontsize=14, fontweight='bold')
plt.ylabel('Accuracy Score')
plt.ylim(0.2, 1.5)
plt.grid(axis='y', linestyle='--', alpha=0.4)

# Annotate bars
for i, acc in enumerate(accuracy_scores):
    plt.text(i, acc + 0.01, f"{acc:.2f}", ha='center', fontsize=12, fontweight='bold')

plt.tight_layout()
plt.show()