In [17]:
#IMPORT ALL THE REQUIRED LIBERARIES
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings

from sdv.single_table import CTGANSynthesizer
from sdv.metadata import SingleTableMetadata
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

warnings.filterwarnings('ignore')
%matplotlib inline

# STEP 2: Fix multiprocessing issues (Windows)
os.environ["NUMBA_NUM_THREADS"] = "1"
os.environ["JOBLIB_START_METHOD"] = "spawn"




In [18]:
import torch

In [19]:

# Load the dataset
LABEVENTS = pd.read_csv("LABEVENTS.csv.gz")
print(LABEVENTS.shape)
LABEVENTS.head()

(27854055, 9)


Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ITEMID,CHARTTIME,VALUE,VALUENUM,VALUEUOM,FLAG
0,281,3,,50820,2101-10-12 16:07:00,7.39,7.39,units,
1,282,3,,50800,2101-10-12 18:17:00,ART,,,
2,283,3,,50802,2101-10-12 18:17:00,-1,-1.0,mEq/L,
3,284,3,,50804,2101-10-12 18:17:00,22,22.0,mEq/L,
4,285,3,,50808,2101-10-12 18:17:00,0.93,0.93,mmol/L,abnormal


In [20]:
LABEVENTS.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27854055 entries, 0 to 27854054
Data columns (total 9 columns):
 #   Column      Dtype  
---  ------      -----  
 0   ROW_ID      int64  
 1   SUBJECT_ID  int64  
 2   HADM_ID     float64
 3   ITEMID      int64  
 4   CHARTTIME   object 
 5   VALUE       object 
 6   VALUENUM    float64
 7   VALUEUOM    object 
 8   FLAG        object 
dtypes: float64(2), int64(3), object(4)
memory usage: 1.9+ GB


In [21]:
# Check missing values
missing = LABEVENTS.isnull().sum()
print(missing[missing > 0])

HADM_ID      5609021
VALUE           1349
VALUENUM     2921220
VALUEUOM     3100249
FLAG        18009889
dtype: int64


In [22]:
LABEVENTS.shape

(27854055, 9)

In [23]:
# Drop rows with missing key values (adjust as needed)
required_cols = ['ROW_ID']
LABEVENTS_cleaned = LABEVENTS.dropna(subset=required_cols)

In [24]:
LABEVENTS_cleaned.fillna({
    'VALUENUM': np.nan,
    'VALUEUOM': 'UNKNOWN',
    'FLAG': 'UNKNOWN',
    'HADM_ID': -1
}, inplace=True)

In [25]:
# Encode categorical columns
cat_cols = LABEVENTS_cleaned.select_dtypes(include='object').columns
for col in cat_cols:
    LABEVENTS_cleaned[col] = LABEVENTS_cleaned[col].astype('category').cat.codes

In [26]:
# Drop duplicates
LABEVENTS.drop_duplicates(inplace=True)

In [27]:
# drop rows where essential columns are missing
essential_cols = ['CHARTTIME', 'VALUE', 'VALUENUM','VALUEUOM','FLAG'] 
LABEVENTS_cleaned = LABEVENTS.dropna(subset=essential_cols)

In [28]:
# Save cleaned file
LABEVENTS_cleaned.to_csv("LABEVENTS_cleaned.csv", index=False)

In [29]:
!pip install sdv


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [30]:
# Imports
import pandas as pd
from sdv.single_table import CTGANSynthesizer
from sdv.metadata import SingleTableMetadata

LABEVENTS_cleaned = pd.read_csv("LABEVENTS_cleaned.csv")

#  Generate metadata for SINGLE TABLE
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(LABEVENTS_cleaned)

# Initialize synthesizer (no enforce_minimal in this version)
synthesizer = CTGANSynthesizer(
    metadata=metadata,
    epochs=30,
)

#  Fit the synthesizer to the data
synthesizer.fit(LABEVENTS_cleaned)

synthetic_same_size = synthesizer.sample(num_rows=len(LABEVENTS_cleaned))
synthetic_data = synthesizer.sample(num_rows=1000)

#  Save synthetic data
synthetic_data.to_csv("synthetic_LABEVENTS.csv", index=False)

# Preview
synthetic_data.head()


MemoryError: Unable to allocate 516. GiB for an array with shape (9582656, 57786) and data type bool

In [None]:
print("Synthesizer fitted:", synthesizer._fitted)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
# Example: Compare distributions of a categorical column
column = 'FLAG'

plt.figure(figsize=(10,5))

# Real data
sns.countplot(x=column, data=LABEVENTS_cleaned, color='blue', label='Real', alpha=0.5)

# Synthetic data
sns.countplot(x=column, data=synthetic_data, color='red', label='Synthetic', alpha=0.5)

plt.legend()
plt.title(f"Real vs Synthetic: {column}")
plt.xticks(rotation=45)
plt.show()

In [None]:
#  Load synthetic dataset
import pandas as pd
import os
synthetic = pd.read_csv("synthetic_LABEVENTS.csv")

#   Setup for evaluation
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Choose features and target
features = ['ITEMID','CHARTTIME','VALUE']
target = 'FLAG'

# Ensure encoding consistency
for col in features:
    LABEVENTS_cleaned[col] = LABEVENTS_cleaned[col].astype('category').cat.codes
    synthetic_data[col] = synthetic_data[col].astype('category').cat.codes

# Real data model
X_real = LABEVENTS_cleaned[features]
y_real = LABEVENTS_cleaned[target]
Xr_train, Xr_test, yr_train, yr_test = train_test_split(X_real, y_real, test_size=0.3)

model_real = RandomForestClassifier()
model_real.fit(Xr_train, yr_train)
real_acc = accuracy_score(yr_test, model_real.predict(Xr_test))

# Synthetic data model
X_syn = synthetic_data[features]
y_syn = synthetic_data[target]
Xs_train, Xs_test, ys_train, ys_test = train_test_split(X_syn, y_syn, test_size=0.3)

model_syn = RandomForestClassifier()
model_syn.fit(Xs_train, ys_train)
syn_acc = accuracy_score(ys_test, model_syn.predict(Xs_test))

print(f" Model accuracy on real data:     {real_acc:.2f}")
print(f" Model accuracy on synthetic data:  {syn_acc:.2f}")



In [None]:

#  Import TVAESynthesizer
from sdv.single_table import TVAESynthesizer

#  Initialize VAE synthesizer
vae_synthesizer = TVAESynthesizer(
    metadata=metadata,
    epochs=10
)

# Fit the VAE synthesizer
vae_synthesizer.fit(LABEVENTS_cleaned)

#  Sample synthetic data using VAE
synthetic_data_vae = vae_synthesizer.sample(num_rows=1000)

#  Save VAE synthetic data
synthetic_data_vae.to_csv('synthetic_LABEVENTS_vae.csv', index=False)

#  Preview VAE synthetic data
synthetic_data_vae.head()

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load your synthetic data generated from VAE
synthetic_vae = pd.read_csv('synthetic_LABEVENTS_vae.csv')

# Preprocessing: convert categorical columns to numeric codes
features = ['ITEMID','CHARTTIME','VALUE']
target = 'FLAG'

# Make sure all datasets are prepared
for col in features:
    LABEVENTS_cleaned[col] = LABEVENTS_cleaned[col].astype('category').cat.codes
    synthetic_vae[col] = synthetic_vae[col].astype('category').cat.codes

# Real data model (already trained if you want, but re-training to be safe)
X_real = LABEVENTS_cleaned[features]
y_real = LABEVENTS_cleaned[target]
Xr_train, Xr_test, yr_train, yr_test = train_test_split(X_real, y_real, test_size=0.3, random_state=42)

model_real = RandomForestClassifier(random_state=42)
model_real.fit(Xr_train, yr_train)
real_acc = accuracy_score(yr_test, model_real.predict(Xr_test))

# VAE Synthetic data model
X_vae = synthetic_vae[features]
y_vae = synthetic_vae[target]
Xv_train, Xv_test, yv_train, yv_test = train_test_split(X_vae, y_vae, test_size=0.3, random_state=42)

model_vae = RandomForestClassifier(random_state=42)
model_vae.fit(Xv_train, yv_train)
vae_acc = accuracy_score(yv_test, model_vae.predict(Xv_test))

# Show results
print(f" Model accuracy on real data:       {real_acc:.2f}")
print(f" Model accuracy on VAE synthetic data: {vae_acc:.2f}")


In [None]:
import matplotlib.pyplot as plt

#  accuracy values
# These are from your previous results:
real_acc = real_acc          # Accuracy on real data using real training
vae_acc = vae_acc           # Accuracy on synthetic VAE data
gan_acc = syn_acc           # Accuracy on synthetic GAN (CTGAN) data

# Prepare bar chart data
labels = ['Real Data (Original)', 'CTGAN (Synthetic)', 'TVAE (Synthetic)']
accuracy_scores = [real_acc, gan_acc, vae_acc]

# Plotting
plt.figure(figsize=(10, 6))
bars = plt.bar(labels, accuracy_scores, color=['skyblue', 'orange', 'green'])

#  chart elements
plt.title('Model Accuracy Comparison: Real vs CTGAN vs TVAE', fontsize=14, fontweight='bold')
plt.ylabel('Accuracy Score')
plt.ylim(0.2, 1.5)
plt.grid(axis='y', linestyle='--', alpha=0.4)

# Annotate bars
for i, acc in enumerate(accuracy_scores):
    plt.text(i, acc + 0.01, f"{acc:.2f}", ha='center', fontsize=12, fontweight='bold')

plt.tight_layout()
plt.show()
