In [1]:
# IMPORTS
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sdv.single_table import CTGANSynthesizer
from sdv.metadata import SingleTableMetadata

# Fix multiprocessing issue on Windows
os.environ["NUMBA_NUM_THREADS"] = "1"
os.environ["JOBLIB_START_METHOD"] = "spawn"

# LOAD DATA
MICROBIOLOGYEVENTS = pd.read_csv("MICROBIOLOGYEVENTS.csv.gz")
print("Original shape:", MICROBIOLOGYEVENTS.shape)
MICROBIOLOGYEVENTS.head(20)


Original shape: (631726, 16)


Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,CHARTDATE,CHARTTIME,SPEC_ITEMID,SPEC_TYPE_DESC,ORG_ITEMID,ORG_NAME,ISOLATE_NUM,AB_ITEMID,AB_NAME,DILUTION_TEXT,DILUTION_COMPARISON,DILUTION_VALUE,INTERPRETATION
0,744,96,170324,2156-04-13 00:00:00,2156-04-13 14:18:00,70021.0,BRONCHOALVEOLAR LAVAGE,80026.0,PSEUDOMONAS AERUGINOSA,1.0,,,,,,
1,745,96,170324,2156-04-20 00:00:00,2156-04-20 13:10:00,70062.0,SPUTUM,,,,,,,,,
2,746,96,170324,2156-04-20 00:00:00,2156-04-20 16:00:00,70012.0,BLOOD CULTURE,,,,,,,,,
3,747,96,170324,2156-04-20 00:00:00,,70012.0,BLOOD CULTURE,,,,,,,,,
4,748,96,170324,2156-04-20 00:00:00,,70079.0,URINE,,,,,,,,,
5,749,96,170324,2156-04-21 00:00:00,2156-04-21 14:00:00,70062.0,SPUTUM,,,,,,,,,
6,750,101,175533,2196-09-26 00:00:00,,70079.0,URINE,,,,,,,,,
7,751,101,175533,2196-09-27 00:00:00,2196-09-27 00:00:00,70012.0,BLOOD CULTURE,,,,,,,,,
8,752,101,175533,2196-09-27 00:00:00,2196-09-27 00:00:00,70012.0,BLOOD CULTURE,,,,,,,,,
9,753,101,175533,2196-09-27 00:00:00,2196-09-27 00:00:00,70062.0,SPUTUM,,,,,,,,,


In [2]:
import torch

In [3]:
# Drop duplicates
MICROBIOLOGYEVENTS.drop_duplicates(inplace=True)

# Convert dates
MICROBIOLOGYEVENTS['CHARTDATE'] = pd.to_datetime(MICROBIOLOGYEVENTS['CHARTDATE'], errors='coerce')
MICROBIOLOGYEVENTS['CHARTTIME'] = pd.to_datetime(MICROBIOLOGYEVENTS['CHARTTIME'], errors='coerce')


In [4]:
# Drop rows with missing key values (adjust as needed)
required_cols = ['SPEC_TYPE_DESC', 'ORG_NAME', 'INTERPRETATION']
MICROBIOLOGYEVENTS_cleaned = MICROBIOLOGYEVENTS.dropna(subset=required_cols)


In [5]:
# Fill  optional fields
MICROBIOLOGYEVENTS_cleaned.fillna({
    'AB_NAME': 'UNKNOWN',
    'DILUTION_TEXT': 'UNKNOWN',
    'DILUTION_COMPARISON': 'UNKNOWN',
    'DILUTION_VALUE': 0,
    'AB_ITEMID': -1,
    'ISOLATE_NUM': 0
}, inplace=True)



In [6]:
# Encode categorical columns
cat_cols = MICROBIOLOGYEVENTS_cleaned.select_dtypes(include='object').columns
for col in cat_cols:
    MICROBIOLOGYEVENTS_cleaned[col] = MICROBIOLOGYEVENTS_cleaned[col].astype('category').cat.codes



In [7]:
# Scale numeric column (e.g., DILUTION_VALUE)
if MICROBIOLOGYEVENTS_cleaned['DILUTION_VALUE'].notnull().sum() > 0:
    scaler = MinMaxScaler()
    MICROBIOLOGYEVENTS_cleaned['DILUTION_VALUE_SCALED'] = scaler.fit_transform(MICROBIOLOGYEVENTS_cleaned[['DILUTION_VALUE']])



In [8]:
# Save cleaned version
MICROBIOLOGYEVENTS_cleaned.to_csv("microbiologyevents_cleaned.csv", index=False)
print("Cleaned data shape:", MICROBIOLOGYEVENTS_cleaned.shape)

Cleaned data shape: (275834, 17)


In [9]:
# Load cleaned data
MICROBIOLOGYEVENTS_cleaned = pd.read_csv("microbiologyevents_cleaned.csv")



In [10]:
# Generate metadata
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(MICROBIOLOGYEVENTS_cleaned)

# Initialize CTGAN
synthesizer = CTGANSynthesizer(metadata=metadata, epochs=300)
synthesizer.fit(MICROBIOLOGYEVENTS_cleaned)

# Generate synthetic data
synthetic_MICROBIOLOGYEVENTS = synthesizer.sample(num_rows=1000)
synthetic_MICROBIOLOGYEVENTS.to_csv("synthetic_microbiology.csv", index=False)


OutOfMemoryError: CUDA out of memory. Tried to allocate 2.47 GiB. GPU 0 has a total capacity of 23.66 GiB of which 1.47 GiB is free. Process 1057776 has 314.00 MiB memory in use. Process 1078468 has 560.00 MiB memory in use. Process 1090717 has 310.00 MiB memory in use. Process 1165899 has 3.12 GiB memory in use. Process 1202020 has 634.00 MiB memory in use. Process 1191989 has 5.02 GiB memory in use. Process 1203679 has 6.77 GiB memory in use. Including non-PyTorch memory, this process has 4.88 GiB memory in use. Of the allocated memory 4.52 GiB is allocated by PyTorch, and 117.13 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
# Example: Compare ORGANISM counts
column = 'ORG_NAME'

plt.figure(figsize=(12,5))
sns.countplot(x=column, data=MICROBIOLOGYEVENTS_cleaned, color='blue', alpha=0.5, label='Real')
sns.countplot(x=column, data=synthetic_MICROBIOLOGYEVENTS, color='red', alpha=0.5, label='Synthetic')
plt.xticks(rotation=45)
plt.title(f"Distribution Comparison: {column}")
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
# Let's say we predict INTERPRETATION from SPEC_TYPE_DESC and ORG_NAME
features = ['SPEC_TYPE_DESC', 'ORG_NAME']
target = 'INTERPRETATION'

# Encode again if needed
for col in features + [target]:
    MICROBIOLOGYEVENTS_cleaned[col] = MICROBIOLOGYEVENTS_cleaned[col].astype('category').cat.codes
    synthetic_MICROBIOLOGYEVENTS[col] = synthetic_MICROBIOLOGYEVENTS[col].astype('category').cat.codes

# Train model on real data
Xr, yr = MICROBIOLOGYEVENTS_cleaned[features], MICROBIOLOGYEVENTS_cleaned[target]
Xr_train, Xr_test, yr_train, yr_test = train_test_split(Xr, yr, test_size=0.3)
model_real = RandomForestClassifier().fit(Xr_train, yr_train)
acc_real = accuracy_score(yr_test, model_real.predict(Xr_test))

# Train model on synthetic data
Xs, ys = synthetic_MICROBIOLOGYEVENTS[features], synthetic_MICROBIOLOGYEVENTS[target]
Xs_train, Xs_test, ys_train, ys_test = train_test_split(Xs, ys, test_size=0.3)
model_syn = RandomForestClassifier().fit(Xs_train, ys_train)
acc_syn = accuracy_score(ys_test, model_syn.predict(Xs_test))

print(f"Real model accuracy:     {acc_real:.2f}")
print(f"Synthetic model accuracy:{acc_syn:.2f}")


In [None]:
from sdv.single_table import TVAESynthesizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Define features and target (from D_CPT structure)
features = ['SPEC_TYPE_DESC', 'ORG_NAME']
target_col = 'INTERPRETATION'

# Create binary classification target
MICROBIOLOGYEVENTS_cleaned['target'] = (MICROBIOLOGYEVENTS_cleaned[target_col] > MICROBIOLOGYEVENTS_cleaned[target_col].median()).astype(int)

# Regenerate metadata (if needed)
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(MICROBIOLOGYEVENTS_cleaned.drop(columns=['target']))

# Fit TVAE (drop target column before fitting)
vae_synthesizer = TVAESynthesizer(metadata=metadata, epochs=30)
vae_synthesizer.fit(MICROBIOLOGYEVENTS_cleaned.drop(columns=['target']))

# Sample synthetic data
synthetic_data_vae = vae_synthesizer.sample(num_rows=1000)

# Add target to synthetic data
synthetic_data_vae['target'] = (synthetic_data_vae[target_col] > MICROBIOLOGYEVENTS_cleaned[target_col].median()).astype(int)

# Encode categorical features
for col in features:
    MICROBIOLOGYEVENTS_cleaned[col] = MICROBIOLOGYEVENTS_cleaned[col].astype('category').cat.codes
    synthetic_data_vae[col] = synthetic_data_vae[col].astype('category').cat.codes

# Train/test split
X_real = MICROBIOLOGYEVENTS_cleaned[features]
y_real = MICROBIOLOGYEVENTS_cleaned['target']
X_vae = synthetic_data_vae[features]
y_vae = synthetic_data_vae['target']

Xr_train, Xr_test, yr_train, yr_test = train_test_split(X_real, y_real, test_size=0.3, random_state=42)
Xv_train, Xv_test, yv_train, yv_test = train_test_split(X_vae, y_vae, test_size=0.3, random_state=42)

# Train models
model_real = RandomForestClassifier(random_state=42)
model_real.fit(Xr_train, yr_train)
real_acc = accuracy_score(yr_test, model_real.predict(Xr_test))

model_vae = RandomForestClassifier(random_state=42)
model_vae.fit(Xv_train, yv_train)
vae_acc = accuracy_score(yv_test, model_vae.predict(Xv_test))

# Results
print(f"Model accuracy on REAL D_CPT data: {real_acc:.2f}")
print(f"Model accuracy on VAE SYNTHETIC D_CPT: {vae_acc:.2f}")


In [None]:
import matplotlib.pyplot as plt

#  accuracy values
# These are from your previous results:
real_acc = real_acc          # Accuracy on real data using real training
vae_acc = vae_acc           # Accuracy on synthetic VAE data
gan_acc = acc_syn           # Accuracy on synthetic GAN (CTGAN) data

# Prepare bar chart data
labels = ['Real Data (Original)', 'CTGAN (Synthetic)', 'TVAE (Synthetic)']
accuracy_scores = [real_acc, gan_acc, vae_acc]

# Plotting
plt.figure(figsize=(10, 6))
bars = plt.bar(labels, accuracy_scores, color=['skyblue', 'orange', 'green'])

#  chart elements
plt.title('Model Accuracy Comparison: Real vs CTGAN vs TVAE', fontsize=14, fontweight='bold')
plt.ylabel('Accuracy Score')
plt.ylim(0.2, 1.5)
plt.grid(axis='y', linestyle='--', alpha=0.4)

# Annotate bars
for i, acc in enumerate(accuracy_scores):
    plt.text(i, acc + 0.01, f"{acc:.2f}", ha='center', fontsize=12, fontweight='bold')

plt.tight_layout()
plt.show()