In [None]:
pip install pandas scipy




In [None]:
import torch
torch.cuda.is_available(), torch.cuda.get_device_name(0)




(True, 'Tesla T4')

In [None]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


Using device: cuda


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
from typing import List, Optional

def dataset_sanity_check(
    df: pd.DataFrame,
    target_col: str,
    interval_cols: Optional[List[str]] = None,
    max_unique_for_cat: int = 50
) -> None:
    """
    Prints a summary of df:
     - shape, dtype counts
     - missing values per column
     - target distribution
     - suggested interval vs categorical splits
    Args:
      df: raw DataFrame
      target_col: name of the target column
      interval_cols: optional list of numeric feature names;
                     if None, they'll be inferred by dtype and unique count
      max_unique_for_cat: if dtype==object but unique<=this, treat as cat
    """
    print("─── DATASET SANITY CHECK ───")
    print(f"Shape: {df.shape[0]} rows × {df.shape[1]} cols")
    print("\nColumn dtypes:")
    print(df.dtypes.value_counts().to_string(), "\n")

    # Missing
    missing = df.isna().sum()
    if missing.any():
        print("Missing values:")
        print(missing[missing>0].sort_values(), "\n")
    else:
        print("No missing values.\n")

    # Target distribution
    if target_col not in df.columns:
        raise ValueError(f"Target column '{target_col}' not found in DataFrame!")
    print(f"Target distribution ({target_col}):")
    print(df[target_col].value_counts(normalize=True).mul(100).round(2).astype(str) + "%\n")

    # Feature type suggestions
    if interval_cols is None:
        # infer numeric by dtype
        num = df.select_dtypes(include=[np.number]).columns.tolist()
        # treat low‐card object cols as categorical too
        obj = [
            c for c in df.select_dtypes(include=["object"]).columns
            if df[c].nunique() <= max_unique_for_cat and c != target_col
        ]
        interval_cols = num
        cat_cols = [c for c in df.columns if c not in interval_cols + [target_col]]
    else:
        # user‐provided
        cat_cols = [c for c in df.columns if c not in interval_cols + [target_col]]

    print(f"Suggested numeric (interval) cols ({len(interval_cols)}): {interval_cols}")
    print(f"Suggested categorical cols ({len(cat_cols)}): {cat_cols}\n")

    # Warn about very small or very large datasets
    if df.shape[0] < 100:
        print("  Warning: fewer than 100 samples—GANs may overfit or collapse.")
    elif df.shape[0] > 200_000:
        print("  Warning: very large dataset—training may be slow.")

    print("──────────────────────────────\n")


# ─── Example usage ───
if __name__ == "__main__":
    # Load any dataset
    df = pd.read_csv("/content/drive/MyDrive/Katabatic/Data/Adult/adult-official.csv")

    # Run the check
    dataset_sanity_check(
      df,
      target_col="class",
      # you can also explicitly tell it which interval cols to use:
      interval_cols=["age","education-num","capital-gain","capital-loss","hours-per-week"]
    )


─── DATASET SANITY CHECK ───
Shape: 48842 rows × 15 cols

Column dtypes:
object    15 

No missing values.

Target distribution (class):
class
<=50K    76.07%\n
>50K     23.93%\n
Name: proportion, dtype: object
Suggested numeric (interval) cols (5): ['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
Suggested categorical cols (9): ['workclass', 'fnlwgt', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']

──────────────────────────────



In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import LabelEncoder, StandardScaler
import os

def interval_to_mid(x):
    """Convert interval strings to midpoints, e.g., '20-30' => 25.0"""
    if pd.isna(x): return x
    parts = re.findall(r'-?[\d\.]+|inf', str(x))
    if len(parts) == 2:
        lo, hi = parts
        lo = float(lo) if lo not in ("-inf", "inf") else float(hi)
        hi = float(hi) if hi not in ("-inf", "inf") else float(lo)
        return (lo + hi) / 2
    try:
        return float(x)
    except:
        return pd.NA

def preprocess_dataset(file_path, target_column, interval_cols=None, output_path="processed_data.csv"):
    # Load dataset
    df = pd.read_csv(file_path)
    df.columns = df.columns.str.strip()


    # Auto-detect interval columns if not provided
    if interval_cols is None:
        interval_cols = df.select_dtypes(include=['object']).columns[
            df.select_dtypes(include=['object']).apply(
                lambda col: col.astype(str).str.contains(r'\d+\s*-\s*\d+').any()
            )
        ].tolist()

    # Convert intervals to midpoints
    for col in interval_cols:
        df[col] = df[col].apply(interval_to_mid)

    # Separate types
    numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
    if target_column in numeric_cols:
        numeric_cols.remove(target_column)

    # Encode target
    le = LabelEncoder()
    df[target_column] = le.fit_transform(df[target_column])

    # Scale numeric columns if they exist
    if numeric_cols:
        scaler = StandardScaler()
        df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
        df_numeric = df[numeric_cols].reset_index(drop=True)
    else:
        df_numeric = pd.DataFrame()

    # Identify categorical columns
    cat_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()

    # One-hot encode categorical columns if they exist
    if cat_cols:
        df_cat = pd.get_dummies(df[cat_cols], drop_first=False).astype(int).reset_index(drop=True)
    else:
        df_cat = pd.DataFrame()

    # Target column
    df_target = df[[target_column]].reset_index(drop=True)

    # Final dataset
    df_processed = pd.concat([df_numeric, df_cat, df_target], axis=1)

    # Hyperparameter adjustment
    num_rows = df_processed.shape[0]
    epochs = 100 if num_rows < 50000 else 150
    print(f"📊 Dataset size: {num_rows} rows → EPOCHS = {epochs}")

    return df_processed, numeric_cols, cat_cols, epochs, target_column,numeric_cols
# Example usage:
data_path = "/content/drive/MyDrive/Katabatic/Data/Adult/adult-official.csv"
target = "class"
interval_columns = []
output_file = "processed_data.csv"
#["age", "education-num", "capital-gain", "capital-loss", "hours-per-week"]

# TARGET_COL        = "class"
# TARGET_COL        = "class"
df_processed, numeric_cols, cat_cols, EPOCHS,TARGET_COL,NUMERIC_COLS  = preprocess_dataset(
    file_path=data_path,
    target_column=target,
    interval_cols=interval_columns,
    output_path=output_file
)


📊 Dataset size: 48842 rows → EPOCHS = 100


In [None]:
import re
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

# Load
df = pd.read_csv("/content/drive/MyDrive/Katabatic/Data/Adult/adult-official.csv")

#Interval → midpoint
interval_cols = ['age','education-num','capital-gain','capital-loss','hours-per-week']
def interval_to_mid(x):
    if pd.isna(x): return x
    parts = re.findall(r'-?[\d\.]+|inf', str(x))
    if len(parts)==2:
        lo, hi = parts
        lo = float(lo) if lo not in ("-inf","inf") else float(hi)
        hi = float(hi) if hi not in ("-inf","inf") else float(lo)
        return (lo+hi)/2
    try:
        return float(x)
    except:
        return pd.NA

for c in interval_cols:
    df[c] = df[c].apply(interval_to_mid)

#Encode target
le = LabelEncoder()
df['class'] = le.fit_transform(df['class'])

#scale numeric features
scaler = StandardScaler()
df[interval_cols] = scaler.fit_transform(df[interval_cols])

#all interval_cols are floats, class is 0/1
print(df[interval_cols].dtypes)
print(df['class'].value_counts())
print(df[interval_cols].head())
print(df['class'].head())


age               float64
education-num     float64
capital-gain      float64
capital-loss      float64
hours-per-week    float64
dtype: object
class
0    37155
1    11687
Name: count, dtype: int64
        age  education-num  capital-gain  capital-loss  hours-per-week
0  0.024301       0.169881     -1.185489     -0.206255        0.274596
1 -0.242654       0.169881     -0.189421     -0.206255       -1.632827
2  0.024301       0.169881     -0.189421     -0.206255        0.274596
3 -0.242654      -1.925737     -0.189421     -0.206255        0.274596
4  0.138711       0.169881     -0.189421     -0.206255        0.274596
0    0
1    0
2    0
3    0
4    0
Name: class, dtype: int64


In [None]:
import numpy as np
#One-hot encode the remaining categorical columns
cat_cols = ['workclass','education','marital-status',
            'occupation','relationship','race','sex',
            'native-country']

# Expand into dummy columns these will be bools
df_cat = pd.get_dummies(df[cat_cols], drop_first=False)

#  Cast to 0 / 1 so downstream code are purely numeric
df_cat = df_cat.astype(int)

#Keep our numeric and target
df_numeric = df[interval_cols]               # five float columns
df_target  = df[['class']]                   # 0/1 target

#Record group sizes so GAN knows how to split its outputs
cat_group_sizes = [
    len(pd.get_dummies(df[col], drop_first=False).columns)
    for col in cat_cols
]

# Build the final feature matrix for modeling
X = np.hstack([df_numeric.values, df_cat.values])
y = df_target.values.ravel()

df_numeric = df_numeric.reset_index(drop=True)
df_cat     = df_cat.reset_index(drop=True)
df_target  = df_target.reset_index(drop=True)

df_processed = pd.concat([df_numeric, df_cat, df_target], axis=1)
out_path = "/content/drive/MyDrive/Katabatic/Data/Adult/preprocessed4_adult.csv"
df_processed.to_csv(out_path, index=False)
print("Preprocessed data saved to:", out_path)


Preprocessed data saved to: /content/drive/MyDrive/Katabatic/Data/Adult/preprocessed4_adult.csv


In [None]:
# Installs & Imports
!pip install torch torchvision scipy scikit-learn xgboost --quiet

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.linear_model    import LogisticRegression
from sklearn.neural_network  import MLPClassifier
from sklearn.ensemble        import RandomForestClassifier
from xgboost                 import XGBClassifier
from sklearn.model_selection import KFold
from scipy.stats             import wasserstein_distance
from scipy.spatial.distance  import jensenshannon

# Hyperparameters
PREPROCESSED_PATH = "/content/drive/MyDrive/Katabatic/Data/Adult/preprocessed4_adult.csv"
LATENT_DIM        = 100
BATCH_SIZE        = 64
EPOCHS            = 100    # medium dataset
REPEATS           = 3
FOLDS             = 2
SYN_RATIO         = 0.5    # synthetic / real train size
NUMERIC_COLS      = ["age","education-num","capital-gain","capital-loss","hours-per-week"]
TARGET_COL        = "class"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# CR-GAN Definitions
class Generator(nn.Module):
    def __init__(self, z_dim, out_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(z_dim,256), nn.ReLU(),
            nn.Linear(256,512),   nn.ReLU(),
            nn.Linear(512,256),   nn.ReLU(),
            nn.Linear(256,out_dim),
            nn.Tanh()
        )
    def forward(self, z):
        return self.net(z)

class Discriminator(nn.Module):
    def __init__(self, in_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim,512), nn.ReLU(),
            nn.Linear(512,256),   nn.ReLU(),
            nn.Linear(256,128),   nn.ReLU(),
            nn.Linear(128,1),     nn.Sigmoid()
        )
    def forward(self, x):
        return self.net(x)

def train_cramer_gan(G, D, loader, epochs):
    G, D = G.to(device), D.to(device)
    optg = optim.Adam(G.parameters(), lr=2e-4)
    optd = optim.Adam(D.parameters(), lr=2e-4)
    loss_fn = nn.BCELoss()
    for ep in range(1, epochs+1):
        for real_batch, _ in loader:
            real_batch = real_batch.to(device)
            bsz = real_batch.size(0)
            # — D step
            optd.zero_grad()
            z      = torch.randn(bsz, LATENT_DIM, device=device)
            fake   = G(z).detach()
            d_real = D(real_batch)
            d_fake = D(fake)
            lossd  = loss_fn(d_real, torch.ones_like(d_real)) + \
                     loss_fn(d_fake, torch.zeros_like(d_fake))
            lossd.backward();  optd.step()
            # — G step
            optg.zero_grad()
            z     = torch.randn(bsz, LATENT_DIM, device=device)
            fake2 = G(z)
            dg    = D(fake2)
            lossg = loss_fn(dg, torch.ones_like(dg))
            lossg.backward(); optg.step()
        if ep%20==0 or ep==1 or ep==epochs:
            print(f" Ep {ep}/{epochs}  D_loss={lossd.item():.4f}  G_loss={lossg.item():.4f}")
    return G, D

def generate_synthetic(G, n_samples):
    G = G.to(device).eval()
    with torch.no_grad():
        z    = torch.randn(n_samples, LATENT_DIM, device=device)
        data = G(z).cpu().numpy()
    return data

def compute_tstr_all(X_real, y_real, X_syn, y_syn):
    # Train each classifier on synthetic → score on real
    results = {}
    for name, clf in [
        ("LR",  LogisticRegression(max_iter=5000)),
        ("MLP", MLPClassifier(hidden_layer_sizes=(128,64), max_iter=1000)),
        ("RF",  RandomForestClassifier(n_estimators=200)),
        ("XGB", XGBClassifier(eval_metric="logloss"))
    ]:
        clf.fit(X_syn, y_syn)
        results[name] = clf.score(X_real, y_real)*100.0
    return results

def compute_jsd_wd(X_real, X_syn, num_idx):
    jsd_list, wd_list = [], []
    for i in num_idx:
        p_real, _  = np.histogram(X_real[:,i], bins=50, density=True)
        p_syn,  _  = np.histogram(X_syn[:,i], bins=50, density=True)
        jsd_list.append( jensenshannon(p_real, p_syn) )
        wd_list .append( wasserstein_distance(X_real[:,i], X_syn[:,i]) )
    return np.mean(jsd_list), np.mean(wd_list)

# Load Preprocessed Data
df = pd.read_csv(PREPROCESSED_PATH)
X_full = df.drop(columns=[TARGET_COL]).values.astype(np.float32)
y_full = df[TARGET_COL].values.astype(int)
num_idx = [df.columns.get_loc(c) for c in NUMERIC_COLS]

# 3×(2-Fold CV)
tstr_scores = {m:[] for m in ["LR","MLP","RF","XGB"]}
jsd_scores, wd_scores = [], []

kf = KFold(n_splits=FOLDS, shuffle=True, random_state=42)
for rep in range(1, REPEATS+1):
    for fold,(train_idx, test_idx) in enumerate(kf.split(X_full),1):
        print(f"\n— Rep {rep}/{REPEATS}  Fold {fold}/{FOLDS} —")
        X_tr, X_te = X_full[train_idx], X_full[test_idx]
        y_tr, y_te = y_full[train_idx], y_full[test_idx]
        loader = DataLoader(
            TensorDataset(torch.from_numpy(X_tr), torch.from_numpy(y_tr)),
            batch_size=BATCH_SIZE, shuffle=True
        )
        # train
        G = Generator(LATENT_DIM, X_tr.shape[1])
        D = Discriminator(X_tr.shape[1])
        G, D = train_cramer_gan(G, D, loader, epochs=EPOCHS)
        # synth
        n_syn = int(SYN_RATIO * len(X_tr))
        X_syn = generate_synthetic(G, n_syn)
        y_syn = np.random.choice(y_tr, size=n_syn, replace=True)
        # metrics
        tstrs = compute_tstr_all(X_te, y_te, X_syn, y_syn)
        for m,sc in tstrs.items(): tstr_scores[m].append(sc)
        js, wd = compute_jsd_wd(X_te, X_syn, num_idx)
        jsd_scores.append(js);  wd_scores.append(wd)

# Report CV Results
print("\n=== CV Results (mean ± std) ===")
for m in ["LR","MLP","RF","XGB"]:
    arr = np.array(tstr_scores[m])
    print(f" • {m:4s} TSTR = {arr.mean():.2f}% ± {arr.std():.2f}%")
print(f" • JSD = {np.mean(jsd_scores):.4f} ± {np.std(jsd_scores):.4f}")
print(f" • WD  = {np.mean(wd_scores):.4f} ± {np.std(wd_scores):.4f}")

# Train on Full & Save Final Synthetic
# retrain on all data
full_loader = DataLoader(
    TensorDataset(torch.from_numpy(X_full), torch.from_numpy(y_full)),
    batch_size=BATCH_SIZE, shuffle=True
)
Gf = Generator(LATENT_DIM, X_full.shape[1])
Df = Discriminator(X_full.shape[1])
Gf, Df = train_cramer_gan(Gf, Df, full_loader, epochs=EPOCHS)

# generate 50% synthetic
n_final = int(SYN_RATIO * len(X_full))
Xf_syn = generate_synthetic(Gf, n_final)
yf_syn = np.random.choice(y_full, size=n_final, replace=True)


cols = df.columns[:-1]
syn_df = pd.DataFrame(Xf_syn, columns=cols)
syn_df[TARGET_COL] = yf_syn
out_path = "/content/drive/MyDrive/Katabatic/Data/Adult/synthetic4_adult_final.csv"
syn_df.to_csv(out_path, index=False)
print(f"\nSaved final synthetic dataset ({n_final} rows) to:\n  {out_path}")


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m115.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m93.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m59.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m20.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━