In [None]:
# 1. Install the ARFF parser
!pip install liac-arff --quiet

# 2. Load and convert
import arff
import pandas as pd

# Adjust the path if needed
arff_path = "/content/drive/MyDrive/Katabatic/Data/Credit/credit-a.arff"
csv_path  = "/content/drive/MyDrive/Katabatic/Data/Credit/credit.csv"

# Parse the ARFF
with open(arff_path, 'r') as f:
    arff_data = arff.load(f)

# Build a DataFrame
columns = [attr[0] for attr in arff_data['attributes']]
df = pd.DataFrame(arff_data['data'], columns=columns)

# Save out as CSV
df.to_csv(csv_path, index=False)
print(f"Saved CSV to {csv_path}")


  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for liac-arff (setup.py) ... [?25l[?25hdone
Saved CSV to /content/drive/MyDrive/Katabatic/Data/Credit/credit.csv


In [None]:
import re
import numpy as np
import pandas as pd

def interval_to_mid(x):
    """
    Parses strings like '10-30', '-inf-100', '' into a single float:
     - If two numbers: returns their average
     - If one number: returns it
     - If no numbers or blank: returns np.nan
    """
    s = str(x)
    if pd.isna(x) or not s.strip():
        return np.nan
    nums = re.findall(r"-?\d+\.?\d*", s)
    if not nums:
        return np.nan
    vals = list(map(float, nums))
    return vals[0] if len(vals) == 1 else sum(vals) / len(vals)

# Load your CSV
in_path = "/content/drive/MyDrive/Katabatic/Data/Credit/credit.csv"
df = pd.read_csv(in_path)
print("Raw shape:", df.shape)

# Columns that hold intervals
interval_cols = ['A2', 'A3', 'A8', 'A11', 'A14', 'A15']

# Apply midpoint conversion, now .astype(float) will work
for col in interval_cols:
    df[col] = df[col].apply(interval_to_mid).astype(float)
print("After interval→midpoint:\n", df[interval_cols].head())

# Map the target to 0/1 (adjust if your actual labels differ)
df['class'] = df['class'].map({'n': 0, 'y': 1})

# One-hot encode the rest
cat_cols = [c for c in df.columns if c not in interval_cols + ['class']]
df_cat  = pd.get_dummies(df[cat_cols], drop_first=False).astype(int)

# Recombine
df_processed = pd.concat([
    df[interval_cols].reset_index(drop=True),
    df_cat.reset_index(drop=True),
    df[['class']].reset_index(drop=True)
], axis=1)

print("Processed shape:", df_processed.shape)

# Save
out_path = "/content/drive/MyDrive/Katabatic/Data/Credit/credit_preprocessed.csv"
df_processed.to_csv(out_path, index=False)
print("✅ Saved preprocessed data to:", out_path)


Raw shape: (690, 16)
After interval→midpoint:
       A2      A3    A8  A11    A14    A15
0 -38.96 -4.2075  1.02 -1.0  105.0 -492.0
1  38.96  4.2075  1.02  2.5 -105.0  492.0
2 -38.96 -4.2075  1.02 -0.5  105.0  492.0
3 -38.96 -4.2075  1.02  2.5 -105.0 -492.0
4 -38.96  4.2075  1.02 -0.5  105.0 -492.0
Processed shape: (690, 52)
✅ Saved preprocessed data to: /content/drive/MyDrive/Katabatic/Data/Credit/credit_preprocessed.csv


In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv("/content/drive/MyDrive/Katabatic/Data/Credit/credit_preprocessed.csv")

# 1) Features are everything except the final 'class' column
X = df.drop(columns=["class"]).values.astype(np.float32)

# 2) Labels are the single integer column
y = df["class"].astype(int).values

print("X shape:", X.shape)   # e.g. (690, 51)
print("y unique:", np.unique(y))  # should be [0,1]


X shape: (690, 51)
y unique: [0 1]


In [None]:
# 0) Installs & Imports
!pip install torch torchvision scipy scikit-learn xgboost --quiet

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.linear_model    import LogisticRegression
from sklearn.neural_network  import MLPClassifier
from sklearn.ensemble        import RandomForestClassifier
from xgboost                 import XGBClassifier
from sklearn.model_selection import KFold
from scipy.stats             import wasserstein_distance
from scipy.spatial.distance  import jensenshannon

# 1) Hyperparameters & Paths
PREPROCESSED_PATH = "/content/drive/MyDrive/Katabatic/Data/Credit/credit_preprocessed.csv"
LATENT_DIM        = 100
BATCH_SIZE        = 64
EPOCHS            = 100    # “medium” dataset
REPEATS           = 3
FOLDS             = 2
SYN_RATIO         = 0.5    # synthetic = 50% of train fold
TARGET_COL        = "class"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("🖥️  Running on", device)


# 2) CR-GAN Definitions (no final Sigmoid; we’ll use BCEWithLogitsLoss)
class Generator(nn.Module):
    def __init__(self, z_dim, out_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(z_dim,256), nn.ReLU(),
            nn.Linear(256,512),   nn.ReLU(),
            nn.Linear(512,256),   nn.ReLU(),
            nn.Linear(256,out_dim),
            nn.Tanh()             # keep [-1,1] range if you like
        )
    def forward(self, z):
        return self.net(z)

class Discriminator(nn.Module):
    def __init__(self, in_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim,512), nn.ReLU(),
            nn.Linear(512,256),   nn.ReLU(),
            nn.Linear(256,128),   nn.ReLU(),
            nn.Linear(128,1)      # **no** Sigmoid here
        )
    def forward(self, x):
        return self.net(x).view(-1,1)  # raw logits


def train_cramer_gan(G, D, loader, epochs):
    G, D = G.to(device), D.to(device)
    opt_g = optim.Adam(G.parameters(), lr=1e-4)   # lower LR for stability
    opt_d = optim.Adam(D.parameters(), lr=2e-4)
    loss_fn = nn.BCEWithLogitsLoss()

    for ep in range(1, epochs+1):
        last_d, last_g = 0.0, 0.0
        for xb, _ in loader:
            xb = xb.to(device)
            bsz = xb.size(0)

            # Discriminator step
            opt_d.zero_grad()
            z      = torch.randn(bsz, LATENT_DIM, device=device)
            fake   = G(z).detach()
            d_real = D(xb)
            d_fake = D(fake)
            # real→1, fake→0
            ld = loss_fn(d_real, torch.ones_like(d_real)) + \
                 loss_fn(d_fake, torch.zeros_like(d_fake))
            ld.backward(); opt_d.step()

            # Generator step
            opt_g.zero_grad()
            z2     = torch.randn(bsz, LATENT_DIM, device=device)
            fake2  = G(z2)
            dg     = D(fake2)
            lg = loss_fn(dg, torch.ones_like(dg))
            lg.backward(); opt_g.step()

            last_d, last_g = ld.item(), lg.item()

        if ep==1 or ep%20==0 or ep==epochs:
            print(f"Epoch {ep:3d}/{epochs}  D_loss={last_d:.4f}  G_loss={last_g:.4f}")

    return G


def generate_synthetic(G, n_samples):
    G = G.to(device).eval()
    with torch.no_grad():
        z = torch.randn(n_samples, LATENT_DIM, device=device)
        return G(z).cpu().numpy()


# 3) Metrics
def compute_tstr_all(Xr, yr, Xs, ys, n_classes):
    out = {}
    lr  = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=5000)
    lr .fit(Xs, ys); out['LR']  = lr.score(Xr, yr)*100

    mlp = MLPClassifier(hidden_layer_sizes=(128,64), max_iter=1000)
    mlp.fit(Xs, ys); out['MLP'] = mlp.score(Xr, yr)*100

    rf  = RandomForestClassifier(n_estimators=200)
    rf .fit(Xs, ys); out['RF']  = rf.score(Xr, yr)*100

    xgb = XGBClassifier(
        objective = 'binary:logistic' if n_classes==2 else 'multi:softprob',
        num_class = n_classes if n_classes>2 else None,
        use_label_encoder=False,
        eval_metric='logloss'
    )
    xgb.fit(Xs, ys); out['XGB'] = xgb.score(Xr, yr)*100

    return out

def compute_jsd_wd(Xr, Xs):
    jsd_list, wd_list = [], []
    for i in range(Xr.shape[1]):
        p_real,_ = np.histogram(Xr[:,i], bins=50, density=True)
        p_syn,_  = np.histogram(Xs[:,i], bins=50, density=True)
        jsd_list.append(jensenshannon(p_real, p_syn))
        wd_list .append(wasserstein_distance(Xr[:,i], Xs[:,i]))
    return np.mean(jsd_list), np.mean(wd_list)


# 4) Load Preprocessed Credit
df      = pd.read_csv(PREPROCESSED_PATH)
X_full  = df.drop(columns=[TARGET_COL]).values.astype(np.float32)
y_full  = df[TARGET_COL].values.astype(int)
n_classes = len(np.unique(y_full))
print(f"→ {len(df)} rows, {X_full.shape[1]} feats, {n_classes} classes")


# 5) 3×(2-Fold CV)
kf     = KFold(n_splits=FOLDS, shuffle=True, random_state=0)
tstrs  = {m:[] for m in ['LR','MLP','RF','XGB']}
jsds, wds = [], []

for rep in range(1, REPEATS+1):
    for fold, (tr, te) in enumerate(kf.split(X_full), start=1):
        print(f"\n► Rep {rep}/{REPEATS} · Fold {fold}/{FOLDS}")
        Xtr, Xte = X_full[tr], X_full[te]
        ytr, yte = y_full[tr], y_full[te]

        loader = DataLoader(
            TensorDataset(torch.from_numpy(Xtr), torch.from_numpy(ytr)),
            batch_size=BATCH_SIZE, shuffle=True
        )

        # train GAN
        G = train_cramer_gan(
            Generator(LATENT_DIM, Xtr.shape[1]),
            Discriminator(Xtr.shape[1]),
            loader, epochs=EPOCHS
        )

        # synth 50% of train
        n_syn = int(SYN_RATIO * len(Xtr))
        Xs    = generate_synthetic(G, n_syn)
        ys    = np.random.choice(ytr, size=n_syn, replace=True)

        # zero-impute any NaN/∞
        Xs = np.nan_to_num(Xs, nan=0., posinf=0., neginf=0.)
        Xte= np.nan_to_num(Xte,nan=0., posinf=0., neginf=0.)

        # TSTR
        res = compute_tstr_all(Xte, yte, Xs, ys, n_classes)
        for m,sc in res.items():
            tstrs[m].append(sc)

        # JSD & WD
        js, wd = compute_jsd_wd(Xte, Xs)
        jsds.append(js); wds.append(wd)


# 6) Report CV
print("\n=== CV Results (mean ± std) ===")
for m in ['LR','MLP','RF','XGB']:
    arr = np.array(tstrs[m])
    print(f" • {m:4s} TSTR = {arr.mean():.2f}% ± {arr.std():.2f}%")
print(f" • JSD = {np.mean(jsds):.4f} ± {np.std(jsds):.4f}")
print(f" • WD  = {np.mean(wds):.4f} ± {np.std(wds):.4f}")


# 7) Retrain on Full & Save 50%-sized Final Synthetic
full_loader = DataLoader(
    TensorDataset(torch.from_numpy(X_full), torch.from_numpy(y_full)),
    batch_size=BATCH_SIZE, shuffle=True
)
Gf = train_cramer_gan(
    Generator(LATENT_DIM, X_full.shape[1]),
    Discriminator(X_full.shape[1]),
    full_loader, epochs=EPOCHS
)

n_final = int(SYN_RATIO * len(X_full))
Xf_syn  = generate_synthetic(Gf, n_final)
yf_syn  = np.random.choice(y_full, size=n_final, replace=True)
Xf_syn  = np.nan_to_num(Xf_syn, nan=0., posinf=0., neginf=0.)

syn_df       = pd.DataFrame(Xf_syn, columns=df.columns[:-1])
syn_df[TARGET_COL] = yf_syn
outp = "/content/drive/MyDrive/Katabatic/Data/Credit/credit_synthetic_final.csv"
syn_df.to_csv(outp, index=False)
print(f"\n✅ Final synthetic ({n_final} rows) saved to:\n  {outp}")


🖥️  Running on cpu
→ 690 rows, 51 feats, 2 classes

► Rep 1/3 · Fold 1/2
Epoch   1/100  D_loss=nan  G_loss=nan
Epoch  20/100  D_loss=nan  G_loss=nan
Epoch  40/100  D_loss=nan  G_loss=nan
Epoch  60/100  D_loss=nan  G_loss=nan
Epoch  80/100  D_loss=nan  G_loss=nan
Epoch 100/100  D_loss=nan  G_loss=nan


Parameters: { "use_label_encoder" } are not used.




► Rep 1/3 · Fold 2/2
Epoch   1/100  D_loss=nan  G_loss=nan
Epoch  20/100  D_loss=nan  G_loss=nan
Epoch  40/100  D_loss=nan  G_loss=nan
Epoch  60/100  D_loss=nan  G_loss=nan
Epoch  80/100  D_loss=nan  G_loss=nan
Epoch 100/100  D_loss=nan  G_loss=nan


Parameters: { "use_label_encoder" } are not used.




► Rep 2/3 · Fold 1/2
Epoch   1/100  D_loss=nan  G_loss=nan
Epoch  20/100  D_loss=nan  G_loss=nan
Epoch  40/100  D_loss=nan  G_loss=nan
Epoch  60/100  D_loss=nan  G_loss=nan
Epoch  80/100  D_loss=nan  G_loss=nan
Epoch 100/100  D_loss=nan  G_loss=nan


Parameters: { "use_label_encoder" } are not used.




► Rep 2/3 · Fold 2/2
Epoch   1/100  D_loss=nan  G_loss=nan
Epoch  20/100  D_loss=nan  G_loss=nan
Epoch  40/100  D_loss=nan  G_loss=nan
Epoch  60/100  D_loss=nan  G_loss=nan
Epoch  80/100  D_loss=nan  G_loss=nan
Epoch 100/100  D_loss=nan  G_loss=nan


Parameters: { "use_label_encoder" } are not used.




► Rep 3/3 · Fold 1/2
Epoch   1/100  D_loss=nan  G_loss=nan
Epoch  20/100  D_loss=nan  G_loss=nan
Epoch  40/100  D_loss=nan  G_loss=nan
Epoch  60/100  D_loss=nan  G_loss=nan
Epoch  80/100  D_loss=nan  G_loss=nan
Epoch 100/100  D_loss=nan  G_loss=nan


Parameters: { "use_label_encoder" } are not used.




► Rep 3/3 · Fold 2/2
Epoch   1/100  D_loss=nan  G_loss=nan
Epoch  20/100  D_loss=nan  G_loss=nan
Epoch  40/100  D_loss=nan  G_loss=nan
Epoch  60/100  D_loss=nan  G_loss=nan
Epoch  80/100  D_loss=nan  G_loss=nan
Epoch 100/100  D_loss=nan  G_loss=nan


Parameters: { "use_label_encoder" } are not used.




=== CV Results (mean ± std) ===
 • LR   TSTR = 55.51% ± 0.43%
 • MLP  TSTR = 52.51% ± 6.90%
 • RF   TSTR = 55.51% ± 0.43%
 • XGB  TSTR = 55.51% ± 0.43%
 • JSD = 0.8152 ± 0.0165
 • WD  = 12.7183 ± 0.0004
Epoch   1/100  D_loss=nan  G_loss=nan
Epoch  20/100  D_loss=nan  G_loss=nan
Epoch  40/100  D_loss=nan  G_loss=nan
Epoch  60/100  D_loss=nan  G_loss=nan
Epoch  80/100  D_loss=nan  G_loss=nan
Epoch 100/100  D_loss=nan  G_loss=nan

✅ Final synthetic (345 rows) saved to:
  /content/drive/MyDrive/Katabatic/Data/Credit/credit_synthetic_final.csv
