In [None]:

!pip install liac-arff --quiet

# 2. Load and convert
import arff
import pandas as pd

# Adjust the paths as needed
arff_path = "/content/drive/MyDrive/Katabatic/Data/Nursery/nursery 1.arff"
csv_path  = "/content/drive/MyDrive/Katabatic/Data/Nursery/nursery.csv"

# 3. Parse the ARFF
with open(arff_path, 'r') as f:
    arff_data = arff.load(f)

# 4. Build a DataFrame
columns = [attr[0] for attr in arff_data['attributes']]
df      = pd.DataFrame(arff_data['data'], columns=columns)

# 5. Save out as CSV
df.to_csv(csv_path, index=False)  # <-- remove the "//7" here!
print(f"✅ Saved CSV to {csv_path}")

✅ Saved CSV to /content/drive/MyDrive/Katabatic/Data/Nursery/nursery.csv


In [None]:


import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

# 1. Load & inspect
file_path = '/content/drive/MyDrive/Katabatic/Data/Nursery/nursery.csv'
df = pd.read_csv(file_path)

# 2. Features vs target
feature_cols = df.columns[:-1].tolist()
target_col   = 'class'

# 3. Encode target to 0/1/2/… integers
le = LabelEncoder()
df[target_col] = le.fit_transform(df[target_col])

# 4. One‐hot encode features → 0/1 ints
df_cat = pd.get_dummies(df[feature_cols], drop_first=False)
df_cat = df_cat.astype(int)

# 5. Record each categorical group’s size
cat_group_sizes = [df[col].nunique() for col in feature_cols]

# 6. Build final DataFrame & save
df_processed = pd.concat([
    df_cat.reset_index(drop=True),
    df[target_col].reset_index(drop=True).rename('class')
], axis=1)

out_path = '/content/drive/MyDrive/Katabatic/Data/Nursery/preprocessed_nursery.csv'
df_processed.to_csv(out_path, index=False)
print(f"✅ Preprocessed data (0/1) saved to: {out_path}")
print("➤ cat_group_sizes:", cat_group_sizes)


✅ Preprocessed data (0/1) saved to: /content/drive/MyDrive/Katabatic/Data/Nursery/preprocessed_nursery.csv
➤ cat_group_sizes: [3, 5, 4, 4, 3, 2, 3, 3]


In [None]:
#make sure each original feature sums to 1 across its one-hot columns:
for col in feature_cols:
    # grab the dummy columns for that feature
    dummies = [c for c in df_cat.columns if c.startswith(col + '_')]
    assert (df_cat[dummies].sum(axis=1) == 1).all()

# 2) confirm target is integer coded
print(df_processed['class'].value_counts())


class
0    4320
1    4266
3    4044
4     328
2       2
Name: count, dtype: int64


In [None]:
# -------------------- 0. Installs & Imports --------------------
!pip install torch torchvision scipy scikit-learn xgboost --quiet

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.linear_model    import LogisticRegression
from sklearn.neural_network  import MLPClassifier
from sklearn.ensemble        import RandomForestClassifier
from xgboost                 import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from scipy.spatial.distance  import jensenshannon
from scipy.stats             import wasserstein_distance

# -------------------- 1. Hyperparameters & Paths --------------------
PREPROCESSED_PATH = "/content/drive/MyDrive/Katabatic/Data/Nursery/preprocessed_nursery.csv"
LATENT_DIM        = 100
BATCH_SIZE        = 64
EPOCHS            = 100    # per spec for small/medium
REPEATS           = 3
FOLDS             = 2
SYN_RATIO         = 0.5    # 50% synthetic of train size
TARGET_COL        = "class"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("▶ Running on", device)

# -------------------- 2. CR-GAN Model Definitions --------------------
class Generator(nn.Module):
    def __init__(self, z_dim, out_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(z_dim,256), nn.ReLU(),
            nn.Linear(256,512),   nn.ReLU(),
            nn.Linear(512,256),   nn.ReLU(),
            nn.Linear(256,out_dim),
            nn.Tanh()
        )
    def forward(self, z): return self.net(z)

class Discriminator(nn.Module):
    def __init__(self, in_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim,512), nn.ReLU(),
            nn.Linear(512,256),   nn.ReLU(),
            nn.Linear(256,128),   nn.ReLU(),
            nn.Linear(128,1),     nn.Sigmoid()
        )
    def forward(self, x): return self.net(x)

def train_cramer_gan(G, D, loader, epochs):
    G, D = G.to(device), D.to(device)
    opt_g = optim.Adam(G.parameters(), lr=2e-4)
    opt_d = optim.Adam(D.parameters(), lr=2e-4)
    loss_fn = nn.BCELoss()
    for ep in range(1, epochs+1):
        ld, lg = 0.0, 0.0
        for xb, _ in loader:
            xb = xb.to(device); bsz = xb.size(0)
            # — D step
            opt_d.zero_grad()
            z     = torch.randn(bsz, LATENT_DIM, device=device)
            fake  = G(z).detach()
            ld    = loss_fn(D(xb),   torch.ones_like(D(xb))) + \
                    loss_fn(D(fake), torch.zeros_like(D(fake)))
            ld.backward(); opt_d.step()
            # — G step
            opt_g.zero_grad()
            z2    = torch.randn(bsz, LATENT_DIM, device=device)
            fake2 = G(z2)
            lg    = loss_fn(D(fake2), torch.ones_like(D(fake2)))
            lg.backward(); opt_g.step()
        if ep==1 or ep%20==0 or ep==epochs:
            print(f"  Epoch {ep}/{epochs}  D_loss={ld:.4f}  G_loss={lg:.4f}")
    return G

def generate_synthetic(G, n):
    G = G.to(device).eval()
    with torch.no_grad():
        z = torch.randn(n, LATENT_DIM, device=device)
        return G(z).cpu().numpy()

# -------------------- 3. Metrics --------------------
def compute_tstr_all(Xr, yr, Xs, ys):
    out = {}
    lr  = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=5000)
    mlp = MLPClassifier(hidden_layer_sizes=(128,64), max_iter=1000)
    rf  = RandomForestClassifier(n_estimators=200)
    xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

    for name, clf in [("LR",lr), ("MLP",mlp), ("RF",rf), ("XGB",xgb)]:
        clf.fit(Xs, ys)
        out[name] = clf.score(Xr, yr)*100
    return out

def compute_jsd_wd(Xr, Xs):
    # fully categorical → return zero
    return 0.0, 0.0

def ensure_all_classes(Xs, ys, classes, Xtr, ytr):
    missing = set(classes) - set(np.unique(ys))
    if missing:
        for c in missing:
            idx = np.where(ytr==c)[0][0]
            Xs  = np.vstack([Xs, Xtr[idx:idx+1]])
            ys  = np.hstack([ys, [c]])
    return Xs, ys

# -------------------- 4. Load Preprocessed Nursery --------------------
df     = pd.read_csv(PREPROCESSED_PATH)
X_full = df.drop(columns=[TARGET_COL]).values.astype(np.float32)
y_full = df[TARGET_COL].values.astype(int)
all_cls = np.unique(y_full)
print(f"▶ Loaded {len(df)} rows, {X_full.shape[1]} features, classes = {all_cls}")

# -------------------- 5. 3×(2-Fold StratifiedCV) --------------------
skf   = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=0)
tstrs = {m:[] for m in ['LR','MLP','RF','XGB']}
jsds, wds = [], []

for rep in range(1, REPEATS+1):
    for fold, (tr, te) in enumerate(skf.split(X_full, y_full), start=1):
        print(f"\n▷ Rep {rep}/{REPEATS} · Fold {fold}/{FOLDS}")
        Xtr, Xte = X_full[tr], X_full[te]
        ytr, yte = y_full[tr], y_full[te]

        loader = DataLoader(
            TensorDataset(torch.from_numpy(Xtr), torch.from_numpy(ytr)),
            batch_size=BATCH_SIZE, shuffle=True
        )

        G = train_cramer_gan(
            Generator(LATENT_DIM, Xtr.shape[1]),
            Discriminator(Xtr.shape[1]),
            loader, epochs=EPOCHS
        )

        n_syn = int(SYN_RATIO * len(Xtr))
        Xs    = generate_synthetic(G, n_syn)
        ys    = np.random.choice(ytr, size=n_syn, replace=True)

        # *** This is the only new line ***
        Xs, ys = ensure_all_classes(Xs, ys, all_cls, Xtr, ytr)

        t_res = compute_tstr_all(Xte, yte, Xs, ys)
        for m, sc in t_res.items(): tstrs[m].append(sc)

        js, wd = compute_jsd_wd(Xte, Xs)
        jsds.append(js); wds.append(wd)

# -------------------- 6. Report CV Results --------------------
print("\n=== CV Results (mean ± std) ===")
for m in ['LR','MLP','RF','XGB']:
    arr = np.array(tstrs[m])
    print(f" • {m:4s} TSTR = {arr.mean():.2f}% ± {arr.std():.2f}%")
print(f" • JSD = {np.mean(jsds):.4f} ± {np.std(jsds):.4f}")
print(f" • WD  = {np.mean(wds):.4f} ± {np.std(wds):.4f}")

# -------------------- 7. Retrain Full & Save Synthetic --------------------
full_loader = DataLoader(
    TensorDataset(torch.from_numpy(X_full), torch.from_numpy(y_full)),
    batch_size=BATCH_SIZE, shuffle=True
)
Gf = train_cramer_gan(
    Generator(LATENT_DIM, X_full.shape[1]),
    Discriminator(X_full.shape[1]),
    full_loader, epochs=EPOCHS
)

n_final = int(SYN_RATIO * len(X_full))
Xf      = generate_synthetic(Gf, n_final)
yf      = np.random.choice(y_full, size=n_final, replace=True)
Xf, yf  = ensure_all_classes(Xf, yf, all_cls, X_full, y_full)

syn_df       = pd.DataFrame(Xf, columns=df.columns[:-1])
syn_df[TARGET_COL] = yf
out_path     = "/content/drive/MyDrive/Katabatic/Data/Nursery/synthetic_nursery_final.csv"
syn_df.to_csv(out_path, index=False)
print(f"\n✅ Final synthetic ({len(syn_df)} rows) saved to:\n  {out_path}")


▶ Running on cuda
▶ Loaded 12960 rows, 27 features, classes = [0 1 2 3 4]

▷ Rep 1/3 · Fold 1/2
  Epoch 1/100  D_loss=0.1747  G_loss=2.6384
  Epoch 20/100  D_loss=0.0964  G_loss=2.9975
  Epoch 40/100  D_loss=0.6011  G_loss=1.9851
  Epoch 60/100  D_loss=0.6976  G_loss=3.0092
  Epoch 80/100  D_loss=0.9122  G_loss=2.0498
  Epoch 100/100  D_loss=0.2935  G_loss=3.4371


Parameters: { "use_label_encoder" } are not used.




▷ Rep 1/3 · Fold 2/2
  Epoch 1/100  D_loss=0.1906  G_loss=2.1762
  Epoch 20/100  D_loss=0.2836  G_loss=2.2121
  Epoch 40/100  D_loss=0.8973  G_loss=2.0247
  Epoch 60/100  D_loss=0.7665  G_loss=2.3363
  Epoch 80/100  D_loss=1.4778  G_loss=1.9949
  Epoch 100/100  D_loss=0.3122  G_loss=2.2812


Parameters: { "use_label_encoder" } are not used.




▷ Rep 2/3 · Fold 1/2
  Epoch 1/100  D_loss=1.0745  G_loss=0.6577
  Epoch 20/100  D_loss=0.4465  G_loss=2.1971
  Epoch 40/100  D_loss=0.7571  G_loss=1.3199
  Epoch 60/100  D_loss=0.4052  G_loss=2.2853
  Epoch 80/100  D_loss=0.4184  G_loss=3.8349
  Epoch 100/100  D_loss=0.3619  G_loss=2.5207


Parameters: { "use_label_encoder" } are not used.




▷ Rep 2/3 · Fold 2/2
  Epoch 1/100  D_loss=0.7249  G_loss=1.5297
  Epoch 20/100  D_loss=1.6203  G_loss=3.3977
  Epoch 40/100  D_loss=0.4617  G_loss=2.6178
  Epoch 60/100  D_loss=0.5842  G_loss=1.4953
  Epoch 80/100  D_loss=0.4449  G_loss=1.7838
  Epoch 100/100  D_loss=0.3360  G_loss=2.4956


Parameters: { "use_label_encoder" } are not used.




▷ Rep 3/3 · Fold 1/2
  Epoch 1/100  D_loss=0.7856  G_loss=1.2085
  Epoch 20/100  D_loss=0.5127  G_loss=1.6985
  Epoch 40/100  D_loss=0.6554  G_loss=1.5560
  Epoch 60/100  D_loss=1.1353  G_loss=2.5727
  Epoch 80/100  D_loss=0.9417  G_loss=1.3404
  Epoch 100/100  D_loss=0.2031  G_loss=2.7692


Parameters: { "use_label_encoder" } are not used.




▷ Rep 3/3 · Fold 2/2
  Epoch 1/100  D_loss=0.4087  G_loss=2.1599
  Epoch 20/100  D_loss=0.6336  G_loss=1.8926
  Epoch 40/100  D_loss=0.6927  G_loss=3.3008
  Epoch 60/100  D_loss=0.5921  G_loss=2.1852
  Epoch 80/100  D_loss=0.3510  G_loss=1.8082
  Epoch 100/100  D_loss=0.5509  G_loss=2.8592


Parameters: { "use_label_encoder" } are not used.




=== CV Results (mean ± std) ===
 • LR   TSTR = 32.64% ± 5.69%
 • MLP  TSTR = 32.44% ± 3.28%
 • RF   TSTR = 31.01% ± 2.59%
 • XGB  TSTR = 34.54% ± 4.47%
 • JSD = 0.0000 ± 0.0000
 • WD  = 0.0000 ± 0.0000
  Epoch 1/100  D_loss=0.6030  G_loss=2.3094
  Epoch 20/100  D_loss=0.4812  G_loss=2.3373
  Epoch 40/100  D_loss=0.5238  G_loss=2.9237
  Epoch 60/100  D_loss=0.0130  G_loss=4.9876
  Epoch 80/100  D_loss=0.0036  G_loss=6.6024
  Epoch 100/100  D_loss=0.0005  G_loss=7.9298

✅ Final synthetic (6481 rows) saved to:
  /content/drive/MyDrive/Katabatic/Data/Nursery/synthetic_nursery_final.csv
