In [1]:
# Install required libraries
!pip install xgboost --quiet

# Import necessary libraries
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import networkx as nx
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from torch.utils.data import DataLoader, TensorDataset
from scipy.spatial.distance import jensenshannon
from scipy.stats import wasserstein_distance
from random import choice
import os
try:
    from google.colab import files
except ImportError:
    files = None

# Load multiple datasets with environment flexibility
def load_data():
    if files:
        uploaded = files.upload()
        return [(pd.read_csv(fname), fname) for fname in uploaded]
    else:
        file_paths = input("Enter CSV file paths separated by commas: ").split(',')
        return [(pd.read_csv(fp.strip()), fp.strip()) for fp in file_paths if os.path.isfile(fp.strip())]

# Preprocessing functions
def identify_categorical_columns(df, target_col):
    cat_cols = [col for col in df.select_dtypes(include='object').columns if col != target_col]
    if not cat_cols:
        cat_cols = [col for col in df.columns if col != target_col and df[col].nunique() <= 15]
    return cat_cols

def encode_columns(df, cat_cols, target_col):
    encoders = {}
    for col in cat_cols + [target_col]:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))
        encoders[col] = le
    return encoders

def split_data(df, target_col, test_size=0.2):
    X = df.drop(columns=[target_col])
    y = df[target_col]
    return train_test_split(X, y, test_size=test_size, stratify=y, random_state=42)

def scale_features(X_train, X_test):
    scaler = StandardScaler()
    return scaler.fit_transform(X_train), scaler.transform(X_test)

def preprocess_data(df):
    target_col = df.columns[-1]
    cat_cols = identify_categorical_columns(df, target_col)
    encoders = encode_columns(df, cat_cols, target_col)
    X_train, X_test, y_train, y_test = split_data(df, target_col)
    X_train_scaled, X_test_scaled = scale_features(X_train, X_test)
    return X_train_scaled, X_test_scaled, y_train, y_test, cat_cols, X_train, target_col, encoders

# KGE Functions
def build_graph(df, cat_cols):
    G = nx.Graph()
    for col in cat_cols:
        for val in df[col].unique():
            G.add_node(f"{col}:{int(val)}")
    for _, row in df[cat_cols].iterrows():
        for i in range(len(cat_cols)):
            for j in range(i + 1, len(cat_cols)):
                G.add_edge(f"{cat_cols[i]}:{int(row[cat_cols[i]])}", f"{cat_cols[j]}:{int(row[cat_cols[j]])}")
    return G

def build_cooccurrence_matrix(G, walk_length, num_walks, window):
    def random_walk(graph, start_node, length):
        walk = [start_node]
        for _ in range(length - 1):
            neighbors = list(graph.neighbors(walk[-1]))
            walk.append(choice(neighbors) if neighbors else walk[-1])
        return walk

    vocab = list(G.nodes())
    vocab_index = {node: idx for idx, node in enumerate(vocab)}
    co_matrix = np.zeros((len(vocab), len(vocab)))

    for node in vocab:
        for _ in range(num_walks):
            walk = random_walk(G, node, walk_length)
            for i, target in enumerate(walk):
                for j in range(max(0, i - window), min(len(walk), i + window + 1)):
                    if i != j:
                        co_matrix[vocab_index[target], vocab_index[walk[j]]] += 1
    return co_matrix, vocab_index

def generate_embeddings(co_matrix, embedding_dim):
    safe_dim = min(embedding_dim, min(co_matrix.shape))
    if safe_dim == 0:
        return None
    return PCA(n_components=safe_dim).fit_transform(co_matrix)

def create_kge_matrix(df, cat_cols, embeddings, vocab_index, embedding_dim):
    if embeddings is None:
        return np.zeros((len(df), embedding_dim))
    kge_vectors = []
    for _, row in df.iterrows():
        vectors = [embeddings[vocab_index.get(f"{col}:{int(row[col])}", 0)] for col in cat_cols]
        avg_vector = np.mean(vectors, axis=0)
        if len(avg_vector) < embedding_dim:
            avg_vector = np.pad(avg_vector, (0, embedding_dim - len(avg_vector)))
        kge_vectors.append(avg_vector)
    return np.array(kge_vectors)

def build_kge_matrix(df, cat_cols, embedding_dim=64, walk_length=10, num_walks=20, window=3):
    if not cat_cols:
        return np.zeros((len(df), embedding_dim))
    G = build_graph(df, cat_cols)
    co_matrix, vocab_index = build_cooccurrence_matrix(G, walk_length, num_walks, window)
    embeddings = generate_embeddings(co_matrix, embedding_dim)
    return create_kge_matrix(df, cat_cols, embeddings, vocab_index, embedding_dim)

# MEG Model Components
class MappingNetwork(nn.Module):
    def __init__(self, kge_dim, embed_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(kge_dim, embed_dim),
            nn.ReLU(),
            nn.Linear(embed_dim, embed_dim)
        )
    def forward(self, kge):
        return self.net(kge)

class MEGModel(nn.Module):
    def __init__(self, input_dim, kge_dim, embed_dim=128):
        super().__init__()
        self.feature_embed = nn.Linear(input_dim, embed_dim)
        self.kge_embed = MappingNetwork(kge_dim, embed_dim)
        encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=4)
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=2)
        self.decoder = nn.Sequential(
            nn.Linear(embed_dim, embed_dim),
            nn.ReLU(),
            nn.Linear(embed_dim, input_dim)
        )
    def forward(self, x, kge):
        x_embed = self.feature_embed(x)
        kge_embed = self.kge_embed(kge)
        fused = x_embed + kge_embed
        encoded = self.encoder(fused.unsqueeze(0)).squeeze(0)
        return self.decoder(encoded)

# Train MEG Model
def train_meg_model(X_tensor, kge_tensor, epochs):
    loader = DataLoader(TensorDataset(X_tensor, kge_tensor), batch_size=64, shuffle=True)
    model = MEGModel(X_tensor.shape[1], kge_tensor.shape[1])
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    loss_fn = nn.MSELoss()
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for x_batch, kge_batch in loader:
            output = model(x_batch, kge_batch)
            loss = loss_fn(output, x_batch)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}/{epochs} | Loss: {total_loss:.4f}")
    return model

# Generate Synthetic Data
def generate_synthetic_data(model, X_train_scaled, kge_tensor):
    model.eval()
    size = int(0.5 * len(X_train_scaled))
    noise = torch.randn((size, X_train_scaled.shape[1]))
    indices = torch.randint(0, len(kge_tensor), (size,))
    sampled_kge = kge_tensor[indices]
    with torch.no_grad():
        synthetic = model(noise, sampled_kge)
    return synthetic.numpy()

# Evaluation: JSD and WD

def compute_jsd(p, q):
    p, q = np.asarray(p) / np.sum(p), np.asarray(q) / np.sum(q)
    return jensenshannon(p, q) ** 2

def compute_wd(p, q):
    return wasserstein_distance(p, q)

def evaluate_synthetic_data(X_real, y_real, X_synth, y_synth):
    classifiers = {
        "Logistic Regression": LogisticRegression(max_iter=500),
        "Random Forest": RandomForestClassifier(),
        "MLP": MLPClassifier(max_iter=300),
        "XGBoost": XGBClassifier(eval_metric='logloss')
    }

    results = {name: {m: [] for m in ["Accuracy", "Precision", "Recall", "F1 Score", "JSD", "WD"]} for name in classifiers}
    skf = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)

    for run in range(3):
        for name, clf in classifiers.items():
            for _, test_index in skf.split(X_real, y_real):
                clf.fit(X_synth, y_synth)
                X_test, y_test = X_real[test_index], y_real[test_index]
                y_pred = clf.predict(X_test)

                results[name]["Accuracy"].append(accuracy_score(y_test, y_pred))
                results[name]["Precision"].append(precision_score(y_test, y_pred, average='weighted', zero_division=0))
                results[name]["Recall"].append(recall_score(y_test, y_pred, average='weighted', zero_division=0))
                results[name]["F1 Score"].append(f1_score(y_test, y_pred, average='weighted', zero_division=0))
                results[name]["JSD"].append(compute_jsd(np.bincount(y_test), np.bincount(y_pred)))
                results[name]["WD"].append(compute_wd(y_test, y_pred))

    return pd.DataFrame({name: {k: np.mean(v) for k, v in met.items()} for name, met in results.items()}).T

# Main
if __name__ == "__main__":
    data_list = load_data()
    for df, fname in data_list:
        print(f"\nProcessing '{fname}'...")
        X_train_scaled, X_test_scaled, y_train, y_test, cat_cols, X_train, target_col, encoders = preprocess_data(df)
        train_df = X_train.copy()
        train_df[target_col] = y_train
        kge_matrix = build_kge_matrix(train_df, cat_cols)

        X_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
        kge_tensor = torch.tensor(kge_matrix, dtype=torch.float32)

        epochs = 150 if len(df) > 40000 else 100
        meg_model = train_meg_model(X_tensor, kge_tensor, epochs)
        X_synth = generate_synthetic_data(meg_model, X_train_scaled, kge_tensor)
        y_synth = y_train.to_numpy()[:len(X_synth)]

        results_df = evaluate_synthetic_data(X_test_scaled, y_test.to_numpy(), X_synth, y_synth)
        print("\nTSTR Evaluation with JSD and WD:\n", results_df)

        out_path = f"synthetic_{os.path.basename(fname)}"
        synth_df = pd.DataFrame(X_synth, columns=df.columns[:-1])
        synth_df[target_col] = y_synth
        synth_df.to_csv(out_path, index=False)
        print(f"Synthetic dataset saved to '{out_path}'")


Saving csv_result-connect-4.csv to csv_result-connect-4.csv

Processing 'csv_result-connect-4.csv'...




Epoch 1/150 | Loss: 70.4954
Epoch 2/150 | Loss: 11.1727
Epoch 3/150 | Loss: 6.1901
Epoch 4/150 | Loss: 12.4904
Epoch 5/150 | Loss: 1.5468
Epoch 6/150 | Loss: 1.2503
Epoch 7/150 | Loss: 13.6652
Epoch 8/150 | Loss: 13.2959
Epoch 9/150 | Loss: 2.0253
Epoch 10/150 | Loss: 1.9747
Epoch 11/150 | Loss: 0.9710
Epoch 12/150 | Loss: 0.9352
Epoch 13/150 | Loss: 5.7652
Epoch 14/150 | Loss: 9.0691
Epoch 15/150 | Loss: 3.5325
Epoch 16/150 | Loss: 4.4638
Epoch 17/150 | Loss: 5.2313
Epoch 18/150 | Loss: 0.9060
Epoch 19/150 | Loss: 0.7676
Epoch 20/150 | Loss: 0.6994
Epoch 21/150 | Loss: 13.9529
Epoch 22/150 | Loss: 2.8751
Epoch 23/150 | Loss: 0.9942
Epoch 24/150 | Loss: 1.5553
Epoch 25/150 | Loss: 0.5440
Epoch 26/150 | Loss: 1.1162
Epoch 27/150 | Loss: 9.2926
Epoch 28/150 | Loss: 6.3466
Epoch 29/150 | Loss: 2.5333
Epoch 30/150 | Loss: 2.4816
Epoch 31/150 | Loss: 0.9240
Epoch 32/150 | Loss: 0.9403
Epoch 33/150 | Loss: 0.7885
Epoch 34/150 | Loss: 4.3300
Epoch 35/150 | Loss: 3.1240
Epoch 36/150 | Loss: 32




TSTR Evaluation with JSD and WD:
                      Accuracy  Precision    Recall  F1 Score       JSD  \
Logistic Regression  0.658304   0.433364  0.658304  0.522659  0.136151   
Random Forest        0.657909   0.488737  0.657909  0.522891  0.133525   
MLP                  0.576574   0.504103  0.576574  0.528469  0.019255   
XGBoost              0.645352   0.513675  0.645352  0.536092  0.085570   

                           WD  
Logistic Regression  0.437167  
Random Forest        0.436279  
MLP                  0.199033  
XGBoost              0.394982  
Synthetic dataset saved to 'synthetic_csv_result-connect-4.csv'
