In [3]:
from google.colab import files
uploaded = files.upload()


Saving car.csv to car.csv


In [5]:
# Install required libraries
!pip install xgboost --quiet

# Imports
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import networkx as nx
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from torch.utils.data import DataLoader, TensorDataset
from random import choice

# Load dataset
filename = list(uploaded.keys())[0]
df = pd.read_csv(filename)
print(f"Loaded {filename} | Shape: {df.shape}")

# Target column
target_col = df.columns[-1]

# Detect categorical columns
cat_cols = [col for col in df.select_dtypes(include='object').columns if col != target_col]
if len(cat_cols) == 0:
    cat_cols = [col for col in df.columns if col != target_col and df[col].nunique() <= 15]
    if len(cat_cols) > 0:
        print(f"Inferred categorical columns: {cat_cols}")
    else:
        print("No categorical columns found, proceeding without KGE")
        cat_cols = []

# Label encoding
encoders = {}
for col in cat_cols + [target_col]:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    encoders[col] = le

# Feature/target split
X = df.drop(columns=[target_col])
y = df[target_col]

# Split into train and test (80/20)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# Scale data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# KGE Matrix builder
def build_kge_matrix(df, cat_cols, embedding_dim=64, walk_length=10, num_walks=20, window=3):
    if not cat_cols:
        print("KGE skipped due to no categorical features")
        return np.zeros((len(df), embedding_dim))

    G = nx.Graph()
    for col in cat_cols:
        for val in df[col].unique():
            G.add_node(f"{col}:{int(val)}")

    for _, row in df[cat_cols].iterrows():
        for i in range(len(cat_cols)):
            for j in range(i + 1, len(cat_cols)):
                G.add_edge(f"{cat_cols[i]}:{int(row[cat_cols[i]])}", f"{cat_cols[j]}:{int(row[cat_cols[j]])}")

    def random_walk(g, start, wl):
        walk = [start]
        for _ in range(wl - 1):
            nbrs = list(g.neighbors(walk[-1]))
            walk.append(choice(nbrs) if nbrs else walk[-1])
        return walk

    vocab = list(G.nodes())
    vocab_index = {n: i for i, n in enumerate(vocab)}
    co_matrix = np.zeros((len(vocab), len(vocab)))

    for node in vocab:
        for _ in range(num_walks):
            walk = random_walk(G, node, walk_length)
            for i, tgt in enumerate(walk):
                for j in range(max(0, i - window), min(len(walk), i + window + 1)):
                    if i != j:
                        co_matrix[vocab_index[tgt], vocab_index[walk[j]]] += 1

    safe_dim = min(embedding_dim, min(co_matrix.shape))
    if safe_dim == 0:
        print("PCA skipped, returning zero matrix")
        return np.zeros((len(df), embedding_dim))

    embeddings = PCA(n_components=safe_dim).fit_transform(co_matrix)

    row_kges = []
    for _, row in df.iterrows():
        vecs = []
        for col in cat_cols:
            node = f"{col}:{int(row[col])}"
            vecs.append(embeddings[vocab_index.get(node, 0)])
        avg_vec = np.mean(vecs, axis=0)
        if safe_dim < embedding_dim:
            avg_vec = np.pad(avg_vec, (0, embedding_dim - safe_dim))
        row_kges.append(avg_vec)

    print("KGE matrix created")
    return np.array(row_kges)

# Build KGE from training data
train_df = X_train.copy()
train_df[target_col] = y_train
kge_matrix = build_kge_matrix(train_df, cat_cols, embedding_dim=64)

# Define MEG architecture
class MappingNetwork(nn.Module):
    def __init__(self, kge_dim, embed_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(kge_dim, embed_dim),
            nn.ReLU(),
            nn.Linear(embed_dim, embed_dim)
        )
    def forward(self, kge):
        return self.net(kge)

class OriginalMEG(nn.Module):
    def __init__(self, input_dim, kge_dim, embed_dim):
        super().__init__()
        self.token_embed = nn.Linear(input_dim, embed_dim)
        self.kge_mapper = MappingNetwork(kge_dim, embed_dim)
        encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=4)
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=2)
        self.decoder = nn.Sequential(
            nn.Linear(embed_dim, embed_dim),
            nn.ReLU(),
            nn.Linear(embed_dim, input_dim)
        )
    def forward(self, x, kge):
        x_embed = self.token_embed(x)
        kge_embed = self.kge_mapper(kge)
        fused = x_embed + kge_embed
        encoded = self.encoder(fused.unsqueeze(0)).squeeze(0)
        return self.decoder(encoded)

# Prepare tensors
X_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
KGE_tensor = torch.tensor(kge_matrix, dtype=torch.float32)

dataset = TensorDataset(X_tensor, KGE_tensor)
loader = DataLoader(dataset, batch_size=64, shuffle=True)

# Train MEG
meg = OriginalMEG(input_dim=X_tensor.shape[1], kge_dim=KGE_tensor.shape[1], embed_dim=128)
opt = torch.optim.Adam(meg.parameters(), lr=0.001)
loss_fn = nn.MSELoss()

meg.train()
for epoch in range(100):
    total = 0
    for xb, kb in loader:
        out = meg(xb, kb)
        loss = loss_fn(out, xb)
        opt.zero_grad()
        loss.backward()
        opt.step()
        total += loss.item()
    print(f"Epoch {epoch+1}/100 | Loss: {total:.4f}")

# Generate synthetic data
meg.eval()
with torch.no_grad():
    noise = torch.randn((len(X_train), X_tensor.shape[1]))
    kge_sample = KGE_tensor[torch.randint(0, len(KGE_tensor), (len(X_train),))]
    synthetic = meg(noise, kge_sample)

X_synth = synthetic.numpy()
y_synth = y_train.to_numpy()
print("Synthetic data shape:", X_synth.shape)

# TSTR Evaluation: Train classifiers on synthetic, test on real
models = {
    "Logistic Regression": LogisticRegression(max_iter=500),
    "MLP": MLPClassifier(max_iter=300),
    "Random Forest": RandomForestClassifier(),
    "XGBoost": XGBClassifier(eval_metric='logloss')
}

results = {}
for name, clf in models.items():
    try:
        clf.fit(X_synth, y_synth)
        y_pred = clf.predict(X_test_scaled)
        results[name] = {
            "Accuracy": accuracy_score(y_test, y_pred),
            "Precision": precision_score(y_test, y_pred, average='weighted', zero_division=0),
            "Recall": recall_score(y_test, y_pred, average='weighted', zero_division=0),
            "F1 Score": f1_score(y_test, y_pred, average='weighted', zero_division=0)
        }
    except Exception as e:
        results[name] = {"Error": str(e)}

# Display results
results_df = pd.DataFrame(results).T
print("TSTR Benchmark (Train on Synthetic, Test on Real):")
print(results_df)

# Save synthetic data
synth_df = pd.DataFrame(X_synth, columns=X.columns)
synth_df[target_col] = y_synth
synth_df.to_csv("synthetic_meg_output.csv", index=False)


Loaded car.csv | Shape: (1728, 7)
KGE matrix created




Epoch 1/100 | Loss: 3.0264
Epoch 2/100 | Loss: 0.6101
Epoch 3/100 | Loss: 0.4337
Epoch 4/100 | Loss: 0.3737
Epoch 5/100 | Loss: 0.3028
Epoch 6/100 | Loss: 0.2636
Epoch 7/100 | Loss: 0.2384
Epoch 8/100 | Loss: 0.2081
Epoch 9/100 | Loss: 0.2023
Epoch 10/100 | Loss: 0.1941
Epoch 11/100 | Loss: 0.1795
Epoch 12/100 | Loss: 0.1602
Epoch 13/100 | Loss: 0.1461
Epoch 14/100 | Loss: 0.1605
Epoch 15/100 | Loss: 0.1395
Epoch 16/100 | Loss: 0.1335
Epoch 17/100 | Loss: 0.1232
Epoch 18/100 | Loss: 0.1213
Epoch 19/100 | Loss: 0.1042
Epoch 20/100 | Loss: 0.1158
Epoch 21/100 | Loss: 0.0910
Epoch 22/100 | Loss: 0.0792
Epoch 23/100 | Loss: 0.0779
Epoch 24/100 | Loss: 0.0706
Epoch 25/100 | Loss: 0.0605
Epoch 26/100 | Loss: 0.0581
Epoch 27/100 | Loss: 0.0547
Epoch 28/100 | Loss: 0.0576
Epoch 29/100 | Loss: 0.0508
Epoch 30/100 | Loss: 0.0471
Epoch 31/100 | Loss: 0.0411
Epoch 32/100 | Loss: 0.0400
Epoch 33/100 | Loss: 0.0354
Epoch 34/100 | Loss: 0.0350
Epoch 35/100 | Loss: 0.0325
Epoch 36/100 | Loss: 0.0314
E



TSTR Benchmark (Train on Synthetic, Test on Real):
                     Accuracy  Precision    Recall  F1 Score
Logistic Regression  0.699422   0.489191  0.699422  0.575715
MLP                  0.684971   0.486109  0.684971  0.568655
Random Forest        0.699422   0.584441  0.699422  0.596458
XGBoost              0.656069   0.563622  0.656069  0.600058
