In [None]:
from google.colab import files
uploaded = files.upload()


Saving shuttle.trn to shuttle (2).trn
Saving shuttle.tst to shuttle (2).tst


In [None]:
# STEP 0: Install required packages
!pip install xgboost --quiet

# STEP 1: Imports
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import networkx as nx
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from torch.utils.data import DataLoader, TensorDataset
from random import choice

# STEP 2: File detection
filenames = list(uploaded.keys())
if len(filenames) != 2:
    raise ValueError("Upload exactly two files (train and test).")
train_file, test_file = filenames[0], filenames[1]
print(f"Train file: {train_file}, Test file: {test_file}")

# STEP 3: Helper to load .csv or space-separated .trn/.tst
def load_file(file):
    if file.lower().endswith(('.trn', '.tst')):
        return pd.read_csv(file, header=None, delim_whitespace=True)
    return pd.read_csv(file, header=None)

df_train = load_file(train_file)
df_test = load_file(test_file)
df_train.columns = [f"col_{i}" for i in range(df_train.shape[1])]
df_test.columns = df_train.columns
print(f"Train shape: {df_train.shape}, Test shape: {df_test.shape}")

# STEP 4: Target column
target_col = df_train.columns[-1]

# STEP 5: Categorical columns
cat_cols = [col for col in df_train.select_dtypes(include='object').columns if col != target_col]
if not cat_cols:
    cat_cols = [col for col in df_train.columns if col != target_col and df_train[col].nunique() <= 15]
    print("Inferred categorical columns:", cat_cols)
else:
    print("Detected categorical columns:", cat_cols)

# STEP 6: Label encode with safe handling of unseen labels
def safe_label_transform(le, series):
    known_labels = set(le.classes_)
    series = series.astype(str).apply(lambda x: x if x in known_labels else "<UNK>")
    if "<UNK>" not in le.classes_:
        le.classes_ = np.append(le.classes_, "<UNK>")
    return le.transform(series)

encoders = {}
for col in cat_cols + [target_col]:
    le = LabelEncoder()
    df_train[col] = le.fit_transform(df_train[col].astype(str))
    df_test[col] = safe_label_transform(le, df_test[col])
    encoders[col] = le

# STEP 7: Split + Scale
X_train = df_train.drop(columns=[target_col])
y_train = df_train[target_col]
X_test = df_test.drop(columns=[target_col])
y_test = df_test[target_col]
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# STEP 8: KGE
def build_kge_matrix(df, cat_cols, embedding_dim=64, walk_length=10, num_walks=20, window=3):
    if not cat_cols:
        print("KGE skipped — no categorical columns found.")
        return np.zeros((len(df), embedding_dim)), np.zeros((len(df), embedding_dim))
    G = nx.Graph()
    for col in cat_cols:
        for val in df[col].unique():
            G.add_node(f"{col}:{int(val)}")
    for _, row in df[cat_cols].iterrows():
        for i in range(len(cat_cols)):
            for j in range(i + 1, len(cat_cols)):
                G.add_edge(f"{cat_cols[i]}:{int(row[cat_cols[i]])}", f"{cat_cols[j]}:{int(row[cat_cols[j]])}")
    def random_walk(graph, start, length):
        walk = [start]
        for _ in range(length - 1):
            nbrs = list(graph.neighbors(walk[-1]))
            walk.append(choice(nbrs) if nbrs else walk[-1])
        return walk
    vocab = list(G.nodes())
    vocab_index = {n: i for i, n in enumerate(vocab)}
    co_matrix = np.zeros((len(vocab), len(vocab)))
    for node in vocab:
        for _ in range(num_walks):
            walk = random_walk(G, node, walk_length)
            for i, tgt in enumerate(walk):
                for j in range(max(0, i - window), min(len(walk), i + window + 1)):
                    if i != j:
                        co_matrix[vocab_index[tgt], vocab_index[walk[j]]] += 1
    safe_dim = min(embedding_dim, min(co_matrix.shape))
    if safe_dim == 0:
        return np.zeros((len(df), embedding_dim)), np.zeros((len(df), embedding_dim))
    embeddings = PCA(n_components=safe_dim).fit_transform(co_matrix)
    def get_kge(df_input):
        row_kges = []
        for _, row in df_input.iterrows():
            vecs = []
            for col in cat_cols:
                node = f"{col}:{int(row[col])}"
                vecs.append(embeddings[vocab_index.get(node, 0)])
            avg_vec = np.mean(vecs, axis=0)
            if safe_dim < embedding_dim:
                avg_vec = np.pad(avg_vec, (0, embedding_dim - safe_dim))
            row_kges.append(avg_vec)
        return np.array(row_kges)
    print(f"KGE dim used: {safe_dim}")
    return get_kge(df_train), get_kge(df_test)

kge_train, kge_test = build_kge_matrix(df_train, cat_cols, embedding_dim=64)

# STEP 9: MEG model
class MappingNetwork(nn.Module):
    def __init__(self, kge_dim, embed_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(kge_dim, embed_dim),
            nn.ReLU(),
            nn.Linear(embed_dim, embed_dim)
        )
    def forward(self, kge): return self.net(kge)

class OriginalMEG(nn.Module):
    def __init__(self, input_dim, kge_dim, embed_dim):
        super().__init__()
        self.token_embed = nn.Linear(input_dim, embed_dim)
        self.kge_mapper = MappingNetwork(kge_dim, embed_dim)
        encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=4)
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=2)
        self.decoder = nn.Sequential(
            nn.Linear(embed_dim, embed_dim),
            nn.ReLU(),
            nn.Linear(embed_dim, input_dim)
        )
    def forward(self, x, kge):
        x_embed = self.token_embed(x)
        kge_embed = self.kge_mapper(kge)
        fused = x_embed + kge_embed
        encoded = self.encoder(fused.unsqueeze(0)).squeeze(0)
        return self.decoder(encoded)

# STEP 10: Train MEG
X_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
KGE_tensor = torch.tensor(kge_train, dtype=torch.float32)
train_loader = DataLoader(TensorDataset(X_tensor, KGE_tensor), batch_size=64, shuffle=True)

meg = OriginalMEG(input_dim=X_tensor.shape[1], kge_dim=KGE_tensor.shape[1], embed_dim=128)
opt = torch.optim.Adam(meg.parameters(), lr=0.001)
loss_fn = nn.MSELoss()
meg.train()
for epoch in range(100):
    total = 0
    for xb, kb in train_loader:
        pred = meg(xb, kb)
        loss = loss_fn(pred, xb)
        opt.zero_grad()
        loss.backward()
        opt.step()
        total += loss.item()
    print(f"Epoch {epoch+1}/10 | Loss: {total:.4f}")

# STEP 11: Generate synthetic data
meg.eval()
with torch.no_grad():
    noise = torch.randn_like(X_tensor)
    X_synth = meg(noise, KGE_tensor).numpy()
y_synth = y_train.to_numpy()

# STEP 12: TSTR benchmark
models = {
    "Logistic Regression": LogisticRegression(max_iter=500),
    "MLP": MLPClassifier(max_iter=300),
    "Random Forest": RandomForestClassifier(),
    "XGBoost": XGBClassifier(eval_metric='logloss')
}
results = {}
for name, clf in models.items():
    clf.fit(X_synth, y_synth)
    y_pred = clf.predict(X_test_scaled)
    results[name] = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred, average='weighted', zero_division=0),
        "Recall": recall_score(y_test, y_pred, average='weighted', zero_division=0),
        "F1 Score": f1_score(y_test, y_pred, average='weighted', zero_division=0)
    }

print("TSTR Benchmark Results (Train on Synthetic, Test on Real):")
display(pd.DataFrame(results).T)

# STEP 13: Save output
synth_df = pd.DataFrame(X_synth, columns=X_train.columns)
synth_df['label'] = y_synth
synth_df.to_csv("synthetic_meg_output.csv", index=False)

from google.colab import files
files.download("synthetic_meg_output.csv")


Train file: shuttle (2).trn, Test file: shuttle (2).tst
Train shape: (43500, 10), Test shape: (14500, 10)
Inferred categorical columns: []


  return pd.read_csv(file, header=None, delim_whitespace=True)
  return pd.read_csv(file, header=None, delim_whitespace=True)


KGE skipped — no categorical columns found.




Epoch 1/10 | Loss: 235.3090
Epoch 2/10 | Loss: 230.0693
Epoch 3/10 | Loss: 223.0230
Epoch 4/10 | Loss: 244.3756
Epoch 5/10 | Loss: 237.4734
Epoch 6/10 | Loss: 225.3831
Epoch 7/10 | Loss: 236.8140
Epoch 8/10 | Loss: 211.6128
Epoch 9/10 | Loss: 190.8128
Epoch 10/10 | Loss: 189.0975
TSTR Benchmark Results (Train on Synthetic, Test on Real):


Unnamed: 0,Accuracy,Precision,Recall,F1 Score
Logistic Regression,0.79131,0.6376,0.79131,0.699738
MLP,0.791586,0.626609,0.791586,0.699502
Random Forest,0.695862,0.679938,0.695862,0.686596
XGBoost,0.745379,0.650431,0.745379,0.691721


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

