In [None]:
import random
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
import torch.nn as nn
import matplotlib.pyplot as plt
import networkx as nx
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import classification_report
from torch_geometric.data import Data
from torch_geometric.utils import from_networkx, to_networkx
from torch_geometric.nn import GCNConv

# Reproducibility
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load data
def load_data(path):
    df = pd.read_csv(path).drop(columns='id')
    X, y = df.drop(columns='target'), df['target']
    return X, y

train_X, train_y = load_data('Train_selected_features.csv')
val1_X, val1_y = load_data('Val1_selected_features.csv')
val2_X, val2_y = load_data('Val2_selected_features.csv')

# Graph construction
def build_graph(features, targets, threshold=0.95):
    sim_matrix = cosine_similarity(features)
    edges = np.argwhere(sim_matrix > threshold)
    edges = edges[edges[:, 0] != edges[:, 1]]
    G = nx.Graph()
    for i in range(len(targets)):
        G.add_node(i, feature=features.iloc[i].values, target=int(targets.iloc[i]))
    for edge in edges:
        G.add_edge(edge[0], edge[1], weight=sim_matrix[edge[0], edge[1]])
    return G

def to_pyg_data(G):
    data = from_networkx(G)
    data.x = torch.tensor([G.nodes[i]['feature'] for i in G.nodes()], dtype=torch.float)
    data.y = torch.tensor([G.nodes[i]['target'] for i in G.nodes()], dtype=torch.long)
    data.edge_attr = data.edge_attr if 'edge_attr' in data else None
    data.train_mask = torch.ones(data.num_nodes, dtype=torch.bool)  # Full supervision
    return data.to(device)

Train_data = to_pyg_data(build_graph(train_X, train_y))
Val1_data = to_pyg_data(build_graph(val1_X, val1_y))
Val2_data = to_pyg_data(build_graph(val2_X, val2_y))

# GCN model
class GCN(nn.Module):
    def __init__(self, in_features, hidden_dim, out_classes):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(in_features, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, hidden_dim)
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(hidden_dim, out_classes)

    def forward(self, x, edge_index, edge_weight=None):
        x = F.relu(self.conv1(x, edge_index, edge_weight))
        x = self.dropout(x)
        x = F.relu(self.conv2(x, edge_index, edge_weight))
        return self.fc(x)

# Initialize model
model = GCN(Train_data.x.shape[1], hidden_dim=256, out_classes=Train_data.y.max().item() + 1).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

# Early stopping
class EarlyStopping:
    def __init__(self, patience=5, delta=0.01):
        self.patience = patience
        self.delta = delta
        self.best_score = float('inf')
        self.counter = 0
        self.should_stop = False

    def __call__(self, val_loss):
        if val_loss < self.best_score - self.delta:
            self.best_score = val_loss
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                self.should_stop = True

early_stopper = EarlyStopping()

# Training loop
def evaluate(data):
    model.eval()
    with torch.no_grad():
        logits = model(data.x, data.edge_index, data.edge_attr)
        loss = F.cross_entropy(logits[data.train_mask], data.y[data.train_mask])
        preds = logits.argmax(dim=1)
        acc = (preds[data.train_mask] == data.y[data.train_mask]).float().mean().item()
        probs = F.softmax(logits, dim=1)
    return loss.item(), acc, preds[data.train_mask], probs[data.train_mask]

train_losses, val1_losses, val2_losses = [], [], []
train_accs, val1_accs, val2_accs = [], [], []

for epoch in range(100):
    model.train()
    optimizer.zero_grad()
    out = model(Train_data.x, Train_data.edge_index, Train_data.edge_attr)
    loss = F.cross_entropy(out[Train_data.train_mask], Train_data.y[Train_data.train_mask])
    loss.backward()
    optimizer.step()

    tr_loss, tr_acc, tr_pred, tr_probs = evaluate(Train_data)
    v1_loss, v1_acc, v1_pred, v1_probs = evaluate(Val1_data)
    v2_loss, v2_acc, v2_pred, v2_probs = evaluate(Val2_data)

    train_losses.append(tr_loss)
    val1_losses.append(v1_loss)
    val2_losses.append(v2_loss)

    train_accs.append(tr_acc)
    val1_accs.append(v1_acc)
    val2_accs.append(v2_acc)

    print(f"Epoch {epoch+1}: Train Loss={tr_loss:.4f}, Val1 Loss={v1_loss:.4f}, Val2 Loss={v2_loss:.4f}")
    early_stopper(v1_loss)
    if early_stopper.should_stop:
        print("Early stopping triggered.")
        break

# Save model
torch.save(model.state_dict(), 'GCN_model_final.pth')

# Save predictions and probabilities
def save_outputs(filename_prefix, preds, probs):
    df = pd.DataFrame()
    df['predictions'] = preds.cpu().numpy()
    prob_df = pd.DataFrame(probs.cpu().numpy(), columns=[f'class_{i}' for i in range(probs.shape[1])])
    df = pd.concat([df, prob_df], axis=1)
    df.to_csv(f'{filename_prefix}_outputs.csv', index=False)

save_outputs("Train", tr_pred, tr_probs)
save_outputs("Val1", v1_pred, v1_probs)
save_outputs("Val2", v2_pred, v2_probs)

# Save classification report
def save_report(data, preds, name):
    report = classification_report(data.y[data.train_mask].cpu(), preds.cpu(), output_dict=True)
    pd.DataFrame(report).transpose().to_csv(f"{name}_classification_report.csv")

save_report(Train_data, tr_pred, "Train")
save_report(Val1_data, v1_pred, "Val1")
save_report(Val2_data, v2_pred, "Val2")

# Plot losses and accuracies
plt.figure(figsize=(10,5))
plt.plot(train_losses, label="Train Loss")
plt.plot(val1_losses, label="Val1 Loss")
plt.plot(val2_losses, label="Val2 Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.title("Loss Curves")
plt.savefig("loss_curves.png")
plt.close()

plt.figure(figsize=(10,5))
plt.plot(train_accs, label="Train Acc")
plt.plot(val1_accs, label="Val1 Acc")
plt.plot(val2_accs, label="Val2 Acc")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.legend()
plt.title("Accuracy Curves")
plt.savefig("accuracy_curves.png")
plt.close()

# Save Graph Visualizations (Only Connected Nodes)
def save_graph_image(pyg_data, name):
    G = to_networkx(pyg_data, to_undirected=True)
    G.remove_nodes_from(list(nx.isolates(G)))  # Remove disconnected nodes
    pos = nx.spring_layout(G, seed=42)

    y = pyg_data.y.cpu().numpy()
    node_colors = [y[n] for n in G.nodes()]
    cmap = plt.cm.Set3

    plt.figure(figsize=(8, 8))
    nx.draw(G, pos,
            node_color=node_colors,
            cmap=cmap,
            node_size=60,
            edge_color='gray',
            alpha=0.8,
            with_labels=False)
    plt.title(f"{name} Graph (Connected Nodes Only)")
    plt.tight_layout()
    plt.savefig(f"{name}_graph.png")
    plt.close()

save_graph_image(Train_data, "Train")
save_graph_image(Val1_data, "Val1")
save_graph_image(Val2_data, "Val2")
