In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install torch_geometric

In [None]:
import os
import glob
import torch
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd

from torch_geometric.data import Data
from torch_geometric.nn import GCNConv, global_mean_pool
import torch.nn as nn
import torch.nn.functional as F

GRAPH_DIR = "/kaggle/input/graphs/pyg_graphs/"
pt_files = sorted(glob.glob(os.path.join(GRAPH_DIR, "*_kmeans*.pt")))

print(f"Found {len(pt_files)} graph files in {GRAPH_DIR}.")

In [None]:
graph_info_list = []

task_label_map = {"WM": 0, "EMOTION": 1, "GAMBLING":2, "MOTOR":3, "RELATIONAL":4, "SOCIAL":5, "LANGUAGE":6}

# ["WM", "EMOTION", 'GAMBLING', 'MOTOR', 'RELATIONAL', 'SOCIAL', 'LANGUAGE'
for pt_path in pt_files:
    fname = os.path.basename(pt_path)
    parts = fname.split("_") 
    
    if len(parts) < 3:
        continue  
    
    subject_id = parts[0]  
    task_str = parts[2]  
    
    if task_str not in task_label_map:
        # skip others for now
        continue
    
    data = torch.load(pt_path)  

    edge_index = data.edge_index.numpy()
    edge_weight = data.edge_attr.numpy().flatten() if data.edge_attr is not None else np.ones(edge_index.shape[1])
    
    G_nx = nx.Graph()
    for i in range(edge_index.shape[1]):
        u = edge_index[0, i]
        v = edge_index[1, i]
        w = float(edge_weight[i])
        if not G_nx.has_node(u):
            G_nx.add_node(u)
        if not G_nx.has_node(v):
            G_nx.add_node(v)
        G_nx.add_edge(u, v, weight=w)

    n_nodes = G_nx.number_of_nodes()
    n_edges = G_nx.number_of_edges()
    density = nx.density(G_nx)
    
    weights = [d['weight'] for (_,_, d) in G_nx.edges(data=True)]
    avg_weight = np.mean(weights) if len(weights) > 0 else 0
    
    strength_list = []
    for node in G_nx.nodes():
        s = sum(d['weight'] for (_,_,d) in G_nx.edges(node, data=True))
        strength_list.append(s)
    avg_strength = np.mean(strength_list) if len(strength_list) > 0 else 0

    label = task_label_map[task_str]
    graph_info = {
        "filepath": pt_path,
        "subject": subject_id,
        "task": task_str,
        "label": label,
        "n_nodes": n_nodes,
        "n_edges": n_edges,
        "density": density,
        "avg_weight": avg_weight,
        "avg_strength": avg_strength,
        "pyg_data": data,  # store the actual PyG graph
    }
    graph_info_list.append(graph_info)

print(f"Loaded {len(graph_info_list)} graphs (WM/EMOTION).")


In [None]:
df = pd.DataFrame(graph_info_list)
print(df.head())

df_box = df[['task','avg_weight']]
df_box.boxplot(by='task', column='avg_weight', grid=False)
plt.title("Average edge weight by task")
plt.suptitle("")
plt.show()

df_box2 = df[['task','avg_strength']]
df_box2.boxplot(by='task', column='avg_strength', grid=False)
plt.title("Average strength by task")
plt.suptitle("")
plt.show()

In [None]:
all_graphs = []
for gi in graph_info_list:
    data = gi["pyg_data"]
    data.y = torch.tensor([gi["label"]], dtype=torch.long)
    all_graphs.append(data)

print(f"Number of labeled graphs for GNN: {len(all_graphs)}")

train_size = int(0.8 * len(all_graphs))
test_size = len(all_graphs) - train_size

rng = np.random.default_rng(42)
indices = np.arange(len(all_graphs))
rng.shuffle(indices)

train_indices = indices[:train_size]
test_indices = indices[train_size:]

train_dataset = [all_graphs[i] for i in train_indices]
test_dataset = [all_graphs[i] for i in test_indices]

print(f"Train set size: {len(train_dataset)}, Test set size: {len(test_dataset)}")

from torch_geometric.loader import DataLoader

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=2, shuffle=False)

class GCNGraphClassifier(nn.Module):
    def __init__(self, in_channels=1, hidden_dim=32, num_classes=2):
        super().__init__()
        # if data.x is shape [num_nodes, 1], in_channels=1
        # GCN layers
        self.conv1 = GCNConv(in_channels, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, hidden_dim)
        # final linear
        self.lin = nn.Linear(hidden_dim, num_classes)

    def forward(self, x, edge_index, batch):
        # x: [total_nodes_in_batch, in_channels]
        # edge_index: [2, E]
        # batch: [total_nodes_in_batch], indicates which graph each node belongs to

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        x = F.relu(x)

        x = global_mean_pool(x, batch)  # shape [num_graphs, hidden_dim]

        # final linear
        x = self.lin(x)  # shape [num_graphs, num_classes]
        return x

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

sample_data = all_graphs[0]
in_channels = sample_data.x.shape[1] if sample_data.x is not None else 1
model = GCNGraphClassifier(in_channels=in_channels, hidden_dim=32, num_classes=2)
model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

def train_epoch(loader):
    model.train()
    total_loss = 0
    for batch_data in loader:
        batch_data = batch_data.to(device)
        optimizer.zero_grad()
        out = model(batch_data.x, batch_data.edge_index, batch_data.batch)
        loss = criterion(out, batch_data.y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * batch_data.num_graphs
    return total_loss / len(loader.dataset)

def eval_accuracy(loader):
    model.eval()
    correct = 0
    total = 0
    for batch_data in loader:
        batch_data = batch_data.to(device)
        out = model(batch_data.x, batch_data.edge_index, batch_data.batch)
        preds = out.argmax(dim=1)
        correct += (preds == batch_data.y).sum().item()
        total += batch_data.num_graphs
    return correct / total if total > 0 else 0

epochs = 100
for epoch in range(1, epochs+1):
    loss = train_epoch(train_loader)
    acc_train = eval_accuracy(train_loader)
    acc_test = eval_accuracy(test_loader)
    print(f"Epoch {epoch:02d}, Loss={loss:.4f}, TrainAcc={acc_train:.3f}, TestAcc={acc_test:.3f}")

In [None]:
import matplotlib.pyplot as plt

def data_to_adjacency(data: Data):
    """
    Convert a PyG Data object into a (n_nodes x n_nodes) adjacency matrix (NumPy).
    We'll use the edge weights from data.edge_attr if available.
    """
    edge_index = data.edge_index.cpu().numpy()
    if data.edge_attr is not None:
        edge_weight = data.edge_attr.cpu().numpy().flatten()
    else:
        # If no edge_attr, assume weight=1
        edge_weight = np.ones(edge_index.shape[1], dtype=np.float32)
    
    n_nodes = data.num_nodes
    adj_matrix = np.zeros((n_nodes, n_nodes), dtype=np.float32)
    
    for i in range(edge_index.shape[1]):
        u = edge_index[0, i]
        v = edge_index[1, i]
        w = edge_weight[i]
        # Undirected => set both [u,v] and [v,u]
        adj_matrix[u, v] = w
        adj_matrix[v, u] = w
    return adj_matrix

tasks_of_interest = ["WM", "EMOTION", 'GAMBLING', 'MOTOR', 'RELATIONAL', 'SOCIAL', 'LANGUAGE']
sample_size_per_task = 3

plot_graphs = []
for tsk in tasks_of_interest:
    subset = df[df["task"] == tsk]
    # pick a few
    subset = subset.head(sample_size_per_task)
    for idx, row in subset.iterrows():
        plot_graphs.append((row["task"], row["subject"], row["pyg_data"]))

if len(plot_graphs) == 0:
    print("No graphs found for the specified tasks.")
else:
    fig, axes = plt.subplots(len(plot_graphs), 1, figsize=(6, 4*len(plot_graphs)))
    if len(plot_graphs) == 1:
        axes = [axes]  # make it iterable
    
    for ax, (task_str, subj, gdata) in zip(axes, plot_graphs):
        adj = data_to_adjacency(gdata)
        im = ax.imshow(adj, cmap="bwr", vmin=-1, vmax=1)  # assuming corr ranges -1..1
        ax.set_title(f"Subject: {subj}, Task: {task_str}, Nodes={gdata.num_nodes}")
        ax.set_xlabel("Node")
        ax.set_ylabel("Node")
        plt.colorbar(im, ax=ax, fraction=0.046, pad=0.04)

    plt.tight_layout()
    plt.show()

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(6,4))
sns.boxplot(data=df, x="task", y="n_edges")
plt.title("Distribution of #Edges across Tasks")
plt.ylabel("#Edges")
plt.xlabel("Task")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

plt.figure(figsize=(6,4))
sns.boxplot(data=df, x="task", y="density", color="lightgray")
sns.stripplot(data=df, x="task", y="density", color="red", alpha=0.6)
plt.title("Graph Density across Tasks")
plt.ylabel("Density")
plt.xlabel("Task")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

metric = "avg_strength"
df_agg = df.groupby("task")[metric].mean().reset_index()

plt.figure(figsize=(6,4))
sns.barplot(data=df_agg, x="task", y=metric, color="skyblue")
plt.title(f"Mean {metric} by Task")
plt.ylabel(f"{metric}")
plt.xlabel("Task")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

plt.figure(figsize=(6,4))
sns.violinplot(data=df, x="task", y="avg_weight")
plt.title("Distribution of Avg Weight by Task")
plt.xlabel("Task")
plt.ylabel("Average Edge Weight")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
