In [None]:
from torch_geometric.datasets import OGB_MAG

In [41]:
import json
from pathlib import Path
from mt.helper import flatten
from datetime import datetime
from dateutil.parser import parse
from typing import Any
import networkx as nx
from torch_geometric.data import Data, Batch
import requests
import re
import torch
from torch.utils.data import Dataset
from torch_geometric.loader import DataLoader
import pickle
import concurrent.futures
import math
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import networkx as nx
import numpy as np
from sklearn.model_selection import train_test_split
from functools import cached_property
from mt.definitions import REPO_DIR
from sklearn.preprocessing import MinMaxScaler

In [53]:
# all, cfg, ch22, dg, ast
# also npy features
g_types = {"ast": 1, "ch22": 2, "cfg": 3, "dg": 4, "cfg_ast": 5}
repo_dir = REPO_DIR / "pytorch/vision"

In [54]:
class GraphDataset(Dataset):
    def __init__(self, repo_dir: Path, graph_type: str) -> None:
        self.graph_type = graph_type
        self.repo_dir = repo_dir
        with open(repo_dir / "maps.json") as f:
            self.maps = json.load(f)
        
        self.node_type = len(self.maps[graph_type]["nodes"])
        self.g_type_node = len(g_types) - 1
        self.g_type_edge = len(g_types) if graph_type != "all" else len(g_types) + 1
        self.edge_type = len(self.maps[graph_type]["edges"])

        self.root_dir = repo_dir / "pts"
        with open(self.repo_dir / "residuals.pkl", "rb") as f:
            self.targets = pickle.load(f)
        self.files = list(self.root_dir.glob(f"{graph_type}_*.pt"))
        self.num_samples = len(self.files)

        target_values = list(self.targets.values())
        self.scaler = MinMaxScaler()
        self.scaler.fit([[target] for target in target_values])

        # Scale the targets and store them in a dictionary
        self.scaled_targets = {key: self.scaler.transform([[value]])[0][0] for key, value in self.targets.items()}

    def __len__(self) -> int:
        100
        # return self.num_samples

    def __getitem__(self, idx: int) -> tuple[str, Batch, torch.FloatTensor]:
        commit_sha, graph = torch.load(self.root_dir / f"{self.graph_type}_{idx}.pt")
        scaled_target = self.scaled_targets[commit_sha]
        graph.y = torch.tensor(scaled_target, dtype=torch.float32)
        return commit_sha, graph

In [55]:
dataset = GraphDataset(repo_dir, "dg")

In [59]:
dataset[1]

('c9eab681e4bd8800fd0169c26e225b64f696cb8a',
 Data(x=[3507, 2], edge_index=[2, 3382], edge_attr=[3382, 2], y=0.5616480708122253))

In [60]:
i_s = {}
for i in range(100): 
    c_s, _ = dataset[i]
    i_s[i] = c_s

In [63]:
with open(repo_dir / 'idx_to_sha.pkl', "wb") as f:
    pickle.dump(i_s, f)

In [139]:
dataset.__class__.__name__

'GraphDataset'

In [136]:
dataset[0][1].x[:, 1].max()

tensor(4)

In [140]:
import random

# Define the total number of indexes
total_indexes = 1000

# Generate a list of indexes from 0 to 999
indexes = list(range(total_indexes))

# Randomly sample 100 indexes from the list
sampled_indexes = random.sample(indexes, 100)

print(sampled_indexes)


[231, 333, 627, 324, 99, 879, 831, 319, 136, 47, 513, 673, 934, 211, 147, 305, 765, 335, 871, 557, 395, 297, 970, 302, 497, 572, 288, 25, 462, 67, 774, 316, 456, 163, 807, 289, 138, 680, 329, 779, 975, 176, 760, 492, 762, 510, 294, 579, 190, 993, 151, 240, 146, 905, 743, 539, 690, 85, 260, 158, 617, 619, 900, 654, 672, 300, 89, 797, 257, 81, 474, 182, 906, 675, 963, 506, 910, 264, 325, 307, 734, 568, 755, 663, 76, 56, 315, 388, 268, 600, 290, 251, 573, 756, 49, 744, 84, 887, 142, 772]


In [None]:
import pickle

with open("/home/fox/projects/general/learning-portfolio/university/masters/masters-thesis/data/repos/pytorch/vision/sample.pkl", "wb") as f:
    pickle.dump(sampled_indexes, f)

In [137]:
dataset.g_type_node, dataset.g_type_edge, dataset.edge_type, dataset.node_type

(5, 6, 17, 230)

In [8]:
data = dataset[0][1]
data

Data(x=[961808, 2], edge_index=[2, 573692], edge_attr=[573692, 2], y=0.5556620955467224)

In [82]:
data.num_nodes

3507

In [83]:
num_nodes = data.num_nodes
indices = torch.randperm(num_nodes)
train_idx = indices[:int(0.8 * num_nodes)]
val_idx = indices[int(0.8 * num_nodes):int(0.9 * num_nodes)]
test_idx = indices[int(0.9 * num_nodes):]

In [84]:
from torch_geometric.loader import NeighborLoader

In [101]:
# Create loaders using NeighborLoader
train_loader = NeighborLoader(
    data,
    num_neighbors=[1000] * 2,  # Adjust number of neighbors as needed
    batch_size=1000,
    input_nodes=train_idx,
    shuffle=True,
)

In [119]:
# Example iteration through the loader
for batch in train_loader:
    print(batch)
    print(batch)
    break

Data(x=[351, 2], edge_index=[2, 301], edge_attr=[301, 2], y=0.5556620955467224, n_id=[351], e_id=[301], input_id=[240], batch_size=240)
Data(x=[351, 2], edge_index=[2, 301], edge_attr=[301, 2], y=0.5556620955467224, n_id=[351], e_id=[301], input_id=[240], batch_size=240)


In [11]:
dataset[0][1]

Data(x=[2958276, 2], edge_index=[2, 4742679], edge_attr=[4742679, 2])

In [39]:
loader = NeighborLoader(
    dataset[0][1],
    # Sample 30 neighbors for each node for 2 iterations
    num_neighbors=[1000] * 3,
    # Use a batch size of 128 for sampling training nodes
    batch_size=128,
    input_nodes=len(dataset[0][1]),
)

NameError: name 'NeighborLoader' is not defined

In [11]:
from torch.utils.data import random_split
from torch_geometric.loader import DataLoader

In [10]:
import torch
from torch_geometric.data import Data, Batch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GATConv, global_mean_pool, AttentionalAggregation, GATv2Conv, GCNConv, GINEConv, SAGEConv
from torch.nn import Sequential, Linear, ReLU

In [12]:
class NodeFeatureEmbedding(nn.Module):
    def __init__(self, num_x, num_y, emb_x, emb_y):
        super(NodeFeatureEmbedding, self).__init__()
        self.embedding_x = nn.Embedding(num_x, emb_x)
        self.embedding_y = nn.Embedding(num_y, emb_y)
        
    def forward(self, x):
        # Assuming x is of shape (num_nodes, 2), where each entry is a category index
        x_embedding = self.embedding_x(x[:, 0])
        y_embedding = self.embedding_y(x[:, 1])
        # Concatenate the embeddings along the last dimension
        return torch.cat([x_embedding, y_embedding], dim=-1)

class EdgeFeatureEmbedding(nn.Module):
    def __init__(self, num_x, num_y, emb_x, emb_y):
        super(EdgeFeatureEmbedding, self).__init__()
        self.embedding_x = nn.Embedding(num_x, emb_x)
        self.embedding_y = nn.Embedding(num_y, emb_y)
        
    def forward(self, edge_attr):
        # Assuming edge_attr is of shape (num_edges, 2), where each entry is a category index
        x_embedding = self.embedding_x(edge_attr[:, 0])
        y_embedding = self.embedding_y(edge_attr[:, 1])
        # Concatenate the embeddings along the last dimension
        return torch.cat([x_embedding, y_embedding], dim=-1)

In [13]:
node_emb_model = NodeFeatureEmbedding(dataset.node_type, dataset.g_type_node, 56, 8)
edge_emb_model = EdgeFeatureEmbedding(dataset.edge_type, dataset.g_type_edge, 56, 8)

In [15]:
commit_sha, data = dataset[0]

In [16]:
data

Data(x=[961808, 2], edge_index=[2, 573692], edge_attr=[573692, 2], y=0.5556620955467224)

In [17]:
emb_x = node_emb_model(data.x)
edge_index = data.edge_index
emb_edge_attr = edge_emb_model(data.edge_attr)
emb_x.shape, emb_edge_attr.shape
batch = data.batch

In [37]:
# Simple
class GCNModel(nn.Module):
    def __init__(self, node_embedding_dim, edge_embedding_dim, hidden_channels, out_channels=1):
        super(GCNModel, self).__init__()
        self.conv1 = GCNConv(node_embedding_dim, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.conv3 = GCNConv(hidden_channels, hidden_channels)
        
        self.edge_mlp = nn.Sequential(
            nn.Linear(edge_embedding_dim, hidden_channels),
            nn.ReLU(),
            nn.Linear(hidden_channels, 1)  # Transform edge attributes to scalar weights
        )
        
        self.regressor = nn.Linear(hidden_channels, out_channels)

    def forward(self, x, edge_index, edge_attr, batch):
        edge_weight = self.edge_mlp(edge_attr).squeeze()  # Transform edge attributes to edge weights

        x = F.relu(self.conv1(x, edge_index, edge_weight))
        x = F.relu(self.conv2(x, edge_index, edge_weight))
        x = F.relu(self.conv3(x, edge_index, edge_weight))
        
        x = global_mean_pool(x, batch)
        
        return self.regressor(x).squeeze()

In [38]:
model = GCNModel(64, 64, 124)
model(emb_x, edge_index, emb_edge_attr, batch)

tensor(-0.3304, grad_fn=<SqueezeBackward0>)

In [12]:
# Attention based
class GATModel(nn.Module):
    def __init__(self, node_embedding_dim, edge_embedding_dim, hidden_channels, agg_hidden_channels, out_channels=1, num_heads=2):
        super(GATModel, self).__init__()
        self.conv1 = GATv2Conv(node_embedding_dim, hidden_channels, heads=num_heads, concat=True, edge_dim=edge_embedding_dim)
        self.conv2 = GATv2Conv(hidden_channels * num_heads, hidden_channels, heads=num_heads, concat=True, edge_dim=edge_embedding_dim)
        self.conv3 = GATv2Conv(hidden_channels * num_heads, hidden_channels, heads=1, concat=False, edge_dim=edge_embedding_dim)
        
        self.gate_nn = nn.Sequential(
            nn.Linear(hidden_channels, agg_hidden_channels),
            nn.ReLU(),
            nn.Linear(agg_hidden_channels, 1)
        )

        self.attention_aggregation = AttentionalAggregation(gate_nn=self.gate_nn)
        
        self.regressor = nn.Linear(hidden_channels, out_channels)

    def forward(self, x, edge_index, edge_attr, batch):
        x = F.relu(self.conv1(x, edge_index, edge_attr))
        x = F.relu(self.conv2(x, edge_index, edge_attr))
        x = self.conv3(x, edge_index, edge_attr)
        
        x = self.attention_aggregation(x, batch)
        
        return self.regressor(x).squeeze()

In [13]:
# Structure, sequence based
class GINEModel(nn.Module):
    def __init__(self, node_embedding_dim, edge_embedding_dim, hidden_channels, lstm_hidden_dim, out_channels=1):
        super(GINEModel, self).__init__()
        nn1 = Sequential(Linear(node_embedding_dim, hidden_channels), ReLU(), Linear(hidden_channels, hidden_channels))
        nn2 = Sequential(Linear(hidden_channels, hidden_channels), ReLU(), Linear(hidden_channels, hidden_channels))
        nn3 = Sequential(Linear(hidden_channels, hidden_channels), ReLU(), Linear(hidden_channels, hidden_channels))
        
        self.conv1 = GINEConv(nn1, edge_dim=edge_embedding_dim)
        self.conv2 = GINEConv(nn2, edge_dim=edge_embedding_dim)
        self.conv3 = GINEConv(nn3, edge_dim=edge_embedding_dim)
        
        self.lstm = nn.LSTM(hidden_channels, lstm_hidden_dim, batch_first=True)
        
        self.regressor = nn.Linear(lstm_hidden_dim, out_channels)

    def forward(self, x, edge_index, edge_attr, batch):
        x = F.relu(self.conv1(x, edge_index, edge_attr))
        x = F.relu(self.conv2(x, edge_index, edge_attr))
        x = F.relu(self.conv3(x, edge_index, edge_attr))
        
        # Aggregate node embeddings into a graph-level embedding
        x = global_mean_pool(x, batch)
        
        # Expand dimensions to fit LSTM input requirements: (batch_size, seq_len, input_dim)
        x = x.unsqueeze(1)  # Assuming each graph is treated as a single sequence
        
        # Pass through LSTM
        x, _ = self.lstm(x)
        
        # Use the output of the last LSTM cell
        x = x[:, -1, :]
        
        return self.regressor(x).squeeze()

In [52]:
graph_types = [""]

for graph_type in graph_types:
    dataset = GraphDataset(repo_dir, graph_type)
    node_emb_model = NodeFeatureEmbedding(dataset.node_type, dataset.g_type_node, 56, 8)
    edge_emb_model = EdgeFeatureEmbedding(dataset.edge_type, dataset.g_type_edge, 56, 8)
    model1, model2, model3 = GCNModel(64, 64, 1), GATModel(64, 64, 1, 1), GINEModel(64, 64, 1, 1)

    train_size = int(0.8 * len(dataset))
    val_size = int(0.1 * len(dataset))
    test_size = len(dataset) - train_size - val_size
    train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])
    train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=1)
    test_loader = DataLoader(test_dataset, batch_size=1)

    for commit_sha, graph, target in train_loader:
        emb_x = node_emb_model(graph.x)
        edge_index = graph.edge_index
        emb_edge_attr = edge_emb_model(graph.edge_attr)
        batch = graph.batch
        print(model1(emb_x, edge_index, emb_edge_attr, batch).view(-1))
        print(target)
        break

tensor([-0.0061], grad_fn=<ViewBackward0>)
tensor([0.5656])


In [None]:
def train_model(model, train_loader, val_loader, optimizer, criterion, device, num_epochs=50, patience=10):
    model.to(device)
    best_val_loss = float('inf')
    patience_counter = 0
    train_losses = []
    val_losses = []
    
    # Learning rate scheduler
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5, verbose=True)
    
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for commit_sha, graph, target in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
            graph = graph.to(device)
            target = target.to(device)
            optimizer.zero_grad()
            emb_x = node_emb_model(graph.x.to(device))
            edge_index = graph.edge_index.to(device)
            emb_edge_attr = edge_emb_model(graph.edge_attr.to(device))
            batch = graph.batch.to(device)
            output = model(emb_x, edge_index, emb_edge_attr, batch)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        avg_train_loss = running_loss / len(train_loader)
        train_losses.append(avg_train_loss)

        model.eval()
        val_loss = 0
        with torch.no_grad():
            for commit_sha, graph, target in val_loader:
                graph = graph.to(device)
                target = target.to(device)
                emb_x = node_emb_model(graph.x.to(device))
                edge_index = graph.edge_index.to(device)
                emb_edge_attr = edge_emb_model(graph.edge_attr.to(device))
                batch = graph.batch.to(device)
                output = model(emb_x, edge_index, emb_edge_attr, batch)
                val_loss += criterion(output, target).item()
        avg_val_loss = val_loss / len(val_loader)
        val_losses.append(avg_val_loss)

        print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {avg_train_loss:.4f}, Validation Loss: {avg_val_loss:.4f}')

        # Early stopping
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            patience_counter = 0
            torch.save(model.state_dict(), 'best_model.pt')
        else:
            patience_counter += 1

        if patience_counter >= patience:
            print(f"Early stopping at epoch {epoch+1}")
            break

        # Step the scheduler
        scheduler.step(avg_val_loss)

    # Load the best model
    model.load_state_dict(torch.load('best_model.pt'))
    
    return model, train_losses, val_losses

In [None]:
repo_dir = Path('/path/to/repo')
graph_types = ["ast", "cfg", "ddg", "pdg", "cdg"]  # Example graph types
models = {
    "GCNModel": GCNModel(64, 64, 100, 1),
    "GATModel": GATModel(64, 64, 100, 100, 1),
    "GINEModel": GINEModel(64, 64, 100, 100, 1)
}

results = []

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

for graph_type in graph_types:
    dataset = GraphDataset(repo_dir, graph_type)
    train_size = int(0.8 * len(dataset))
    val_size = int(0.1 * len(dataset))
    test_size = len(dataset) - train_size - val_size
    train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])
    
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=32)
    test_loader = DataLoader(test_dataset, batch_size=32)

    for model_name, model_class in models.items():
        # Initialize node and edge embeddings
        node_emb_model = NodeFeatureEmbedding(dataset.node_type, dataset.g_type_node, 56, 8).to(device)
        edge_emb_model = EdgeFeatureEmbedding(dataset.edge_type, dataset.g_type_edge, 56, 8).to(device)
        
        # Initialize model, optimizer, and loss criterion
        model = model_class.to(device)
        optimizer = optim.Adam(model.parameters(), lr=0.001)
        criterion = nn.MSELoss()
        
        # Train the model
        print(f'Training {model_name} on {graph_type} dataset...')
        model, train_losses, val_losses = train_model(model, train_loader, val_loader, optimizer, criterion, device, num_epochs=50)
        
        # Evaluate on test set
        model.eval()
        test_true = []
        test_pred = []
        with torch.no_grad():
            for commit_sha, graph, target in test_loader:
                graph = graph.to(device)
                target = target.to(device)
                emb_x = node_emb_model(graph.x.to(device))
                edge_index = graph.edge_index.to(device)
                emb_edge_attr = edge_emb_model(graph.edge_attr.to(device))
                batch = graph.batch.to(device)
                output = model(emb_x, edge_index, emb_edge_attr, batch)
                test_true.append(target.cpu().numpy())
                test_pred.append(output.cpu().numpy())
        
        test_true = np.concatenate(test_true)
        test_pred = np.concatenate(test_pred)
        mse = mean_squared_error(test_true, test_pred)
        
        # Save results
        results.append({
            "model": model_name,
            "graph_type": graph_type,
            "test_true": test_true,
            "test_pred": test_pred,
            "mse": mse
        })
        
        # Save the model
        torch.save(model.state_dict(), f'{model_name}_{graph_type}.pt')

# Save all results to a file
with open('results.pkl', 'wb') as f:
    pickle.dump(results, f)