In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch_geometric.nn import RGCNConv
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from scipy.sparse import csr_matrix

# ==============================
# Step 1: Load Data and Build Graph Structures
# ==============================

# Load event data from features.csv
df = pd.read_csv("features.csv")

# Encode actor_name (users) as unique numerical identifiers
user_mapping = {name: idx for idx, name in enumerate(df['actor_name'].unique())}
df['user_id'] = df['actor_name'].map(user_mapping)

# Use the repo_index from the CSV (already unique) and build a mapping to row indices
unique_repos = df['repo_index'].unique()
repo_mapping = {repo: i for i, repo in enumerate(unique_repos)}

# Counts
num_users = len(user_mapping)
num_repos = len(unique_repos)

# Get unique event types
event_types = df['event_type'].unique()
num_event_types = len(event_types)

# 1a. Build binary repo-user bipartite adjacency matrices (one per event type)
repo_user_adj_matrices = {event: np.zeros((num_repos, num_users)) for event in event_types}
for _, row in df.iterrows():
    repo_id = repo_mapping[row['repo_index']]
    user_id = row['user_id']
    event_type = row['event_type']
    # Binary: set to 1 if any interaction exists
    repo_user_adj_matrices[event_type][repo_id, user_id] = 1

# Convert each to a sparse CSR matrix for efficiency
repo_user_sparse_matrices = {event: csr_matrix(matrix) for event, matrix in repo_user_adj_matrices.items()}

# 1b. Build the repository feature matrix (sum of event counts per repo)
repo_feature_matrix = np.zeros((num_repos, num_event_types))
for _, row in df.iterrows():
    repo_id = repo_mapping[row['repo_index']]
    event_type = row['event_type']
    # Find column index for this event type
    event_idx = np.where(event_types == event_type)[0][0]
    repo_feature_matrix[repo_id, event_idx] += row['num_event']

# Global z-score normalization (all values together)
global_mean = repo_feature_matrix.mean()
global_std = repo_feature_matrix.std()
repo_feature_matrix = (repo_feature_matrix - global_mean) / global_std

# Convert to torch tensor
repo_features = torch.tensor(repo_feature_matrix, dtype=torch.float)

# Initialize user features as zeros (same number of features as repo features)
user_features = torch.zeros((num_users, num_event_types), dtype=torch.float)

# 1c. Build the global (combined) edge index for the bipartite graph
# We combine nodes so that nodes 0..num_repos-1 are repos and nodes num_repos..num_repos+num_users-1 are users.

edge_indices_list = []
edge_types_list = []

for event_idx, event in enumerate(event_types):
    # Get the sparse matrix for this event type and convert to COO format
    coo = repo_user_sparse_matrices[event].tocoo()
    # For repo->user, the source nodes are repo indices (already in 0..num_repos-1)
    # and the target nodes are user indices; but we must add an offset (num_repos) to user indices.
    rows = coo.row
    cols = coo.col + num_repos  # add offset
    edge_index_ru = torch.tensor([rows, cols], dtype=torch.long)
    edge_indices_list.append(edge_index_ru)
    edge_types_list.append(torch.full((edge_index_ru.shape[1],), event_idx, dtype=torch.long))

# Concatenate all repo->user edges and corresponding types
edge_index_ru = torch.cat(edge_indices_list, dim=1)
edge_type_ru = torch.cat(edge_types_list, dim=0)

# For the reverse edges (user->repo), simply flip the edge_index_ru:
edge_index_ur = edge_index_ru[[1, 0]]  # Now, source is user (global index), target is repo.
edge_type_ur = edge_type_ru.clone()  # Same relation type

# ==============================
# Step 2: Load Labels and Prepare Train/Test Masks
# ==============================

# Load labels from label.csv
df_labels = pd.read_csv("label.csv")

# Initialize label array for all repos with -1 (for missing labels)
repo_labels = np.full(num_repos, -1)
for _, row in df_labels.iterrows():
    repo_id = row['repo_index']
    if repo_id in repo_mapping:
        repo_labels[repo_mapping[repo_id]] = row['label']
repo_labels = torch.tensor(repo_labels, dtype=torch.long)

# Create train/test split (only for repos with valid labels, i.e., label != -1)
valid_idx = (repo_labels != -1).nonzero(as_tuple=True)[0]
train_idx, test_idx = train_test_split(valid_idx.cpu().numpy(), test_size=0.3, random_state=42)
train_mask = torch.zeros(num_repos, dtype=torch.bool)
test_mask = torch.zeros(num_repos, dtype=torch.bool)
train_mask[train_idx] = True
test_mask[test_idx] = True

# ==============================
# Step 3: Combine Node Features into a Single Tensor
# ==============================
# Combined nodes: repos first, then users.
x = torch.cat([repo_features, user_features], dim=0)  # shape: (num_repos+num_users, num_event_types)
#x = repo_features
# ==============================
# Step 4: Define the Bipartite R-GCN Model
# ==============================

class BipartiteRGCN(nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, num_relations, dropout_p=0.5):
        super().__init__()
        # Initial projection from input to hidden dimensions.
        self.mlp = nn.Linear(in_channels, hidden_channels)
        # First layer: aggregate from repos to users (repo->user edges)
        self.conv1 = RGCNConv(hidden_channels, hidden_channels, num_relations)
        # Second layer: aggregate from users to repos (user->repo edges)
        self.conv2 = RGCNConv(hidden_channels, hidden_channels, num_relations)
        # Classifier for repository embeddings.
        self.classifier = nn.Linear(hidden_channels, out_channels)
        self.relu = nn.ReLU()
        # Dropout layer.
        self.dropout = nn.Dropout(p=dropout_p)
        
    def forward(self, x, edge_index_ru, edge_index_ur, edge_type_ru, edge_type_ur):
        # Initial projection and activation.
        x = self.mlp(x)
        x = self.relu(x)
        x = self.dropout(x)
        
        # Propagate information from repos to users.
        x = self.conv1(x, edge_index_ru, edge_type_ru)
        x = self.relu(x)
        x = self.dropout(x)
        
        # Propagate information from users to repos.
        x = self.conv2(x, edge_index_ur, edge_type_ur)
        x = self.relu(x)
        x = self.dropout(x)
        
        # Extract repository embeddings (first num_repos rows)
        repo_emb = x[:num_repos]
        out = self.classifier(repo_emb)
        return out

# Initialize the model
model = BipartiteRGCN(in_channels=num_event_types, hidden_channels=128, out_channels=2, num_relations=num_event_types)
optimizer = optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
criterion = nn.CrossEntropyLoss()

# ==============================
# Step 5: Training and Testing Functions
# ==============================

def train():
    model.train()
    optimizer.zero_grad()
    out = model(x, edge_index_ru, edge_index_ur, edge_type_ru, edge_type_ur)
    loss = criterion(out[train_mask], repo_labels[train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

def test():
    model.eval()
    with torch.no_grad():
        out = model(x, edge_index_ru, edge_index_ur, edge_type_ru, edge_type_ur)
    pred = out.argmax(dim=1)
    acc = accuracy_score(repo_labels[test_mask].cpu(), pred[test_mask].cpu())
    return acc

# ==============================
# Step 6: Training Loop
# ==============================
epochs = 300
for epoch in range(epochs):
    loss = train()
    if epoch % 5 == 0:
        acc = test()
        print(f"Epoch {epoch}: Loss = {loss:.4f}, Test Accuracy = {acc:.4f}")
        
final_acc = test()
print(f"Final Test Accuracy: {final_acc:.4f}")

Traceback (most recent call last):
  File "/Users/delione/.vscode/extensions/ms-python.python-2025.2.0-darwin-arm64/python_files/python_server.py", line 133, in exec_user_input
    retval = callable_(user_input, user_globals)
  File "<string>", line 1, in <module>
ModuleNotFoundError: No module named 'pandas'

