In [1]:
pip install torch-geometric


Collecting torch-geometric
  Downloading torch_geometric-2.3.1.tar.gz (661 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m661.6/661.6 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: torch-geometric
  Building wheel for torch-geometric (pyproject.toml) ... [?25l[?25hdone
  Created wheel for torch-geometric: filename=torch_geometric-2.3.1-py3-none-any.whl size=910454 sha256=9f2d415b3e1823e59f2bb0d3a734d29ef976fcba1d44527ef2ba164a26b9e9db
  Stored in directory: /root/.cache/pip/wheels/ac/dc/30/e2874821ff308ee67dcd7a66dbde912411e19e35a1addda028
Successfully built torch-geometric
Installing collected packages: torch-geometric
Successfully installed torch-geometric-2.3.1


In [2]:
import pandas as pd
import torch
from torch_geometric.data import Data

# Load the CSV data
df = pd.read_csv('/content/updated_authors.csv')




In [3]:
import re
# import pandas as pd


def extract_author_id(author_instance):
    match = re.search(r'author_id: "([^"]+)"', author_instance)
    if match:
        return match.group(1)
    else:
        return None  # Return None if no match is found

# Apply the extract_author_id function to the 'a' column and create a new column 'author_id'
df['author_id'] = df['a'].apply(extract_author_id)

# Print the DataFrame with extracted author_id values
print(df['author_id'])


0      authorID_9a049_b03f6_fc40b_fcf2f_13632
1      authorID_1be00_34108_2e25c_4e251_ca671
2      authorID_c2356_069e9_d1e79_ca924_37815
3      authorID_3635a_91e3d_a857f_7847f_68185
4      authorID_cba28_b89eb_85949_7f544_956d6
                        ...                  
342    authorID_d29d5_3701d_3c859_e29e1_b9002
343    authorID_d8658_0a57f_7bf54_2e852_02283
344    authorID_d7cda_a5ca0_58207_6c8e7_72cce
345    authorID_6db6e_b4af1_e18ab_81d38_78e44
346    authorID_7acc6_84a84_8a9b9_54959_fdd22
Name: author_id, Length: 347, dtype: object


In [4]:
# Define the author for which you want to predict co-authors (Author_X)
# author_x = 'authorID_766cb_53c75_3baed_ac5dc_78259'
index_to_select = 100  # Change this to the desired row number

# Get the author_id from the selected row
author_x = df.loc[index_to_select, 'author_id']
# Create a label column based on whether each row's 'author_id' matches Author_X
df['label'] = (df['author_id'] == author_x).astype(int)
labels = df['label'].tolist()

# Now, the 'label' column contains binary labels (1 for Author_X, 0 for others)

Create a Graph Representation:
You'll need to construct a graph representation from the 'a' and 'b' columns of your DataFrame. In this case, 'a' and 'b' represent co-author relationships. You can create an edge list from this data.

In [5]:
all_authors = df['a'].unique().tolist() + df['coauthors'].unique().tolist()

# Create a dictionary to map author IDs to unique numerical indices
author_id_to_index = {author_id: index for index, author_id in enumerate(all_authors)}

# Map 'a' and 'b' columns to numerical indices
df['a_numeric'] = df['a'].map(author_id_to_index)
df['b_numeric'] = df['coauthors'].map(author_id_to_index)

# Extract the numerical edge list
edge_list = df[['a_numeric', 'b_numeric']].values.tolist()

edge_index = torch.tensor(edge_list, dtype=torch.long).t().contiguous()
# Create a PyTorch Geometric Data object
data = Data(edge_index=edge_index)
data.y = labels

# Create a placeholder tensor for node features
num_nodes = len(all_authors)  # Number of nodes in the graph
num_features = 64  # Number of features (adjust as needed)
data.x = torch.randn(num_nodes, num_features)  # Placeholder node features




Create a NetworkX Graph:
You can convert the edge list into a NetworkX graph, which is a common format for working with graphs in Python. NetworkX can be used to perform various graph operations.

In [6]:
import networkx as nx

# Create a NetworkX graph from the edge list
G = nx.Graph()
G.add_edges_from(edge_list)


GNNModel is a custom GNN model class that inherits from nn.Module.
The model consists of two GCN layers (self.conv1 and self.conv2), but you can adjust the number of layers and hidden units as needed.
The forward method defines the forward pass of the model. It applies the first GCN layer, a ReLU activation, and then the second GCN layer.

In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class GNNModel(nn.Module):
    def __init__(self, num_nodes, num_features, num_classes):
        super(GNNModel, self).__init__()
        self.conv1 = GCNConv(num_features, 64)  # GCN layer with 64 output channels
        self.conv2 = GCNConv(64, num_classes)   # GCN layer with output size equal to the number of classes

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        # Apply the first GCN layer followed by a ReLU activation
        x = self.conv1(x, edge_index)
        x = F.relu(x)

        # Apply the second GCN layer
        x = self.conv2(x, edge_index)

        return x

# Instantiate the GNN model
num_nodes = len(all_authors)  # The total number of authors in  dataset
num_features = 64
num_classes = 1  # Adjust to the number of classes in your classification task
model = GNNModel(num_nodes, num_features, num_classes)



In this modified code, we use the length of data.edge_index[0] to determine the number of nodes since the edge index implicitly defines the nodes. We then generate synthetic labels for authors and proceed with the data split.

This approach assumes that you don't have node features, and your GNN will operate solely based on the graph structure (edge information).

2. Define Loss Function and Optimizer:

You need to choose an appropriate loss function and optimizer for your specific task. For example, if you are performing binary classification, you can use binary cross-entropy loss and the Adam optimizer.

In [8]:
import torch.optim as optim

criterion = nn.BCEWithLogitsLoss()  # Binary cross-entropy loss for binary classification
optimizer = optim.Adam(model.parameters(), lr=0.001)  # Adam optimizer with a learning rate


In [9]:
# Determine the number of nodes in your graph (based on data or labels)
num_nodes = len(data.y)  # Assuming labels are correctly aligned

# Split your data into training, validation, and test sets

import random

# Define the split ratios
train_ratio = 0.8
val_ratio = 0.1
test_ratio = 0.1

# Create a list of indices for all nodes
num_nodes = len(data.y)
all_indices = list(range(num_nodes))

# Determine the number of nodes for each split
train_size = int(train_ratio * num_nodes)
val_size = int(val_ratio * num_nodes)
test_size = num_nodes - train_size - val_size

# Ensure that train_size does not exceed the maximum valid index
train_size = min(train_size, num_nodes - 1)

# Shuffle the indices randomly
random.shuffle(all_indices[:train_size])

# Split the indices into train, validation, and test sets
train_indices = all_indices[:train_size]
val_indices = all_indices[train_size:train_size + val_size]
test_indices = all_indices[train_size + val_size:]

# Extract the labels for each set
train_labels = [labels[i] for i in train_indices]
val_labels = [labels[i] for i in val_indices]
test_labels = [labels[i] for i in test_indices]


In [10]:
print(len(train_indices))
print(len(train_labels))
print(num_nodes)


277
277
347


In [11]:
# Debugging: Print the maximum index in train_indices
max_index = max(train_indices)
print("Max Index in train_indices:", max_index)

# Print the number of nodes
print("Number of Nodes in Dataset:", num_nodes)


Max Index in train_indices: 276
Number of Nodes in Dataset: 347


In [12]:
num_epochs = 100
train_labels_tensor = torch.zeros((data.size(0),))

for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()

    # Forward pass
    output = model(data)
    output = torch.squeeze(output)



    # Print a subset of output and train_labels_tensor for debugging
    print("Output:", output[:10])  # Print the first 10 elements
    print("Train labels:", train_labels_tensor[:10])  # Print the first 10 labels

    # Ensure that train_labels_tensor matches the size of output
    train_labels_tensor = train_labels_tensor.view(-1)

    # Debugging: Print a subset of train_indices
    # print("Train indices:", train_indices[:10])  # Print the first 10 indices

    # Ensure that train_indices are within bounds
    train_indices = [idx for idx in train_indices if idx < output.size(0)]

    # Calculate the binary cross-entropy loss using train_labels_tensor
    train_loss = criterion(output[train_indices], train_labels_tensor[train_indices])

    # Backpropagation
    train_loss.backward()
    optimizer.step()

    # Print loss for monitoring
    print(f'Epoch [{epoch + 1}/{num_epochs}], Training Loss: {train_loss.item()}')


Output: tensor([ 1.7591,  1.0912,  2.2726, -1.9933,  2.0105,  0.5205, -1.0657,  1.2347,
        -0.2799,  0.7778], grad_fn=<SliceBackward0>)
Train labels: tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
Epoch [1/100], Training Loss: 1.2398618459701538
Output: tensor([ 1.6680,  1.0309,  2.1955, -2.0781,  1.9436,  0.4658, -1.0899,  1.1793,
        -0.3546,  0.7258], grad_fn=<SliceBackward0>)
Train labels: tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
Epoch [2/100], Training Loss: 1.1954072713851929
Output: tensor([ 1.5769,  0.9716,  2.1201, -2.1635,  1.8755,  0.4105, -1.1155,  1.1241,
        -0.4285,  0.6745], grad_fn=<SliceBackward0>)
Train labels: tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
Epoch [3/100], Training Loss: 1.1519497632980347
Output: tensor([ 1.4873,  0.9127,  2.0457, -2.2471,  1.8065,  0.3556, -1.1426,  1.0680,
        -0.5023,  0.6236], grad_fn=<SliceBackward0>)
Train labels: tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
Epoch [4/100], Training Loss: 1.10952

In [13]:
print(output.size())
print(train_labels_tensor.size())

torch.Size([676])
torch.Size([676])


In [14]:
import numpy as np
model.eval()

# Forward pass to generate predictions for all nodes
with torch.no_grad():
    predictions = model(data)
    predictions = torch.sigmoid(predictions)  # Apply sigmoid activation for probability scores

# Convert predictions to a numpy array for easier sorting
predictions = predictions.cpu().numpy()

# Sort predictions in descending order
sorted_indices = np.argsort(predictions[:, 0])[::-1]  # Sort by the first column (probability of being similar)

# Ensure that sorted_indices do not exceed the length of data.y
sorted_indices = sorted_indices[sorted_indices < len(data.y)]

# Get the top 5 authors (if data.y has at least 5 elements)
if len(data.y) >= 5:
    num_top_authors = min(5, len(sorted_indices))  # Ensure we don't go beyond the length of sorted_indices
    top_authors_indices = sorted_indices[:num_top_authors]
    top_5_authors = [data.y[i] for i in top_authors_indices]

    # Print the top 5 authors
    print("Top 5 Authors Who Could Co-Author:")
    for author in top_5_authors:
        print(author)
else:
    print("Not enough authors in data.y to predict the top 5.")


Top 5 Authors Who Could Co-Author:
0
0
0
0
0


In [15]:
# Create a dictionary to map numerical indices to author_ids
index_to_author_id = {index: author_id for author_id, index in author_id_to_index.items()}

# Get the top 5 authors (if data.y has at least 5 elements)
if len(data.y) >= 5:
    num_top_authors = min(5, len(sorted_indices))  # Ensure we don't go beyond the length of sorted_indices
    top_authors_indices = sorted_indices[:num_top_authors]
    top_5_authors = [index_to_author_id[i] for i in top_authors_indices]

    # Print the top 5 authors and their author_ids
    print("Top 5 Authors Who Could Co-Author:")
    for author_index, author_id in zip(top_authors_indices, top_5_authors):
        print(f"Author Index: {author_index}, Author ID: {author_id}")


else:
    print("Not enough authors in data.y to predict the top 5.")


Top 5 Authors Who Could Co-Author:
Author Index: 305, Author ID: (:Author {Feature140: "0",Feature146: "0",Feature145: "0",Feature148: "0",Feature147: "0",Feature90: "0",Feature142: "0",Feature141: "0",Feature92: "0",Feature144: "0",Feature143: "0",Feature91: "0",Feature94: "0",Feature93: "0",Feature96: "0",Feature95: "0",Feature98: "0",Feature139: "0",Feature138: "0",Feature97: "0",Feature99: "0",Feature151: "0",Feature150: "0",Feature157: "0",Feature156: "0",Feature159: "0",Feature158: "0",Feature153: "0",Feature152: "0",Feature81: "0",Feature155: "0",Feature80: "0",Feature154: "0",Feature83: "0",Feature82: "0",Feature85: "0",Feature84: "0",Feature87: "0",Feature149: "0",Feature86: "0",Feature89: "0",Feature88: "0",Feature124: "0",Feature123: "0",Feature126: "0",Feature125: "0",Feature120: "0",Feature70: "0",Feature122: "0",Feature121: "0",Feature72: "0",Feature71: "0",Feature74: "0",Feature73: "0",Feature117: "0",Feature76: "0",Feature75: "0",Feature116: "0",Feature119: "0",Feature7

In [16]:
if len(data.y) >= 5:
    num_top_authors = min(5, len(top_authors_indices))
    print("For author:",author_x)

    print("Top 5 Authors Who Could Co-Author:")
    for author_index in top_authors_indices:
        if author_index < len(df):
            author_id = df.loc[author_index, 'author_id']
            print(f"Author Index: {author_index}, Author ID: {author_id}")
        else:
            print(f"Author Index: {author_index}, Author ID: Not found in DataFrame")
else:
    print("Not enough authors in data.y to predict the top 5.")


For author: authorID_3c152_85c04_fff40_024bb_8714b
Top 5 Authors Who Could Co-Author:
Author Index: 305, Author ID: authorID_82c01_ce15b_431d4_20eb6_a1feb
Author Index: 293, Author ID: authorID_8ede6_b2634_3305e_05c3c_0029f
Author Index: 291, Author ID: authorID_0e17d_aca5f_3e175_f448b_acace
Author Index: 319, Author ID: authorID_090d3_859ff_6840b_2280f_4708c
Author Index: 345, Author ID: authorID_6db6e_b4af1_e18ab_81d38_78e44


In [17]:
torch.save(model.state_dict(), 'model.pth')

In [18]:
model.load_state_dict(torch.load('model.pth'))

<All keys matched successfully>

In [19]:
!pip freeze > requirements.txt
