In [1]:
import torch

def format_pytorch_version(version):
    return version.split('+')[0]

def format_cuda_version(version):
    return 'cu' + version.replace('.', '')

TORCH_version = torch.__version__
TORCH = format_pytorch_version(TORCH_version)
CUDA_version = torch.version.cuda
CUDA = format_cuda_version(CUDA_version)

!pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
!pip install torch-sparse -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
!pip install torch-cluster -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
!pip install torch-spline-conv -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
!pip install torch-geometric
!pip install torch_geometric

AttributeError: 'NoneType' object has no attribute 'replace'

In [None]:
import pandas as pd

# Read a TSV file with default parameters
movieDataFrame = pd.read_csv('movies.data.tsv', sep='\t')#
movieLabelsFrame= pd.read_csv('movies.labels.tsv', sep='\t')
personDataFrame= pd.read_csv('person.data.tsv', sep='\t')#
personLabelsFrame = pd.read_csv('person.labels.tsv', sep='\t')
sharedMovieEdgesFrame = pd.read_csv('shares_movies.edge.tsv', sep='\t')
worksOnEdgesFrame = pd.read_csv('works_on.edge.tsv', sep='\t')


#Prepare HeteroData object

# Node features for Person
# Extract columns from DataFrame

# Remove 'nm' prefix and convert remaining part to integers
personDataFrame['NameID'] = personDataFrame['NameID'].str.replace('nm', '').astype(int)
name_id = personDataFrame['NameID'].values.astype(int)

# Replace missing values with a placeholder (-1 in this example)
birth_year = personDataFrame['BirthYear'].replace('\\N', -1)
# Convert BirthYear to integer
birth_year = birth_year.astype(int)

# Define a custom function to convert binary strings to integers with arbitrary precision
def binary_string_to_int(binary_string):
    return int(binary_string, 2)

# Apply the custom function to each element of the 'PrimaryProfessions' column
primary_professions = personDataFrame['PrimaryProfessions'].apply(binary_string_to_int)

# Convert the resulting Series to a NumPy array
primary_professions = primary_professions.values

movies_worked_on = personDataFrame['MoviesWorkedOn'].values.astype(int)  # Convert MoviesWorkedOn to integer


# Create tensors with appropriate data types
x_name_id = torch.tensor(name_id, dtype=torch.int32)
x_birth_year = torch.tensor(birth_year, dtype=torch.int32)
x_primary_professions = torch.tensor(primary_professions, dtype=torch.int64)
x_movies_worked_on = torch.tensor(movies_worked_on, dtype=torch.int32)

# Concatenate tensors to form the feature matrix
x_person = torch.stack([x_name_id, x_birth_year, x_primary_professions, x_movies_worked_on], dim=1)

# Now x_person contains all the features for each person, where each row represents a person and each column represents a feature

print(x_person)


tensor([[            2,          1924,      50855936,             3],
        [            8,          1924, 1099645847552,             2],
        [           25,          1929,         32778,             6],
        ...,
        [      9990915,            -1,          8192,             1],
        [      9991049,            -1,     134227968,             1],
        [      9992085,            -1, 1099511627776,             1]])


In [None]:

# Node features for Movie
# Extract columns from DataFrame

# Remove 'tt' prefix and convert remaining part to integers
movieDataFrame['TitleID'] = movieDataFrame['TitleID'].str.replace('tt', '').astype(int)
title_id = movieDataFrame['TitleID'].values.astype(int)

# Convert releaseYear to integer
release_year = movieDataFrame['ReleaseYear'].values.astype(int)
runtime_minutes = movieDataFrame['RuntimeMinutes'].values.astype(int)
avg_ratings = movieDataFrame['AvgRatings'].values.astype(float)
num_ratings = movieDataFrame['NumRatings'].values.astype(int)


# Apply the custom function to each element of the 'PrimaryProfessions' column
genres = movieDataFrame['Genres'].apply(binary_string_to_int)

# Convert the resulting Series to a NumPy array
#genres = primary_professions.values


# Create tensors with appropriate data types
x_title_id = torch.tensor(title_id, dtype=torch.int32)
x_release_year = torch.tensor(release_year , dtype=torch.int32)
x_runtime_minutes = torch.tensor(runtime_minutes, dtype=torch.int32)
x_avg_ratings = torch.tensor(avg_ratings, dtype=torch.float)
x_num_ratings = torch.tensor(num_ratings, dtype=torch.int32)
x_genres = torch.tensor(genres, dtype=torch.int32)

# Concatenate tensors to form the feature matrix
x_movie = torch.stack([x_title_id, x_release_year, x_runtime_minutes, x_avg_ratings, x_num_ratings, x_genres], dim=1)

# Now x_movie contains all the features for each movie, where each row represents a person and each column represents a feature
torch.set_printoptions(precision=None, threshold=None, edgeitems=None, linewidth=None, profile=None, sci_mode=False)
print(x_movie)




tensor([[35423.0000,  2001.0000,   118.0000,     6.4000, 89039.0000, 14680064.0000],
        [118589.0000,  2001.0000,   104.0000,     2.4000, 24085.0000, 3147776.0000],
        [118694.0000,  2000.0000,    98.0000,     8.1000, 166670.0000, 3145728.0000],
        ...,
        [9893250.0000,  2020.0000,   118.0000,     6.4000, 145883.0000, 26214400.0000],
        [9898858.0000,  2020.0000,    88.0000,     5.2000, 14566.0000, 58720256.0000],
        [9900782.0000,  2019.0000,   145.0000,     8.4000, 42380.0000, 51380224.0000]])


In [None]:
#Load labels for person and movie
#movieLabelsFrame= pd.read_csv('movies.labels.tsv', sep='\t')
#personLabelsFrame = pd.read_csv('person.labels.tsv', sep='\t')

titleID_label = title_id
movieTitleName = movieLabelsFrame['TitleName'].values.astype(str)

personID_label = name_id
personTitleName = personLabelsFrame['PrimaryName'].values.astype(str)






In [None]:
# sharedMovieEdgesFrame = pd.read_csv('shares_movies.edge.tsv', sep='\t')

#Edge index for person to person edges

# Extract relevant columns from sharedMovieEdgesFrame (adjust as needed)
source_nodes = sharedMovieEdgesFrame['NameID1']
target_nodes = sharedMovieEdgesFrame['NameID2']

# Convert 'NameID' columns to integers after removing the 'nm' prefix
source_nodes = source_nodes.astype(str).str.replace('nm', '').astype(int)
target_nodes = target_nodes.astype(str).str.replace('nm', '').astype(int)

# Combine source and target nodes into a single tensor
edge_index_person_to_person = torch.tensor([source_nodes.values, target_nodes.values], dtype=torch.int64)

print("Edge index for person-to-person edges:")
print(edge_index_person_to_person)

#Edge attributes for person to person
edge_attr_person_to_person_temp = sharedMovieEdgesFrame['NumShared']

# Convert edge attributes to tensor
edge_attr_person_to_person = torch.tensor(edge_attr_person_to_person_temp.values, dtype=torch.int32)

print(edge_attr_person_to_person)


Edge index for person-to-person edges:
tensor([[    212,     212,     212,  ..., 5701899, 5701899, 5701899],
        [   1396,    6142,    3712,  ..., 7992231, 7807469, 8325456]])
tensor([1, 1, 1,  ..., 1, 1, 1], dtype=torch.int32)


  edge_index_person_to_person = torch.tensor([source_nodes.values, target_nodes.values], dtype=torch.int64)


In [None]:
# worksOnEdgesFrame = pd.read_csv('works_on.edge.tsv', sep='\t')
#Edge index for person to movie edges

# Extract relevant columns from worksOnEdgesFrame (adjust as needed)
source_nodes_ptm = worksOnEdgesFrame['NameID']
target_nodes_ptm = worksOnEdgesFrame['TitleID']

# Convert 'NameID' columns to integers after removing the 'nm' prefix
source_nodes_ptm = source_nodes_ptm.astype(str).str.replace('nm', '').astype(int)

target_nodes_ptm = target_nodes_ptm.astype(str).str.replace('tt', '').astype(int)

# Combine source and target nodes into a single tensor
edge_index_person_to_movie = torch.tensor([source_nodes_ptm.values, target_nodes_ptm.values], dtype=torch.int64)

print("Edge index for person-to-movie edges:")
print(edge_index_person_to_movie)

#Edge attributes for person to movie
edge_attr_person_to_movie_temp = worksOnEdgesFrame['Category']

# Convert edge attributes to tensor
edge_attr_person_to_movie = torch.tensor(edge_attr_person_to_movie_temp.values, dtype=torch.int32)

print(edge_attr_person_to_movie)



Edge index for person-to-movie edges:
tensor([[    212,  413168,     630,  ..., 4237148, 7594199, 5701899],
        [  35423,   35423,   35423,  ..., 9900782, 9900782, 9900782]])
tensor([ 2,  8,  8,  ..., 10,  5, 12], dtype=torch.int32)


In [None]:
from torch_geometric.data import HeteroData
import torch_geometric.transforms as T

data = HeteroData()

# Assign node features for each node type
data['x', 'person'] = x_person
data['x', 'movie'] = x_movie

# Assign edge indices and edge attributes for each edge type
data['edge_index', ('person', 'person')] = edge_index_person_to_person
data['edge_attr', ('person', 'person')] = edge_attr_person_to_person

data['edge_index', ('person', 'movie')] = edge_index_person_to_movie
data['edge_attr', ('person', 'movie')] = edge_attr_person_to_movie

# You can also add target labels or ground truth values if applicable
# For example:
# data['y', 'person'] = y_person
# data['y', 'movie'] = y_movie

print(data)



  from .autonotebook import tqdm as notebook_tqdm


HeteroData(
  (x, person)=[55911, 4],
  (x, movie)=[6560, 6],
  (edge_index, (person, person))=[2, 1241937],
  (edge_attr, (person, person))=[1241937],
  (edge_index, (person, movie))=[2, 144422],
  (edge_attr, (person, movie))=[144422]
)


In [None]:
# We also need to make sure to add the reverse edges from movies to users
# in order to let a GNN be able to pass messages in both directions.
# We can leverage the `T.ToUndirected()` transform for this from PyG:
data = T.ToUndirected()(data)

print(data)

HeteroData(
  (x, person)=[55911, 4],
  (x, movie)=[6560, 6],
  (edge_index, (person, person))=[2, 1241937],
  (edge_attr, (person, person))=[1241937],
  (edge_index, (person, movie))=[2, 144422],
  (edge_attr, (person, movie))=[144422]
)


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import MessagePassing

class HeteroGNN(MessagePassing):
    def __init__(self, in_channels, out_channels):
        super(HeteroGNN, self).__init__(aggr='add')
        self.lin = nn.Linear(in_channels, out_channels)

    def forward(self, x, edge_index, edge_attr):
        return self.propagate(edge_index, x=x, edge_attr=edge_attr)

    def message(self, x_j, edge_attr):
        return x_j

    def update(self, aggr_out):
        return self.lin(aggr_out)

class HeteroClusteringGNN(nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, num_relations):
        super(HeteroClusteringGNN, self).__init__()
        self.conv1 = HeteroGNN(in_channels, hidden_channels)
        self.conv2 = HeteroGNN(hidden_channels, out_channels)

    def forward(self, data):
        x = data['x']
        edge_indices = data['edge_indices']
        edge_attrs = data['edge_attrs']

        x = F.relu(self.conv1(x, edge_indices, edge_attrs))
        x = F.relu(self.conv2(x, edge_indices, edge_attrs))
        return x

model = HeteroClusteringGNN(in_channels=10, hidden_channels=64, out_channels=10, num_relations=1)
output = model(data)



ValueError: `MessagePassing.propagate` only supports integer tensors of shape `[2, num_messages]`, `torch_sparse.SparseTensor` or `torch.sparse.Tensor` for argument `edge_index`.

In [None]:
import torch
import torch.nn as nn
from torch_geometric.data import HeteroData
import torch.nn.functional as F
from sklearn.cluster import KMeans

# class HeteroGNN(nn.Module):
#     def __init__(self, in_channels, hidden_channels, out_channels, num_clusters):
#         super(HeteroGNN, self).__init__()
#         self.lin1_person = nn.Linear(in_channels_person, hidden_channels)
#         self.lin1_movie = nn.Linear(in_channels_movie, hidden_channels)
#         self.lin2 = nn.Linear(hidden_channels, out_channels)
#         self.num_clusters = num_clusters
#         self.lin_classifier = nn.Linear(out_channels, num_clusters)

#     def forward(self, data):
#         _x_person = data['x', 'person']  # Features for person nodes
#         _x_movie = data['x', 'movie']    # Features for movie nodes
#         _edge_index_person_to_person = data['edge_index', ('person', 'person')]
#         _edge_index_person_to_movie = data['edge_index', ('person', 'movie')]

#         print(_x_person)

#         # Perform message passing and aggregation for person nodes
#         _x_person = torch.relu(self.lin1_person(_x_person))
#         _x_person = self.message_passing(_x_person, _edge_index_person_to_person)

#         # Perform message passing and aggregation for movie nodes
#         _x_movie = torch.relu(self.lin1_movie(_x_movie))
#         _x_movie = self.message_passing(_x_movie, _edge_index_person_to_movie)

#         # Concatenate node embeddings from different types of nodes
#         x = torch.cat([_x_person, _x_movie], dim=0)

#         # Perform final linear layer and activation
#         x = torch.relu(self.lin2(x))

#         # Perform clustering using KMeans
#         kmeans = KMeans(n_clusters=self.num_clusters)
#         cluster_labels = kmeans.fit_predict(x.detach().numpy())
#         cluster_labels = torch.tensor(cluster_labels, dtype=torch.long)

#         # Pseudo-labels for nodes
#         pseudo_labels = F.one_hot(cluster_labels, num_classes=self.num_clusters)

#         # Classification loss
#         logits = self.lin_classifier(x)
#         classification_loss = F.cross_entropy(logits, cluster_labels)

#         return classification_loss, pseudo_labels

#     def message_passing(self, x, edge_index):
#         # Simple message passing function
#         return torch.matmul(torch.sparse_coo_tensor(edge_index, x[edge_index[0]]), x)


def forward(self, data):
    # Extract node features and edge indices for each edge type

    edge_index_person_to_person = data['edge_index', ('person', 'person')]  # Edge indices for person-to-person edges
    edge_index_person_to_movie = data['edge_index', ('person', 'movie')]    # Edge indices for person-to-movie edges

    # Perform message passing and aggregation for each type of edge
    x_person_to_person = F.relu(self.conv_person_to_person(x_person, edge_index_person_to_person))
    x_person_to_movie = F.relu(self.conv_person_to_movie(x_person, edge_index_person_to_movie))

    # Optionally, perform further operations based on the node features

    return x_person_to_person, x_person_to_movie



# Define the HeteroGNN architecture
class HeteroGNN(nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(HeteroGNN, self).__init__()
        # Define separate message passing layers for different types of edges
        self.conv_person_to_person = HeteroGraphConv(in_channels, hidden_channels, aggr='mean')
        self.conv_person_to_movie = HeteroGraphConv(in_channels, hidden_channels, aggr='mean')
        # Output layer for each node type
        self.lin_person = nn.Linear(hidden_channels, out_channels)
        self.lin_movie = nn.Linear(hidden_channels, out_channels)

    def forward(self, data):
        # Extract node features and edge indices for each edge type
        x = data['x']
        edge_index_person_to_person = data['edge_index_person_to_person']
        edge_index_person_to_movie = data['edge_index_person_to_movie']

        # Message passing and aggregation for person-to-person edges
        x_person_to_person = torch.relu(self.conv_person_to_person(x, edge_index_person_to_person))
        # Message passing and aggregation for person-to-movie edges
        x_person_to_movie = torch.relu(self.conv_person_to_movie(x, edge_index_person_to_movie))

        # Apply output layers for each node type
        out_person = self.lin_person(x_person_to_person)
        out_movie = self.lin_movie(x_person_to_movie)

        return out_person, out_movie


# Assuming x_person and x_movie are the feature matrices for person and movie nodes
in_channels_person = x_person.size(1)  # Number of features for person nodes
in_channels_movie = x_movie.size(1)    # Number of features for movie nodes
in_channels_combined = in_channels_person + in_channels_movie  # Total combined dimensionality

print(in_channels_person)

# Initialize the HeteroGNN model
model = HeteroGNN(in_channels=in_channels_combined, hidden_channels=64, out_channels=in_channels_combined, num_clusters=4)

# # Example HeteroData object (replace with your actual data)
# data = HeteroData()
# data['x'] = {...}  # Node features
# data['edge_index'] = {...}  # Edge indices

# Define unsupervised objective function and optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

num_epochs = 50

# Define a custom function to convert binary strings to integers with arbitrary precision
def unsupervised_objective(binary_string):
    return int(binary_string, 2)


# Training loop
for epoch in range(num_epochs):
    # Forward pass
    output = model(data)

    # Compute loss (unsupervised objective)
    loss = unsupervised_objective(output)

    # Backpropagation
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # Print loss for monitoring convergence
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item()}')







ModuleNotFoundError: No module named 'torch_geometric'

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.utils import to_networkx
from sklearn.cluster import KMeans
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Define the Graph Convolutional Network (GCN) model
class GCN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, output_dim)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index)
        return x


# Preprocess your data if needed (e.g., normalization)
scaler = StandardScaler()
data_scaled = scaler.fit_transform(movieData)  # Scale your data

# Convert data to PyTorch tensors
x = torch.tensor(data_scaled, dtype=torch.float)
edge_index = ...  # Construct the edge index for your graph
edge_index = torch.tensor(edge_index, dtype=torch.long)

# Define model parameters
input_dim = data_scaled.shape[1]  # Dimensionality of input features
hidden_dim = 256  # Dimensionality of hidden layers
output_dim = 128  # Dimensionality of output features
n_clusters = 5  # Number of clusters

# Initialize the GCN model
model = GCN(input_dim, hidden_dim, output_dim)

# Set model to training mode
model.train()

# Define optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# Training loop
for epoch in range(100):  # Example: 100 epochs
    # Forward pass
    embeddings = model(x, edge_index)

    # Compute cluster assignments (e.g., using KMeans on embeddings)
    kmeans = KMeans(n_clusters=n_clusters)
    clusters = kmeans.fit_predict(embeddings.detach().numpy())

    # Compute loss (e.g., Kullback-Leibler divergence)
    # Example: loss = KLDivergenceLoss(embeddings, clusters)

    # Backward pass and optimization
    optimizer.zero_grad()
    # loss.backward()
    optimizer.step()

# Print cluster assignments
print(clusters)









# Add nodes with features
num_actors = 10
num_movies = 5

actor_features = np.random.rand(num_actors, 5)  # Example: 10 actors with 5-dimensional feature vectors
movie_features = np.random.rand(num_movies, 5)  # Example: 5 movies with 5-dimensional feature vectors

for i, features in enumerate(actor_features):
    actor_id = f"actor_{i}"
    graph.add_node(actor_id, type='actor', features=features)

for i, features in enumerate(movie_features):
    movie_id = f"movie_{i}"
    graph.add_node(movie_id, type='movie', features=features)

# Add edges with features
for actor_id in graph.nodes():
    if graph.nodes[actor_id]['type'] == 'actor':
        for movie_id in np.random.choice(list(graph.nodes()), size=np.random.randint(1, 4)):
            if graph.nodes[movie_id]['type'] == 'movie':
                edge_features = np.random.rand(3)  # Example: 3-dimensional feature vector for the edge
                graph.add_edge(actor_id, movie_id, type='acted_in', features=edge_features)

# Convert nodes and edges into PyTorch tensors
node_features = torch.tensor([graph.nodes[node]['features'] for node in graph.nodes], dtype=torch.float)
edges = [(edge[0], edge[1]) for edge in graph.edges]  # Consider only edge connections for simplicity
edge_features = torch.tensor([graph.edges[edge]['features'] for edge in graph.edges], dtype=torch.float)
edge_index = torch.tensor([[int(edge[0][6:]), int(edge[1][6:])] for edge in graph.edges], dtype=torch.long).t().contiguous()

# Create PyTorch Geometric Data object
data = Data(x=node_features, edge_index=edge_index, edge_attr=edge_features)

# Define a GNN model
class GNN(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(GNN, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, hidden_dim)

    def forward(self, data):
        x, edge_index, edge_attr = data.x, data.edge_index, data.edge_attr
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        return x

# Instantiate the GNN model
model = GNN(input_dim=node_features.size(1), hidden_dim=64)

# Perform graph embedding using the GNN model
with torch.no_grad():
    embeddings = model(data).numpy()

print("Embeddings:", embeddings)

# Perform clustering using K-means on learned embeddings
kmeans = KMeans(n_clusters=2)  # Define number of clusters
clusters = kmeans.fit_predict(embeddings)
print("Clusters:", clusters)