In [1]:
# Install necessary dependencies (takes a long time)
!pip install torch torch_scatter torch_sparse torch_geometric graphdatascience

In [2]:
import os
import pandas as pd
from graphdatascience import GraphDataScience
import torch
from torch_geometric.data import Data
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, RGCNConv
from torch_geometric.transforms import RandomNodeSplit
import random
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer




In [3]:
# Set seeds for consistent results
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed_all(42)

In [4]:
# Get Neo4j DB URI, credentials and name from environment if applicable
NEO4J_URI = os.environ.get("NEO4J_URI", "bolt://localhost:7687")
NEO4J_DB = os.environ.get("NEO4J_DB", "neo4j")
NEO4J_AUTH = (
    os.environ.get("NEO4J_USER", "neo4j"),
    os.environ.get("NEO4J_PASSWORD", "pleaseletmein"),
)

gds = GraphDataScience(NEO4J_URI, auth=NEO4J_AUTH, database=NEO4J_DB)

HeteroData(
  movie={
    x=[4278, 3066],
    y=[4278],
    train_mask=[4278],
    val_mask=[4278],
    test_mask=[4278],
  },
  director={ x=[2081, 3066] },
  actor={ x=[5257, 3066] },
  (movie, to, director)={ edge_index=[2, 4278] },
  (movie, to, actor)={ edge_index=[2, 12828] },
  (director, to, movie)={ edge_index=[2, 4278] },
  (actor, to, movie)={ edge_index=[2, 12828] }
)

In [5]:
def fetch_data(query):
    return gds.run_cypher(query)

In [5]:
def load_node(cypher, index_col, encoders=None, **kwargs):
    # Execute the cypher query and retrieve data from Neo4j
    df = fetch_data(cypher)
    df.set_index(index_col, inplace=True)
    # Define node mapping
    mapping = {index: i for i, index in enumerate(df.index.unique())}
    # Define node features
    x = None
    if encoders is not None:
        xs = [encoder(df[col]) for col, encoder in encoders.items()]
        x = torch.cat(xs, dim=-1)

    return x, mapping

In [7]:
def load_edge(cypher, src_index_col, src_mapping, dst_index_col, dst_mapping,
                  encoders=None, **kwargs):
    # Execute the cypher query and retrieve data from Neo4j
    df = fetch_data(cypher)
    # Define edge index
    src = [src_mapping[index] for index in df[src_index_col]]
    dst = [dst_mapping[index] for index in df[dst_index_col]]
    edge_index = torch.tensor([src, dst])
    # Define edge features
    edge_attr = None
    if encoders is not None:
        edge_attrs = [encoder(df[col]) for col, encoder in encoders.items()]
        edge_attr = torch.cat(edge_attrs, dim=-1)

    return edge_index, edge_attr

ImportError: 'SparseTensor' requires 'torch-sparse'

In [8]:
from sentence_transformers import SentenceTransformer

class SequenceEncoder:
    def __init__(self, model_name='all-MiniLM-L6-v2', device=None):
        self.device = device
        self.model = SentenceTransformer(model_name, device=device)

    @torch.no_grad()
    def __call__(self, df):
        x = self.model.encode(df.values, show_progress_bar=True,
                              convert_to_tensor=True, device=self.device)
        return x.cpu()
    
class WeekdayEncoder:
    def __init__(self, sep='|'):
        self.sep = sep

    def __call__(self, df):
        genres = set(col for col in df.values)
        mapping = {genre: i for i, genre in enumerate(genres)}

        x = torch.zeros(len(df), len(mapping))
        for i, col in enumerate(df.values):
            for genre in col.split(self.sep):
                x[i, mapping[genre]] = 1
        return x

In [9]:
recipe_query = """
MATCH (a:Recipe)
RETURN a.recipieId as recipeId,
       a.openaiEmbeddings as embedding
"""

recipe_x, recipe_mapping = load_node(
    recipe_query, 
    index_col='recipeId', 
    encoders={
        "embedding": lambda x: torch.Tensor(x)
    }
)

In [10]:
recipe_x.shape, len(recipe_mapping)

In [11]:
ingredient_query = """
MATCH (i:Ingredient)
RETURN DISTINCT ID(i) as ingredientId, i.title as title
"""
ingredient_x, ingredient_mapping = load_node(
    ingredient_query, 
    index_col='ingredientId', 
    encoders={
        'title': SequenceEncoder()
    }
)

In [12]:
ingredient_x.shape, len(ingredient_mapping)

dict_values([tensor([[   0,    1,    2,  ..., 4275, 4276, 4277],
        [ 789,  680, 1757,  ..., 1781,  166,  399]]), tensor([[   0,    0,    0,  ..., 4277, 4277, 4277],
        [ 674, 2394, 5129,  ...,  100, 1078, 1439]]), tensor([[   0,    1,    2,  ..., 2078, 2079, 2080],
        [3972, 3751, 3090,  ..., 3246, 3652, 4143]]), tensor([[   0,    0,    0,  ..., 5256, 5256, 5256],
        [ 555,  703, 1147,  ..., 1765, 2086, 2540]])])


RuntimeError: index 5129 is out of bounds for dimension 0 with size 4278

In [16]:
menu_query = """
MATCH (m:Menu)
RETURN ID(m) as menuId, m.year as year, m.week as week
"""


def encode_cyclical_week(week_series):
    week_sin = np.sin(2 * np.pi * week_series / 52.0)
    week_cos = np.cos(2 * np.pi * week_series / 52.0)
    week_tensor = torch.Tensor(list(zip(week_sin, week_cos)))
    return week_tensor


menu_x, menu_mapping = load_node(
    menu_query, 
    index_col='menuId',
    encoders={
        "year": lambda x: torch.Tensor(x.tolist()).view(-1, 1),
        "week": encode_cyclical_week
    }
)

In [17]:
menu_x.shape, len(menu_mapping)

(torch.Size([142, 3]), 142)

In [18]:
recipe_menu_query = """
MATCH (n:Recipe)-[r:IS_PART_OF_MENU]->(m:Menu)
RETURN n.recipieId AS recipeId, ID(m) AS menuId, r.weekDay AS weekDay
"""

def encode_cyclical_weekday(weekday_series):
    # Map weekdays to integers: Monday: 0, Tuesday: 1, ..., Sunday: 6
    weekday_to_int = {
        "Monday": 0, "Tuesday": 1, "Wednesday": 2, "Thursday": 3,
        "Friday": 4, "Saturday": 5, "Sunday": 6
    }
    weekday_series_int = weekday_series.map(weekday_to_int)
    
    # Apply cyclical encoding
    weekday_sin = np.sin(2 * np.pi * weekday_series_int / 7.0)
    weekday_cos = np.cos(2 * np.pi * weekday_series_int / 7.0)
    weekday_tensor = torch.Tensor(list(zip(weekday_sin, weekday_cos)))
    
    return weekday_tensor



recipe_menu_edge_index, recipe_menu_edge_label = load_edge(
    recipe_menu_query,
    src_index_col='recipeId',
    src_mapping=recipe_mapping,
    dst_index_col='menuId',
    dst_mapping=menu_mapping,
    encoders={'weekDay': encode_cyclical_weekday},
)

In [19]:
recipe_menu_edge_index.shape, recipe_menu_edge_label.shape

(torch.Size([2, 993]), torch.Size([993, 2]))

In [48]:
recipe_ingredient_query = """
MATCH (n:Recipe)-[r:HAS_INGREDIENT]->(i:Ingredient) 
RETURN n.recipieId AS recipeId, ID(i) AS ingredientId
"""

recipe_ingredient_edge_index, recipe_ingredient_edge_label = load_edge(
    recipe_ingredient_query,
    src_index_col='recipeId',
    src_mapping=recipe_mapping,
    dst_index_col='ingredientId',
    dst_mapping=ingredient_mapping,
)

In [49]:
recipe_ingredient_edge_index.shape, recipe_ingredient_edge_label

(torch.Size([2, 7907]), None)

In [74]:
print(recipe_ingredient_edge_index[1].max(), len(ingredient_mapping))
print(recipe_menu_edge_index[1].max(), len(menu_mapping))


(tensor(0), tensor(1792))

In [23]:
if torch.cuda.is_available():
    device = torch.device('cuda')
elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
    device = torch.device('mps')
else:
    device = torch.device('cpu')

In [24]:
from torch_geometric.data import HeteroData

data = HeteroData()

data['recipe'].x = recipe_x
data["menu"].x = menu_x
data["ingredient"].x = ingredient_x
data["recipe", "has_ingredient", "ingredient"].edge_index = recipe_ingredient_edge_index
data["recipe", "has_ingredient", "ingredient"].edge_attr = torch.ones((recipe_ingredient_edge_index.size(1), 1)) * 1
data["recipe", "is_part_of_menu", "menu"].edge_index = recipe_menu_edge_index
data["recipe", "is_part_of_menu", "menu"].edge_attr = recipe_menu_edge_label
data.num_relations = 2
data.num_nodes = len(recipe_mapping) + len(ingredient_mapping) + len(menu_mapping)

data.to(device)
data

HeteroData(
  num_relations=2,
  num_nodes=2658,
  recipe={ x=[723, 1536] },
  menu={ x=[142, 3] },
  ingredient={ x=[1793, 384] },
  (recipe, has_ingredient, ingredient)={
    edge_index=[2, 7907],
    edge_attr=[7907, 1],
  },
  (recipe, is_part_of_menu, menu)={
    edge_index=[2, 993],
    edge_attr=[993, 2],
  }
)

In [None]:
torch.save(data, "data.pt")

In [25]:
from torch_geometric.transforms import ToUndirected

data = ToUndirected()(data)
data

HeteroData(
  num_relations=2,
  num_nodes=2658,
  recipe={ x=[723, 1536] },
  menu={ x=[142, 3] },
  ingredient={ x=[1793, 384] },
  (recipe, has_ingredient, ingredient)={
    edge_index=[2, 7907],
    edge_attr=[7907, 1],
  },
  (recipe, is_part_of_menu, menu)={
    edge_index=[2, 993],
    edge_attr=[993, 2],
  },
  (ingredient, rev_has_ingredient, recipe)={
    edge_index=[2, 7907],
    edge_attr=[7907, 1],
  },
  (menu, rev_is_part_of_menu, recipe)={
    edge_index=[2, 993],
    edge_attr=[993, 2],
  }
)

In [31]:
# import torch_geometric.transforms as T
# 
# metapaths = [
#     [('recipe', 'menu'), ('menu', 'recipe')],  # MAM
#     [('recipe', 'ingredient'), ('ingredient', 'recipe')],  # MDM
# ]
# data = T.AddMetaPaths(metapaths, drop_orig_edge_types=True)(data)

ImportError: 'SparseTensor' requires 'torch-sparse'

In [99]:
from torch_geometric.transforms import RandomLinkSplit

# 2. Perform a link-level split into training, validation, and test edges.
transform = RandomLinkSplit(
    num_val=0.1,
    num_test=0.1,
    neg_sampling_ratio=0.0,
    edge_types=[("recipe", "is_part_of_menu", "menu"), ("recipe", "has_ingredient", "ingredient")],
    is_undirected=True,
    split_labels=True, 
    add_negative_train_samples=True,
)
train_data, val_data, test_data = transform(data)

In [63]:
train_data

HeteroData(
  num_relations=2,
  num_nodes=2658,
  recipe={ x=[723, 1536] },
  menu={ x=[142, 3] },
  ingredient={ x=[1793, 384] },
  (recipe, has_ingredient, ingredient)={
    edge_index=[2, 6327],
    edge_attr=[6327, 1],
    pos_edge_label=[6327],
    pos_edge_label_index=[2, 6327],
  },
  (recipe, is_part_of_menu, menu)={
    edge_index=[2, 795],
    edge_attr=[795, 2],
    pos_edge_label=[795],
    pos_edge_label_index=[2, 795],
  },
  (ingredient, rev_has_ingredient, recipe)={
    edge_index=[2, 6327],
    edge_attr=[6327, 1],
  },
  (menu, rev_is_part_of_menu, recipe)={
    edge_index=[2, 795],
    edge_attr=[795, 2],
  }
)

In [109]:
from torch_geometric.nn import GCNConv

class HGNN(torch.nn.Module):
    def __init__(self, in_channels_recipe, in_channels_menu, in_channels_ingredient, out_channels):
        super(HGNN, self).__init__()
        self.conv_recipe_menu = GCNConv(in_channels_recipe, out_channels)
        self.conv_recipe_ingredient = GCNConv(in_channels_recipe, out_channels)

    def forward(self, data):
        x_recipe = data['recipe'].x
        x_menu = data['menu'].x
        x_ingredient = data['ingredient'].x

        edge_index_rm = data['recipe', 'is_part_of_menu', 'menu'].edge_index
        edge_index_ri = data['recipe', 'has_ingredient', 'ingredient'].edge_index

        x_recipe_menu = self.conv_recipe_menu(x_recipe, edge_index_rm)
        x_recipe_ingredient = self.conv_recipe_ingredient(x_recipe, edge_index_ri)

        return F.relu(x_recipe_menu), F.relu(x_menu), F.relu(x_recipe_ingredient)


In [84]:
len(data.edge_types)

In [110]:
model = HGNN(
    in_channels_recipe=data['recipe'].x.size(1),
    in_channels_menu=data['menu'].x.size(1),
    in_channels_ingredient=data['ingredient'].x.size(1),
    out_channels=64
)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [103]:
# with torch.no_grad():  # Initialize lazy modules.
#     out = model(data.x_dict, data.edge_index_dict)

In [104]:
def mask_nodes(x, mask_rate=0.15):
    num_nodes, num_features = x.size()
    mask = torch.rand(num_nodes, num_features) < mask_rate
    x_masked = x.clone()
    x_masked[mask] = 0
    return x_masked, mask


In [111]:

for epoch in range(100):
    model.train()
    optimizer.zero_grad()

    # Mask a fraction of node attributes
    x_recipe_masked, mask_recipe = mask_nodes(data['recipe'].x)
    x_menu_masked, mask_menu = mask_nodes(data['menu'].x)
    x_ingredient_masked, mask_ingredient = mask_nodes(data['ingredient'].x)

    data['recipe'].x = x_recipe_masked
    data['menu'].x = x_menu_masked
    data['ingredient'].x = x_ingredient_masked

    out_recipe_menu, out_menu, out_recipe_ingredient = model(data)

    # Reconstruction loss
    loss_recipe = F.mse_loss(out_recipe_menu[mask_recipe], data['recipe'].x[mask_recipe])
    loss_menu = F.mse_loss(out_menu[mask_menu], data['menu'].x[mask_menu])
    loss_ingredient = F.mse_loss(out_recipe_ingredient[mask_ingredient], data['ingredient'].x[mask_ingredient])
    
    loss = loss_recipe + loss_menu + loss_ingredient

    loss.backward()
    optimizer.step()

    print(f"Epoch {epoch+1}, Loss: {loss.item()}")

RuntimeError: index 723 is out of bounds for dimension 0 with size 723

In [93]:
model.encode(train_data.x_dict, train_data.edge_index_dict)

tensor([[-10.2415,   0.5008,  -7.4603,  ...,  -6.9207,  -1.3326,  -3.3603],
        [-10.2391,   0.5022,  -7.4600,  ...,  -6.9198,  -1.3326,  -3.3601],
        [-10.2387,   0.5019,  -7.4575,  ...,  -6.9208,  -1.3332,  -3.3580],
        ...,
        [-10.2428,   0.5010,  -7.4602,  ...,  -6.9220,  -1.3324,  -3.3581],
        [-10.2353,   0.5031,  -7.4569,  ...,  -6.9189,  -1.3329,  -3.3591],
        [-10.2309,   0.5029,  -7.4524,  ...,  -6.9160,  -1.3320,  -3.3554]],
       grad_fn=<AddmmBackward0>)