In [2]:
# Install necessary dependencies (takes a long time)
!pip install torch torch_scatter torch_sparse torch_geometric graphdatascience

^C


In [7]:
import os
import pandas as pd
from graphdatascience import GraphDataScience
import torch
from torch_geometric.data import Data
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, RGCNConv
from torch_geometric.transforms import RandomNodeSplit
import random
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer


In [8]:
# Set seeds for consistent results
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed_all(42)

In [9]:
# Get Neo4j DB URI, credentials and name from environment if applicable
NEO4J_URI = os.environ.get("NEO4J_URI", "bolt://localhost:7687")
NEO4J_DB = os.environ.get("NEO4J_DB", "neo4j")
NEO4J_AUTH = (
    os.environ.get("NEO4J_USER", "neo4j"),
    os.environ.get("NEO4J_PASSWORD", "pleaseletmein"),
)

gds = GraphDataScience(NEO4J_URI, auth=NEO4J_AUTH, database=NEO4J_DB)

In [10]:
def fetch_data(query):
    return gds.run_cypher(query)

In [12]:
def load_node(cypher, index_col, encoders=None, target_encoders=None, **kwargs):
    # Execute the cypher query and retrieve data from Neo4j
    df = fetch_data(cypher)
    df.set_index(index_col, inplace=True)
    # Define node mapping
    mapping = {index: i for i, index in enumerate(df.index.unique())}
    # Define node features
    x = None
    if encoders is not None:
        xs = [encoder(df[col]) for col, encoder in encoders.items()]
        x = torch.cat(xs, dim=-1)
        
    y = None
    if target_encoders is not None: 
        ys = [encoder(df[col]) for col, encoder in target_encoders.items()]
        y = torch.cat(ys, dim=-1)

    return x, mapping, y

In [13]:
def load_edge(cypher, src_index_col, src_mapping, dst_index_col, dst_mapping,
                  encoders=None, **kwargs):
    # Execute the cypher query and retrieve data from Neo4j
    df = fetch_data(cypher)
    # Define edge index
    src = [src_mapping[index] for index in df[src_index_col]]
    dst = [dst_mapping[index] for index in df[dst_index_col]]
    edge_index = torch.tensor([src, dst])
    # Define edge features
    edge_attr = None
    if encoders is not None:
        edge_attrs = [encoder(df[col]) for col, encoder in encoders.items()]
        edge_attr = torch.cat(edge_attrs, dim=-1)

    return edge_index, edge_attr

In [14]:
def tags_encoder(tags):
    mlb = MultiLabelBinarizer()
    tags_mlb = mlb.fit_transform(tags)
    return torch.Tensor(list(tags_mlb))

from sentence_transformers import SentenceTransformer

class SequenceEncoder:
    def __init__(self, model_name='all-MiniLM-L6-v2', device=None):
        self.device = device
        self.model = SentenceTransformer(model_name, device=device)

    @torch.no_grad()
    def __call__(self, df):
        x = self.model.encode(df.values, show_progress_bar=True,
                              convert_to_tensor=True, device=self.device)
        return x.cpu()
    
class WeekdayEncoder:
    def __init__(self, sep='|'):
        self.sep = sep

    def __call__(self, df):
        genres = set(col for col in df.values)
        mapping = {genre: i for i, genre in enumerate(genres)}

        x = torch.zeros(len(df), len(mapping))
        for i, col in enumerate(df.values):
            for genre in col.split(self.sep):
                x[i, mapping[genre]] = 1
        return x

In [15]:
recipe_query = """
MATCH (a:Recipe)
OPTIONAL MATCH (a)-[:HAS_TAG]->(tag:Tag)
RETURN a.recipieId as recipeId,
       a.openaiEmbeddings as embedding,
       collect(tag.title) AS tags

"""

recipe_x, recipe_mapping, recipe_y = load_node(
    recipe_query, 
    index_col='recipeId', 
    encoders={
        "embedding": lambda x: torch.Tensor(x)
    },
    target_encoders={
        "tags": tags_encoder
    }
)

  "embedding": lambda x: torch.Tensor(x)
  return torch.Tensor(list(tags_mlb))


In [16]:
recipe_x.shape, len(recipe_mapping), recipe_y.shape

(torch.Size([723, 1536]), 723, torch.Size([723, 262]))

In [17]:
ingredient_query = """
MATCH (i:Ingredient)
RETURN ID(i) as ingredientId, i.title as title
"""
ingredient_x, ingredient_mapping, y = load_node(
    ingredient_query, 
    index_col='ingredientId', 
    encoders={
        'title': SequenceEncoder()
    }
)

Batches:   0%|          | 0/57 [00:00<?, ?it/s]

In [18]:
ingredient_x.shape, len(ingredient_mapping)

(torch.Size([1793, 384]), 1793)

In [19]:
menu_query = """
MATCH (m:Menu)
RETURN ID(m) as menuId, m.year as year, m.week as week
"""
menu_x, menu_mapping, y = load_node(
    menu_query, 
    index_col='menuId',
    encoders={
        "year": lambda x: torch.Tensor(x.tolist()).view(-1, 1),
        "week": lambda x: torch.Tensor(x.tolist()).view(-1, 1),
    }
)

In [20]:
menu_x.shape, len(menu_mapping)

(torch.Size([142, 2]), 142)

In [21]:
recipe_menu_query = """
MATCH (n:Recipe)-[r:IS_PART_OF_MENU]->(m:Menu) 
RETURN n.recipieId AS recipeId, ID(m) AS menuId, r.weekDay AS weekDay
"""

recipe_menu_edge_index, recipe_menu_edge_label = load_edge(
    recipe_menu_query,
    src_index_col='recipeId',
    src_mapping=recipe_mapping,
    dst_index_col='menuId',
    dst_mapping=menu_mapping,
    encoders={'weekDay': WeekdayEncoder()},
)

In [22]:
recipe_menu_edge_index.shape, recipe_menu_edge_label.shape

(torch.Size([2, 993]), torch.Size([993, 7]))

In [23]:
recipe_ingredient_query = """
MATCH (n:Recipe)-[r:HAS_INGREDIENT]->(i:Ingredient) 
RETURN n.recipieId AS recipeId, ID(i) AS ingredientId
"""

recipe_ingredient_edge_index, recipe_ingredient_edge_label = load_edge(
    recipe_ingredient_query,
    src_index_col='recipeId',
    src_mapping=recipe_mapping,
    dst_index_col='ingredientId',
    dst_mapping=ingredient_mapping,
)

In [24]:
recipe_ingredient_edge_index.shape, recipe_ingredient_edge_label

(torch.Size([2, 7907]), None)

In [55]:
from torch_geometric.data import HeteroData

data = HeteroData()

data['recipe'].x = recipe_x
data['recipe'].y = recipe_y
data["menu"].x = menu_x
data["ingredient"].x = ingredient_x
data["recipe", "has_ingredient", "ingredient"].edge_index = recipe_ingredient_edge_index
data["recipe", "is_part_of_menu", "menu"].edge_index = recipe_menu_edge_index
data["recipe", "is_part_of_menu", "menu"].edge_attr = recipe_menu_edge_label
data.num_relations = 2
data.num_classes = recipe_y.shape[-1]
data.num_nodes = len(recipe_mapping) + len(ingredient_mapping) + len(menu_mapping)
data

HeteroData(
  num_relations=2,
  num_classes=262,
  num_nodes=2658,
  recipe={
    x=[723, 1536],
    y=[723, 262],
  },
  menu={ x=[142, 2] },
  ingredient={ x=[1793, 384] },
  (recipe, has_ingredient, ingredient)={ edge_index=[2, 7907] },
  (recipe, is_part_of_menu, menu)={
    edge_index=[2, 993],
    edge_attr=[993, 7],
  }
)

In [56]:
from torch_geometric.transforms import ToUndirected

data = ToUndirected()(data)
data

HeteroData(
  num_relations=2,
  num_classes=262,
  num_nodes=2658,
  recipe={
    x=[723, 1536],
    y=[723, 262],
  },
  menu={ x=[142, 2] },
  ingredient={ x=[1793, 384] },
  (recipe, has_ingredient, ingredient)={ edge_index=[2, 7907] },
  (recipe, is_part_of_menu, menu)={
    edge_index=[2, 993],
    edge_attr=[993, 7],
  },
  (ingredient, rev_has_ingredient, recipe)={ edge_index=[2, 7907] },
  (menu, rev_is_part_of_menu, recipe)={
    edge_index=[2, 993],
    edge_attr=[993, 7],
  }
)

In [57]:
from torch_geometric.transforms import RandomLinkSplit

# 2. Perform a link-level split into training, validation, and test edges.
transform = RandomLinkSplit(
    num_val=0.1,
    num_test=0.1,
    neg_sampling_ratio=0.0,
    edge_types=[("recipe", "is_part_of_menu", "menu"), ("recipe", "has_ingredient", "ingredient")],
    # rev_edge_types=[('movie', 'rev_rates', 'user')],
)
train_data, val_data, test_data = transform(data)

In [58]:
data.metadata()[1]

HeteroData(
  num_relations=2,
  num_classes=262,
  num_nodes=2658,
  recipe={
    x=[723, 1536],
    y=[723, 262],
  },
  menu={ x=[142, 2] },
  ingredient={ x=[1793, 384] },
  (recipe, has_ingredient, ingredient)={
    edge_index=[2, 6327],
    edge_label=[6327],
    edge_label_index=[2, 6327],
  },
  (recipe, is_part_of_menu, menu)={
    edge_index=[2, 795],
    edge_attr=[795, 7],
    edge_label=[795],
    edge_label_index=[2, 795],
  },
  (ingredient, rev_has_ingredient, recipe)={ edge_index=[2, 7907] },
  (menu, rev_is_part_of_menu, recipe)={
    edge_index=[2, 993],
    edge_attr=[993, 7],
  }
)

In [29]:
from torch_geometric.nn import GATConv, Linear, to_hetero

class GAT(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = GATConv((-1, -1), hidden_channels, add_self_loops=False)
        self.lin1 = Linear(-1, hidden_channels)
        self.conv2 = GATConv((-1, -1), out_channels, add_self_loops=False)
        self.lin2 = Linear(-1, out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index) + self.lin1(x)
        x = x.relu()
        x = self.conv2(x, edge_index) + self.lin2(x)
        return x


In [59]:
model = GAT(hidden_channels=64, out_channels=data.num_classes)
model = to_hetero(model, data.metadata(), aggr='sum')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
data, model = data.to(device), model.to(device)

In [61]:
with torch.no_grad():  # Initialize lazy modules.
    out = model(data.x_dict, data.edge_index_dict)

In [32]:
def train():
    model.train()
    optimizer.zero_grad()
    out = model(data.x_dict, data.edge_index_dict)
    # mask = data['paper'].train_mask
    loss = F.cross_entropy(out['recipe'], data['recipe'].y)
    loss.backward()
    optimizer.step()
    return float(loss)

In [None]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import RGCNConv

optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=0.001)
# optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.BCEWithLogitsLoss()

# Training loop
for epoch in range(500):
    model.train()
    optimizer.zero_grad()
    
    out = model(train_data.x_dict, train_data.edge_index_dict)
    loss = criterion(out['recipe'], train_data['recipe'].y.float())
    loss.backward()
    optimizer.step()

    print(f"Epoch {epoch + 1}, Loss: {loss.item()}")
    
        # Validation
    model.eval()
    with torch.no_grad():
        val_loss = 0
    
        out = model(val_data.x_dict, val_data.edge_index_dict)
        loss = criterion(out["recipe"], val_data["recipe"].y)
        val_loss += loss.item()
        print(f"Validation Loss: {val_loss}")


Epoch 1, Loss: 1.324755311012268
Validation Loss: 21.23717498779297
Epoch 2, Loss: 21.23717498779297
Validation Loss: 3.708242893218994
Epoch 3, Loss: 3.708242893218994
Validation Loss: 2.041651964187622
Epoch 4, Loss: 2.041651964187622
Validation Loss: 3.559126377105713
Epoch 5, Loss: 3.559126377105713
Validation Loss: 3.771883964538574
Epoch 6, Loss: 3.771883964538574
Validation Loss: 3.9242632389068604
Epoch 7, Loss: 3.9242632389068604
Validation Loss: 4.753610134124756
Epoch 8, Loss: 4.753610134124756
Validation Loss: 4.958366394042969
Epoch 9, Loss: 4.958366394042969
Validation Loss: 4.9136962890625
Epoch 10, Loss: 4.9136962890625
Validation Loss: 5.23234748840332
Epoch 11, Loss: 5.23234748840332
Validation Loss: 5.568265914916992
Epoch 12, Loss: 5.568265914916992
Validation Loss: 5.590836048126221
Epoch 13, Loss: 5.590836048126221
Validation Loss: 5.611876010894775
Epoch 14, Loss: 5.611876010894775
Validation Loss: 5.548414707183838
Epoch 15, Loss: 5.548414707183838
Validation Lo

In [70]:

# For evaluation metrics, you can use sklearn's metrics
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

# Convert the model's output to binary labels
def to_binary_labels(output):
    return (torch.sigmoid(output) > 0.5).int()

# Evaluate on validation set
model.eval()
with torch.no_grad():
  # Assuming val_data is your validation data
    out = model(test_data.x_dict, test_data.edge_index_dict)
    test_loss = criterion(out['recipe'], test_data['recipe'].y.float())
    print(f"Test Loss: {test_loss.item()}")

    # Convert outputs to binary labels for metric calculation
    preds = to_binary_labels(out['recipe'])
    labels = test_data['recipe'].y.int()

    f1 = f1_score(labels.numpy(), preds.numpy(), average='micro')
    precision = precision_score(labels.numpy(), preds.numpy(), average='micro')
    recall = recall_score(labels.numpy(), preds.numpy(), average='micro')
    accuracy = accuracy_score(labels.numpy(), preds.numpy())

    print(f"F1 Score: {f1}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"Accuracy: {accuracy}")

Test Loss: 1.3247435092926025
F1 Score: 0.16283586921333765
Precision: 0.2678381256656017
Recall: 0.1169767441860465
Accuracy: 0.0
