In [2]:
# Install necessary dependencies (takes a long time)
!pip install torch torch_scatter torch_sparse torch_geometric graphdatascience

^C


In [7]:
import os
import pandas as pd
from graphdatascience import GraphDataScience
import torch
from torch_geometric.data import Data
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, RGCNConv
from torch_geometric.transforms import RandomNodeSplit
import random
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer


In [8]:
# Set seeds for consistent results
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed_all(42)

In [9]:
# Get Neo4j DB URI, credentials and name from environment if applicable
NEO4J_URI = os.environ.get("NEO4J_URI", "bolt://localhost:7687")
NEO4J_DB = os.environ.get("NEO4J_DB", "neo4j")
NEO4J_AUTH = (
    os.environ.get("NEO4J_USER", "neo4j"),
    os.environ.get("NEO4J_PASSWORD", "pleaseletmein"),
)

gds = GraphDataScience(NEO4J_URI, auth=NEO4J_AUTH, database=NEO4J_DB)

In [10]:
def fetch_data(query):
    return gds.run_cypher(query)

In [12]:
def load_node(cypher, index_col, encoders=None, target_encoders=None, **kwargs):
    # Execute the cypher query and retrieve data from Neo4j
    df = fetch_data(cypher)
    df.set_index(index_col, inplace=True)
    # Define node mapping
    mapping = {index: i for i, index in enumerate(df.index.unique())}
    # Define node features
    x = None
    if encoders is not None:
        xs = [encoder(df[col]) for col, encoder in encoders.items()]
        x = torch.cat(xs, dim=-1)
        
    y = None
    if target_encoders is not None: 
        ys = [encoder(df[col]) for col, encoder in target_encoders.items()]
        y = torch.cat(ys, dim=-1)

    return x, mapping, y

In [13]:
def load_edge(cypher, src_index_col, src_mapping, dst_index_col, dst_mapping,
                  encoders=None, **kwargs):
    # Execute the cypher query and retrieve data from Neo4j
    df = fetch_data(cypher)
    # Define edge index
    src = [src_mapping[index] for index in df[src_index_col]]
    dst = [dst_mapping[index] for index in df[dst_index_col]]
    edge_index = torch.tensor([src, dst])
    # Define edge features
    edge_attr = None
    if encoders is not None:
        edge_attrs = [encoder(df[col]) for col, encoder in encoders.items()]
        edge_attr = torch.cat(edge_attrs, dim=-1)

    return edge_index, edge_attr

In [126]:
mlb = MultiLabelBinarizer()

def tags_encoder(tags):
    tags_mlb = mlb.fit_transform(tags)
    return torch.Tensor(list(tags_mlb))

from sentence_transformers import SentenceTransformer

class SequenceEncoder:
    def __init__(self, model_name='all-MiniLM-L6-v2', device=None):
        self.device = device
        self.model = SentenceTransformer(model_name, device=device)

    @torch.no_grad()
    def __call__(self, df):
        x = self.model.encode(df.values, show_progress_bar=True,
                              convert_to_tensor=True, device=self.device)
        return x.cpu()
    
class WeekdayEncoder:
    def __init__(self, sep='|'):
        self.sep = sep

    def __call__(self, df):
        genres = set(col for col in df.values)
        mapping = {genre: i for i, genre in enumerate(genres)}

        x = torch.zeros(len(df), len(mapping))
        for i, col in enumerate(df.values):
            for genre in col.split(self.sep):
                x[i, mapping[genre]] = 1
        return x

In [127]:
recipe_query = """
MATCH (a:Recipe)
OPTIONAL MATCH (a)-[:HAS_TAG]->(tag:Tag)
RETURN a.recipieId as recipeId,
       a.openaiEmbeddings as embedding,
       collect(tag.title) AS tags

"""

recipe_x, recipe_mapping, recipe_y = load_node(
    recipe_query, 
    index_col='recipeId', 
    encoders={
        "embedding": lambda x: torch.Tensor(x)
    },
    target_encoders={
        "tags": tags_encoder
    }
)

  "embedding": lambda x: torch.Tensor(x)


In [84]:
recipe_x.shape, len(recipe_mapping), recipe_y.shape

(torch.Size([723, 1536]), 723, torch.Size([723, 262]))

In [17]:
ingredient_query = """
MATCH (i:Ingredient)
RETURN ID(i) as ingredientId, i.title as title
"""
ingredient_x, ingredient_mapping, y = load_node(
    ingredient_query, 
    index_col='ingredientId', 
    encoders={
        'title': SequenceEncoder()
    }
)

Batches:   0%|          | 0/57 [00:00<?, ?it/s]

In [18]:
ingredient_x.shape, len(ingredient_mapping)

(torch.Size([1793, 384]), 1793)

In [19]:
menu_query = """
MATCH (m:Menu)
RETURN ID(m) as menuId, m.year as year, m.week as week
"""
menu_x, menu_mapping, y = load_node(
    menu_query, 
    index_col='menuId',
    encoders={
        "year": lambda x: torch.Tensor(x.tolist()).view(-1, 1),
        "week": lambda x: torch.Tensor(x.tolist()).view(-1, 1),
    }
)

In [20]:
menu_x.shape, len(menu_mapping)

(torch.Size([142, 2]), 142)

In [89]:
recipe_menu_query = """
MATCH (n:Recipe)-[:HAS_TAG]->(:Tag), (n)-[r:IS_PART_OF_MENU]->(m:Menu)
RETURN DISTINCT n.recipieId AS recipeId, ID(m) AS menuId, r.weekDay AS weekDay
"""

recipe_menu_edge_index, recipe_menu_edge_label = load_edge(
    recipe_menu_query,
    src_index_col='recipeId',
    src_mapping=recipe_mapping,
    dst_index_col='menuId',
    dst_mapping=menu_mapping,
    encoders={'weekDay': WeekdayEncoder()},
)

In [90]:
recipe_menu_edge_index.shape, recipe_menu_edge_label.shape

(torch.Size([2, 698]), torch.Size([698, 7]))

In [91]:
recipe_ingredient_query = """
MATCH (n:Recipe)-[:HAS_TAG]->(:Tag), (n)-[r:HAS_INGREDIENT]->(i:Ingredient) 
RETURN DISTINCT n.recipieId AS recipeId, ID(i) AS ingredientId
"""

recipe_ingredient_edge_index, recipe_ingredient_edge_label = load_edge(
    recipe_ingredient_query,
    src_index_col='recipeId',
    src_mapping=recipe_mapping,
    dst_index_col='ingredientId',
    dst_mapping=ingredient_mapping,
)

In [92]:
recipe_ingredient_edge_index.shape, recipe_ingredient_edge_label

(torch.Size([2, 5921]), None)

In [93]:
from torch_geometric.data import HeteroData

data = HeteroData()

data['recipe'].x = recipe_x
data['recipe'].y = recipe_y
data["menu"].x = menu_x
data["ingredient"].x = ingredient_x
data["recipe", "has_ingredient", "ingredient"].edge_index = recipe_ingredient_edge_index
data["recipe", "is_part_of_menu", "menu"].edge_index = recipe_menu_edge_index
data["recipe", "is_part_of_menu", "menu"].edge_attr = recipe_menu_edge_label
data.num_relations = 2
data.num_classes = recipe_y.shape[-1]
data.num_nodes = len(recipe_mapping) + len(ingredient_mapping) + len(menu_mapping)
data

HeteroData(
  num_relations=2,
  num_classes=262,
  num_nodes=2658,
  recipe={
    x=[723, 1536],
    y=[723, 262],
  },
  menu={ x=[142, 2] },
  ingredient={ x=[1793, 384] },
  (recipe, has_ingredient, ingredient)={ edge_index=[2, 5921] },
  (recipe, is_part_of_menu, menu)={
    edge_index=[2, 698],
    edge_attr=[698, 7],
  }
)

In [94]:
from torch_geometric.transforms import ToUndirected

data = ToUndirected()(data)
data

HeteroData(
  num_relations=2,
  num_classes=262,
  num_nodes=2658,
  recipe={
    x=[723, 1536],
    y=[723, 262],
  },
  menu={ x=[142, 2] },
  ingredient={ x=[1793, 384] },
  (recipe, has_ingredient, ingredient)={ edge_index=[2, 5921] },
  (recipe, is_part_of_menu, menu)={
    edge_index=[2, 698],
    edge_attr=[698, 7],
  },
  (ingredient, rev_has_ingredient, recipe)={ edge_index=[2, 5921] },
  (menu, rev_is_part_of_menu, recipe)={
    edge_index=[2, 698],
    edge_attr=[698, 7],
  }
)

In [95]:
from torch_geometric.transforms import RandomLinkSplit

# 2. Perform a link-level split into training, validation, and test edges.
transform = RandomLinkSplit(
    num_val=0.1,
    num_test=0.1,
    neg_sampling_ratio=0.0,
    edge_types=[("recipe", "is_part_of_menu", "menu"), ("recipe", "has_ingredient", "ingredient")],
    # rev_edge_types=[('movie', 'rev_rates', 'user')],
)
train_data, val_data, test_data = transform(data)

In [96]:
data.metadata()[1]

[('recipe', 'has_ingredient', 'ingredient'),
 ('recipe', 'is_part_of_menu', 'menu'),
 ('ingredient', 'rev_has_ingredient', 'recipe'),
 ('menu', 'rev_is_part_of_menu', 'recipe')]

In [97]:
from torch_geometric.nn import GATConv, Linear, to_hetero

class GAT(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = GATConv((-1, -1), hidden_channels, add_self_loops=False)
        self.lin1 = Linear(-1, hidden_channels)
        self.conv2 = GATConv((-1, -1), out_channels, add_self_loops=False)
        self.lin2 = Linear(-1, out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index) + self.lin1(x)
        x = x.relu()
        x = self.conv2(x, edge_index) + self.lin2(x)
        return x


In [98]:
model = GAT(hidden_channels=64, out_channels=data.num_classes)
model = to_hetero(model, data.metadata(), aggr='sum')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
data, model = data.to(device), model.to(device)

In [61]:
with torch.no_grad():  # Initialize lazy modules.
    out = model(data.x_dict, data.edge_index_dict)

In [99]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import RGCNConv

optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=0.001)
# optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.BCEWithLogitsLoss()

# Training loop
for epoch in range(500):
    model.train()
    optimizer.zero_grad()
    
    out = model(train_data.x_dict, train_data.edge_index_dict)
    loss = criterion(out['recipe'], train_data['recipe'].y.float())
    loss.backward()
    optimizer.step()

    print(f"Epoch {epoch + 1}, Loss: {loss.item()}")
    
        # Validation
    model.eval()
    with torch.no_grad():
        val_loss = 0
    
        out = model(val_data.x_dict, val_data.edge_index_dict)
        loss = criterion(out["recipe"], val_data["recipe"].y)
        val_loss += loss.item()
        print(f"Validation Loss: {val_loss}")


Epoch 1, Loss: 76.69147491455078
Validation Loss: 49.7157096862793
Epoch 2, Loss: 49.7157096862793
Validation Loss: 27.945297241210938
Epoch 3, Loss: 27.945297241210938
Validation Loss: 14.334324836730957
Epoch 4, Loss: 14.334324836730957
Validation Loss: 8.50998306274414
Epoch 5, Loss: 8.50998306274414
Validation Loss: 5.360548973083496
Epoch 6, Loss: 5.360548973083496
Validation Loss: 4.900411128997803
Epoch 7, Loss: 4.900411128997803
Validation Loss: 4.559986591339111
Epoch 8, Loss: 4.559986591339111
Validation Loss: 4.417131423950195
Epoch 9, Loss: 4.417131423950195
Validation Loss: 4.963930130004883
Epoch 10, Loss: 4.963930130004883
Validation Loss: 4.875558853149414
Epoch 11, Loss: 4.875558853149414
Validation Loss: 4.920295238494873
Epoch 12, Loss: 4.920295238494873
Validation Loss: 5.022231101989746
Epoch 13, Loss: 5.022231101989746
Validation Loss: 5.022214889526367
Epoch 14, Loss: 5.022214889526367
Validation Loss: 4.886704921722412
Epoch 15, Loss: 4.886704921722412
Validatio

In [100]:

# For evaluation metrics, you can use sklearn's metrics
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

# Convert the model's output to binary labels
def to_binary_labels(output):
    return (torch.sigmoid(output) > 0.5).int()

# Evaluate on validation set
model.eval()
with torch.no_grad():
  # Assuming val_data is your validation data
    out = model(test_data.x_dict, test_data.edge_index_dict)
    test_loss = criterion(out['recipe'], test_data['recipe'].y.float())
    print(f"Test Loss: {test_loss.item()}")

    # Convert outputs to binary labels for metric calculation
    preds = to_binary_labels(out['recipe'])
    labels = test_data['recipe'].y.int()

    f1 = f1_score(labels.numpy(), preds.numpy(), average='micro')
    precision = precision_score(labels.numpy(), preds.numpy(), average='micro')
    recall = recall_score(labels.numpy(), preds.numpy(), average='micro')
    accuracy = accuracy_score(labels.numpy(), preds.numpy())

    print(f"F1 Score: {f1}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"Accuracy: {accuracy}")

Test Loss: 0.2922382652759552
F1 Score: 0.1896095301125083
Precision: 0.32855504587155965
Recall: 0.13325581395348837
Accuracy: 0.25311203319502074


In [103]:
no_tag_query = """
MATCH (n:Recipe)
WHERE NOT (n)-[:HAS_TAG]->(:Tag)
return n.recipieId as recipeId, n.title as title, n.openaiEmbeddings as embedding
""" 

df = fetch_data(no_tag_query)
df.head()

Unnamed: 0,recipeId,title,embedding
0,a220a04a-c211-461d-bad8-d73ed2d29ea3,Baked feta pasta,"[-0.0077616022899746895, -0.007513688877224922..."
1,cbf082fa-38b1-4395-b3a7-0a0546333861,Blomkålsuppe med kikerter og grønnkål,"[-0.005160760134458542, -0.020936481654644012,..."
2,1dd265fa-d7d0-408f-9ab3-bcf42d66211d,Blomkålsalat med syrlig dressing,"[-0.017495296895503998, -0.019598836079239845,..."
3,42aaa6ff-eaf8-4c1b-8925-55e245a56678,Steinsopprisotto,"[0.0018266895785927773, -0.029227033257484436,..."
4,67902641-b348-45f2-b8dd-bee9a3070c9e,Klassisk pasta carbonara,"[0.0034193696919828653, -0.01366486120969057, ..."


In [105]:
recipe_x, recipe_mapping, _ = load_node(
    no_tag_query, 
    index_col='recipeId', 
    encoders={
        "embedding": lambda x: torch.Tensor(x)
    }
)

  "embedding": lambda x: torch.Tensor(x)


In [107]:
recipe_x.shape, len(recipe_mapping)

(torch.Size([183, 1536]), 183)

In [108]:
recipe_menu_query = """
MATCH (n:Recipe)-[r:IS_PART_OF_MENU]->(m:Menu)
WHERE NOT (n)-[:HAS_TAG]->(:Tag)
RETURN DISTINCT n.recipieId AS recipeId, ID(m) AS menuId, r.weekDay AS weekDay
"""

recipe_menu_edge_index, recipe_menu_edge_label = load_edge(
    recipe_menu_query,
    src_index_col='recipeId',
    src_mapping=recipe_mapping,
    dst_index_col='menuId',
    dst_mapping=menu_mapping,
    encoders={'weekDay': WeekdayEncoder()},
)

In [109]:
recipe_menu_edge_index.shape, recipe_menu_edge_label.shape

(torch.Size([2, 295]), torch.Size([295, 7]))

In [110]:
recipe_ingredient_query = """
MATCH (n)-[r:HAS_INGREDIENT]->(i:Ingredient)
WHERE NOT (n)-[:HAS_TAG]->(:Tag)
RETURN DISTINCT n.recipieId AS recipeId, ID(i) AS ingredientId
"""

recipe_ingredient_edge_index, recipe_ingredient_edge_label = load_edge(
    recipe_ingredient_query,
    src_index_col='recipeId',
    src_mapping=recipe_mapping,
    dst_index_col='ingredientId',
    dst_mapping=ingredient_mapping,
)

In [111]:
recipe_ingredient_edge_index.shape, recipe_ingredient_edge_label

(torch.Size([2, 1986]), None)

In [114]:
from torch_geometric.data import HeteroData

no_tag_data = HeteroData()

no_tag_data['recipe'].x = recipe_x
no_tag_data["menu"].x = menu_x
no_tag_data["ingredient"].x = ingredient_x
no_tag_data["recipe", "has_ingredient", "ingredient"].edge_index = recipe_ingredient_edge_index
no_tag_data["recipe", "is_part_of_menu", "menu"].edge_index = recipe_menu_edge_index
no_tag_data["recipe", "is_part_of_menu", "menu"].edge_attr = recipe_menu_edge_label
no_tag_data.num_relations = 2
no_tag_data.num_nodes = len(recipe_mapping) + len(ingredient_mapping) + len(menu_mapping)
no_tag_data

HeteroData(
  num_relations=2,
  num_nodes=2118,
  recipe={ x=[183, 1536] },
  menu={ x=[142, 2] },
  ingredient={ x=[1793, 384] },
  (recipe, has_ingredient, ingredient)={ edge_index=[2, 1986] },
  (recipe, is_part_of_menu, menu)={
    edge_index=[2, 295],
    edge_attr=[295, 7],
  }
)

In [115]:
from torch_geometric.transforms import ToUndirected

no_tag_data = ToUndirected()(no_tag_data)
no_tag_data

HeteroData(
  num_relations=2,
  num_nodes=2118,
  recipe={ x=[183, 1536] },
  menu={ x=[142, 2] },
  ingredient={ x=[1793, 384] },
  (recipe, has_ingredient, ingredient)={ edge_index=[2, 1986] },
  (recipe, is_part_of_menu, menu)={
    edge_index=[2, 295],
    edge_attr=[295, 7],
  },
  (ingredient, rev_has_ingredient, recipe)={ edge_index=[2, 1986] },
  (menu, rev_is_part_of_menu, recipe)={
    edge_index=[2, 295],
    edge_attr=[295, 7],
  }
)

In [116]:
model.eval()
with torch.no_grad():
    # Assuming test_data is your filtered data without tags
    out = model(no_tag_data.x_dict, no_tag_data.edge_index_dict)
    
    # Convert outputs to probabilities
    probs = torch.sigmoid(out['recipe'])
    
    # Convert probabilities to binary labels (if needed)
    preds = (probs > 0.5).int()

In [138]:
preds.numpy()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [145]:
df = fetch_data(no_tag_query)

In [146]:
print(len(mlb.classes_))
print(len(preds[0]))
print(len(df))

262
262
183


In [148]:
df["tags"] = mlb.inverse_transform(preds.numpy())
df[["title", "tags"]]

Unnamed: 0,title,tags
0,Baked feta pasta,"(Hovedrett, Skalldyr, Tex-Mex)"
1,Blomkålsuppe med kikerter og grønnkål,"(Hovedrett, Skalldyr, Tex-Mex)"
2,Blomkålsalat med syrlig dressing,"(Hovedrett, Skalldyr, Tex-Mex)"
3,Steinsopprisotto,"(Forrett, Gjester, Hovedrett, Skalldyr, Sushi,..."
4,Klassisk pasta carbonara,"(Hovedrett, Kylling, Sommer)"
...,...,...
178,Lam i pita,"(Middagstips,)"
179,Quinoasalat med gresskar,"(Forrett, Gjester, Hovedrett, Skalldyr, Sushi,..."
180,Enkel grønnsakssuppe,"(Hovedrett, Kylling, Sommer)"
181,Middagssalat med kikerter og feta,"(Hovedrett, Skalldyr, Tex-Mex)"
