In [None]:
from gnn import GNNTrainer
from movie_lens_loader import MovieLensLoader
from llm import PromptEncoderOnlyClassifier, VanillaEncoderOnlyClassifier

First we load the MovieLensLoader, which downloads the Movie Lens dataset (https://files.grouplens.org/datasets/movielens/ml-latest-small.zip) and prepares it to be used on GNN and LLM (approximatly 30 secs first time)

In [None]:

movie_lens_loader = MovieLensLoader()

Next we initialize the GNNTrainer, which expects the complete dataset to read the dataset schema. The GNNTrainer can later be used to train in link prediction.

In [None]:
gnn_trainer = GNNTrainer(movie_lens_loader.data)

We then train and validate the model on the link prediction task. If the model is already trained, we can skip this part.

In [None]:
#gnn_trainer.train_model(movie_lens_loader.gnn_train_data, 10)
#gnn_trainer.validate_model(movie_lens_loader.gnn_val_data)

In [None]:

def __get_embedding(self, row, movie_lens_loader: MovieLensLoader):
            split = row["split"]
            data = movie_lens_loader.gnn_train_data if split == "train" else movie_lens_loader.gnn_val_data if split == "val" else movie_lens_loader.gnn_test_data if split == "test" else movie_lens_loader.gnn_train_data
            user_id = row["mappedUserId"]
            movie_id = row["mappedMovieId"]
            user_embedding, movie_embedding, _, _ = self.get_embedding(data, user_id, movie_id)
            row["user_embedding"] = user_embedding.detach().tolist()
            row["movie_embedding"] = movie_embedding.detach().tolist()
            return row
        
#produce the embeddings for all edges
df = movie_lens_loader.llm_df.apply(lambda row: __get_embedding(gnn_trainer, row, movie_lens_loader), axis = 1)


In [None]:
#compress the embeddings of all user embeddings to two dimensions
df[f"pca_2_user_embedding"] = ""
for split in ["train", "test", "val", "rest"]:
    print(split)
    condition = df['split'] == split
    user_embeddings = list(df[condition]["user_embedding"].values)
    if split == "train":
        if not gnn_trainer.user_pca_train or gnn_trainer.force_recompute:
            gnn_trainer.user_pca_train = PCA(n_components=2)  # Reduce to 2 dimensions
        pca = gnn_trainer.user_pca_train
    if split == "val":
        if not gnn_trainer.user_pca_val or gnn_trainer.force_recompute:
            gnn_trainer.user_pca_val = PCA(n_components=2)  # Reduce to 2 dimensions
        pca = gnn_trainer.user_pca_val
    if split == "test":
        if not gnn_trainer.user_pca_test or gnn_trainer.force_recompute:
            gnn_trainer.user_pca_test = PCA(n_components=2)  # Reduce to 2 dimensions
        pca = gnn_trainer.user_pca_test
    if split == "rest":
        if not gnn_trainer.user_pca_rest or gnn_trainer.force_recompute:
            gnn_trainer.user_pca_rest = PCA(n_components=2)  # Reduce to 2 dimensions
        pca = gnn_trainer.user_pca_rest
    print(len(user_embeddings), len(user_embeddings[0]))
    print(pca.fit_transform(user_embeddings))
    print(pca.fit_transform(user_embeddings).squeeze())
    pca_2_user_embeddings = pca.fit_transform(user_embeddings).squeeze().tolist()
    pca_2_user_embeddings = list(map(lambda emb: str(emb), pca_2_user_embeddings))
    df.loc[condition, 'pca_2_user_embedding'] = pca_2_user_embeddings

Next we produce the user embedding and movie embedding for every edge in the dataset. These embeddings can then be used for the LLM on the link-prediction task. Can be skipped if this was already done ones.

In [None]:
llm_df = gnn_trainer.get_embeddings(movie_lens_loader)


Next we initialize the vanilla encoder only classifier. This classifier does only use the NLP part of the prompt (no graph embeddings) for predicting if the given link exists.

In [None]:
vanilla_encoder_only_classifier = VanillaEncoderOnlyClassifier(movie_lens_loader.llm_df)

Next we generate a vanilla llm dataset and tokenize it for training.

In [None]:
dataset_vanilla = movie_lens_loader.generate_vanilla_dataset(vanilla_encoder_only_classifier.tokenize_function)

Next we train the model on the produced dataset. This can be skipped, if already trained ones.

In [None]:
vanilla_encoder_only_classifier.train_model_on_data(dataset_vanilla, epochs=3)

Next we initialize the prompt encoder only classifier. This classifier uses the vanilla prompt and the graph embeddings for its link prediction.

In [None]:
prompt_encoder_only_classifier = PromptEncoderOnlyClassifier(movie_lens_loader, gnn_trainer.get_embedding)

We also generate a prompt dataset, this time the prompts also include 2d embeddings of user and movie

In [None]:
dataset_prompt = movie_lens_loader.generate_prompt_embedding_dataset(prompt_encoder_only_classifier.tokenize_function)

We also train the model. This can be skipped if already done ones.

In [None]:
prompt_encoder_only_classifier.train_model_on_data(dataset_prompt, epochs = 3)

In [None]:
prompt_negative_sample = movie_lens_loader.sample_prompt_datapoint(existing=False, get_embedding_cb=gnn_trainer.get_embedding, tokenize_function=prompt_encoder_only_classifier.tokenize_function)
prompt_positive_sample = movie_lens_loader.sample_prompt_datapoint(tokenize_function=prompt_encoder_only_classifier.tokenize_function)
vanilla_negative_sample = movie_lens_loader.sample_vanilla_datapoint(existing=False, tokenize_function=vanilla_encoder_only_classifier.tokenize_function)
vanilla_positive_sample = movie_lens_loader.sample_vanilla_datapoint(tokenize_function=vanilla_encoder_only_classifier.tokenize_function)

In [None]:
import torch
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import BertTokenizer
import pandas as pd
import numpy as np

In [None]:
tokenizer = BertTokenizer.from_pretrained("google/bert_uncased_L-2_H-128_A-2", model_max_length=256)

In [None]:
test = "user: 0, title: Toy Story (1995), genres: ['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy'],[0.09566975384950638, 0.1871771365404129]"

# Current State
Here I want to plot the attentions not only between single tokens but between the embedding part and non-embedding part.

In [None]:
def find_sub_list(sl,l):
    sll=len(sl)
    for ind in (i for i,e in enumerate(l) if e==sl[0]):
        if l[ind:ind+sll]==sl:
            return ind

def foo(self: PromptEncoderOnlyClassifier, sample: dict, layer = -1):
    self.model.eval()
    with torch.no_grad():
        outputs = self.model(input_ids = sample["input_ids"], attention_mask = sample["attention_mask"], output_attentions=True)
        attentions = outputs.attentions  # This will contain the attention weights for each layer and head
    combined_attention = torch.sum(attentions[layer], dim=1).squeeze().detach().numpy()
    # Tokenize the text to get the token labels
    tokens = self.tokenizer.convert_ids_to_tokens(sample['input_ids'][0])
    starting_index_user_embeddings = find_sub_list(['user', 'em', '##bed', '##ding'], tokens)
    starting_index_movie_embeddings = find_sub_list(['user', 'em', '##bed', '##ding'], tokens)
    print

    # Plot the combined attention weights
    fig, ax = plt.subplots(figsize=(10, 10))
    sns.heatmap(combined_attention, xticklabels=tokens, yticklabels=tokens, cmap='viridis', ax=ax)
    plt.title('Combined Attention Weights for Layer 1 After Linear Projection')
    plt.xlabel('Tokens')
    plt.ylabel('Tokens')
    plt.show()
foo(prompt_encoder_only_classifier, prompt_negative_sample)

In [None]:
prompt_encoder_only_classifier.plot_confusion_matrix(dataset=dataset_prompt)

In [None]:
vanilla_encoder_only_classifier.plot_confusion_matrix(dataset=dataset_vanilla)