In [1]:
from graph_representation_generator import GraphRepresentationGenerator
from dataset_manager.kg_manager import (
    MovieLensManager,
    PROMPT_KGE_DIMENSION,
    INPUT_EMBEDS_REPLACE_KGE_DIMENSION,
    ROOT,
)
from llm_manager import (
    PromptBertClassifier,
    VanillaBertClassifier,
    GraphPrompterHFClassifier,
)

In [2]:
kg_manager = MovieLensManager()

In [3]:
graph_representation_generator_prompt = GraphRepresentationGenerator(
    kg_manager.data,
    kg_manager.gnn_train_data,
    kg_manager.gnn_val_data,
    kg_manager.gnn_test_data,
    kge_dimension=PROMPT_KGE_DIMENSION,
)
graph_representation_generator_graph_prompter_hf = GraphRepresentationGenerator(
    kg_manager.data,
    kg_manager.gnn_train_data,
    kg_manager.gnn_val_data,
    kg_manager.gnn_test_data,
    hidden_channels=INPUT_EMBEDS_REPLACE_KGE_DIMENSION,
    kge_dimension=INPUT_EMBEDS_REPLACE_KGE_DIMENSION,
)

loading pretrained model
Device: 'cuda'
loading pretrained model
Device: 'cuda'


In [4]:
prompt_embeddings = graph_representation_generator_prompt.get_saved_embeddings("prompt")
graph_prompter_hf_embeddings = (
    graph_representation_generator_graph_prompter_hf.get_saved_embeddings(
        "graph_prompter_hf"
    )
)
save_prompt = False
save_graph_prompter_hf = False
if prompt_embeddings is None:
    prompt_embeddings = graph_representation_generator_prompt.generate_embeddings(
        kg_manager.llm_df
    )
    save_prompt = True
if graph_prompter_hf_embeddings is None:
    graph_prompter_hf_embeddings = (
        graph_representation_generator_graph_prompter_hf.generate_embeddings(
            kg_manager.llm_df
        )
    )
    save_graph_prompter_hf = True

kg_manager.append_prompt_graph_embeddings(prompt_embeddings, save=save_prompt)
kg_manager.append_graph_prompter_hf_graph_embeddings(
    graph_prompter_hf_embeddings, save=save_graph_prompter_hf
)


In [5]:
VANILLA_ROOT = f"{ROOT}/llm/vanilla"
PROMPT_ROOT = f"{ROOT}/llm/prompt"
INPUT_EMBEDS_REPLACE_ROOT = f"{ROOT}/llm/graph_prompter_hf"

In [6]:
vanilla_bert_classifier = VanillaBertClassifier(
    kg_manager.llm_df,
    kg_manager.source_df,
    kg_manager.target_df,
    root_path=VANILLA_ROOT,
    false_ratio=-1,
)
prompt_bert_classifier = PromptBertClassifier(
    kg_manager,
    graph_representation_generator_prompt.get_embedding,
    root_path=PROMPT_ROOT,
    model_max_length=512,
    false_ratio=-1,
)
graph_prompter_hf_bert_classifier = GraphPrompterHFClassifier(
    kg_manager,
    graph_representation_generator_graph_prompter_hf.get_embedding,
    root_path=INPUT_EMBEDS_REPLACE_ROOT,
    false_ratio=-1,
)


In [7]:
dataset_vanilla = kg_manager.generate_vanilla_dataset(
    vanilla_bert_classifier.tokenize_function
)
dataset_prompt = kg_manager.generate_prompt_embedding_dataset(
    prompt_bert_classifier.tokenize_function,
)
dataset_graph_prompter_hf = kg_manager.generate_graph_prompter_hf_embedding_dataset(
    graph_prompter_hf_bert_classifier.tokenizer.sep_token,
    graph_prompter_hf_bert_classifier.tokenizer.pad_token,
    graph_prompter_hf_bert_classifier.tokenize_function,
)

In [9]:
for pos in range(
    len(dataset_graph_prompter_hf["train"][0]["token_type_ranges"]) + 3,
    len(dataset_graph_prompter_hf["train"][0]["token_type_ranges"]) + 4,
):
    prompt_df = prompt_bert_classifier.forward_dataset_and_save_outputs(
        dataset_prompt,
        kg_manager.get_prompt_tokens_as_df,
        epochs=1,
        force_recompute=True,
        load_fields=["hidden_states", "logits"],
        hidden_state_position=pos,
    )
    graph_prompter_hf_df = (
        graph_prompter_hf_bert_classifier.forward_dataset_and_save_outputs(
            dataset_graph_prompter_hf,
            kg_manager.get_vanilla_tokens_as_df,
            epochs=1,
            force_recompute=True,
            load_fields=["hidden_states", "logits"],
            hidden_state_position=pos,
        )
    )
    dataset = kg_manager.generate_huggingface_dataset(
        [prompt_df, graph_prompter_hf_df],
        ["prompt", "graph_prompter_hf"],
        add_tokens=False,
    )
    dataset.save_to_disk(f"./data/dataset_hidden_states_{pos}.hf")


train Forward Epoch 1 from 1


ValueError: attempt to get argmax of an empty sequence

In [8]:
graph_prompter_hf_df = (
    graph_prompter_hf_bert_classifier.forward_dataset_and_save_outputs(
        dataset_graph_prompter_hf,
        kg_manager.get_vanilla_tokens_as_df,
        epochs=1,
        force_recompute=True,
        load_fields=["hidden_states", "logits"],
    )
)

train Forward Epoch 1 from 1


ValueError: attempt to get argmax of an empty sequence

In [16]:
dataset

DatasetDict({
    train: Dataset({
        features: ['prompt_hidden_states', 'prompt_hidden_states_original_shape', 'prompt_logits', 'prompt_predictions', 'split', 'graph_prompter_hf_hidden_states', 'graph_prompter_hf_logits', 'graph_prompter_hf_hidden_states_original_shape', 'graph_prompter_hf_predictions'],
        num_rows: 56469
    })
    val: Dataset({
        features: ['prompt_hidden_states', 'prompt_hidden_states_original_shape', 'prompt_logits', 'prompt_predictions', 'split', 'graph_prompter_hf_hidden_states', 'graph_prompter_hf_logits', 'graph_prompter_hf_hidden_states_original_shape', 'graph_prompter_hf_predictions'],
        num_rows: 34284
    })
    test: Dataset({
        features: ['prompt_hidden_states', 'prompt_hidden_states_original_shape', 'prompt_logits', 'prompt_predictions', 'split', 'graph_prompter_hf_hidden_states', 'graph_prompter_hf_logits', 'graph_prompter_hf_hidden_states_original_shape', 'graph_prompter_hf_predictions'],
        num_rows: 34284
    })
})

In [8]:
vanilla_df = vanilla_bert_classifier.forward_dataset_and_save_outputs(
    dataset_vanilla,
    kg_manager.get_vanilla_tokens_as_df,
    epochs=1,
    include_graph_embeddings=False,
    force_recompute=True,
    load_fields=["hidden_states", "logits"],
)

train Forward Epoch 1 from 1
tensor([[-1.5371,  1.3131],
        [-0.9730,  0.6566],
        [-1.6749,  1.4050],
        [-1.3884,  1.0126],
        [-2.0894,  1.5958],
        [-1.6421,  1.3254],
        [-0.4306,  0.2153],
        [-1.2743,  1.0361],
        [-1.6583,  1.4387],
        [-1.6693,  1.3125],
        [-1.6230,  1.3089],
        [-1.3386,  1.0710],
        [-1.5201,  1.2354],
        [-1.5028,  1.1586],
        [-1.5802,  1.2815],
        [-1.7154,  1.3710],
        [-1.6931,  1.4269],
        [-1.3479,  1.0593],
        [-1.5286,  1.2707],
        [-1.6458,  1.3827],
        [-1.6411,  1.4311],
        [-1.0584,  0.7812],
        [-0.2711, -0.1110],
        [-0.4654,  0.0812],
        [ 0.2574, -0.4983],
        [ 0.0412, -0.3482],
        [-0.8761,  0.5034],
        [-0.2106, -0.2186],
        [-0.5903,  0.2960],
        [-0.5305,  0.1971],
        [-1.0252,  0.7675],
        [-1.0387,  0.7474],
        [-1.5022,  1.1573],
        [-1.3784,  1.0648],
        [-2.1076,  

ValueError: attempt to get argmax of an empty sequence

In [10]:
dataset = kg_manager.generate_huggingface_dataset(
    [vanilla_df, prompt_df, graph_prompter_hf_df],
    ["vanilla", "prompt", "graph_prompter_hf"],
    add_tokens=False,
)

In [11]:
dataset

DatasetDict({
    train: Dataset({
        features: ['vanilla_hidden_states_original_shape', 'vanilla_predictions', 'vanilla_hidden_states', 'split', 'vanilla_logits', 'prompt_hidden_states', 'prompt_hidden_states_original_shape', 'prompt_logits', 'prompt_predictions', 'graph_prompter_hf_hidden_states', 'graph_prompter_hf_logits', 'graph_prompter_hf_hidden_states_original_shape', 'graph_prompter_hf_predictions'],
        num_rows: 56469
    })
    val: Dataset({
        features: ['vanilla_hidden_states_original_shape', 'vanilla_predictions', 'vanilla_hidden_states', 'split', 'vanilla_logits', 'prompt_hidden_states', 'prompt_hidden_states_original_shape', 'prompt_logits', 'prompt_predictions', 'graph_prompter_hf_hidden_states', 'graph_prompter_hf_logits', 'graph_prompter_hf_hidden_states_original_shape', 'graph_prompter_hf_predictions'],
        num_rows: 34284
    })
    test: Dataset({
        features: ['vanilla_hidden_states_original_shape', 'vanilla_predictions', 'vanilla_hidden_

In [12]:
dataset.save_to_disk("./data/dataset_hidden_states_0.hf")
# dataset.push_to_hub("AhmadPython/MovieLens_KGE")

Saving the dataset (0/7 shards):   0%|          | 0/56469 [00:00<?, ? examples/s]

Saving the dataset (0/4 shards):   0%|          | 0/34284 [00:00<?, ? examples/s]

Saving the dataset (0/4 shards):   0%|          | 0/34284 [00:00<?, ? examples/s]