In [1]:
from llm_manager import (
    SequenceClassifierOutputOverRanges,
    EmbeddingBasedClassifier,
    ClassifierBase,
    ID2LABEL,
    LABEL2ID,
    MODEL_NAME,
)
from dataset_manager import ROOT, MovieLensManager, INPUT_EMBEDS_REPLACE_KGE_DIMENSION
from graph_representation_generator import GraphRepresentationGenerator
from typing import Optional, Union, Tuple
import os

import torch
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from transformers import BertForSequenceClassification

# Customize Input Embeds Replace Classifier
In our first experiments we noticed that replacing the placeholders of the input embeds with the KGEs breakes up the gradient propagation all the way back. To make it even, we will freeze this entire step so that only the attention headers are trained. 

In [2]:
class InputEmbedsReplaceFrozenBertForSequenceClassification(
    BertForSequenceClassification
):
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        graph_embeddings: Optional[torch.Tensor] = None,
        semantic_positional_encoding: Optional[torch.Tensor] = None,
    ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutputOverRanges]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )
        if inputs_embeds is None:
            # CHANGES: we freeze the gradients when producing input embeddings and detach them before passing them on just to be save
            with torch.no_grad():
                inputs_embeds = self.bert.embeddings(input_ids).detach()
            assert isinstance(inputs_embeds, torch.Tensor)
        if graph_embeddings is not None and len(graph_embeddings) > 0:
            if attention_mask is not None:
                mask = (
                    (
                        (attention_mask.to(self.device).sum(dim=1) - 1)
                        .unsqueeze(1)
                        .repeat((1, 2))
                        - torch.tensor([3, 1], device=self.device)
                    )
                    .unsqueeze(2)
                    .repeat((1, 1, self.config.hidden_size))
                )  # basically a mask finding the last positions between the sep tokens (reshaped so they can be used in scatter)
                inputs_embeds = inputs_embeds.to(
                    self.device
                ).scatter(
                    1, mask.to(self.device), graph_embeddings.to(self.device)
                )  # replace the input embeds at the place holder positions with the KGEs.
        outputs = self.bert(
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )  # feed forward the input embeds to the attention model

        pooled_output = outputs[1]

        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        loss = None
        if labels is not None:
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (
                    labels.dtype == torch.long or labels.dtype == torch.int
                ):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutputOverRanges(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
            semantic_positional_encoding=semantic_positional_encoding,
        )

Next we define the new Classifier the same way the original was defined, with the exception of the model we are loading, which is the new frozen model.

In [3]:
class InputEmbedsReplaceFrozenClassifier(EmbeddingBasedClassifier):
    def __init__(
        self,
        kge_manager,
        get_embedding_cb,
        root_path,
        model_name=MODEL_NAME,
        model_max_length=256,
        false_ratio=1.0,
        force_recompute=False,
    ) -> None:
        training_path = f"{root_path}/training"
        model_path = f"{training_path}/best"

        if os.path.exists(model_path) and not force_recompute:
            model = (
                InputEmbedsReplaceFrozenBertForSequenceClassification.from_pretrained(
                    model_path,
                    num_labels=2,
                    id2label=ID2LABEL,
                    label2id=LABEL2ID,
                )
            )
        else:
            model = (
                InputEmbedsReplaceFrozenBertForSequenceClassification.from_pretrained(
                    model_name, num_labels=2, id2label=ID2LABEL, label2id=LABEL2ID
                )
            )
        assert isinstance(model, BertForSequenceClassification)
        super().__init__(
            kge_manager,
            get_embedding_cb,
            root_path,
            model,
            model_name,
            model_max_length,
            false_ratio,
            force_recompute,
        )

    def plot_training_loss_and_accuracy(self):
        model_type = "Input Embeds Replace"
        self._plot_training_loss_and_accuracy(model_type)

Now comes the whole training procedure (see training_models.ipynb)

In [4]:
kg_manager = MovieLensManager()
EPOCHS = 20
BATCH_SIZE = 256
graph_representation_generator_input_embeds_replace = GraphRepresentationGenerator(
    kg_manager.data,
    kg_manager.gnn_train_data,
    kg_manager.gnn_val_data,
    kg_manager.gnn_test_data,
    hidden_channels=INPUT_EMBEDS_REPLACE_KGE_DIMENSION,
    kge_dimension=INPUT_EMBEDS_REPLACE_KGE_DIMENSION,
)
input_embeds_replace_embeddings = (
    graph_representation_generator_input_embeds_replace.get_saved_embeddings(
        "input_embeds_replace"
    )
)
save = False
if input_embeds_replace_embeddings is None:
    input_embeds_replace_embeddings = (
        graph_representation_generator_input_embeds_replace.generate_embeddings(
            kg_manager.llm_df
        )
    )
    save = True
kg_manager.append_input_embeds_replace_graph_embeddings(
    input_embeds_replace_embeddings, save=save
)

loading pretrained model
Device: 'cuda'


In [5]:
INPUT_EMBEDS_REPLACE_FROZEN_ROOT = (
    f"{ROOT}/llm/input_embeds_replace_frozen"  # make sure the dir exists
)
if not os.path.exists(INPUT_EMBEDS_REPLACE_FROZEN_ROOT):
    os.makedirs(INPUT_EMBEDS_REPLACE_FROZEN_ROOT)
input_embeds_replace_frozen_bert_classifier = InputEmbedsReplaceFrozenClassifier(
    kg_manager,
    graph_representation_generator_input_embeds_replace.get_embedding,
    root_path=INPUT_EMBEDS_REPLACE_FROZEN_ROOT,
)
dataset_input_embeds_replace = (
    kg_manager.generate_input_embeds_replace_embedding_dataset(
        input_embeds_replace_frozen_bert_classifier.tokenizer.sep_token,
        input_embeds_replace_frozen_bert_classifier.tokenizer.pad_token,
        input_embeds_replace_frozen_bert_classifier.tokenize_function,
        force_recompute=True,
    )
)
input_embeds_replace_frozen_bert_classifier.train_model_on_data(
    dataset_input_embeds_replace, epochs=EPOCHS, batch_size=BATCH_SIZE
)
# now we don't have to generate new non-existing edges, because they were generated in the beginning anyways.
input_embeds_replace_frozen_bert_classifier = InputEmbedsReplaceFrozenClassifier(
    kg_manager,
    graph_representation_generator_input_embeds_replace.get_embedding,
    root_path=INPUT_EMBEDS_REPLACE_FROZEN_ROOT,
    false_ratio=-1.0,  # init with false_ratio of -1 so no new false edges are produced on the fly
)
input_embeds_replace_frozen_df = (
    input_embeds_replace_frozen_bert_classifier.forward_dataset_and_save_outputs(
        dataset_input_embeds_replace,
        kg_manager.get_vanilla_tokens_as_df,
        epochs=1,
        batch_size=BATCH_SIZE,
        force_recompute=False,
    )
)

Some weights of InputEmbedsReplaceFrozenBertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/56469 [00:00<?, ? examples/s]

Map:   0%|          | 0/34284 [00:00<?, ? examples/s]

Map:   0%|          | 0/34284 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/56469 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/34284 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/34284 [00:00<?, ? examples/s]

  0%|          | 0/4420 [00:00<?, ?it/s]

  attn_output = torch.nn.functional.scaled_dot_product_attention(


{'loss': 0.6929, 'grad_norm': 0.4582357704639435, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.05}
{'loss': 0.6937, 'grad_norm': 0.40128108859062195, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.09}
{'loss': 0.6922, 'grad_norm': 0.41317835450172424, 'learning_rate': 3e-06, 'epoch': 0.14}
{'loss': 0.6912, 'grad_norm': 0.3376091420650482, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.18}
{'loss': 0.6907, 'grad_norm': 0.27966317534446716, 'learning_rate': 5e-06, 'epoch': 0.23}
{'loss': 0.6898, 'grad_norm': 0.653462827205658, 'learning_rate': 6e-06, 'epoch': 0.27}
{'loss': 0.6895, 'grad_norm': 0.29140666127204895, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.32}
{'loss': 0.6892, 'grad_norm': 0.597112774848938, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.36}
{'loss': 0.6838, 'grad_norm': 0.5333534479141235, 'learning_rate': 9e-06, 'epoch': 0.41}
{'loss': 0.686, 'grad_norm': 0.45640063285827637, 'learning_rate': 1e-05, 'epoch': 0.45}
{'loss': 0.6845, 'grad_nor

  0%|          | 0/134 [00:00<?, ?it/s]

{'eval_loss': 0.5134672522544861, 'eval_accuracy': 0.7898144907245362, 'eval_runtime': 30.5805, 'eval_samples_per_second': 1121.106, 'eval_steps_per_second': 4.382, 'epoch': 1.0}
{'loss': 0.5402, 'grad_norm': 0.8751072287559509, 'learning_rate': 2.3000000000000003e-05, 'epoch': 1.04}
{'loss': 0.5279, 'grad_norm': 0.8203177452087402, 'learning_rate': 2.4e-05, 'epoch': 1.09}
{'loss': 0.5142, 'grad_norm': 0.7455928921699524, 'learning_rate': 2.5e-05, 'epoch': 1.13}
{'loss': 0.5069, 'grad_norm': 0.7852235436439514, 'learning_rate': 2.6000000000000002e-05, 'epoch': 1.18}
{'loss': 0.4934, 'grad_norm': 0.8675952553749084, 'learning_rate': 2.7000000000000002e-05, 'epoch': 1.22}
{'loss': 0.4865, 'grad_norm': 0.7708192467689514, 'learning_rate': 2.8000000000000003e-05, 'epoch': 1.27}
{'loss': 0.4826, 'grad_norm': 0.7010336518287659, 'learning_rate': 2.9e-05, 'epoch': 1.31}
{'loss': 0.4763, 'grad_norm': 0.6503818035125732, 'learning_rate': 3e-05, 'epoch': 1.36}
{'loss': 0.4645, 'grad_norm': 0.763

  0%|          | 0/134 [00:00<?, ?it/s]

{'eval_loss': 0.39485082030296326, 'eval_accuracy': 0.8309707152024268, 'eval_runtime': 30.5287, 'eval_samples_per_second': 1123.009, 'eval_steps_per_second': 4.389, 'epoch': 2.0}
{'loss': 0.4251, 'grad_norm': 0.9167351126670837, 'learning_rate': 4.5e-05, 'epoch': 2.04}
{'loss': 0.4098, 'grad_norm': 0.7703631520271301, 'learning_rate': 4.600000000000001e-05, 'epoch': 2.08}
{'loss': 0.4155, 'grad_norm': 0.8481701016426086, 'learning_rate': 4.7e-05, 'epoch': 2.13}
{'loss': 0.3988, 'grad_norm': 0.8063255548477173, 'learning_rate': 4.8e-05, 'epoch': 2.17}
{'loss': 0.396, 'grad_norm': 0.6711715459823608, 'learning_rate': 4.9e-05, 'epoch': 2.22}
{'loss': 0.3949, 'grad_norm': 0.8976175785064697, 'learning_rate': 5e-05, 'epoch': 2.26}
{'loss': 0.3952, 'grad_norm': 1.729543685913086, 'learning_rate': 4.987244897959184e-05, 'epoch': 2.31}
{'loss': 0.4157, 'grad_norm': 0.8729920983314514, 'learning_rate': 4.974489795918368e-05, 'epoch': 2.35}
{'loss': 0.4061, 'grad_norm': 0.6452743411064148, 'lea

  0%|          | 0/134 [00:00<?, ?it/s]

{'eval_loss': 0.3448346257209778, 'eval_accuracy': 0.855442772138607, 'eval_runtime': 31.1751, 'eval_samples_per_second': 1099.725, 'eval_steps_per_second': 4.298, 'epoch': 3.0}
{'loss': 0.3721, 'grad_norm': 0.9145734906196594, 'learning_rate': 4.783163265306123e-05, 'epoch': 3.03}
{'loss': 0.3676, 'grad_norm': 0.725443959236145, 'learning_rate': 4.7704081632653066e-05, 'epoch': 3.08}
{'loss': 0.3926, 'grad_norm': 0.7349307537078857, 'learning_rate': 4.7576530612244904e-05, 'epoch': 3.12}
{'loss': 0.3521, 'grad_norm': 0.7026703953742981, 'learning_rate': 4.744897959183674e-05, 'epoch': 3.17}
{'loss': 0.3618, 'grad_norm': 0.7656220197677612, 'learning_rate': 4.732142857142857e-05, 'epoch': 3.21}
{'loss': 0.372, 'grad_norm': 0.6938208937644958, 'learning_rate': 4.719387755102041e-05, 'epoch': 3.26}
{'loss': 0.3655, 'grad_norm': 0.697308361530304, 'learning_rate': 4.706632653061225e-05, 'epoch': 3.3}
{'loss': 0.3702, 'grad_norm': 0.843809962272644, 'learning_rate': 4.6938775510204086e-05,

  0%|          | 0/134 [00:00<?, ?it/s]

{'eval_loss': 0.32922351360321045, 'eval_accuracy': 0.8586804340217011, 'eval_runtime': 30.6781, 'eval_samples_per_second': 1117.539, 'eval_steps_per_second': 4.368, 'epoch': 4.0}
{'loss': 0.3462, 'grad_norm': 0.8350036144256592, 'learning_rate': 4.502551020408164e-05, 'epoch': 4.03}
{'loss': 0.3423, 'grad_norm': 0.7518599629402161, 'learning_rate': 4.4897959183673474e-05, 'epoch': 4.07}
{'loss': 0.3834, 'grad_norm': 0.8655911684036255, 'learning_rate': 4.477040816326531e-05, 'epoch': 4.12}
{'loss': 0.3587, 'grad_norm': 0.8432542681694031, 'learning_rate': 4.464285714285715e-05, 'epoch': 4.16}
{'loss': 0.3465, 'grad_norm': 0.7233604788780212, 'learning_rate': 4.451530612244898e-05, 'epoch': 4.21}
{'loss': 0.3334, 'grad_norm': 0.674627959728241, 'learning_rate': 4.438775510204082e-05, 'epoch': 4.25}
{'loss': 0.3504, 'grad_norm': 0.8079975247383118, 'learning_rate': 4.4260204081632656e-05, 'epoch': 4.3}
{'loss': 0.3384, 'grad_norm': 0.6328087449073792, 'learning_rate': 4.4132653061224493

  0%|          | 0/134 [00:00<?, ?it/s]

{'eval_loss': 0.3201815187931061, 'eval_accuracy': 0.8648640765371602, 'eval_runtime': 30.7996, 'eval_samples_per_second': 1113.131, 'eval_steps_per_second': 4.351, 'epoch': 5.0}
{'loss': 0.3582, 'grad_norm': 0.715366542339325, 'learning_rate': 4.2219387755102045e-05, 'epoch': 5.02}
{'loss': 0.3533, 'grad_norm': 0.5962766408920288, 'learning_rate': 4.209183673469388e-05, 'epoch': 5.07}
{'loss': 0.3535, 'grad_norm': 0.8110225796699524, 'learning_rate': 4.196428571428572e-05, 'epoch': 5.11}
{'loss': 0.3323, 'grad_norm': 0.8143725991249084, 'learning_rate': 4.183673469387756e-05, 'epoch': 5.16}
{'loss': 0.3298, 'grad_norm': 0.6367934346199036, 'learning_rate': 4.170918367346939e-05, 'epoch': 5.2}
{'loss': 0.3313, 'grad_norm': 0.7643365859985352, 'learning_rate': 4.1581632653061226e-05, 'epoch': 5.25}
{'loss': 0.3521, 'grad_norm': 0.7419686317443848, 'learning_rate': 4.1454081632653064e-05, 'epoch': 5.29}
{'loss': 0.3548, 'grad_norm': 0.7939806580543518, 'learning_rate': 4.13265306122449e-

  0%|          | 0/134 [00:00<?, ?it/s]

{'eval_loss': 0.3125484585762024, 'eval_accuracy': 0.8687434371718586, 'eval_runtime': 30.8241, 'eval_samples_per_second': 1112.248, 'eval_steps_per_second': 4.347, 'epoch': 6.0}
{'loss': 0.332, 'grad_norm': 0.8464667797088623, 'learning_rate': 3.9413265306122446e-05, 'epoch': 6.02}
{'loss': 0.3334, 'grad_norm': 0.7309756875038147, 'learning_rate': 3.928571428571429e-05, 'epoch': 6.06}
{'loss': 0.3411, 'grad_norm': 0.8790808916091919, 'learning_rate': 3.915816326530613e-05, 'epoch': 6.11}
{'loss': 0.3328, 'grad_norm': 0.9831762313842773, 'learning_rate': 3.9030612244897965e-05, 'epoch': 6.15}
{'loss': 0.3346, 'grad_norm': 0.6677863597869873, 'learning_rate': 3.8903061224489796e-05, 'epoch': 6.2}
{'loss': 0.3291, 'grad_norm': 0.730955958366394, 'learning_rate': 3.8775510204081634e-05, 'epoch': 6.24}
{'loss': 0.3472, 'grad_norm': 0.819309413433075, 'learning_rate': 3.864795918367347e-05, 'epoch': 6.29}
{'loss': 0.3358, 'grad_norm': 0.7697454690933228, 'learning_rate': 3.852040816326531e-

  0%|          | 0/134 [00:00<?, ?it/s]

{'eval_loss': 0.3008391261100769, 'eval_accuracy': 0.8749270796873178, 'eval_runtime': 31.1781, 'eval_samples_per_second': 1099.618, 'eval_steps_per_second': 4.298, 'epoch': 7.0}
{'loss': 0.3271, 'grad_norm': 0.7256224155426025, 'learning_rate': 3.6607142857142853e-05, 'epoch': 7.01}
{'loss': 0.3314, 'grad_norm': 0.7637787461280823, 'learning_rate': 3.64795918367347e-05, 'epoch': 7.06}
{'loss': 0.3311, 'grad_norm': 0.6798861026763916, 'learning_rate': 3.6352040816326536e-05, 'epoch': 7.1}
{'loss': 0.3146, 'grad_norm': 0.8280966281890869, 'learning_rate': 3.622448979591837e-05, 'epoch': 7.15}
{'loss': 0.3143, 'grad_norm': 0.7171550989151001, 'learning_rate': 3.609693877551021e-05, 'epoch': 7.19}
{'loss': 0.3451, 'grad_norm': 0.627884566783905, 'learning_rate': 3.596938775510204e-05, 'epoch': 7.24}
{'loss': 0.3268, 'grad_norm': 0.6408852934837341, 'learning_rate': 3.584183673469388e-05, 'epoch': 7.29}
{'loss': 0.3299, 'grad_norm': 0.6742938160896301, 'learning_rate': 3.571428571428572e-0

  0%|          | 0/134 [00:00<?, ?it/s]

{'eval_loss': 0.29197609424591064, 'eval_accuracy': 0.878835608447089, 'eval_runtime': 30.8424, 'eval_samples_per_second': 1111.585, 'eval_steps_per_second': 4.345, 'epoch': 8.0}
{'loss': 0.3204, 'grad_norm': 0.8711879849433899, 'learning_rate': 3.380102040816326e-05, 'epoch': 8.01}
{'loss': 0.3212, 'grad_norm': 0.9114750623703003, 'learning_rate': 3.36734693877551e-05, 'epoch': 8.05}
{'loss': 0.3466, 'grad_norm': 0.6983364224433899, 'learning_rate': 3.354591836734694e-05, 'epoch': 8.1}
{'loss': 0.3183, 'grad_norm': 0.6994694471359253, 'learning_rate': 3.341836734693878e-05, 'epoch': 8.14}
{'loss': 0.3208, 'grad_norm': 0.7548505067825317, 'learning_rate': 3.329081632653062e-05, 'epoch': 8.19}
{'loss': 0.3336, 'grad_norm': 0.7596806287765503, 'learning_rate': 3.316326530612245e-05, 'epoch': 8.24}
{'loss': 0.3175, 'grad_norm': 0.8499863147735596, 'learning_rate': 3.303571428571429e-05, 'epoch': 8.28}
{'loss': 0.3278, 'grad_norm': 0.698805570602417, 'learning_rate': 3.2908163265306125e-05

  0%|          | 0/134 [00:00<?, ?it/s]

{'eval_loss': 0.29590144753456116, 'eval_accuracy': 0.8774647065686617, 'eval_runtime': 30.2923, 'eval_samples_per_second': 1131.775, 'eval_steps_per_second': 4.424, 'epoch': 9.0}
{'loss': 0.3145, 'grad_norm': 0.7110249996185303, 'learning_rate': 3.0994897959183676e-05, 'epoch': 9.0}
{'loss': 0.3121, 'grad_norm': 0.74888676404953, 'learning_rate': 3.086734693877551e-05, 'epoch': 9.05}
{'loss': 0.2992, 'grad_norm': 0.783839225769043, 'learning_rate': 3.073979591836735e-05, 'epoch': 9.1}
{'loss': 0.3104, 'grad_norm': 0.7438682913780212, 'learning_rate': 3.061224489795919e-05, 'epoch': 9.14}
{'loss': 0.3343, 'grad_norm': 1.352871060371399, 'learning_rate': 3.0484693877551023e-05, 'epoch': 9.19}
{'loss': 0.335, 'grad_norm': 0.9227076172828674, 'learning_rate': 3.0357142857142857e-05, 'epoch': 9.23}
{'loss': 0.3267, 'grad_norm': 0.775842010974884, 'learning_rate': 3.0229591836734695e-05, 'epoch': 9.28}
{'loss': 0.3164, 'grad_norm': 0.7836957573890686, 'learning_rate': 3.0102040816326533e-05

  0%|          | 0/134 [00:00<?, ?it/s]

{'eval_loss': 0.2878313660621643, 'eval_accuracy': 0.880294014700735, 'eval_runtime': 30.9438, 'eval_samples_per_second': 1107.946, 'eval_steps_per_second': 4.33, 'epoch': 10.0}
{'loss': 0.324, 'grad_norm': 0.8487284183502197, 'learning_rate': 2.8061224489795918e-05, 'epoch': 10.05}
{'loss': 0.3234, 'grad_norm': 0.764045000076294, 'learning_rate': 2.7933673469387756e-05, 'epoch': 10.09}
{'loss': 0.2987, 'grad_norm': 0.7852975726127625, 'learning_rate': 2.7806122448979593e-05, 'epoch': 10.14}
{'loss': 0.3061, 'grad_norm': 0.7608056664466858, 'learning_rate': 2.767857142857143e-05, 'epoch': 10.18}
{'loss': 0.2931, 'grad_norm': 0.724790096282959, 'learning_rate': 2.7551020408163265e-05, 'epoch': 10.23}
{'loss': 0.3211, 'grad_norm': 0.7681241631507874, 'learning_rate': 2.7423469387755103e-05, 'epoch': 10.27}
{'loss': 0.312, 'grad_norm': 0.6648139953613281, 'learning_rate': 2.729591836734694e-05, 'epoch': 10.32}
{'loss': 0.2945, 'grad_norm': 0.6836796998977661, 'learning_rate': 2.7168367346

  0%|          | 0/134 [00:00<?, ?it/s]

{'eval_loss': 0.2889189124107361, 'eval_accuracy': 0.8805856959514642, 'eval_runtime': 30.4473, 'eval_samples_per_second': 1126.013, 'eval_steps_per_second': 4.401, 'epoch': 11.0}
{'loss': 0.316, 'grad_norm': 0.8163294196128845, 'learning_rate': 2.5255102040816326e-05, 'epoch': 11.04}
{'loss': 0.2926, 'grad_norm': 0.7280205488204956, 'learning_rate': 2.5127551020408164e-05, 'epoch': 11.09}
{'loss': 0.3066, 'grad_norm': 0.9762079119682312, 'learning_rate': 2.5e-05, 'epoch': 11.13}
{'loss': 0.3192, 'grad_norm': 1.0050466060638428, 'learning_rate': 2.487244897959184e-05, 'epoch': 11.18}
{'loss': 0.3227, 'grad_norm': 0.900336742401123, 'learning_rate': 2.4744897959183673e-05, 'epoch': 11.22}
{'loss': 0.3102, 'grad_norm': 0.8947376012802124, 'learning_rate': 2.461734693877551e-05, 'epoch': 11.27}
{'loss': 0.3196, 'grad_norm': 0.7671115398406982, 'learning_rate': 2.448979591836735e-05, 'epoch': 11.31}
{'loss': 0.3149, 'grad_norm': 0.686499834060669, 'learning_rate': 2.4362244897959186e-05, '

  0%|          | 0/134 [00:00<?, ?it/s]

{'eval_loss': 0.28472280502319336, 'eval_accuracy': 0.8822191109555477, 'eval_runtime': 30.3801, 'eval_samples_per_second': 1128.503, 'eval_steps_per_second': 4.411, 'epoch': 12.0}
{'loss': 0.2999, 'grad_norm': 1.0480611324310303, 'learning_rate': 2.2448979591836737e-05, 'epoch': 12.04}
{'loss': 0.3133, 'grad_norm': 0.8168999552726746, 'learning_rate': 2.2321428571428575e-05, 'epoch': 12.08}
{'loss': 0.2961, 'grad_norm': 0.8464174866676331, 'learning_rate': 2.219387755102041e-05, 'epoch': 12.13}
{'loss': 0.3265, 'grad_norm': 0.7923605442047119, 'learning_rate': 2.2066326530612247e-05, 'epoch': 12.17}
{'loss': 0.3007, 'grad_norm': 0.7791452407836914, 'learning_rate': 2.193877551020408e-05, 'epoch': 12.22}
{'loss': 0.3147, 'grad_norm': 0.7774122953414917, 'learning_rate': 2.181122448979592e-05, 'epoch': 12.26}
{'loss': 0.3172, 'grad_norm': 1.0869982242584229, 'learning_rate': 2.1683673469387756e-05, 'epoch': 12.31}
{'loss': 0.312, 'grad_norm': 0.8831064701080322, 'learning_rate': 2.15561

  0%|          | 0/134 [00:00<?, ?it/s]

{'eval_loss': 0.2822956144809723, 'eval_accuracy': 0.8827733053319332, 'eval_runtime': 30.4009, 'eval_samples_per_second': 1127.728, 'eval_steps_per_second': 4.408, 'epoch': 13.0}
{'loss': 0.3086, 'grad_norm': 0.7135851383209229, 'learning_rate': 1.9642857142857145e-05, 'epoch': 13.03}
{'loss': 0.2954, 'grad_norm': 0.9137508869171143, 'learning_rate': 1.9515306122448983e-05, 'epoch': 13.08}
{'loss': 0.3204, 'grad_norm': 0.8552921414375305, 'learning_rate': 1.9387755102040817e-05, 'epoch': 13.12}
{'loss': 0.2847, 'grad_norm': 0.994759738445282, 'learning_rate': 1.9260204081632655e-05, 'epoch': 13.17}
{'loss': 0.303, 'grad_norm': 0.7316272854804993, 'learning_rate': 1.913265306122449e-05, 'epoch': 13.21}
{'loss': 0.3158, 'grad_norm': 1.142510175704956, 'learning_rate': 1.9005102040816326e-05, 'epoch': 13.26}
{'loss': 0.3221, 'grad_norm': 0.7474606037139893, 'learning_rate': 1.8877551020408164e-05, 'epoch': 13.3}
{'loss': 0.3164, 'grad_norm': 0.8416677713394165, 'learning_rate': 1.8750000

  0%|          | 0/134 [00:00<?, ?it/s]

{'eval_loss': 0.2784067988395691, 'eval_accuracy': 0.883590012833975, 'eval_runtime': 30.5623, 'eval_samples_per_second': 1121.774, 'eval_steps_per_second': 4.384, 'epoch': 14.0}
{'loss': 0.3085, 'grad_norm': 0.9444997906684875, 'learning_rate': 1.683673469387755e-05, 'epoch': 14.03}
{'loss': 0.3029, 'grad_norm': 0.734022855758667, 'learning_rate': 1.670918367346939e-05, 'epoch': 14.07}
{'loss': 0.3062, 'grad_norm': 0.7859597206115723, 'learning_rate': 1.6581632653061225e-05, 'epoch': 14.12}
{'loss': 0.3237, 'grad_norm': 0.9414042234420776, 'learning_rate': 1.6454081632653062e-05, 'epoch': 14.16}
{'loss': 0.2921, 'grad_norm': 0.8176825642585754, 'learning_rate': 1.6326530612244897e-05, 'epoch': 14.21}
{'loss': 0.3072, 'grad_norm': 0.9385035037994385, 'learning_rate': 1.6198979591836734e-05, 'epoch': 14.25}
{'loss': 0.3137, 'grad_norm': 0.7723339796066284, 'learning_rate': 1.6071428571428572e-05, 'epoch': 14.3}
{'loss': 0.2845, 'grad_norm': 0.822453498840332, 'learning_rate': 1.59438775

  0%|          | 0/134 [00:00<?, ?it/s]

{'eval_loss': 0.27679625153541565, 'eval_accuracy': 0.8847275697118189, 'eval_runtime': 30.8775, 'eval_samples_per_second': 1110.322, 'eval_steps_per_second': 4.34, 'epoch': 15.0}
{'loss': 0.2871, 'grad_norm': 0.9733253717422485, 'learning_rate': 1.4030612244897959e-05, 'epoch': 15.02}
{'loss': 0.3201, 'grad_norm': 1.0517566204071045, 'learning_rate': 1.3903061224489797e-05, 'epoch': 15.07}
{'loss': 0.2968, 'grad_norm': 0.7327494621276855, 'learning_rate': 1.3775510204081633e-05, 'epoch': 15.11}
{'loss': 0.3, 'grad_norm': 0.8263207077980042, 'learning_rate': 1.364795918367347e-05, 'epoch': 15.16}
{'loss': 0.3011, 'grad_norm': 0.8814637660980225, 'learning_rate': 1.3520408163265308e-05, 'epoch': 15.2}
{'loss': 0.3134, 'grad_norm': 0.7117173075675964, 'learning_rate': 1.3392857142857144e-05, 'epoch': 15.25}
{'loss': 0.3067, 'grad_norm': 0.8434823155403137, 'learning_rate': 1.3265306122448982e-05, 'epoch': 15.29}
{'loss': 0.3108, 'grad_norm': 0.6617300510406494, 'learning_rate': 1.3137755

  0%|          | 0/134 [00:00<?, ?it/s]

{'eval_loss': 0.2783898115158081, 'eval_accuracy': 0.8844067203360167, 'eval_runtime': 31.0569, 'eval_samples_per_second': 1103.911, 'eval_steps_per_second': 4.315, 'epoch': 16.0}
{'loss': 0.3195, 'grad_norm': 0.8947851657867432, 'learning_rate': 1.1224489795918369e-05, 'epoch': 16.02}
{'loss': 0.2861, 'grad_norm': 0.7487950921058655, 'learning_rate': 1.1096938775510205e-05, 'epoch': 16.06}
{'loss': 0.3075, 'grad_norm': 0.8315370678901672, 'learning_rate': 1.096938775510204e-05, 'epoch': 16.11}
{'loss': 0.3013, 'grad_norm': 0.8027757406234741, 'learning_rate': 1.0841836734693878e-05, 'epoch': 16.15}
{'loss': 0.2971, 'grad_norm': 0.8171247839927673, 'learning_rate': 1.0714285714285714e-05, 'epoch': 16.2}
{'loss': 0.3002, 'grad_norm': 0.8738830089569092, 'learning_rate': 1.0586734693877552e-05, 'epoch': 16.24}
{'loss': 0.3032, 'grad_norm': 0.7531570196151733, 'learning_rate': 1.045918367346939e-05, 'epoch': 16.29}
{'loss': 0.3124, 'grad_norm': 0.7500296235084534, 'learning_rate': 1.03316

  0%|          | 0/134 [00:00<?, ?it/s]

{'eval_loss': 0.2763293385505676, 'eval_accuracy': 0.8854567728386419, 'eval_runtime': 30.6132, 'eval_samples_per_second': 1119.911, 'eval_steps_per_second': 4.377, 'epoch': 17.0}
{'loss': 0.294, 'grad_norm': 0.6928896903991699, 'learning_rate': 8.418367346938775e-06, 'epoch': 17.01}
{'loss': 0.3039, 'grad_norm': 0.7257115244865417, 'learning_rate': 8.290816326530612e-06, 'epoch': 17.06}
{'loss': 0.3071, 'grad_norm': 0.8468036651611328, 'learning_rate': 8.163265306122448e-06, 'epoch': 17.1}
{'loss': 0.3026, 'grad_norm': 0.7924095392227173, 'learning_rate': 8.035714285714286e-06, 'epoch': 17.15}
{'loss': 0.2985, 'grad_norm': 0.8559116721153259, 'learning_rate': 7.908163265306124e-06, 'epoch': 17.19}
{'loss': 0.2986, 'grad_norm': 0.9285740256309509, 'learning_rate': 7.78061224489796e-06, 'epoch': 17.24}
{'loss': 0.3054, 'grad_norm': 0.6626217365264893, 'learning_rate': 7.653061224489797e-06, 'epoch': 17.29}
{'loss': 0.2888, 'grad_norm': 0.8077366948127747, 'learning_rate': 7.525510204081

  0%|          | 0/134 [00:00<?, ?it/s]

{'eval_loss': 0.2785985469818115, 'eval_accuracy': 0.8845233928363084, 'eval_runtime': 30.2353, 'eval_samples_per_second': 1133.905, 'eval_steps_per_second': 4.432, 'epoch': 18.0}
{'loss': 0.2903, 'grad_norm': 0.8356746435165405, 'learning_rate': 5.612244897959184e-06, 'epoch': 18.01}
{'loss': 0.3091, 'grad_norm': 0.7528868317604065, 'learning_rate': 5.48469387755102e-06, 'epoch': 18.05}
{'loss': 0.3084, 'grad_norm': 0.9889609813690186, 'learning_rate': 5.357142857142857e-06, 'epoch': 18.1}
{'loss': 0.3155, 'grad_norm': 0.8081106543540955, 'learning_rate': 5.229591836734695e-06, 'epoch': 18.14}
{'loss': 0.2939, 'grad_norm': 0.7267455458641052, 'learning_rate': 5.102040816326531e-06, 'epoch': 18.19}
{'loss': 0.2953, 'grad_norm': 0.7956850528717041, 'learning_rate': 4.9744897959183674e-06, 'epoch': 18.24}
{'loss': 0.3071, 'grad_norm': 0.6891839504241943, 'learning_rate': 4.846938775510204e-06, 'epoch': 18.28}
{'loss': 0.3026, 'grad_norm': 0.7114583849906921, 'learning_rate': 4.7193877551

  0%|          | 0/134 [00:00<?, ?it/s]

{'eval_loss': 0.27347999811172485, 'eval_accuracy': 0.8857192859642982, 'eval_runtime': 30.174, 'eval_samples_per_second': 1136.208, 'eval_steps_per_second': 4.441, 'epoch': 19.0}
{'loss': 0.3083, 'grad_norm': 0.7870933413505554, 'learning_rate': 2.806122448979592e-06, 'epoch': 19.0}
{'loss': 0.3131, 'grad_norm': 0.7621570825576782, 'learning_rate': 2.6785714285714285e-06, 'epoch': 19.05}
{'loss': 0.298, 'grad_norm': 0.9138600826263428, 'learning_rate': 2.5510204081632653e-06, 'epoch': 19.1}
{'loss': 0.2795, 'grad_norm': 0.8356846570968628, 'learning_rate': 2.423469387755102e-06, 'epoch': 19.14}
{'loss': 0.2872, 'grad_norm': 0.7734460830688477, 'learning_rate': 2.295918367346939e-06, 'epoch': 19.19}
{'loss': 0.3182, 'grad_norm': 0.8826451897621155, 'learning_rate': 2.1683673469387757e-06, 'epoch': 19.23}
{'loss': 0.302, 'grad_norm': 0.7545251846313477, 'learning_rate': 2.040816326530612e-06, 'epoch': 19.28}
{'loss': 0.2917, 'grad_norm': 0.9485779404640198, 'learning_rate': 1.9132653061

  0%|          | 0/134 [00:00<?, ?it/s]

{'eval_loss': 0.27520960569381714, 'eval_accuracy': 0.8859234628398086, 'eval_runtime': 30.26, 'eval_samples_per_second': 1132.981, 'eval_steps_per_second': 4.428, 'epoch': 20.0}
{'train_runtime': 9341.9345, 'train_samples_per_second': 120.894, 'train_steps_per_second': 0.473, 'train_loss': 0.3462586749732764, 'epoch': 20.0}
./data/llm/input_embeds_replace_frozen/attentions.npy ./data/llm/input_embeds_replace_frozen/hidden_states.npy ./data/llm/input_embeds_replace_frozen/tokens.csv
train Forward Epoch 1 from 1




test Forward Epoch 1 from 1
val Forward Epoch 1 from 1


# Post-Training
After Training, we load all attentions and hidden states from before and append the new fields to the dataset.

In [6]:
vanilla_df = ClassifierBase.read_forward_dataset("./data/llm/vanilla")
prompt_df = ClassifierBase.read_forward_dataset("./data/llm/prompt")
input_embeds_replace_df = ClassifierBase.read_forward_dataset(
    "./data/llm/input_embeds_replace"
)

In [7]:
kg_manager.llm_df

Unnamed: 0,source_id,target_id,id_x,id_y,prompt_feature_title,prompt_feature_genres,labels,split,prompt,gnn_feature_(no genres listed),...,gnn_feature_IMAX,gnn_feature_Musical,gnn_feature_Mystery,gnn_feature_Romance,gnn_feature_Sci-Fi,gnn_feature_Thriller,gnn_feature_War,gnn_feature_Western,input_embeds_replace_source_embedding,input_embeds_replace_target_embedding
0,0,5,0.0,5.0,Heat (1995),"['Action', 'Crime', 'Thriller']",1,train,"0[SEP]5[SEP]Heat (1995)[SEP]['Action', 'Crime'...",,...,,,,,,,,,"[0.6693655252456665, 0.4152011275291443, -0.31...","[0.5493870973587036, -0.5692430734634399, -0.6..."
1,0,89,0.0,89.0,Bottle Rocket (1996),"['Adventure', 'Comedy', 'Crime', 'Romance']",1,train,0[SEP]89[SEP]Bottle Rocket (1996)[SEP]['Advent...,,...,,,,,,,,,"[0.6030164957046509, 0.37829291820526123, -0.3...","[0.08589676022529602, -0.4689454734325409, 0.0..."
2,0,97,0.0,97.0,Braveheart (1995),"['Action', 'Drama', 'War']",1,train,"0[SEP]97[SEP]Braveheart (1995)[SEP]['Action', ...",,...,,,,,,,,,"[0.6387443542480469, 0.3706628084182739, -0.32...","[0.04940161854028702, -0.47019216418266296, -0..."
3,0,184,0.0,184.0,Billy Madison (1995),['Comedy'],1,train,0[SEP]184[SEP]Billy Madison (1995)[SEP]['Comedy'],,...,,,,,,,,,"[0.49271589517593384, 0.4562503397464752, -0.2...","[0.6636490821838379, -0.3091672956943512, 0.01..."
4,0,224,0.0,224.0,Star Wars: Episode IV - A New Hope (1977),"['Action', 'Adventure', 'Sci-Fi']",1,train,0[SEP]224[SEP]Star Wars: Episode IV - A New Ho...,,...,,,,,,,,,"[0.6498588919639587, 0.43191879987716675, -0.3...","[-0.08320911973714828, -0.060460180044174194, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125032,79,8846,,,Lovesick (2014),"['Comedy', 'Romance']",0,val,"79[SEP]8846[SEP]Lovesick (2014)[SEP]['Comedy',...",0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,"[-0.5735506415367126, -0.044633716344833374, -...","[0.47228026390075684, -0.911340594291687, 0.00..."
125033,300,2595,,,Dersu Uzala (1975),"['Adventure', 'Drama']",0,val,300[SEP]2595[SEP]Dersu Uzala (1975)[SEP]['Adve...,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[0.006543345749378204, -0.3807082176208496, 0....","[0.36955565214157104, -0.21707147359848022, -0..."
125034,321,59,,,Lawnmower Man 2: Beyond Cyberspace (1996),"['Action', 'Sci-Fi', 'Thriller']",0,val,321[SEP]59[SEP]Lawnmower Man 2: Beyond Cybersp...,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,"[-0.3852957785129547, 0.02625414729118347, -0....","[0.38845473527908325, -0.27163609862327576, -0..."
125035,388,2639,,,All the Vermeers in New York (1990),"['Comedy', 'Drama', 'Romance']",0,val,388[SEP]2639[SEP]All the Vermeers in New York ...,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,"[0.26095038652420044, 0.08665543794631958, -0....","[1.6701771020889282, -0.6786400079727173, 0.87..."


In [8]:
dataset = kg_manager.generate_huggingface_dataset(
    [vanilla_df, prompt_df, input_embeds_replace_df, input_embeds_replace_frozen_df],
    ["vanilla", "prompt", "input_embeds_replace", "input_embeds_replace_frozen_df"],
)

In [9]:
dataset

DatasetDict({
    train: Dataset({
        features: ['source_id', 'target_id', 'id_x', 'id_y', 'prompt_feature_title', 'prompt_feature_genres', 'labels', 'split', 'prompt', 'gnn_feature_(no genres listed)', 'gnn_feature_Action', 'gnn_feature_Adventure', 'gnn_feature_Animation', 'gnn_feature_Children', 'gnn_feature_Comedy', 'gnn_feature_Crime', 'gnn_feature_Documentary', 'gnn_feature_Drama', 'gnn_feature_Fantasy', 'gnn_feature_Film-Noir', 'gnn_feature_Horror', 'gnn_feature_IMAX', 'gnn_feature_Musical', 'gnn_feature_Mystery', 'gnn_feature_Romance', 'gnn_feature_Sci-Fi', 'gnn_feature_Thriller', 'gnn_feature_War', 'gnn_feature_Western', 'input_embeds_replace_source_embedding', 'input_embeds_replace_target_embedding', 'vanilla_attentions', 'vanilla_hidden_states', 'vanilla_attentions_original_shape', 'vanilla_hidden_states_original_shape', 'prompt_attentions', 'prompt_hidden_states', 'prompt_attentions_original_shape', 'prompt_hidden_states_original_shape', 'input_embeds_replace_attentions

In [10]:
dataset.save_to_disk("./data/dataset.hf")

Saving the dataset (0/10 shards):   0%|          | 0/56469 [00:00<?, ? examples/s]

Saving the dataset (0/6 shards):   0%|          | 0/34284 [00:00<?, ? examples/s]

Saving the dataset (0/6 shards):   0%|          | 0/34284 [00:00<?, ? examples/s]

In [11]:
dataset.push_to_hub("AhmadPython/MovieLens_KGE")

Uploading the dataset shards:   0%|          | 0/10 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/6 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/6 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/2.80k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


CommitInfo(commit_url='https://huggingface.co/datasets/AhmadPython/MovieLens_KGE/commit/70c96f660429150e55b6437d59ee39da76891de7', commit_message='Upload dataset', commit_description='', oid='70c96f660429150e55b6437d59ee39da76891de7', pr_url=None, pr_revision=None, pr_num=None)