# LM-based models inference

> Inference for LM-based models

- title-block-banner: true

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os

os.chdir("..")
from typing import Any, Dict

import pytorch_lightning as pl
from embeddings.config.lightning_config import LightningAdvancedConfig
from embeddings.defaults import DATASET_PATH, RESULTS_PATH
from embeddings.model.lightning_module.text_classification import (
    TextClassificationModule,
)
from embeddings.pipeline.hf_preprocessing_pipeline import (
    HuggingFacePreprocessingPipeline,
)
from embeddings.pipeline.lightning_classification import LightningClassificationPipeline
from embeddings.task.lightning_task.text_classification import TextClassificationTask
from embeddings.utils.utils import build_output_path

In [None]:
embedding_name_or_path = "hf-internal-testing/tiny-albert"
dataset_name = "clarin-pl/polemo2-official"

dataset_path = build_output_path(DATASET_PATH, embedding_name_or_path, dataset_name)
output_path = build_output_path(RESULTS_PATH, embedding_name_or_path, dataset_name)

### Preprocess and downsample data

In [None]:
def preprocess_data(path: str) -> Dict[str, Any]:
    pipeline = HuggingFacePreprocessingPipeline(
        dataset_name=dataset_name,
        load_dataset_kwargs={
            "train_domains": ["hotels", "medicine"],
            "dev_domains": ["hotels", "medicine"],
            "test_domains": ["hotels", "medicine"],
            "text_cfg": "text",
        },
        persist_path=path,
        sample_missing_splits=None,
        ignore_test_subset=False,
        downsample_splits=(0.01, 0.01, 0.05),
        seed=441,
    )
    pipeline.run()

    return {
        "dataset_name_or_path": path,
        "input_column_name": ["text"],
        "target_column_name": "target",
    }


dataset_kwargs = preprocess_data(dataset_path)

### Train simple downsampled pipeline

In [None]:
config = LightningAdvancedConfig(
    finetune_last_n_layers=0,
    task_train_kwargs={"max_epochs": 1, "deterministic": True,},
    task_model_kwargs={
        "learning_rate": 5e-4,
        "train_batch_size": 32,
        "eval_batch_size": 32,
        "use_scheduler": True,
        "optimizer": "AdamW",
        "adam_epsilon": 1e-8,
        "warmup_steps": 100,
        "weight_decay": 0.0,
    },
    datamodule_kwargs={"max_seq_length": 64,},
    early_stopping_kwargs={"monitor": "val/Loss", "mode": "min", "patience": 3,},
    tokenizer_kwargs={},
    batch_encoding_kwargs={},
    dataloader_kwargs={},
    model_config_kwargs={},
)

In [None]:
pipeline = LightningClassificationPipeline(
    embedding_name_or_path=embedding_name_or_path,
    output_path=output_path,
    config=config,
    devices="auto",
    accelerator="cpu",
    **dataset_kwargs
)
result = pipeline.run()

### Load model from chechpoint automatically generated with Trainer

In [None]:
ckpt_path = output_path / "checkpoints" / "last.ckpt"
ckpt_path

In [None]:
task_from_ckpt = TextClassificationTask.from_checkpoint(
    checkpoint_path=ckpt_path, output_path=output_path,
)

#### Alternatively we can load the model

In [None]:
model_from_ckpt = TextClassificationModule.load_from_checkpoint(str(ckpt_path))

The warning appears when loading the model, however, it was validated that the loaded weights are the same as the weights that are being saved. The reason for this is that when the model_state_dict keys are loaded from the cached huggingface model some of them (cls.(...)) do not match the keys from the state_dict of the model weights that are saved.

https://github.com/CLARIN-PL/embeddings/issues/225

### Use task from checkpoint for predictions

`return_names` needs to be set to False since it uses the `datamodule` to retrieves the names while the datamodule is not loaded to `Trainer` in the `LightningTask` since we have not fitted it yet.

In [None]:
test_dataloader = pipeline.datamodule.test_dataloader()
preds = task_from_ckpt.predict(test_dataloader)
preds

Alternatively we can implicitly assign the `datamodule` to `Trainer` in `LightningTask`

In [None]:
task_from_ckpt.trainer.datamodule = pipeline.datamodule
preds_with_names = task_from_ckpt.predict(test_dataloader, return_names=True)
preds_with_names

We can also use previosly loaded lightning model (`LightningModule`) outside of the task and get the predictions. To do this we also need to intitialize a `Trainer`.

In [None]:
trainer = pl.Trainer(default_root_dir=str(output_path))
preds_from_model = trainer.predict(model_from_ckpt, dataloaders=test_dataloader)
preds_from_model