### Validation functions

In [None]:
def _validate_load_data(data_file_path: str) -> bool:
    """Validate the user passed."""
    from datasets.load import load_dataset
    from pathlib import Path
    from functools import partial

    if Path(data_file_path).is_dir():
        print(
            f"WARNING! Directory, {data_file_path} is passed as input. Skipping validation!"
        )
        return True

    # format to load function map
    format_load_func_map = {
        "json": partial(load_dataset, path="json"),
        "parquet": partial(load_dataset, path="parquet"),
        "csv": partial(load_dataset, path="csv"),
        "tsv": partial(load_dataset, path="csv", sep="\t"),
        "jsonl": partial(load_dataset, path="json"),
    }

    # try fetching file format post the dot operator
    loader_func = format_load_func_map.get(Path(data_file_path).suffix[1:], None)
    if loader_func:
        try:
            _ = loader_func(data_files={"train": data_file_path}, split="train")
            return True
        except Exception:
            raise ValueError(
                "WARNING! Error while loading the dataset.\n"
                "Submitting the finetune job with this dataset is going to fail the run. Please fix the dataset and submit again."
            )

    print("WARNING! Unable to find the file format. Skipping validation.")
    return True


def _validate_finetune_data(
    train_file_path: str, validation_file_path: str, test_file_path: str
):
    """Validate the data finetune data selected by the user."""
    print("Validating train file path")
    _validate_load_data(train_file_path.replace("azureml:", ""))
    print("Validating train file path")
    _validate_load_data(validation_file_path.replace("azureml:", ""))
    print("Test train file path")
    _validate_load_data(test_file_path.replace("azureml:", ""))

In [1]:
from azure.ai.ml.entities import AmlCompute
from azure.ai.ml.constants._compute import ComputeType
from azure.ai.ml import MLClient


def _validate_finetune_compute(
    compute_name: str, finetune_nodes: str, ws_ml_client: MLClient
):
    """validate if the compute has nodes as requested by the user."""
    # check if compute exists
    try:
        compute = ws_ml_client.compute.get(compute_name)
    except Exception:
        raise ValueError(f"Couldn't find compute with name: {compute_name}")

    # check if the compute can scale to the number of nodes selected by the user
    if compute.type == ComputeType.COMPUTEINSTANCE and int(finetune_nodes) > 1:
        raise ValueError(
            f"Finetune nodes requested: {finetune_nodes}. Max nodes compute, {compute_name} can scale to: 1. "
            f"Please create a compute cluster with max_intances parameter to at least {finetune_nodes} while creating the compute."
        )
    if compute.type == ComputeType.AMLCOMPUTE and compute.max_instances < int(
        finetune_nodes
    ):
        raise ValueError(
            f"Finetune nodes requested: {finetune_nodes}. Max nodes compute, {compute_name} can scale to: {compute.max_instances}. "
            f"Please set the max_intances parameter to at least {finetune_nodes} while creating the compute."
        )

In [None]:
from azure.ai.ml.entities import PipelineJob
from typing import Dict, Any


def _get_param_value(pipeline_job_dict: Dict[str, Any], param_name: str):
    """Fetch the param value."""
    pipeline_name = next(iter(pipeline_job_dict["jobs"]))

    # check if user customized the value
    param_value = pipeline_job_dict["jobs"][pipeline_name]["inputs"].get(param_name)
    if param_value is not None:
        return param_value

    # fetch the default otherwise
    return pipeline_job_dict["jobs"][pipeline_name]["component"]["inputs"][param_name][
        "default"
    ]


def validate_pipeline(ft_pipeline: PipelineJob, ws_ml_client: MLClient):
    """Validate the user created pipeline."""
    # get compute cluster
    pipeline_job_dict = ft_pipeline._to_dict()

    # validate compute
    compute_name = _get_param_value(pipeline_job_dict, "compute_finetune")
    num_nodes_finetune = _get_param_value(pipeline_job_dict, "num_nodes_finetune")
    _validate_finetune_compute(
        compute_name=compute_name,
        finetune_nodes=num_nodes_finetune,
        ws_ml_client=ws_ml_client,
    )

    # validate nodes selected by user
    train_file_path = _get_param_value(pipeline_job_dict, "train_file_path")["path"]
    validation_file_path = _get_param_value(pipeline_job_dict, "validation_file_path")[
        "path"
    ]
    test_file_path = _get_param_value(pipeline_job_dict, "test_file_path")["path"]
    _validate_finetune_data(
        train_file_path=train_file_path,
        validation_file_path=validation_file_path,
        test_file_path=test_file_path,
    )