In [1]:
# Prequisites

# %pip install --upgrade azure-ai-ml
# %pip install --upgrade azure-identity
# %pip install --upgrade datasets==2.9.0
# %pip install py7zr

In [13]:
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential
from azure.ai.ml.entities import AmlCompute
import time

try:
    credential = DefaultAzureCredential()
    credential.get_token("https://management.azure.com/.default")
except Exception as ex:
    credential = InteractiveBrowserCredential()

workspace_ml_client = None
try:
    workspace_ml_client = MLClient.from_config(credential)
    subscription_id = workspace_ml_client.subscription_id
    workspace = workspace_ml_client.workspace_name
    resource_group = workspace_ml_client.resource_group_name
except Exception as ex:
    print(ex)
    # Enter details of your AML workspace
    subscription_id = "<SUBSCRIPTION_ID>"
    resource_group = "<RESOURCE_GROUP>"
    workspace = "<AML_WORKSPACE_NAME>"
    workspace_ml_client = MLClient(
        credential, subscription_id, resource_group, workspace
    )

# replace with the registry name
nemo_registry = "nvidia-ai" #"azureml"

# registry where models are present
model_registry = "azureml-preview-test1"

nemo_registry_ml_client = MLClient(
    credential, subscription_id, resource_group, registry_name=nemo_registry
)
model_registry_ml_client = MLClient(
    credential, subscription_id, resource_group, registry_name=model_registry
)
nemo_registry_ml_client

Found the config file in: /config.json


MLClient(credential=<azure.identity._credentials.default.DefaultAzureCredential object at 0x7fd1c959c100>,
         subscription_id=d4d34678-c0d7-4d69-a257-366e3cb4a7d8,
         resource_group_name=registry-builtin-nvidia-eastus,
         workspace_name=None)

## Creating Computes

In [14]:
# If you already have a gpu cluster, mention it here. Else will create a new one with the name 'ghyadav-westus-a100'
# model will only run on an a100 instance

compute_cluster = "nemo-westus"
try:
    compute = workspace_ml_client.compute.get(compute_cluster)
    print(f"GPU compute '{compute_cluster}' found.")
except Exception as ex:
    print(f"GPU compute '{compute_cluster}' not found. Creating new one.")
    compute = AmlCompute(
        name=compute_cluster,
        size="STANDARD_ND96AMSR_A100_V4",
        max_instances=2,  # For multi node training set this to an integer value more than 1
    )
    workspace_ml_client.compute.begin_create_or_update(compute).wait()

# generating a unique timestamp that can be used for names and versions that need to be unique
timestamp = str(int(time.time()))


# This is the number of GPUs in a single node of the selected 'vm_size' compute.
# Setting this to less than the number of GPUs will result in underutilized GPUs, taking longer to train.
# Setting this to more than the number of GPUs will result in an error.
gpus_per_node = 1  # default value
gpu_count_found = False
ws_computes = workspace_ml_client.compute.list_sizes()
for ws_compute in ws_computes:
    if ws_compute.name.lower() == compute.size.lower():
        gpus_per_node = ws_compute.gpus
        print(f"Number of GPUs in compute {ws_compute.name} are {ws_compute.gpus}")
# if gpu_count_found not found, then print an error
if gpus_per_node > 0:
    gpu_count_found = True
else:
    gpu_count_found = False
    print(f"No GPUs found in compute. Number of GPUs in compute {compute.size} 0.")

GPU compute 'nemo-westus' found.
Number of GPUs in compute Standard_NC96ads_A100_v4 are 4


## Input Data for Evaluation

In [15]:
# download the dataset using the helper script. This needs datasets library: https://pypi.org/project/datasets/
import os
from datasets import load_dataset, get_dataset_split_names
dataset_dir = "pubmed-dataset"
dataset_name = "pubmed_qa"
# create the download directory if it does not exist
if not os.path.exists(dataset_dir):
    os.makedirs(dataset_dir)


# import hugging face datasets library

split = 'train' # Only test available
dataset = load_dataset(dataset_name, 'pqa_labeled', split=split)
# save the split of the dataset to the download directory as json lines file
dataset.to_json(os.path.join(dataset_dir, f"{split}.jsonl"))
# print dataset features


Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 27.73ba/s]


2201997

In [5]:
dataset

Dataset({
    features: ['pubid', 'question', 'context', 'long_answer', 'final_decision'],
    num_rows: 1000
})

In [6]:
# evaluation_dataset = 'text-generation-data.jsonl' # evaluation_dataset_path
# prompt_data = '' # prompt_data_path
# load the ./samsum-dataset/train.jsonl file into a pandas dataframe and show the first 5 rows
import pandas as pd
evaluation_dataset = "./pubmed-dataset/test/test.jsonl"
pd.set_option(
    "display.max_colwidth", 0
)  # set the max column width to 0 to display the full text
df = pd.read_json("./pubmed-dataset/train.jsonl", lines=True)
df = df[:10]

In [7]:
def form_question(obj):
    st = ""
    st += f"QUESTION: {obj['question']}\n"
    st += "CONTEXT: "
    for i in range(len(obj['context']['labels'])):
        st += f"{obj['context']['contexts'][i]}\n"
    st += f"TARGET: the answer to the question given the context is (yes|no|maybe): "
    return st

result = []
for i, row in df.iterrows():
    result.append(form_question(row))
data = pd.DataFrame({'input': result})
data

Unnamed: 0,input
0,"QUESTION: Do mitochondria play a role in remodelling lace plant leaves during programmed cell death?\nCONTEXT: Programmed cell death (PCD) is the regulated death of cells within an organism. The lace plant (Aponogeton madagascariensis) produces perforations in its leaves through PCD. The leaves of the plant consist of a latticework of longitudinal and transverse veins enclosing areoles. PCD occurs in the cells at the center of these areoles and progresses outwards, stopping approximately five cells from the vasculature. The role of mitochondria during PCD has been recognized in animals; however, it has been less studied during PCD in plants.\nThe following paper elucidates the role of mitochondrial dynamics during developmentally regulated PCD in vivo in A. madagascariensis. A single areole within a window stage leaf (PCD is occurring) was divided into three areas based on the progression of PCD; cells that will not undergo PCD (NPCD), cells in early stages of PCD (EPCD), and cells in late stages of PCD (LPCD). Window stage leaves were stained with the mitochondrial dye MitoTracker Red CMXRos and examined. Mitochondrial dynamics were delineated into four categories (M1-M4) based on characteristics including distribution, motility, and membrane potential (ΔΨm). A TUNEL assay showed fragmented nDNA in a gradient over these mitochondrial stages. Chloroplasts and transvacuolar strands were also examined using live cell imaging. The possible importance of mitochondrial permeability transition pore (PTP) formation during PCD was indirectly examined via in vivo cyclosporine A (CsA) treatment. This treatment resulted in lace plant leaves with a significantly lower number of perforations compared to controls, and that displayed mitochondrial dynamics similar to that of non-PCD cells.\nTARGET: the answer to the question given the context is (yes|no|maybe):"
1,"QUESTION: Landolt C and snellen e acuity: differences in strabismus amblyopia?\nCONTEXT: Assessment of visual acuity depends on the optotypes used for measurement. The ability to recognize different optotypes differs even if their critical details appear under the same visual angle. Since optotypes are evaluated on individuals with good visual acuity and without eye disorders, differences in the lower visual acuity range cannot be excluded. In this study, visual acuity measured with the Snellen E was compared to the Landolt C acuity.\n100 patients (age 8 - 90 years, median 60.5 years) with various eye disorders, among them 39 with amblyopia due to strabismus, and 13 healthy volunteers were tested. Charts with the Snellen E and the Landolt C (Precision Vision) which mimic the ETDRS charts were used to assess visual acuity. Three out of 5 optotypes per line had to be correctly identified, while wrong answers were monitored. In the group of patients, the eyes with the lower visual acuity, and the right eyes of the healthy subjects, were evaluated.\nDifferences between Landolt C acuity (LR) and Snellen E acuity (SE) were small. The mean decimal values for LR and SE were 0.25 and 0.29 in the entire group and 0.14 and 0.16 for the eyes with strabismus amblyopia. The mean difference between LR and SE was 0.55 lines in the entire group and 0.55 lines for the eyes with strabismus amblyopia, with higher values of SE in both groups. The results of the other groups were similar with only small differences between LR and SE.\nTARGET: the answer to the question given the context is (yes|no|maybe):"
2,"QUESTION: Syncope during bathing in infants, a pediatric form of water-induced urticaria?\nCONTEXT: Apparent life-threatening events in infants are a difficult and frequent problem in pediatric practice. The prognosis is uncertain because of risk of sudden infant death syndrome.\nEight infants aged 2 to 15 months were admitted during a period of 6 years; they suffered from similar maladies in the bath: on immersion, they became pale, hypotonic, still and unreactive; recovery took a few seconds after withdrawal from the bath and stimulation. Two diagnoses were initially considered: seizure or gastroesophageal reflux but this was doubtful. The hypothesis of an equivalent of aquagenic urticaria was then considered; as for patients with this disease, each infant's family contained members suffering from dermographism, maladies or eruption after exposure to water or sun. All six infants had dermographism. We found an increase in blood histamine levels after a trial bath in the two infants tested. The evolution of these ""aquagenic maladies"" was favourable after a few weeks without baths. After a 2-7 year follow-up, three out of seven infants continue to suffer from troubles associated with sun or water.\nTARGET: the answer to the question given the context is (yes|no|maybe):"
3,"QUESTION: Are the long-term results of the transanal pull-through equal to those of the transabdominal pull-through?\nCONTEXT: The transanal endorectal pull-through (TERPT) is becoming the most popular procedure in the treatment of Hirschsprung disease (HD), but overstretching of the anal sphincters remains a critical issue that may impact the continence. This study examined the long-term outcome of TERPT versus conventional transabdominal (ABD) pull-through for HD.\nRecords of 41 patients more than 3 years old who underwent a pull-through for HD (TERPT, n = 20; ABD, n = 21) were reviewed, and their families were thoroughly interviewed and scored via a 15-item post-pull-through long-term outcome questionnaire. Patients were operated on between the years 1995 and 2003. During this time, our group transitioned from the ABD to the TERPT technique. Total scoring ranged from 0 to 40: 0 to 10, excellent; 11 to 20 good; 21 to 30 fair; 31 to 40 poor. A 2-tailed Student t test, analysis of covariance, as well as logistic and linear regression were used to analyze the collected data with confidence interval higher than 95%.\nOverall scores were similar. However, continence score was significantly better in the ABD group, and the stool pattern score was better in the TERPT group. A significant difference in age at interview between the 2 groups was noted; we therefore reanalyzed the data controlling for age, and this showed that age did not significantly affect the long-term scoring outcome between groups.\nTARGET: the answer to the question given the context is (yes|no|maybe):"
4,"QUESTION: Can tailored interventions increase mammography use among HMO women?\nCONTEXT: Telephone counseling and tailored print communications have emerged as promising methods for promoting mammography screening. However, there has been little research testing, within the same randomized field trial, of the efficacy of these two methods compared to a high-quality usual care system for enhancing screening. This study addressed the question: Compared to usual care, is tailored telephone counseling more effective than tailored print materials for promoting mammography screening?\nThree-year randomized field trial.\nOne thousand ninety-nine women aged 50 and older recruited from a health maintenance organization in North Carolina.\nWomen were randomized to 1 of 3 groups: (1) usual care, (2) tailored print communications, and (3) tailored telephone counseling.\nAdherence to mammography screening based on self-reports obtained during 1995, 1996, and 1997.\nCompared to usual care alone, telephone counseling promoted a significantly higher proportion of women having mammograms on schedule (71% vs 61%) than did tailored print (67% vs 61%) but only after the first year of intervention (during 1996). Furthermore, compared to usual care, telephone counseling was more effective than tailored print materials at promoting being on schedule with screening during 1996 and 1997 among women who were off-schedule during the previous year.\nTARGET: the answer to the question given the context is (yes|no|maybe):"
5,"QUESTION: Double balloon enteroscopy: is it efficacious and safe in a community setting?\nCONTEXT: From March 2007 to January 2011, 88 DBE procedures were performed on 66 patients. Indications included evaluation anemia/gastrointestinal bleed, small bowel IBD and dilation of strictures. Video-capsule endoscopy (VCE) was used prior to DBE in 43 of the 66 patients prior to DBE evaluation.\nThe mean age was 62 years. Thirty-two patients were female, 15 were African-American; 44 antegrade and 44 retrograde DBEs were performed. The mean time per antegrade DBE was 107.4±30.0 minutes with a distance of 318.4±152.9 cm reached past the pylorus. The mean time per lower DBE was 100.7±27.3 minutes with 168.9±109.1 cm meters past the ileocecal valve reached. Endoscopic therapy in the form of electrocautery to ablate bleeding sources was performed in 20 patients (30.3%), biopsy in 17 patients (25.8%) and dilation of Crohn's-related small bowel strictures in 4 (6.1%). 43 VCEs with pathology noted were performed prior to DBE, with findings endoscopically confirmed in 32 cases (74.4%). In 3 cases the DBE showed findings not noted on VCE.\nTARGET: the answer to the question given the context is (yes|no|maybe):"
6,"QUESTION: 30-Day and 1-year mortality in emergency general surgery laparotomies: an area of concern and need for improvement?\nCONTEXT: Emergency surgery is associated with poorer outcomes and higher mortality with recent studies suggesting the 30-day mortality to be 14-15%. The aim of this study was to analyse the 30-day mortality, age-related 30-day mortality and 1-year mortality following emergency laparotomy. We hope this will encourage prospective data collection, improvement of care and initiate strategies to establish best practice in this area.\nThis was a retrospective study of patients who underwent emergency laparotomy from June 2010 to May 2012. The primary end point of the study was 30-day mortality, age-related 30-day mortality and 1-year all-cause mortality.\n477 laparotomies were performed in 446 patients. 57% were aged<70 and 43% aged>70 years. 30-day mortality was 12, 4% in those aged<70 years and 22% in those>70 years (p<0.001). 1-year mortality was 25, 15% in those aged under 70 years and 38% in those aged>70 years (p<0.001).\nTARGET: the answer to the question given the context is (yes|no|maybe):"
7,"QUESTION: Is adjustment for reporting heterogeneity necessary in sleep disorders?\nCONTEXT: Anchoring vignettes are brief texts describing a hypothetical character who illustrates a certain fixed level of a trait under evaluation. This research uses vignettes to elucidate factors associated with sleep disorders in adult Japanese before and after adjustment for reporting heterogeneity in self-reports. This study also evaluates the need for adjusting for reporting heterogeneity in the management of sleep and energy related problems in Japan.\nWe investigated a dataset of 1002 respondents aged 18 years and over from the Japanese World Health Survey, which collected information through face-to-face interview from 2002 to 2003. The ordered probit model and the Compound Hierarchical Ordered Probit (CHOPIT) model, which incorporated anchoring vignettes, were employed to estimate and compare associations of sleep and energy with socio-demographic and life-style factors before and after adjustment for differences in response category cut-points for each individual.\nThe prevalence of self-reported problems with sleep and energy was 53 %. Without correction of cut-point shifts, age, sex, and the number of comorbidities were significantly associated with a greater severity of sleep-related problems. After correction, age, the number of comorbidities, and regular exercise were significantly associated with a greater severity of sleep-related problems; sex was no longer a significant factor. Compared to the ordered probit model, the CHOPIT model provided two changes with a subtle difference in the magnitude of regression coefficients after correction for reporting heterogeneity.\nTARGET: the answer to the question given the context is (yes|no|maybe):"
8,"QUESTION: Do mutations causing low HDL-C promote increased carotid intima-media thickness?\nCONTEXT: Although observational data support an inverse relationship between high-density lipoprotein (HDL) cholesterol and coronary heart disease (CHD), genetic HDL deficiency states often do not correlate with premature CHD.\nCarotid intima-media thickness (cIMT) measurements were obtained in cases comprising 10 different mutations in LCAT, ABCA1 and APOA1 to further evaluate the relationship between low HDL resulting from genetic variation and early atherosclerosis.\nIn a 1:2 case-control study of sex and age-related (+/-5 y) subjects (n=114), cIMT was nearly identical between cases (0.66+/-0.17 cm) and controls (0.65+/-0.18 cm) despite significantly lower HDL cholesterol (0.67 vs. 1.58 mmol/l) and apolipoprotein A-I levels (96.7 vs. 151.4 mg/dl) (P<0.05)\nTARGET: the answer to the question given the context is (yes|no|maybe):"
9,"QUESTION: A short stay or 23-hour ward in a general and academic children's hospital: are they effective?\nCONTEXT: We evaluated the usefulness of a short stay or 23-hour ward in a pediatric unit of a large teaching hospital, Westmead Hospital, and an academic Children's hospital, The New Children's Hospital, to determine if they are a useful addition to the emergency service.\nThis is a descriptive comparison of prospectively collected data on all children admitted to the short stay ward at Westmead Hospital (WH) during 1994 and the short stay ward at the New Children's Hospital (NCH) during 1997-98. These hospitals service an identical demographic area with the latter (NCH) a tertiary referral center. The following outcome measures were used: length of stay, appropriateness of stay, rate of admission to an in-hospital bed, and rate of unscheduled visits within 72 hours of discharge. Adverse events were reported and patient follow-up was attempted at 48 hours after discharge in all cases.\nThe short stay ward accounted for 10.3% (Westmead Hospital) and 14.7% (New Children's Hospital) of admissions, with 56% medical in nature, 30% surgical, and the remainder procedural or psychological. Admission patterns were similar, with asthma, gastroenteritis, convulsion, pneumonia, and simple surgical conditions accounting for most short stay ward admissions. The short stay ward increased hospital efficiency with an average length of stay of 17.5 hours (Westmead Hospital) compared to 20.5 hours (New Children's Hospital). The users of the short stay ward were children of young age less than 2 years, with stay greater than 23 hours reported in only 1% of all admissions to the short stay ward. The rate of patient admission to an in-hospital bed was low, (4% [Westmead Hospital] compared to 6% [New Children's Hospital]), with the number of unscheduled visits within 72 hours of short stay ward discharge less than 1%. There were no adverse events reported at either short stay ward, with parental satisfaction high. The short stay ward was developed through reallocation of resources from within the hospital to the short stay ward. This resulted in estimated savings of $1/2 million (Westmead Hospital) to $2.3 million (New Children's Hospital) to the hospital, due to more efficient bed usage.\nTARGET: the answer to the question given the context is (yes|no|maybe):"


In [8]:
frac = 1
evaluation_dataset = "./pubmedqa-dataset/test/test_frac.jsonl"
os.makedirs("./pubmedqa-dataset/test", exist_ok=True)
data.sample(frac=frac).to_json(
    evaluation_dataset, orient="records", lines=True
)

## Loading model from Registry

In [16]:
model_name = "Nemotron-3-8B-Base-4k"
model_version = "latest"

nemo_model_object = nemo_registry_ml_client.models.get(model_name, label="latest")

# Prompt

In [17]:
prompt_path = "prompts/prompt-qna.txt"


## Submitting Evaluation Pipeline

In [20]:
from azure.ai.ml.dsl import pipeline
from azure.ai.ml import Input
from azure.ai.ml.constants import AssetTypes

# fetch the pipeline component
pipeline_component_func = nemo_registry_ml_client.components.get(
    name="nemo_qna_evaluation", label="latest"
    # name = "nemo_prediction_base", label="latest"
)
openai_params='{"type":"azure_open_ai","model_name":"gpt-35-turbo","deployment_name":"gpt-35-turbo","questions":"input","contexts":"input"}'



# define the pipeline job
@pipeline()
def evaluation_pipeline(mlflow_model):
    evaluation_job = pipeline_component_func(
        dataset_path=Input(type=AssetTypes.URI_FOLDER, path="./pubmedqa-dataset/test/"),
        model_path=Input(type=AssetTypes.TRITON_MODEL, path=f"{nemo_model_object.id}"),
        openai_config_params=openai_params,
        # prompt_schema=Input(type=AssetTypes.URI_FILE, path=prompt_path), # Not required for text-gen task
        # prediction_column_name="output",   
    )
    return {"evaluation_result": evaluation_job.outputs.evaluation_result}

In [22]:
experiment_name = "nemo-qna-eval-pipeline"
pipeline_jobs = []

pipeline_object = evaluation_pipeline()

# don't reuse cached results from previous jobs
pipeline_object.settings.force_rerun = True
pipeline_object.settings.default_compute = compute_cluster
pipeline_job = workspace_ml_client.jobs.create_or_update(
    pipeline_object, experiment_name=experiment_name
)
# add model['name'] and pipeline_job.name as key value pairs to a dictionary
pipeline_jobs.append({"model_name": model_name, "job_name": pipeline_job.name})
# wait for the pipeline job to complete
workspace_ml_client.jobs.stream(pipeline_job.name)

RunId: helpful_salt_g3w0n26ltf
Web View: https://ml.azure.com/runs/helpful_salt_g3w0n26ltf?wsid=/subscriptions/72c03bf3-4e69-41af-9532-dfcdc3eefef4/resourcegroups/nvidia/workspaces/nvidia-eus

Streaming logs/azureml/executionlogs.txt

[2023-11-15 12:05:17Z] Submitting 1 runs, first five are: 6306a868:f7ebc092-9058-4590-afce-04d70408657e
