This notebook is for developing the code for extracting metadata from the Prisma incident report (PIR) data. The notebook:

* Loads the full set of PIRs, both labeled and unlabeled
* Separates the labeled from the unlabeled PIRs
* Defines a function that takes as input a prompt template, a set of labeled PIRs, and an unlabeled PIR to create a few-shot learning (FSL) prompt
* Defines a function that takes as input a prompt and a LLM, and returns the model's output
* Defines a function that parses the model output to produce structured metadata
* Saves the resulting structured metadata.

This notebook is for developing that code; once it is developed, it should be put into one or several .py script(s) so that it can be run on the Cluster without having a Jupyter session open.

In [3]:
# Imports
#from utils.load_llm_model import prepare_to_load_model, load_model
#import os
import random
import pandas as pd
from langchain import PromptTemplate

ModuleNotFoundError: No module named 'langchain'

In [None]:
# Load unlabeled PIRs (full)
"""
In its final form, the unlabeled PIRs should be in the format of a pandas dataframe,
where one column (titled 'text') is the PIR, and 'near_miss' and 'any_harm' each have a column.
"""

# Read the full PIRs

PIRs_full = pd.read_excel("PIRS_FULL.xlsx",usecols=['Near_Miss', 'Any_Harm', 'Text'])


In [None]:
# Load labeled PIRs
"""
In its final form, the labeled PIRs should be in the format of a pandas dataframe with four columns,
where one column (titled 'text') is the PIR, and the other three columns are 'risks_challenges',
'actions_strategies', and 'facilitators'.
"""

# Read the labeled PIRs

PIRs_labeled = pd.read_excel("PIRS_LABELED.xlsx", usecols=['Text', 'Risk_Challenge', 'Actions_Strategies', 'Facilitators'])


In [None]:
# Separate labeled from unlabeled PIRs
"""
PIRs appearing in the labeled data should be dropped from the unlabeled data set.
"""

df = pd.merge(PIRs_full, PIRs_labeled, on =["Text"], how = "outer", indicator = True)

df2 = df.loc[df["_merge"] == "left_only"].drop("_merge", axis=1)

PIRs_unlabeled = df2.drop(columns=['Risk_Challenge', 'Facilitators', 'Near_Miss', 'Any_Harm'])

#PIRs_unlabeled['Actions_Strategies'] = None

In [None]:
# Read the prompt template text to a string
with open("PIR_Prompt_AS.txt","r") as file:
    template = file.read()

# Now the variable "template" contains the contents of PIR_Prompt_AS.txt as a string

In [None]:
# Helper function: create FSL prompt
def create_fsl_prompt(labeled_pirs, unlabeled_pir,template=template):
    
    # Get the actions/strategies labeled PIRs into a dictionary
    labeled_pirs = PIRs_labeled[['Text','Actions_Strategies']]
    
    # Randomly select labeled PIRs
    labeled_pirs_examples = random.sample(labeled_pirs, 5)
    
    # Unlabeled PIRs
    
    unlabeled_pir = PIRs_unlabeled[['Text','Actions_Strategies']]
    
    # Just select one unlabeled PIRs for test
    
    unlabeled_pir = unlabeled_pir.iloc[0]
    
    unlabeled_pir=unlabeled_pir.to_string()
    
    
    # Construct the final prompt
    
    prompt_template = PromptTemplate(template=template, 
                                     input_variables=["examples","text"])
    
    # Integrate randomly selected labeled PIRs into prompt templates
    
    labeled_prompt_input_dict = {'examples': labeled_pirs_examples}
    
    unlabeled_prompt_input_dict = {'text':unlabeled_pir}
    
    # Prompts
    
    labeled_prompt = prompt_template.format(**labeled_prompt_input_dict)
    
    unlabeled_prompt = prompt_template.format(**unlabeled_prompt_input_dict)
    
    # Convert into a dict
    
    prompts = {'labeled ':{'prompt':labeled_prompt},
               'unlabeled':{'prompt':unlabeled_prompt}}
    
    return prompts
    

In [None]:

#def get_model_output(model, prompt:str):
    """
    Return a model's output given a particular prompt.
    """
    #TODO
    #pass


In [None]:
def parse_model_output(pir, category, output):
    """
    Parse a model's text output in order to extract structured metadata from it.

    Args:
        pir (str): The text of the PIR that the model was asked to generate metadata about.
        category (str): The category of metadata the model was asked to generate (e.g. 'risk_challenge')
        output (str): The model's text output.

    Returns:
        dict: {"text":pir, category:list_of_labels}
    """
    #TODO
    pass