# Label Set Harmonisation 

> This module provides funcetions to harmonise cell type labels between different label sets using AI-assisted matching. It leverages OpenAI's GPT API model to associate labels from an existing set with those from a predicted set, based on its natural language processing capabilities and internal knowledge.


In [1]:
#| hide
from nbdev.showdoc import *

In [2]:
#| default_exp harmonise

In [3]:
#| export
def match_cell_labels(existing_labels_set:set, # A set of existing cell type labels
                      predicted_labels_set:set, # A set of predicted cell type labels
                      openai_api_key:str=None  # The API key for OpenAI. If not provided, it will be taken from the environment variable 'OPENAI_API_KEY'
                      ) -> dict: # A dictionary representing the JSON object with matched labels.
    
    """
    This function matches cell type labels from two sets using OpenAI's GPT-4 model.
    """
    
    # Import necessary libraries
    from openai import OpenAI
    import os
    import json
    
    # Use the provided API key or get it from the environment variable
    api_key = openai_api_key or os.getenv('OPENAI_API_KEY')
    if api_key is None:
        raise ValueError("An OpenAI API key must be provided either as an argument or as an environment variable 'OPENAI_API_KEY'.")

    # Construct the prompt for the OpenAI model
    prompt = f"First set of labels: {predicted_labels_set}. Second set of labels: {existing_labels_set}. " \
             "Associate each label in the first set with labels in the second set, based on your knowledge of cell type similarity, " \
             "as accurately as possible. Return answer as JSON object"

    # Initialize the OpenAI client with the API key
    client = OpenAI(api_key=api_key)

    # Create a completion request to the OpenAI API
    completion = client.chat.completions.create(
        model="gpt-4-1106-preview",
        messages=[
            {"role": "system", "content":"As an expert Cell Biologist with extensive knowledge in comparing and relating various cell classification types, you will be presented with two lists of cell type labels. Your objective is to accurately match each label from the first list with its most suitable counterpart in the second list. It is important to note that multiple labels from the first list may correspond to a single label in the second list, reflecting differences in annotation resolution. Your responses should demonstrate the depth of your analytical and reasoning skills, underpinned by your comprehensive scientific understanding and the insights you've acquired from thorough research in this field. Please submit your answers in the form of a JSON object."},
            {"role": "user", "content": prompt}
        ],
        response_format={"type": "json_object"}
    )

    # Extract the response from the completion object
    bingo = completion.choices[0].message

    # Parse the response as JSON
    try:
        json_data = json.loads(str(bingo.content))
        return json_data
    except json.JSONDecodeError:
        print("The message is not in JSON format.")
        return None

#| export
def map_old_labels_to_new(old_labels: list, # A list of old labels that need to be mapped to new labels
                          label_mapping: dict # A dictionary where keys are old labels and values are new labels
                          ) -> list: # Returns a list of new labels corresponding to each old label in `old_labels`.
    
    """
    Maps each old label in `old_labels` to its corresponding new label based on `label_mapping`.
    """
    old_to_new_label = {}

    # Constructing a mapping from old label to new label
    for old_label, new_labels in label_mapping.items():
        if isinstance(new_labels, list):
            for new_label in new_labels:
                old_to_new_label[old_label] = new_label
        else:
            old_to_new_label[old_label] = new_labels

    # Mapping each old label in the old_labels list to its new label
    mapped_list = [old_to_new_label.get(label, "unknown") for label in old_labels]

    return mapped_list

In [4]:
#| export
def map_labels_to_categories(label_list: list, # A list of labels that need to be categorized
                             label_dict: dict # A dictionary where keys are categories and values are lists of labels belonging to those categories
                             ) -> list: # Returns a list of categories corresponding to each label in `label_list`.
    
    """
    Maps each label in `label_list` to its corresponding category based on `label_dict`.
    """
    label_to_category = {}

    # Constructing a mapping from label to category
    for category, labels in label_dict.items():
        if isinstance(labels, list):
            for label in labels:
                label_to_category[label] = category
        else:
            label_to_category[labels] = category

    # Mapping each label in the label_list to its category
    mapped_list = [label_to_category.get(label, "unknown") for label in label_list]

    return mapped_list

In [5]:
#| hide
import nbdev; nbdev.nbdev_export()