In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from typing import List
import os

import seaborn as sns
import plotly.express as px
from tqdm import tqdm

from openai import AzureOpenAI
from openai import OpenAI

pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_rows", 100)

In [2]:
# Load keys
from dotenv import load_dotenv

load_dotenv()

from openai_utils import *

In [3]:
# Directory setup
data_dir = "../data/"
output_dir = "../results/general_knowledge/"

os.makedirs(output_dir, exist_ok=True)

## Generate Questions


In [4]:
def load_and_process_data(file_path):
    # Load the data
    df = pd.read_csv(file_path)

    # Filter out rows that are not "preferred name" or "brand name"
    filtered_df = df[df["string_type"].isin(["preferred name", "brand name"])]

    # Group by concept_code and keep the first occurrence of each string_type per group
    unique_names_df = (
        filtered_df.groupby(["concept_code", "string_type"]).first().reset_index()
    )

    # Since we want to keep one of each type per concept_code, let's ensure there's only one of each
    final_df = unique_names_df.groupby("concept_code").filter(lambda x: len(x) <= 2)

    # Filter out concept_codes that appear only once in the DataFrame
    counts = final_df["concept_code"].value_counts()
    filtered_final_df = final_df[
        final_df["concept_code"].isin(counts[counts > 1].index)
    ]

    # Ensure we have a DataFrame that includes both a preferred name and a brand name for each concept_code
    concept_codes_with_both_names = filtered_final_df.groupby("concept_code").filter(
        lambda x: len(x) == 2
    )

    # Split the DataFrame into two: one for preferred names and one for brand names
    preferred_names_df = concept_codes_with_both_names[
        concept_codes_with_both_names["string_type"] == "preferred name"
    ]

    brand_names_df = concept_codes_with_both_names[
        concept_codes_with_both_names["string_type"] == "brand name"
    ]

    # Merge them to have a single DataFrame with both preferred and brand names for each concept_code
    combined_df = pd.merge(
        preferred_names_df,
        brand_names_df,
        on="concept_code",
        suffixes=("_preferred", "_brand"),
    )

    return combined_df


combined_df = load_and_process_data(data_dir + "HemOnc_drug_list.csv")

combined_df

Unnamed: 0,concept_code,string_type_preferred,Unnamed: 0_preferred,string_preferred,string_type_brand,Unnamed: 0_brand,string_brand
0,4,preferred name,8,Abemaciclib,brand name,10,Verzenio
1,6,preferred name,14,Abiraterone,brand name,17,Abatitor
2,7,preferred name,39,Acalabrutinib,brand name,41,Calquence
3,9,preferred name,49,Aclarubicin,brand name,56,Aclacin
4,12,preferred name,105,Trastuzumab emtansine,brand name,110,Kadcyla
...,...,...,...,...,...,...,...
362,111358,preferred name,6057,Olutasidenib,brand name,6059,Rezlidhia
363,111427,preferred name,6063,Pyrotinib,brand name,6064,Irene
364,114346,preferred name,6072,Buserelin,brand name,6076,Bigonist
365,114432,preferred name,6084,Lifileuecel,brand name,6086,Contego


In [5]:
# Save the combined DataFrame
combined_df.to_csv(data_dir + "combined_df.csv", index=False)

In [6]:
all_preferred_names = combined_df["string_preferred"].unique()
all_brand_names = combined_df["string_brand"].unique()

In [7]:
# Function to generate three random incorrect answers
def get_incorrect_answers(correct_answer, all_answers):
    incorrect_answers = np.random.choice(
        all_answers[all_answers != correct_answer], size=3, replace=False
    )
    return incorrect_answers.tolist()


def generate_variations(
    all_answers,
    correct_answer,
    question_template,
    concept_code,
    variation_id,
    num_variations=5,
):
    incorrect_answers = get_incorrect_answers(correct_answer, all_answers)
    options = [correct_answer] + incorrect_answers
    np.random.shuffle(options)  # Shuffle all options
    return {
        "concept_code": concept_code,
        "variation_id": variation_id,
        "question": question_template,
        "options": options,
        "correct_answer": correct_answer,
    }

In [8]:
# Generate questions and answers
questions_answers = []
num_variations = 5

for index, row in combined_df.iterrows():
    concept_code = row["concept_code"]
    preferred_name = row["string_preferred"]
    brand_name = row["string_brand"]

    for variation_id in range(1, 6):  # Generate 5 variations
        # Brand name question variations
        question_brand_template = (
            f"Which of the following is the brand name for the drug {preferred_name}?"
        )
        variation_brand = generate_variations(
            all_brand_names,
            brand_name,
            question_brand_template,
            concept_code,
            variation_id,
            num_variations,
        )
        questions_answers.append(variation_brand)

        # Generic name question variations
        question_generic_template = (
            f"Which of the following is the generic name for the drug {brand_name}?"
        )
        variation_generic = generate_variations(
            all_preferred_names,
            preferred_name,
            question_generic_template,
            concept_code,
            variation_id,
        )
        questions_answers.append(variation_generic)

# Preview the first few questions and answers
questions_answers[:10]

[{'concept_code': 4,
  'variation_id': 1,
  'question': 'Which of the following is the brand name for the drug Abemaciclib?',
  'options': ['TheraCys', 'Rydapt', 'Verzenio', 'Osimert'],
  'correct_answer': 'Verzenio'},
 {'concept_code': 4,
  'variation_id': 1,
  'question': 'Which of the following is the generic name for the drug Verzenio?',
  'options': ['Floxuridine',
   'Gemtuzumab ozogamicin',
   'Abemaciclib',
   'Paclitaxel'],
  'correct_answer': 'Abemaciclib'},
 {'concept_code': 4,
  'variation_id': 2,
  'question': 'Which of the following is the brand name for the drug Abemaciclib?',
  'options': ['Verzenio', 'Provenge', 'Ovastat', 'Amelie'],
  'correct_answer': 'Verzenio'},
 {'concept_code': 4,
  'variation_id': 2,
  'question': 'Which of the following is the generic name for the drug Verzenio?',
  'options': ['Abemaciclib',
   'Ribociclib',
   'Idecabtagene vicleucel',
   'Decitabine'],
  'correct_answer': 'Abemaciclib'},
 {'concept_code': 4,
  'variation_id': 3,
  'question'

In [9]:
# Create a DataFrame from the questions_answers list
qa_df = pd.DataFrame(questions_answers)

# Include variation_id in your qa_brand_df and qa_preferred_df before merging
qa_brand_df = qa_df[
    qa_df["question"].str.contains("brand name for the drug")
].reset_index(drop=True)

qa_preferred_df = qa_df[
    qa_df["question"].str.contains("generic name for the drug")
].reset_index(drop=True)

In [10]:
# Create a new DataFrame that repeats each row of combined_df for each variation
expanded_combined_df = pd.DataFrame(
    np.repeat(combined_df.values, num_variations, axis=0), columns=combined_df.columns
)

# Add a variation_id column to expanded_combined_df
expanded_combined_df["variation_id"] = np.tile(
    np.arange(1, num_variations + 1), len(combined_df)
)

expanded_combined_df

Unnamed: 0,concept_code,string_type_preferred,Unnamed: 0_preferred,string_preferred,string_type_brand,Unnamed: 0_brand,string_brand,variation_id
0,4,preferred name,8,Abemaciclib,brand name,10,Verzenio,1
1,4,preferred name,8,Abemaciclib,brand name,10,Verzenio,2
2,4,preferred name,8,Abemaciclib,brand name,10,Verzenio,3
3,4,preferred name,8,Abemaciclib,brand name,10,Verzenio,4
4,4,preferred name,8,Abemaciclib,brand name,10,Verzenio,5
...,...,...,...,...,...,...,...,...
1830,114494,preferred name,6094,Tabelecleucel,brand name,6097,Ebvallo,1
1831,114494,preferred name,6094,Tabelecleucel,brand name,6097,Ebvallo,2
1832,114494,preferred name,6094,Tabelecleucel,brand name,6097,Ebvallo,3
1833,114494,preferred name,6094,Tabelecleucel,brand name,6097,Ebvallo,4


In [11]:
# Merging with combined_df, ensuring variation_id is kept
filtered_final_df_with_qa = pd.merge(
    expanded_combined_df,
    qa_brand_df,
    on=["concept_code", "variation_id"],
    how="left",
    suffixes=("_brand", ""),
)

filtered_final_df_with_qa = pd.merge(
    filtered_final_df_with_qa,
    qa_preferred_df,
    on=["concept_code", "variation_id"],
    how="left",
    suffixes=("_brand", "_preferred"),
)

filtered_final_df_with_qa

Unnamed: 0,concept_code,string_type_preferred,Unnamed: 0_preferred,string_preferred,string_type_brand,Unnamed: 0_brand,string_brand,variation_id,question_brand,options_brand,correct_answer_brand,question_preferred,options_preferred,correct_answer_preferred
0,4,preferred name,8,Abemaciclib,brand name,10,Verzenio,1,Which of the following is the brand name for the drug Abemaciclib?,"[TheraCys, Rydapt, Verzenio, Osimert]",Verzenio,Which of the following is the generic name for the drug Verzenio?,"[Floxuridine, Gemtuzumab ozogamicin, Abemaciclib, Paclitaxel]",Abemaciclib
1,4,preferred name,8,Abemaciclib,brand name,10,Verzenio,2,Which of the following is the brand name for the drug Abemaciclib?,"[Verzenio, Provenge, Ovastat, Amelie]",Verzenio,Which of the following is the generic name for the drug Verzenio?,"[Abemaciclib, Ribociclib, Idecabtagene vicleucel, Decitabine]",Abemaciclib
2,4,preferred name,8,Abemaciclib,brand name,10,Verzenio,3,Which of the following is the brand name for the drug Abemaciclib?,"[Pegasys, Nexavar, Nitrol, Verzenio]",Verzenio,Which of the following is the generic name for the drug Verzenio?,"[Trimetrexate, Abemaciclib, Idecabtagene vicleucel, Dabrafenib]",Abemaciclib
3,4,preferred name,8,Abemaciclib,brand name,10,Verzenio,4,Which of the following is the brand name for the drug Abemaciclib?,"[Pemazyre, Verzenio, Bosulif, CA ATRA]",Verzenio,Which of the following is the generic name for the drug Verzenio?,"[Bortezomib, Fotemustine, Abemaciclib, Avapritinib]",Abemaciclib
4,4,preferred name,8,Abemaciclib,brand name,10,Verzenio,5,Which of the following is the brand name for the drug Abemaciclib?,"[Verzenio, Ibrunib, Muphoran, Breyanzi]",Verzenio,Which of the following is the generic name for the drug Verzenio?,"[Valrubicin, Entrectinib, Abemaciclib, Pertuzumab]",Abemaciclib
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1830,114494,preferred name,6094,Tabelecleucel,brand name,6097,Ebvallo,1,Which of the following is the brand name for the drug Tabelecleucel?,"[Omnitarg, Ukoniq, Inrebic, Ebvallo]",Ebvallo,Which of the following is the generic name for the drug Ebvallo?,"[Tabelecleucel, Oxaliplatin, Clofarabine, Colchicine]",Tabelecleucel
1831,114494,preferred name,6094,Tabelecleucel,brand name,6097,Ebvallo,2,Which of the following is the brand name for the drug Tabelecleucel?,"[Abitrexate, Iclusig, Ebvallo, Hansizhuang]",Ebvallo,Which of the following is the generic name for the drug Ebvallo?,"[Valrubicin, Tabelecleucel, Leuprolide, Daratumumab and hyaluronidase]",Tabelecleucel
1832,114494,preferred name,6094,Tabelecleucel,brand name,6097,Ebvallo,3,Which of the following is the brand name for the drug Tabelecleucel?,"[Conmana, Krazati, Nilevar, Ebvallo]",Ebvallo,Which of the following is the generic name for the drug Ebvallo?,"[Trimetrexate, Tabelecleucel, Pegylated liposomal doxorubicin, Ixazomib]",Tabelecleucel
1833,114494,preferred name,6094,Tabelecleucel,brand name,6097,Ebvallo,4,Which of the following is the brand name for the drug Tabelecleucel?,"[Ebvallo, Iclusig, Valstar, Actimmune]",Ebvallo,Which of the following is the generic name for the drug Ebvallo?,"[Bismuth subsalicylate, Octreotide, Tivozanib, Tabelecleucel]",Tabelecleucel


In [12]:
def generate_name_questions(row):
    questions = {}

    for type in ["preferred", "brand"]:
        question_type_key = f"question_{type}"
        options_type_key = f"options_{type}"

        question = row[question_type_key]
        options = row[options_type_key]

        # Check if options is already a list, if not attempt to interpret it as such
        if not isinstance(options, list):
            options_list = options.split(",") if isinstance(options, str) else []
        else:
            options_list = options

        options_formatted = ", ".join(
            [f"{i + 1}: {opt.strip()}" for i, opt in enumerate(options_list)]
        )

        final_question = f"{question}\nOptions: {options_formatted}\nAnswer: {row['correct_answer_' + type]}"
        questions[type] = final_question

    return questions["preferred"], questions["brand"]


# Apply the updated function and create new columns for preferred and brand questions
filtered_final_df_with_qa[["final_preferred_question", "final_brand_question"]] = (
    filtered_final_df_with_qa.apply(
        lambda row: generate_name_questions(row), axis=1, result_type="expand"
    )
)

# preview the final DataFrame
print(
    filtered_final_df_with_qa[
        ["final_preferred_question", "final_brand_question"]
    ].head()
)

                                                                                                                                                     final_preferred_question  \
0    Which of the following is the generic name for the drug Verzenio?\nOptions: 1: Floxuridine, 2: Gemtuzumab ozogamicin, 3: Abemaciclib, 4: Paclitaxel\nAnswer: Abemaciclib   
1    Which of the following is the generic name for the drug Verzenio?\nOptions: 1: Abemaciclib, 2: Ribociclib, 3: Idecabtagene vicleucel, 4: Decitabine\nAnswer: Abemaciclib   
2  Which of the following is the generic name for the drug Verzenio?\nOptions: 1: Trimetrexate, 2: Abemaciclib, 3: Idecabtagene vicleucel, 4: Dabrafenib\nAnswer: Abemaciclib   
3              Which of the following is the generic name for the drug Verzenio?\nOptions: 1: Bortezomib, 2: Fotemustine, 3: Abemaciclib, 4: Avapritinib\nAnswer: Abemaciclib   
4               Which of the following is the generic name for the drug Verzenio?\nOptions: 1: Valrubicin, 2: Entre

## GPT Inference


In [13]:
SYS_PROMPT = "You are an AI assistant answering multiple choice questions. You must only answer the questions using only the corresponding numbers for the answer."
SERVICE = "openai"
ENGINE = "gpt-3.5-turbo-0613"
# ENGINE = "gpt-4-turbo"
temperatures = [0.0, 0.5, 1.0, 2.0]
DEBUG = False

if DEBUG:
    filtered_final_df_with_qa = filtered_final_df_with_qa.head()

In [14]:
# Dict for storing DataFrames for each temperature
temperature_dfs = {}

for temp in temperatures:
    # Lists for storing inference results for the current temperature
    inferred_preferred_names = []
    inferred_brand_names = []

    # Get inference for each row in the DataFrame
    for index, row in tqdm(
        filtered_final_df_with_qa.iterrows(),
        total=filtered_final_df_with_qa.shape[0],
        desc=f"Temperature {temp}",
    ):
        # Handle preferred name question
        preferred_question = row["final_preferred_question"]
        response_preferred = get_chat_completion(
            user_prompt=preferred_question,
            system_prompt=SYS_PROMPT,
            service=SERVICE,
            engine=ENGINE,
            temperature=temp,
            max_tokens=1,
            full_response=True,
        )
        inferred_preferred_names.append(response_preferred)

        # Handle brand name question
        brand_question = row["final_brand_question"]
        response_brand = get_chat_completion(
            user_prompt=brand_question,
            system_prompt=SYS_PROMPT,
            service=SERVICE,
            engine=ENGINE,
            temperature=temp,
            max_tokens=1,
            full_response=True,
        )
        inferred_brand_names.append(response_brand)

    # Create a new DataFrame for the current temperature
    temp_df = filtered_final_df_with_qa.copy()
    temp_df["inferred_preferred_name"] = inferred_preferred_names
    temp_df["inferred_brand_name"] = inferred_brand_names

    # Apply digit extraction based on the SERVICE
    if SERVICE == "azure":
        temp_df["inferred_preferred_answer"] = [
            extract_azure_top_digit(i) for i in temp_df["inferred_preferred_name"]
        ]
        temp_df["inferred_brand_answer"] = [
            extract_azure_top_digit(i) for i in temp_df["inferred_brand_name"]
        ]
    else:
        temp_df["inferred_preferred_answer"] = [
            extract_openai_top_digit(i) for i in temp_df["inferred_preferred_name"]
        ]
        temp_df["inferred_brand_answer"] = [
            extract_openai_top_digit(i) for i in temp_df["inferred_brand_name"]
        ]

    # Store the DataFrame in the dictionary with a key representing the temperature
    temperature_dfs[f"temp_{str(temp).replace('.', '_')}"] = temp_df

# Print keys in the dictionary
print(temperature_dfs.keys())

Temperature 0.0: 100%|██████████| 1835/1835 [30:25<00:00,  1.00it/s]
Temperature 0.5: 100%|██████████| 1835/1835 [31:58<00:00,  1.05s/it]  
Temperature 1.0: 100%|██████████| 1835/1835 [1:23:08<00:00,  2.72s/it]
Temperature 2.0: 100%|██████████| 1835/1835 [8:48:30<00:00, 17.28s/it]  

dict_keys(['temp_0_0', 'temp_0_5', 'temp_1_0', 'temp_2_0'])





## Evaluate Performance


In [15]:
def calculate_accuracy(correct_answer, inferred_answer, options):
    if (
        pd.isna(inferred_answer) or inferred_answer == ""
    ):  # Check if 'inferred_answer' is empty or NaN
        return "N/A"  # Return "N/A" for not available data
    else:
        try:
            inferred_answer = int(inferred_answer)
            answer_position = options.index(correct_answer) + 1  # 1-indexed
            return "Correct" if inferred_answer == answer_position else "Incorrect"
        except ValueError:
            return "N/A"


temperature_accuracy_summaries = {}  # Store accuracy summaries for each temperature

for temp in temperatures:
    # Assuming temperature_dfs contains the DataFrames for each temperature
    temp_key = f"temp_{str(temp).replace('.', '_')}"
    temp_df = temperature_dfs[temp_key]

    # Calculate value counts for the current temperature DataFrame
    preferred_name_counts = temp_df["inferred_preferred_answer"].value_counts()
    brand_name_counts = temp_df["inferred_brand_answer"].value_counts()

    # Display the value counts for current temperature
    print(f"Temperature {temp}:")
    print("Preferred Name Value Counts:")
    print(preferred_name_counts)
    print("\nBrand Name Value Counts:")
    print(brand_name_counts)

    # Calculate accuracy
    temp_df["brand_accuracy"] = temp_df.apply(
        lambda row: calculate_accuracy(
            row["correct_answer_brand"],
            row["inferred_brand_answer"],
            row["options_brand"],
        ),
        axis=1,
    )

    temp_df["preferred_accuracy"] = temp_df.apply(
        lambda row: calculate_accuracy(
            row["correct_answer_preferred"],
            row["inferred_preferred_answer"],
            row["options_preferred"],
        ),
        axis=1,
    )

    # Creating a summary DataFrame for brand and preferred accuracies for the current temperature
    accuracy_summary = {
        "Temperature": temp,
        "Type": ["Brand", "Preferred"],
        "Correct": [
            (temp_df["brand_accuracy"] == "Correct").sum(),
            (temp_df["preferred_accuracy"] == "Correct").sum(),
        ],
        "Incorrect": [
            (temp_df["brand_accuracy"] == "Incorrect").sum(),
            (temp_df["preferred_accuracy"] == "Incorrect").sum(),
        ],
        "Not Available": [
            (temp_df["brand_accuracy"] == "N/A").sum(),
            (temp_df["preferred_accuracy"] == "N/A").sum(),
        ],
    }

    accuracy_summary_df = pd.DataFrame(accuracy_summary)
    # Store the summary DataFrame in the dictionary with a key representing the temperature
    temperature_accuracy_summaries[temp_key] = accuracy_summary_df

    # Display or store the accuracy summary DataFrame for current temperature
    print(f"\nAccuracy Summary for Temperature {temp}:")
    print(accuracy_summary_df)

Temperature 0.0:
Preferred Name Value Counts:
inferred_preferred_answer
     690
2    398
4    384
3    297
1     66
Name: count, dtype: int64

Brand Name Value Counts:
inferred_brand_answer
     948
2    359
4    272
3    226
1     30
Name: count, dtype: int64

Accuracy Summary for Temperature 0.0:
   Temperature       Type  Correct  Incorrect  Not Available
0          0.0      Brand      887          0            948
1          0.0  Preferred     1145          0            690
Temperature 0.5:
Preferred Name Value Counts:
inferred_preferred_answer
     691
2    394
4    381
3    301
1     68
Name: count, dtype: int64

Brand Name Value Counts:
inferred_brand_answer
     943
2    360
4    271
3    229
1     32
Name: count, dtype: int64

Accuracy Summary for Temperature 0.5:
   Temperature       Type  Correct  Incorrect  Not Available
0          0.5      Brand      891          1            943
1          0.5  Preferred     1144          0            691
Temperature 1.0:
Preferred Name 

## Save results


In [16]:
for temp_key, temp_df in temperature_dfs.items():
    # Save the temperature-specific DataFrame
    temp_df_filename = os.path.join(output_dir, ENGINE, f"{temp_key}_df.csv")
    if not os.path.exists(os.path.join(output_dir, ENGINE)):
        os.makedirs(os.path.join(output_dir, ENGINE))
    temp_df.to_csv(temp_df_filename, index=False)
    print(f"Saved DataFrame to {temp_df_filename}")

    # Save the accuracy summary DataFrame
    summary_df = temperature_accuracy_summaries[temp_key]
    if not os.path.exists(os.path.join(output_dir, ENGINE)):
        os.makedirs(os.path.join(output_dir, ENGINE))
    summary_df_filename = os.path.join(output_dir, ENGINE, f"{temp_key}_summary.csv")
    summary_df.to_csv(summary_df_filename, index=False)
    print(f"Saved Accuracy Summary to {summary_df_filename}")

Saved DataFrame to ../results/general_knowledge/gpt-3.5-turbo-0613/temp_0_0_df.csv
Saved Accuracy Summary to ../results/general_knowledge/gpt-3.5-turbo-0613/temp_0_0_summary.csv
Saved DataFrame to ../results/general_knowledge/gpt-3.5-turbo-0613/temp_0_5_df.csv
Saved Accuracy Summary to ../results/general_knowledge/gpt-3.5-turbo-0613/temp_0_5_summary.csv
Saved DataFrame to ../results/general_knowledge/gpt-3.5-turbo-0613/temp_1_0_df.csv
Saved Accuracy Summary to ../results/general_knowledge/gpt-3.5-turbo-0613/temp_1_0_summary.csv
Saved DataFrame to ../results/general_knowledge/gpt-3.5-turbo-0613/temp_2_0_df.csv
Saved Accuracy Summary to ../results/general_knowledge/gpt-3.5-turbo-0613/temp_2_0_summary.csv


In [17]:
# check the output of example temperature accuracy summary
temperature_accuracy_summaries["temp_0_0"]

Unnamed: 0,Temperature,Type,Correct,Incorrect,Not Available
0,0.0,Brand,887,0,948
1,0.0,Preferred,1145,0,690
