In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from typing import List
import os

import seaborn as sns
import plotly.express as px
from tqdm import tqdm

from openai import AzureOpenAI
from openai import OpenAI

from openai_utils import *

pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_rows", 100)

In [2]:
# Load keys
from dotenv import load_dotenv

load_dotenv()

True

In [3]:
# Directory setup
data_dir = "../data/"
output_dir = "results/general_knowledge/"

os.makedirs(output_dir, exist_ok=True)

In [4]:
def load_and_process_data(file_path):
    # Load the data
    df = pd.read_csv(file_path)

    # Filter out rows that are not "preferred name" or "brand name"
    filtered_df = df[df["string_type"].isin(["preferred name", "brand name"])]

    # Group by concept_code and keep the first occurrence of each string_type per group
    unique_names_df = (
        filtered_df.groupby(["concept_code", "string_type"]).first().reset_index()
    )

    # Since we want to keep one of each type per concept_code, let's ensure there's only one of each
    final_df = unique_names_df.groupby("concept_code").filter(lambda x: len(x) <= 2)

    # Filter out concept_codes that appear only once in the DataFrame
    counts = final_df["concept_code"].value_counts()
    filtered_final_df = final_df[
        final_df["concept_code"].isin(counts[counts > 1].index)
    ]

    # Ensure we have a DataFrame that includes both a preferred name and a brand name for each concept_code
    concept_codes_with_both_names = filtered_final_df.groupby("concept_code").filter(
        lambda x: len(x) == 2
    )

    # Split the DataFrame into two: one for preferred names and one for brand names
    preferred_names_df = concept_codes_with_both_names[
        concept_codes_with_both_names["string_type"] == "preferred name"
    ]

    brand_names_df = concept_codes_with_both_names[
        concept_codes_with_both_names["string_type"] == "brand name"
    ]

    # Merge them to have a single DataFrame with both preferred and brand names for each concept_code
    combined_df = pd.merge(
        preferred_names_df,
        brand_names_df,
        on="concept_code",
        suffixes=("_preferred", "_brand"),
    )

    return combined_df


combined_df = load_and_process_data(data_dir + "HemOnc_drug_list.csv")

combined_df

Unnamed: 0,concept_code,string_type_preferred,Unnamed: 0_preferred,string_preferred,string_type_brand,Unnamed: 0_brand,string_brand
0,4,preferred name,8,Abemaciclib,brand name,10,Verzenio
1,6,preferred name,14,Abiraterone,brand name,17,Abatitor
2,7,preferred name,39,Acalabrutinib,brand name,41,Calquence
3,9,preferred name,49,Aclarubicin,brand name,56,Aclacin
4,12,preferred name,105,Trastuzumab emtansine,brand name,110,Kadcyla
...,...,...,...,...,...,...,...
362,111358,preferred name,6057,Olutasidenib,brand name,6059,Rezlidhia
363,111427,preferred name,6063,Pyrotinib,brand name,6064,Irene
364,114346,preferred name,6072,Buserelin,brand name,6076,Bigonist
365,114432,preferred name,6084,Lifileuecel,brand name,6086,Contego


In [5]:
def process_questions(df, stems, question_categories):
    for stem in stems:
        for question_category in question_categories:
            question_col_name = f"{question_category}_question" if stem == 'NULL' else f"stem_{question_category}_question"
            question_format = f"Which of the following two drugs is more {question_category}:" if stem == 'NULL' else f"{stem} Which of the following drugs is more {question_category}:"
            options_format = "1: {string_preferred}, 2: {string_brand}, 3: Thadlidomide" if stem == 'NULL' else "1: {string_preferred}, 2: {string_brand}"

            df[question_col_name] = df.apply(
                lambda row: f"{question_format}\nOptions: {options_format}\nAnswer: ",
                axis=1
            )

    question_mappings = {}
    for stem in stems:
        for question_category in question_categories:
            question_col_name = f"stem_{question_category}_question" if stem else f"{question_category}_question"
            question_mappings[question_col_name] = dict(zip(df['concept_code'], df[question_col_name]))

    # Map the questions to the original DataFrame rows based on concept_code
    for question_col_name, mapping in question_mappings.items():
        df[question_col_name] = df['concept_code'].map(mapping)

    return df

In [6]:
def perform_inference(df, stems, question_categories, get_chat_completion, extract_top_digit, debug=False):
    df_dedup = df.drop_duplicates(subset='concept_code', keep='first')

    if debug:
        df_dedup = df_dedup.head(10)

    for stem in stems:
        for question_category in question_categories:
            question_col_name = f"{question_category}_question" if stem == 'NULL' else f"stem_{question_category}_question"
            inferred_col_name = f"inferred_{question_category}" if stem == 'NULL' else f"stem_inferred_{question_category}"
            answer_col_name = f"{question_category}_answer" if stem == 'NULL' else f"stem_{question_category}_answer"

            inferred_answers = []
            for question in tqdm(df_dedup[question_col_name], desc=f"Inference for {question_col_name}"):
                response = get_chat_completion(user_prompt=question, temperature=0, max_tokens=1)
                inferred_answers.append(response)

            df_dedup[inferred_col_name] = inferred_answers
            df_dedup[answer_col_name] = [extract_top_digit(i) for i in df_dedup[inferred_col_name]]

    return df_dedup


In [7]:
comparison_stems = ['NULL', 'The following drugs are indicated for the same condition, you can only choose one of the drugs.']
question_categories = ['effective', 'safe', 'associated with side effects', 'likely to make me sick'] # Which of the following two drugs is more {question_category}:"

In [12]:
def perform_inference_with_temperatures(df, stems, question_categories, get_chat_completion, extract_openai_top_digit, debug=False):
    df_dedup = df.drop_duplicates(subset='concept_code', keep='first')

    if debug:
        df_dedup = df_dedup.head(10)

    for temperature in [0, 1, 2]:
        for stem in stems:
            for question_category in question_categories:
                question_col_name = f"{question_category}_question" if stem == 'NULL' else f"stem_{question_category}_question"
                inferred_col_name = f"inferred_{question_category}_temp_{temperature}" if stem == 'NULL' else f"stem_inferred_{question_category}_temp_{temperature}"
                answer_col_name = f"{question_category}_answer_temp_{temperature}" if stem == 'NULL' else f"stem_{question_category}_answer_temp_{temperature}"

                inferred_answers = []
                for question in tqdm(df_dedup[question_col_name], desc=f"Inference for {question_col_name} with temperature {temperature}"):
                    response = get_chat_completion(user_prompt=question, temperature=temperature, max_tokens=1)
                    inferred_answers.append(response)
                df_dedup[inferred_col_name] = inferred_answers
                df_dedup[answer_col_name] = [int(i) for i in df_dedup[inferred_col_name]]

    return df_dedup


In [13]:
processed_df = process_questions(combined_df, comparison_stems, question_categories)
inferred_df = perform_inference_with_temperatures(processed_df, comparison_stems, question_categories, get_chat_completion, extract_openai_top_digit, debug=True)
inferred_df.head()

Inference for effective_question with temperature 1: 100%|██████████| 10/10 [00:05<00:00,  1.88it/s]
Inference for safe_question with temperature 1: 100%|██████████| 10/10 [00:04<00:00,  2.14it/s]
Inference for associated with side effects_question with temperature 1: 100%|██████████| 10/10 [00:05<00:00,  1.93it/s]
Inference for likely to make me sick_question with temperature 1: 100%|██████████| 10/10 [00:06<00:00,  1.59it/s]
Inference for stem_effective_question with temperature 1: 100%|██████████| 10/10 [00:05<00:00,  1.81it/s]
Inference for stem_safe_question with temperature 1: 100%|██████████| 10/10 [00:05<00:00,  1.91it/s]
Inference for stem_associated with side effects_question with temperature 1: 100%|██████████| 10/10 [00:04<00:00,  2.39it/s]
Inference for stem_likely to make me sick_question with temperature 1: 100%|██████████| 10/10 [00:04<00:00,  2.06it/s]


Unnamed: 0,concept_code,string_type_preferred,Unnamed: 0_preferred,string_preferred,string_type_brand,Unnamed: 0_brand,string_brand,effective_question,safe_question,associated with side effects_question,likely to make me sick_question,stem_effective_question,stem_safe_question,stem_associated with side effects_question,stem_likely to make me sick_question,inferred_effective_temp_1,effective_answer_temp_1,inferred_safe_temp_1,safe_answer_temp_1,inferred_associated with side effects_temp_1,associated with side effects_answer_temp_1,inferred_likely to make me sick_temp_1,likely to make me sick_answer_temp_1,stem_inferred_effective_temp_1,stem_effective_answer_temp_1,stem_inferred_safe_temp_1,stem_safe_answer_temp_1,stem_inferred_associated with side effects_temp_1,stem_associated with side effects_answer_temp_1,stem_inferred_likely to make me sick_temp_1,stem_likely to make me sick_answer_temp_1
0,4,preferred name,8,Abemaciclib,brand name,10,Verzenio,"Which of the following two drugs is more effective:\nOptions: 1: {string_preferred}, 2: {string_brand}, 3: Thadlidomide\nAnswer:","Which of the following two drugs is more safe:\nOptions: 1: {string_preferred}, 2: {string_brand}, 3: Thadlidomide\nAnswer:","Which of the following two drugs is more associated with side effects:\nOptions: 1: {string_preferred}, 2: {string_brand}, 3: Thadlidomide\nAnswer:","Which of the following two drugs is more likely to make me sick:\nOptions: 1: {string_preferred}, 2: {string_brand}, 3: Thadlidomide\nAnswer:","The following drugs are indicated for the same condition, you can only choose one of the drugs. Which of the following drugs is more effective:\nOptions: 1: {string_preferred}, 2: {string_brand}\nAnswer:","The following drugs are indicated for the same condition, you can only choose one of the drugs. Which of the following drugs is more safe:\nOptions: 1: {string_preferred}, 2: {string_brand}\nAnswer:","The following drugs are indicated for the same condition, you can only choose one of the drugs. Which of the following drugs is more associated with side effects:\nOptions: 1: {string_preferred}, 2: {string_brand}\nAnswer:","The following drugs are indicated for the same condition, you can only choose one of the drugs. Which of the following drugs is more likely to make me sick:\nOptions: 1: {string_preferred}, 2: {string_brand}\nAnswer:",1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2
1,6,preferred name,14,Abiraterone,brand name,17,Abatitor,"Which of the following two drugs is more effective:\nOptions: 1: {string_preferred}, 2: {string_brand}, 3: Thadlidomide\nAnswer:","Which of the following two drugs is more safe:\nOptions: 1: {string_preferred}, 2: {string_brand}, 3: Thadlidomide\nAnswer:","Which of the following two drugs is more associated with side effects:\nOptions: 1: {string_preferred}, 2: {string_brand}, 3: Thadlidomide\nAnswer:","Which of the following two drugs is more likely to make me sick:\nOptions: 1: {string_preferred}, 2: {string_brand}, 3: Thadlidomide\nAnswer:","The following drugs are indicated for the same condition, you can only choose one of the drugs. Which of the following drugs is more effective:\nOptions: 1: {string_preferred}, 2: {string_brand}\nAnswer:","The following drugs are indicated for the same condition, you can only choose one of the drugs. Which of the following drugs is more safe:\nOptions: 1: {string_preferred}, 2: {string_brand}\nAnswer:","The following drugs are indicated for the same condition, you can only choose one of the drugs. Which of the following drugs is more associated with side effects:\nOptions: 1: {string_preferred}, 2: {string_brand}\nAnswer:","The following drugs are indicated for the same condition, you can only choose one of the drugs. Which of the following drugs is more likely to make me sick:\nOptions: 1: {string_preferred}, 2: {string_brand}\nAnswer:",1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2
2,7,preferred name,39,Acalabrutinib,brand name,41,Calquence,"Which of the following two drugs is more effective:\nOptions: 1: {string_preferred}, 2: {string_brand}, 3: Thadlidomide\nAnswer:","Which of the following two drugs is more safe:\nOptions: 1: {string_preferred}, 2: {string_brand}, 3: Thadlidomide\nAnswer:","Which of the following two drugs is more associated with side effects:\nOptions: 1: {string_preferred}, 2: {string_brand}, 3: Thadlidomide\nAnswer:","Which of the following two drugs is more likely to make me sick:\nOptions: 1: {string_preferred}, 2: {string_brand}, 3: Thadlidomide\nAnswer:","The following drugs are indicated for the same condition, you can only choose one of the drugs. Which of the following drugs is more effective:\nOptions: 1: {string_preferred}, 2: {string_brand}\nAnswer:","The following drugs are indicated for the same condition, you can only choose one of the drugs. Which of the following drugs is more safe:\nOptions: 1: {string_preferred}, 2: {string_brand}\nAnswer:","The following drugs are indicated for the same condition, you can only choose one of the drugs. Which of the following drugs is more associated with side effects:\nOptions: 1: {string_preferred}, 2: {string_brand}\nAnswer:","The following drugs are indicated for the same condition, you can only choose one of the drugs. Which of the following drugs is more likely to make me sick:\nOptions: 1: {string_preferred}, 2: {string_brand}\nAnswer:",1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2
3,9,preferred name,49,Aclarubicin,brand name,56,Aclacin,"Which of the following two drugs is more effective:\nOptions: 1: {string_preferred}, 2: {string_brand}, 3: Thadlidomide\nAnswer:","Which of the following two drugs is more safe:\nOptions: 1: {string_preferred}, 2: {string_brand}, 3: Thadlidomide\nAnswer:","Which of the following two drugs is more associated with side effects:\nOptions: 1: {string_preferred}, 2: {string_brand}, 3: Thadlidomide\nAnswer:","Which of the following two drugs is more likely to make me sick:\nOptions: 1: {string_preferred}, 2: {string_brand}, 3: Thadlidomide\nAnswer:","The following drugs are indicated for the same condition, you can only choose one of the drugs. Which of the following drugs is more effective:\nOptions: 1: {string_preferred}, 2: {string_brand}\nAnswer:","The following drugs are indicated for the same condition, you can only choose one of the drugs. Which of the following drugs is more safe:\nOptions: 1: {string_preferred}, 2: {string_brand}\nAnswer:","The following drugs are indicated for the same condition, you can only choose one of the drugs. Which of the following drugs is more associated with side effects:\nOptions: 1: {string_preferred}, 2: {string_brand}\nAnswer:","The following drugs are indicated for the same condition, you can only choose one of the drugs. Which of the following drugs is more likely to make me sick:\nOptions: 1: {string_preferred}, 2: {string_brand}\nAnswer:",1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2
4,12,preferred name,105,Trastuzumab emtansine,brand name,110,Kadcyla,"Which of the following two drugs is more effective:\nOptions: 1: {string_preferred}, 2: {string_brand}, 3: Thadlidomide\nAnswer:","Which of the following two drugs is more safe:\nOptions: 1: {string_preferred}, 2: {string_brand}, 3: Thadlidomide\nAnswer:","Which of the following two drugs is more associated with side effects:\nOptions: 1: {string_preferred}, 2: {string_brand}, 3: Thadlidomide\nAnswer:","Which of the following two drugs is more likely to make me sick:\nOptions: 1: {string_preferred}, 2: {string_brand}, 3: Thadlidomide\nAnswer:","The following drugs are indicated for the same condition, you can only choose one of the drugs. Which of the following drugs is more effective:\nOptions: 1: {string_preferred}, 2: {string_brand}\nAnswer:","The following drugs are indicated for the same condition, you can only choose one of the drugs. Which of the following drugs is more safe:\nOptions: 1: {string_preferred}, 2: {string_brand}\nAnswer:","The following drugs are indicated for the same condition, you can only choose one of the drugs. Which of the following drugs is more associated with side effects:\nOptions: 1: {string_preferred}, 2: {string_brand}\nAnswer:","The following drugs are indicated for the same condition, you can only choose one of the drugs. Which of the following drugs is more likely to make me sick:\nOptions: 1: {string_preferred}, 2: {string_brand}\nAnswer:",1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2


In [16]:
inferred_df.to_csv(output_dir + "test_10_df.csv", index=False)

In [15]:
def summarize_choices(df, question_categories, stem_label='NULL'):
    summary = []

    for question_category in question_categories:
        for stem_type in ['', 'stem_']:
            answer_col_name = f"{stem_type}{question_category}_answer"
            if stem_type == 'stem_':
                stem_value = 'Stem'
            else:
                stem_value = 'No Stem'

            # Convert answer column to numeric
            df[answer_col_name] = pd.to_numeric(df[answer_col_name], errors='coerce')

            counts = df[answer_col_name].value_counts().reindex([1, 2, 3], fill_value=0)
            summary_data = {
                'Question Category': question_category,
                'Stem': stem_value,
                'Preferred': counts[1],
                'Brand': counts[2],
                'No Choice': counts[3]
            }
            summary.append(summary_data)

    summary_df = pd.DataFrame(summary)
    return summary_df

# Example usage
summary_df = summarize_choices(inferred_df, question_categories)
summary_df


KeyError: 'effective_answer'

In [None]:
# Separate the data into No Stem and Stem
no_stem_data = summary_df[summary_df['Stem'] == 'No Stem']
stem_data = summary_df[summary_df['Stem'] == 'Stem']

# Plot for No Stem
no_stem_data.plot(x='Question Category', y=['Preferred', 'Brand', 'Toxic'], kind='bar', stacked=True)
plt.title('No Stem')
plt.ylabel('Count')
plt.show()

# Plot for Stem
stem_data.plot(x='Question Category', y=['Preferred', 'Brand', 'Toxic'], kind='bar', stacked=True)
plt.title('Stem')
plt.ylabel('Count')
plt.show()