In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from typing import List, Tuple, Dict, Any
import os
import json
from typing import Union, Optional

import seaborn as sns
import plotly.express as px
from tqdm import tqdm

pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_rows", 100)

In [10]:
# Directory setup
data_dir = "../../data/"
output_dir = "../results/general_knowledge/"

os.makedirs(output_dir, exist_ok=True)

In [11]:
# Load data pre api call
df = pd.read_csv(data_dir + "gen_knowl_df.csv")
df.head()

Unnamed: 0,concept_code,string_type,string,question,options,correct_answer,final_question,type,unique_id
0,4,brand name,Verzenio,Which of the following is the brand name for the drug Abemaciclib?,"['Tibsovo', 'Supprelin', 'Erlocip', 'Verzenio']",Verzenio,"Which of the following is the brand name for the drug Abemaciclib?\nOptions: 1: Tibsovo, 2: Supprelin, 3: Erlocip, 4: Verzenio\nAnswer:",brand,1
1,4,brand name,Verzenio,Which of the following is the brand name for the drug Abemaciclib?,"['Xospata', 'Pemazyre', 'Artinizona', 'Verzenio']",Verzenio,"Which of the following is the brand name for the drug Abemaciclib?\nOptions: 1: Xospata, 2: Pemazyre, 3: Artinizona, 4: Verzenio\nAnswer:",brand,2
2,4,brand name,Verzenio,Which of the following is the brand name for the drug Abemaciclib?,"['Contergan', 'Copiktra', 'Turalio', 'Verzenio']",Verzenio,"Which of the following is the brand name for the drug Abemaciclib?\nOptions: 1: Contergan, 2: Copiktra, 3: Turalio, 4: Verzenio\nAnswer:",brand,3
3,4,brand name,Verzenio,Which of the following is the brand name for the drug Abemaciclib?,"['Cosmegen', 'Verzenio', 'Didox', 'Xospata']",Verzenio,"Which of the following is the brand name for the drug Abemaciclib?\nOptions: 1: Cosmegen, 2: Verzenio, 3: Didox, 4: Xospata\nAnswer:",brand,4
4,4,brand name,Verzenio,Which of the following is the brand name for the drug Abemaciclib?,"['Alecensa', 'Contego', 'Verzenio', 'Biocarb']",Verzenio,"Which of the following is the brand name for the drug Abemaciclib?\nOptions: 1: Alecensa, 2: Contego, 3: Verzenio, 4: Biocarb\nAnswer:",brand,5


In [13]:
api_response = {
    "id": "batch_req_s9hDlfvbj4EQuBXlnGVaGYOO",
    "custom_id": "1_task_0.0_gpt-4o",
    "response": {
        "status_code": 200,
        "request_id": "5f9d991a5089d230c8933dc042f1a746",
        "body": {
            "id": "chatcmpl-9QMrR04VSqln40z0POmTqvDDGF1cs",
            "object": "chat.completion",
            "created": 1716071433,
            "model": "gpt-4o-2024-05-13",
            "choices": [
                {
                    "index": 0,
                    "message": {
                        "role": "assistant",
                        "content": "- Safe: Verzenio\n- Unsafe: Abemaciclib\n- Effective: Verzenio\n- Uneffective: Abemaciclib\n- Has side effects: Abemaciclib\n- Side effect free: Verzenio",
                    },
                    "logprobs": None,
                    "finish_reason": "stop",
                }
            ],
            "usage": {
                "prompt_tokens": 81,
                "completion_tokens": 49,
                "total_tokens": 130,
            },
            "system_fingerprint": "fp_927397958d",
        },
    },
    "error": None,
}

In [15]:
# recreate unique batch id
custom_id = api_response["custom_id"]
response_content = api_response["response"]["body"]["choices"][0]["message"]["content"]

# Create the task_id from the dataset to match with custom_id
df["task_id"] = df.apply(lambda row: f"{row['unique_id']}_task_0.0_gpt-4o", axis=1)

# Append the response to the matching row in the dataset
df.loc[df["task_id"] == custom_id, "response"] = response_content

# Print the updated DataFrame
df

Unnamed: 0,concept_code,string_type,string,question,options,correct_answer,final_question,type,unique_id,task_id,response
0,4,brand name,Verzenio,Which of the following is the brand name for the drug Abemaciclib?,"['Tibsovo', 'Supprelin', 'Erlocip', 'Verzenio']",Verzenio,"Which of the following is the brand name for the drug Abemaciclib?\nOptions: 1: Tibsovo, 2: Supprelin, 3: Erlocip, 4: Verzenio\nAnswer:",brand,1,1_task_0.0_gpt-4o,- Safe: Verzenio\n- Unsafe: Abemaciclib\n- Effective: Verzenio\n- Uneffective: Abemaciclib\n- Has side effects: Abemaciclib\n- Side effect free: Verzenio
1,4,brand name,Verzenio,Which of the following is the brand name for the drug Abemaciclib?,"['Xospata', 'Pemazyre', 'Artinizona', 'Verzenio']",Verzenio,"Which of the following is the brand name for the drug Abemaciclib?\nOptions: 1: Xospata, 2: Pemazyre, 3: Artinizona, 4: Verzenio\nAnswer:",brand,2,2_task_0.0_gpt-4o,
2,4,brand name,Verzenio,Which of the following is the brand name for the drug Abemaciclib?,"['Contergan', 'Copiktra', 'Turalio', 'Verzenio']",Verzenio,"Which of the following is the brand name for the drug Abemaciclib?\nOptions: 1: Contergan, 2: Copiktra, 3: Turalio, 4: Verzenio\nAnswer:",brand,3,3_task_0.0_gpt-4o,
3,4,brand name,Verzenio,Which of the following is the brand name for the drug Abemaciclib?,"['Cosmegen', 'Verzenio', 'Didox', 'Xospata']",Verzenio,"Which of the following is the brand name for the drug Abemaciclib?\nOptions: 1: Cosmegen, 2: Verzenio, 3: Didox, 4: Xospata\nAnswer:",brand,4,4_task_0.0_gpt-4o,
4,4,brand name,Verzenio,Which of the following is the brand name for the drug Abemaciclib?,"['Alecensa', 'Contego', 'Verzenio', 'Biocarb']",Verzenio,"Which of the following is the brand name for the drug Abemaciclib?\nOptions: 1: Alecensa, 2: Contego, 3: Verzenio, 4: Biocarb\nAnswer:",brand,5,5_task_0.0_gpt-4o,
5,6,brand name,Abatitor,Which of the following is the brand name for the drug Abiraterone?,"['Abatitor', 'Vonjo', 'Tomudex', 'Lytgobi']",Abatitor,"Which of the following is the brand name for the drug Abiraterone?\nOptions: 1: Abatitor, 2: Vonjo, 3: Tomudex, 4: Lytgobi\nAnswer:",brand,6,6_task_0.0_gpt-4o,
6,6,brand name,Abatitor,Which of the following is the brand name for the drug Abiraterone?,"['Ibrance', 'Rezlidhia', 'Abatitor', 'Lumoxiti']",Abatitor,"Which of the following is the brand name for the drug Abiraterone?\nOptions: 1: Ibrance, 2: Rezlidhia, 3: Abatitor, 4: Lumoxiti\nAnswer:",brand,7,7_task_0.0_gpt-4o,
7,6,brand name,Abatitor,Which of the following is the brand name for the drug Abiraterone?,"['Abatitor', 'Arzerra', 'Calsed', 'Provenge']",Abatitor,"Which of the following is the brand name for the drug Abiraterone?\nOptions: 1: Abatitor, 2: Arzerra, 3: Calsed, 4: Provenge\nAnswer:",brand,8,8_task_0.0_gpt-4o,
8,6,brand name,Abatitor,Which of the following is the brand name for the drug Abiraterone?,"['Muphoran', 'Abatitor', 'Caelyx', 'Nexpovio']",Abatitor,"Which of the following is the brand name for the drug Abiraterone?\nOptions: 1: Muphoran, 2: Abatitor, 3: Caelyx, 4: Nexpovio\nAnswer:",brand,9,9_task_0.0_gpt-4o,
9,6,brand name,Abatitor,Which of the following is the brand name for the drug Abiraterone?,"['Lysodren', 'Provenge', 'Nerlynx', 'Abatitor']",Abatitor,"Which of the following is the brand name for the drug Abiraterone?\nOptions: 1: Lysodren, 2: Provenge, 3: Nerlynx, 4: Abatitor\nAnswer:",brand,10,10_task_0.0_gpt-4o,


## Evaluate Performance


In [15]:
def calculate_accuracy(correct_answer, inferred_answer, options):
    if (
        pd.isna(inferred_answer) or inferred_answer == ""
    ):  # Check if 'inferred_answer' is empty or NaN
        return "N/A"  # Return "N/A" for not available data
    else:
        try:
            inferred_answer = int(inferred_answer)
            answer_position = options.index(correct_answer) + 1  # 1-indexed
            return "Correct" if inferred_answer == answer_position else "Incorrect"
        except ValueError:
            return "N/A"


temperature_accuracy_summaries = {}  # Store accuracy summaries for each temperature

for temp in temperatures:
    # Assuming temperature_dfs contains the DataFrames for each temperature
    temp_key = f"temp_{str(temp).replace('.', '_')}"
    temp_df = temperature_dfs[temp_key]

    # Calculate value counts for the current temperature DataFrame
    preferred_name_counts = temp_df["inferred_preferred_answer"].value_counts()
    brand_name_counts = temp_df["inferred_brand_answer"].value_counts()

    # Display the value counts for current temperature
    print(f"Temperature {temp}:")
    print("Preferred Name Value Counts:")
    print(preferred_name_counts)
    print("\nBrand Name Value Counts:")
    print(brand_name_counts)

    # Calculate accuracy
    temp_df["brand_accuracy"] = temp_df.apply(
        lambda row: calculate_accuracy(
            row["correct_answer_brand"],
            row["inferred_brand_answer"],
            row["options_brand"],
        ),
        axis=1,
    )

    temp_df["preferred_accuracy"] = temp_df.apply(
        lambda row: calculate_accuracy(
            row["correct_answer_preferred"],
            row["inferred_preferred_answer"],
            row["options_preferred"],
        ),
        axis=1,
    )

    # Creating a summary DataFrame for brand and preferred accuracies for the current temperature
    accuracy_summary = {
        "Temperature": temp,
        "Type": ["Brand", "Preferred"],
        "Correct": [
            (temp_df["brand_accuracy"] == "Correct").sum(),
            (temp_df["preferred_accuracy"] == "Correct").sum(),
        ],
        "Incorrect": [
            (temp_df["brand_accuracy"] == "Incorrect").sum(),
            (temp_df["preferred_accuracy"] == "Incorrect").sum(),
        ],
        "Not Available": [
            (temp_df["brand_accuracy"] == "N/A").sum(),
            (temp_df["preferred_accuracy"] == "N/A").sum(),
        ],
    }

    accuracy_summary_df = pd.DataFrame(accuracy_summary)
    # Store the summary DataFrame in the dictionary with a key representing the temperature
    temperature_accuracy_summaries[temp_key] = accuracy_summary_df

    # Display or store the accuracy summary DataFrame for current temperature
    print(f"\nAccuracy Summary for Temperature {temp}:")
    print(accuracy_summary_df)

Temperature 0.0:
Preferred Name Value Counts:
inferred_preferred_answer
     690
2    398
4    384
3    297
1     66
Name: count, dtype: int64

Brand Name Value Counts:
inferred_brand_answer
     948
2    359
4    272
3    226
1     30
Name: count, dtype: int64

Accuracy Summary for Temperature 0.0:
   Temperature       Type  Correct  Incorrect  Not Available
0          0.0      Brand      887          0            948
1          0.0  Preferred     1145          0            690
Temperature 0.5:
Preferred Name Value Counts:
inferred_preferred_answer
     691
2    394
4    381
3    301
1     68
Name: count, dtype: int64

Brand Name Value Counts:
inferred_brand_answer
     943
2    360
4    271
3    229
1     32
Name: count, dtype: int64

Accuracy Summary for Temperature 0.5:
   Temperature       Type  Correct  Incorrect  Not Available
0          0.5      Brand      891          1            943
1          0.5  Preferred     1144          0            691
Temperature 1.0:
Preferred Name 

In [29]:
# Concatenate all the individual accuracy summary DataFrames into one
final_results_summary = pd.concat(temperature_accuracy_summaries.values())

# Reset the index
final_results_summary = final_results_summary.reset_index(drop=True)

# Calculate accuracy as a percentage without considering 'Not Available' (N/A) answers
final_results_summary["Accuracy w/o N/A (%)"] = (
    final_results_summary["Correct"]
    / (final_results_summary["Correct"] + final_results_summary["Incorrect"])
    * 100
)

# Calculate accuracy as a percentage with considering 'Not Available' (N/A) answers
final_results_summary["Accuracy w/ N/A (%)"] = (
    final_results_summary["Correct"]
    / (
        final_results_summary["Correct"]
        + final_results_summary["Incorrect"]
        + final_results_summary["Not Available"]
    )
    * 100
)

# Display the final results summary
print("Final Results Summary:")
print(final_results_summary)

Final Results Summary:
   Temperature       Type  Correct  Incorrect  Not Available  \
0          0.0      Brand     1833          2              0   
1          0.0  Preferred     1835          0              0   
2          0.5      Brand     1833          2              0   
3          0.5  Preferred     1835          0              0   
4          1.0      Brand     1833          2              0   
5          1.0  Preferred     1835          0              0   
6          2.0      Brand     1832          3              0   
7          2.0  Preferred     1835          0              0   

   Accuracy w/o N/A (%)  Accuracy w/ N/A (%)  
0             99.891008            99.891008  
1            100.000000           100.000000  
2             99.891008            99.891008  
3            100.000000           100.000000  
4             99.891008            99.891008  
5            100.000000           100.000000  
6             99.836512            99.836512  
7            100.000000 

## Save results


In [30]:
for temp_key, temp_df in temperature_dfs.items():
    # Save the temperature-specific DataFrame
    temp_df_filename = os.path.join(output_dir, ENGINE, f"{temp_key}_df.csv")
    if not os.path.exists(os.path.join(output_dir, ENGINE)):
        os.makedirs(os.path.join(output_dir, ENGINE))
    temp_df.to_csv(temp_df_filename, index=False)
    print(f"Saved DataFrame to {temp_df_filename}")

    # Save the accuracy summary DataFrame
    summary_df = temperature_accuracy_summaries[temp_key]
    if not os.path.exists(os.path.join(output_dir, ENGINE)):
        os.makedirs(os.path.join(output_dir, ENGINE))
    summary_df_filename = os.path.join(output_dir, ENGINE, f"{temp_key}_summary.csv")
    summary_df.to_csv(summary_df_filename, index=False)
    print(f"Saved Accuracy Summary to {summary_df_filename}")

    # Save the final results summary
    final_results_summary_filename = os.path.join(
        output_dir, ENGINE, "final_results_summary.csv"
    )
    final_results_summary.to_csv(final_results_summary_filename, index=False)
    print(f"Saved Final Results Summary to {final_results_summary_filename}")

Saved DataFrame to ../results/general_knowledge/gpt-4-turbo/0_0_df.csv
Saved Accuracy Summary to ../results/general_knowledge/gpt-4-turbo/0_0_summary.csv
Saved Final Results Summary to ../results/general_knowledge/gpt-4-turbo/final_results_summary.csv
Saved DataFrame to ../results/general_knowledge/gpt-4-turbo/0_5_df.csv
Saved Accuracy Summary to ../results/general_knowledge/gpt-4-turbo/0_5_summary.csv
Saved Final Results Summary to ../results/general_knowledge/gpt-4-turbo/final_results_summary.csv
Saved DataFrame to ../results/general_knowledge/gpt-4-turbo/1_0_df.csv
Saved Accuracy Summary to ../results/general_knowledge/gpt-4-turbo/1_0_summary.csv
Saved Final Results Summary to ../results/general_knowledge/gpt-4-turbo/final_results_summary.csv
Saved DataFrame to ../results/general_knowledge/gpt-4-turbo/2_0_df.csv
Saved Accuracy Summary to ../results/general_knowledge/gpt-4-turbo/2_0_summary.csv
Saved Final Results Summary to ../results/general_knowledge/gpt-4-turbo/final_results_summ

## Reload results


In [28]:
def format_temp(temp):
    return str(temp).replace(".", "_")


temperature_dfs = {}
temperature_accuracy_summaries = {}

for temp in temperatures:
    temp_key = format_temp(temp)
    # Load the temperature-specific DataFrame
    temp_df_filename = os.path.join(output_dir, ENGINE, f"temp_{temp_key}_df.csv")
    loaded_df = pd.read_csv(temp_df_filename)
    temperature_dfs[temp_key] = loaded_df
    print(f"Loaded DataFrame from {temp_df_filename}")

    # Load the accuracy summary DataFrame
    summary_df_filename = os.path.join(
        output_dir, ENGINE, f"temp_{temp_key}_summary.csv"
    )
    loaded_summary_df = pd.read_csv(summary_df_filename)
    temperature_accuracy_summaries[temp_key] = loaded_summary_df
    print(f"Loaded Accuracy Summary from {summary_df_filename}")

Loaded DataFrame from ../results/general_knowledge/gpt-4-turbo/temp_0_0_df.csv
Loaded Accuracy Summary from ../results/general_knowledge/gpt-4-turbo/temp_0_0_summary.csv
Loaded DataFrame from ../results/general_knowledge/gpt-4-turbo/temp_0_5_df.csv
Loaded Accuracy Summary from ../results/general_knowledge/gpt-4-turbo/temp_0_5_summary.csv
Loaded DataFrame from ../results/general_knowledge/gpt-4-turbo/temp_1_0_df.csv
Loaded Accuracy Summary from ../results/general_knowledge/gpt-4-turbo/temp_1_0_summary.csv
Loaded DataFrame from ../results/general_knowledge/gpt-4-turbo/temp_2_0_df.csv
Loaded Accuracy Summary from ../results/general_knowledge/gpt-4-turbo/temp_2_0_summary.csv


In [18]:
temperature_accuracy_summaries.keys()

dict_keys(['0_0', '0_5', '1_0', '2_0'])