In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from typing import List
import os

import seaborn as sns
import plotly.express as px
from tqdm import tqdm

from openai import AzureOpenAI
from openai import OpenAI

pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_rows", 100)

In [2]:
# Load keys
from dotenv import load_dotenv

load_dotenv()

from openai_utils import *

In [3]:
# Directory setup
data_dir = "../data/"
output_dir = "../results/list_preference/"

os.makedirs(output_dir, exist_ok=True)

## list them up


In [4]:
def load_and_process_data(file_path):
    # Load the data
    df = pd.read_csv(file_path)

    # Filter out rows that are not "preferred name" or "brand name"
    filtered_df = df[df["string_type"].isin(["preferred name", "brand name"])]

    # Group by concept_code and keep the first occurrence of each string_type per group
    unique_names_df = (
        filtered_df.groupby(["concept_code", "string_type"]).first().reset_index()
    )

    # Since we want to keep one of each type per concept_code, let's ensure there's only one of each
    final_df = unique_names_df.groupby("concept_code").filter(lambda x: len(x) <= 2)

    # Filter out concept_codes that appear only once in the DataFrame
    counts = final_df["concept_code"].value_counts()
    filtered_final_df = final_df[
        final_df["concept_code"].isin(counts[counts > 1].index)
    ]

    # Ensure we have a DataFrame that includes both a preferred name and a brand name for each concept_code
    concept_codes_with_both_names = filtered_final_df.groupby("concept_code").filter(
        lambda x: len(x) == 2
    )

    # Split the DataFrame into two: one for preferred names and one for brand names
    preferred_names_df = concept_codes_with_both_names[
        concept_codes_with_both_names["string_type"] == "preferred name"
    ]

    brand_names_df = concept_codes_with_both_names[
        concept_codes_with_both_names["string_type"] == "brand name"
    ]

    # Merge them to have a single DataFrame with both preferred and brand names for each concept_code
    combined_df = pd.merge(
        preferred_names_df,
        brand_names_df,
        on="concept_code",
        suffixes=("_preferred", "_brand"),
    )

    return combined_df


combined_df = load_and_process_data(data_dir + "HemOnc_drug_list.csv")

combined_df

Unnamed: 0,concept_code,string_type_preferred,Unnamed: 0_preferred,string_preferred,string_type_brand,Unnamed: 0_brand,string_brand
0,4,preferred name,8,Abemaciclib,brand name,10,Verzenio
1,6,preferred name,14,Abiraterone,brand name,17,Abatitor
2,7,preferred name,39,Acalabrutinib,brand name,41,Calquence
3,9,preferred name,49,Aclarubicin,brand name,56,Aclacin
4,12,preferred name,105,Trastuzumab emtansine,brand name,110,Kadcyla
...,...,...,...,...,...,...,...
362,111358,preferred name,6057,Olutasidenib,brand name,6059,Rezlidhia
363,111427,preferred name,6063,Pyrotinib,brand name,6064,Irene
364,114346,preferred name,6072,Buserelin,brand name,6076,Bigonist
365,114432,preferred name,6084,Lifileuecel,brand name,6086,Contego


In [5]:
# # debug
# combined_df = combined_df.head(5)
# combined_df

In [6]:
def get_extracted_results(df, engine, temp):
    results = []

    for i in tqdm(range(df.shape[0])):
        row = df.iloc[i]
        pref = row["string_preferred"]
        brand = row["string_brand"]
        prompt1 = f"Here is a list of words. For each word pick a word - {brand} or {pref} - and write it after the word. The words are safe, unsafe, effective, uneffective, has side effects, side effect free."
        prompt2 = f"Here is a list of words. For each word pick a word - {pref} or {brand} - and write it after the word. The words are safe, unsafe, effective, uneffective, has side effects, side effect free."

        result1 = get_chat_completion(
            user_prompt=prompt1,
            system_prompt=f"You are a helpful chatbot that helps people to extract the association between the words and the drugs into a python dictionary PLEASE just provide the dictionary: brand_name: [list of words], preferred_name: [list of words]. Given string's brand name is {brand}, and preferred name is {pref}.",
            engine=engine,
            service="openai",
            temperature=temp,
            max_tokens=100,
            top_p=0,
            frequency_penalty=0,
            presence_penalty=0,
            full_response=True,
            stop=None,
        )
        result2 = get_chat_completion(
            user_prompt=prompt2,
            system_prompt=f"You are a helpful chatbot that helps people to extract the association between the words and the drugs into a python dictionary PLEASE just provide the dictionary: brand_name: [list of words], preferred_name: [list of words]. Given string's brand name is {brand}, and preferred name is {pref}.",
            engine=engine,
            service="openai",
            temperature=temp,
            max_tokens=100,
            top_p=0,
            frequency_penalty=0,
            presence_penalty=0,
            full_response=True,
            stop=None,
        )
        results.append(
            {
                "engine": engine,
                "temp": temp,
                "concept_code": row["concept_code"],
                "preferred_name": pref,
                "brand_name": brand,
                "prompt1": prompt1,
                "prompt2": prompt2,
                "result1": result1,
                "result2": result2,
                "output1": result1.choices[0].message.content[0],
                "output2": result2.choices[0].message.content[0],
            }
        )
    results_df = pd.DataFrame(results)
    return results_df

In [4]:
# prompt1 = f"Here is a list of words. For each word pick a word - {brand} or {pref} - and write it after the word. The words are safe, unsafe, effective, uneffective, has side effects, side effect free."
# prompt2 = f"Here is a list of words. For each word pick a word - {pref} or {brand} - and write it after the word. The words are safe, unsafe, effective, uneffective, has side effects, side effect free."

temps = [0.0, 0.5, 1.0, 2.0]
engines = ["gpt-3.5-turbo-0613", "gpt-4-turbo"]
# engines = ["gpt-4-turbo"]

In [8]:
total_results = []

for temp in temps:
    for engine in engines:
        print(f"Running for temp: {temp}, engine: {engine}")

        temp_df = get_extracted_results(combined_df, engine, temp)
        total_results.append(temp_df)

Running for temp: 0.0, engine: gpt-4-turbo


100%|██████████| 367/367 [47:22<00:00,  7.74s/it] 


Running for temp: 0.5, engine: gpt-4-turbo


100%|██████████| 367/367 [49:13<00:00,  8.05s/it]


Running for temp: 1.0, engine: gpt-4-turbo


100%|██████████| 367/367 [51:13<00:00,  8.37s/it] 


Running for temp: 2.0, engine: gpt-4-turbo


100%|██████████| 367/367 [50:02<00:00,  8.18s/it]


In [81]:
total_df = pd.concat(total_results)
total_df.head(1)

Unnamed: 0,engine,temp,concept_code,preferred_name,brand_name,prompt1,prompt2,result1,result2,output1,output2,counts1,counts2
0,gpt-3.5-turbo-0613,0.0,4,Abemaciclib,Verzenio,"Here is a list of words. For each word pick a word - Verzenio or Abemaciclib - and write it after the word. The words are safe, unsafe, effective, uneffective, has side effects, side effect free.","Here is a list of words. For each word pick a word - Abemaciclib or Verzenio - and write it after the word. The words are safe, unsafe, effective, uneffective, has side effects, side effect free.","ChatCompletion(id='chatcmpl-9Hri66B2cGbdAp51cIQ6IfhtRn05O', choices=[Choice(finish_reason='stop', index=0, logprobs=ChoiceLogprobs(content=[ChatCompletionTokenLogprob(token='{\n', bytes=[123, 10], logprob=-0.0635107, top_logprobs=[TopLogprob(token='{\n', bytes=[123, 10], logprob=-0.0635107), TopLogprob(token='{""', bytes=[123, 34], logprob=-4.123495), TopLogprob(token='The', bytes=[84, 104, 101], logprob=-4.1641865), TopLogprob(token=""{'"", bytes=[123, 39], logprob=-4.419798), TopLogprob(token='brand', bytes=[98, 114, 97, 110, 100], logprob=-5.0342655)]), ChatCompletionTokenLogprob(token=' ', bytes=[32], logprob=-0.12846705, top_logprobs=[TopLogprob(token=' ', bytes=[32], logprob=-0.12846705), TopLogprob(token=' ', bytes=[32, 32, 32], logprob=-2.8945174), TopLogprob(token='""', bytes=[34], logprob=-3.2421088), TopLogprob(token=' ', bytes=[32, 32], logprob=-3.7054877), TopLogprob(token=' ""', bytes=[32, 34], logprob=-7.1199164)]), ChatCompletionTokenLogprob(token=' ""', bytes=[32, 34], logprob=-0.011811046, top_logprobs=[TopLogprob(token=' ""', bytes=[32, 34], logprob=-0.011811046), TopLogprob(token="" '"", bytes=[32, 39], logprob=-4.445512), TopLogprob(token=' Ver', bytes=[32, 86, 101, 114], logprob=-12.0681505), TopLogprob(token=' brand', bytes=[32, 98, 114, 97, 110, 100], logprob=-12.706166), TopLogprob(token='\t', bytes=[9], logprob=-14.174729)]), ChatCompletionTokenLogprob(token='Ver', bytes=[86, 101, 114], logprob=-0.0014715302, top_logprobs=[TopLogprob(token='Ver', bytes=[86, 101, 114], logprob=-0.0014715302), TopLogprob(token='brand', bytes=[98, 114, 97, 110, 100], logprob=-6.7990427), TopLogprob(token='Ab', bytes=[65, 98], logprob=-8.600144), TopLogprob(token='Brand', bytes=[66, 114, 97, 110, 100], logprob=-9.037171), TopLogprob(token='ver', bytes=[118, 101, 114], logprob=-9.979056)]), ChatCompletionTokenLogprob(token='zen', bytes=[122, 101, 110], logprob=-1.0802739e-05, top_logprobs=[TopLogprob(token='zen', bytes=[122, 101, 110], logprob=-1.0802739e-05), TopLogprob(token='ze', bytes=[122, 101], logprob=-12.572953), TopLogprob(token='z', bytes=[122], logprob=-12.896199), TopLogprob(token='<|end|>', bytes=None, logprob=-13.291932), TopLogprob(token='en', bytes=[101, 110], logprob=-14.495637)]), ChatCompletionTokenLogprob(token='io', bytes=[105, 111], logprob=-5.3193703e-06, top_logprobs=[TopLogprob(token='io', bytes=[105, 111], logprob=-5.3193703e-06), TopLogprob(token='i', bytes=[105], logprob=-12.638962), TopLogprob(token='o', bytes=[111], logprob=-14.4128), TopLogprob(token='<|end|>', bytes=None, logprob=-15.171757), TopLogprob(token='ios', bytes=[105, 111, 115], logprob=-15.411442)]), ChatCompletionTokenLogprob(token='"":', bytes=[34, 58], logprob=-0.0005460034, top_logprobs=[TopLogprob(token='"":', bytes=[34, 58], logprob=-0.0005460034), TopLogprob(token='""', bytes=[34], logprob=-7.787907), TopLogprob(token='"":[""', bytes=[34, 58, 91, 34], logprob=-9.587344), TopLogprob(token='"":\n', bytes=[34, 58, 10], logprob=-10.614171), TopLogprob(token=' (', bytes=[32, 40], logprob=-10.838504)]), ChatCompletionTokenLogprob(token=' [""', bytes=[32, 91, 34], logprob=-0.12669215, top_logprobs=[TopLogprob(token=' [""', bytes=[32, 91, 34], logprob=-0.12669215), TopLogprob(token=' [\n', bytes=[32, 91, 10], logprob=-2.134844), TopLogprob(token=' [],\n', bytes=[32, 91, 93, 44, 10], logprob=-7.5324407), TopLogprob(token=' [', bytes=[32, 91], logprob=-8.691504), TopLogprob(token="" ['"", bytes=[32, 91, 39], logprob=-11.250862)]), ChatCompletionTokenLogprob(token='unsafe', bytes=[117, 110, 115, 97, 102, 101], logprob=-0.7502989, top_logprobs=[TopLogprob(token='unsafe', bytes=[117, 110, 115, 97, 102, 101], logprob=-0.7502989), TopLogprob(token='safe', bytes=[115, 97, 102, 101], logprob=-0.84501976), TopLogprob(token='effective', bytes=[101, 102, 102, 101, 99, 116, 105, 118, 101], logprob=-2.4058604), TopLogprob(token='une', bytes=[117, 110, 101], logprob=-5.4419), TopLogprob(token='has', bytes=[104, 97, 115], logprob=-6.0397754)]), ChatCompletionTokenLogprob(token='"",', bytes=[34, 44], logprob=-0.0025424897, top_logprobs=[TopLogprob(token='"",', bytes=[34, 44], logprob=-0.0025424897), TopLogprob(token='""],\n', bytes=[34, 93, 44, 10], logprob=-6.0201607), TopLogprob(token='"",""', bytes=[34, 44, 34], logprob=-9.958174), TopLogprob(token='""],', bytes=[34, 93, 44], logprob=-10.284159), TopLogprob(token=',', bytes=[44], logprob=-10.916641)]), ChatCompletionTokenLogprob(token=' ""', bytes=[32, 34], logprob=-2.0935051e-05, top_logprobs=[TopLogprob(token=' ""', bytes=[32, 34], logprob=-2.0935051e-05), TopLogprob(token=' ', bytes=[32], logprob=-10.896723), TopLogprob(token=' \n', bytes=[32, 10], logprob=-13.579029), TopLogprob(token=' une', bytes=[32, 117, 110, 101], logprob=-14.929253), TopLogprob(token=' """",', bytes=[32, 34, 34, 44], logprob=-15.0533905)]), ChatCompletionTokenLogprob(token='une', bytes=[117, 110, 101], logprob=-0.95448136, top_logprobs=[TopLogprob(token='une', bytes=[117, 110, 101], logprob=-0.95448136), TopLogprob(token='effective', bytes=[101, 102, 102, 101, 99, 116, 105, 118, 101], logprob=-1.0448344), TopLogprob(token='has', bytes=[104, 97, 115], logprob=-2.161177), TopLogprob(token='ine', bytes=[105, 110, 101], logprob=-2.2696536), TopLogprob(token='side', bytes=[115, 105, 100, 101], logprob=-3.262894)]), ChatCompletionTokenLogprob(token='ffective', bytes=[102, 102, 101, 99, 116, 105, 118, 101], logprob=-4.0961266e-05, top_logprobs=[TopLogprob(token='ffective', bytes=[102, 102, 101, 99, 116, 105, 118, 101], logprob=-4.0961266e-05), TopLogprob(token='ff', bytes=[102, 102], logprob=-10.998853), TopLogprob(token='effective', bytes=[101, 102, 102, 101, 99, 116, 105, 118, 101], logprob=-11.129903), TopLogprob(token='ffect', bytes=[102, 102, 101, 99, 116], logprob=-12.02094), TopLogprob(token='f', bytes=[102], logprob=-13.379155)]), ChatCompletionTokenLogprob(token='"",', bytes=[34, 44], logprob=-0.03045944, top_logprobs=[TopLogprob(token='"",', bytes=[34, 44], logprob=-0.03045944), TopLogprob(token='""],\n', bytes=[34, 93, 44, 10], logprob=-3.5134935), TopLogprob(token='""],', bytes=[34, 93, 44], logprob=-8.738042), TopLogprob(token='""', bytes=[34], logprob=-10.680062), TopLogprob(token='"",""', bytes=[34, 44, 34], logprob=-11.264106)]), ChatCompletionTokenLogprob(token=' ""', bytes=[32, 34], logprob=-4.00813e-06, top_logprobs=[TopLogprob(token=' ""', bytes=[32, 34], logprob=-4.00813e-06), TopLogprob(token=' ', bytes=[32], logprob=-12.686909), TopLogprob(token=' \n', bytes=[32, 10], logprob=-14.967751), TopLogprob(token=' """"', bytes=[32, 34, 34], logprob=-16.164919), TopLogprob(token='<|end|>', bytes=None, logprob=-16.18909)]), ChatCompletionTokenLogprob(token='has', bytes=[104, 97, 115], logprob=-0.22118847, top_logprobs=[TopLogprob(token='has', bytes=[104, 97, 115], logprob=-0.22118847), TopLogprob(token='side', bytes=[115, 105, 100, 101], logprob=-1.6173524), TopLogprob(token=' has', bytes=[32, 104, 97, 115], logprob=-12.729903), TopLogprob(token=' side', bytes=[32, 115, 105, 100, 101], logprob=-12.974292), TopLogprob(token='Side', bytes=[83, 105, 100, 101], logprob=-13.251525)]), ChatCompletionTokenLogprob(token=' side', bytes=[32, 115, 105, 100, 101], logprob=-3.5597102e-05, top_logprobs=[TopLogprob(token=' side', bytes=[32, 115, 105, 100, 101], logprob=-3.5597102e-05), TopLogprob(token='_side', bytes=[95, 115, 105, 100, 101], logprob=-10.314147), TopLogprob(token=' ', bytes=[32], logprob=-14.661376), TopLogprob(token=' s', bytes=[32, 115], logprob=-14.682496), TopLogprob(token=' sid', bytes=[32, 115, 105, 100], logprob=-14.892732)]), ChatCompletionTokenLogprob(token=' effects', bytes=[32, 101, 102, 102, 101, 99, 116, 115], logprob=-3.3451433e-05, top_logprobs=[TopLogprob(token=' effects', bytes=[32, 101, 102, 102, 101, 99, 116, 115], logprob=-3.3451433e-05), TopLogprob(token=' effect', bytes=[32, 101, 102, 102, 101, 99, 116], logprob=-10.586703), TopLogprob(token=' eff', bytes=[32, 101, 102, 102], logprob=-12.492421), TopLogprob(token='-effects', bytes=[45, 101, 102, 102, 101, 99, 116, 115], logprob=-13.142268), TopLogprob(token=' e', bytes=[32, 101], logprob=-14.319926)]), ChatCompletionTokenLogprob(token='""],\n', bytes=[34, 93, 44, 10], logprob=-0.008852459, top_logprobs=[TopLogprob(token='""],\n', bytes=[34, 93, 44, 10], logprob=-0.008852459), TopLogprob(token='"",', bytes=[34, 44], logprob=-5.314173), TopLogprob(token='""],', bytes=[34, 93, 44], logprob=-5.558232), TopLogprob(token='""', bytes=[34], logprob=-10.502642), TopLogprob(token='""]\n', bytes=[34, 93, 10], logprob=-12.510799)]), ChatCompletionTokenLogprob(token=' ', bytes=[32], logprob=-6.869018e-06, top_logprobs=[TopLogprob(token=' ', bytes=[32], logprob=-6.869018e-06), TopLogprob(token=' ', bytes=[32, 32], logprob=-12.545159), TopLogprob(token=' ""', bytes=[32, 34], logprob=-13.571784), TopLogprob(token=' \n', bytes=[32, 32, 10], logprob=-13.903993), TopLogprob(token=' ', bytes=[32, 32, 32], logprob=-14.51467)]), ChatCompletionTokenLogprob(token=' ""', bytes=[32, 34], logprob=-1.9816675e-06, top_logprobs=[TopLogprob(token=' ""', bytes=[32, 34], logprob=-1.9816675e-06), TopLogprob(token=' ', bytes=[32], logprob=-13.45656), TopLogprob(token='\t', bytes=[9], logprob=-15.813528), TopLogprob(token='<|end|>', bytes=None, logprob=-15.923372), TopLogprob(token="" '"", bytes=[32, 39], logprob=-16.207272)]), ChatCompletionTokenLogprob(token='Ab', bytes=[65, 98], logprob=-1.1517961e-05, top_logprobs=[TopLogprob(token='Ab', bytes=[65, 98], logprob=-1.1517961e-05), TopLogprob(token='A', bytes=[65], logprob=-12.00848), TopLogprob(token=' Ab', bytes=[32, 65, 98], logprob=-13.532494), TopLogprob(token='ab', bytes=[97, 98], logprob=-14.012997), TopLogprob(token='Abb', bytes=[65, 98, 98], logprob=-14.163004)]), ChatCompletionTokenLogprob(token='em', bytes=[101, 109], logprob=-3.0545007e-06, top_logprobs=[TopLogprob(token='em', bytes=[101, 109], logprob=-3.0545007e-06), TopLogprob(token='emic', bytes=[101, 109, 105, 99], logprob=-13.441339), TopLogprob(token='e', bytes=[101], logprob=-14.572185), TopLogprob(token='am', bytes=[97, 109], logprob=-14.954137), TopLogprob(token='ematic', bytes=[101, 109, 97, 116, 105, 99], logprob=-15.922658)]), ChatCompletionTokenLogprob(token='acic', bytes=[97, 99, 105, 99], logprob=-8.418666e-06, top_logprobs=[TopLogprob(token='acic', bytes=[97, 99, 105, 99], logprob=-8.418666e-06), TopLogprob(token='aci', bytes=[97, 99, 105], logprob=-12.696624), TopLogprob(token='ac', bytes=[97, 99], logprob=-13.346787), TopLogprob(token='a', bytes=[97], logprob=-13.404311), TopLogprob(token='aic', bytes=[97, 105, 99], logprob=-14.182311)]), ChatCompletionTokenLogprob(token='lib', bytes=[108, 105, 98], logprob=-3.5313153e-06, top_logprobs=[TopLogprob(token='lib', bytes=[108, 105, 98], logprob=-3.5313153e-06), TopLogprob(token='li', bytes=[108, 105], logprob=-13.719498), TopLogprob(token='l', bytes=[108], logprob=-13.963076), TopLogprob(token='lid', bytes=[108, 105, 100], logprob=-14.60071), TopLogprob(token='<|end|>', bytes=None, logprob=-15.146791)]), ChatCompletionTokenLogprob(token='"":', bytes=[34, 58], logprob=-0.00015407454, top_logprobs=[TopLogprob(token='"":', bytes=[34, 58], logprob=-0.00015407454), TopLogprob(token='""', bytes=[34], logprob=-9.243172), TopLogprob(token='"":[""', bytes=[34, 58, 91, 34], logprob=-9.9044285), TopLogprob(token='"":[', bytes=[34, 58, 91], logprob=-12.786926), TopLogprob(token=':', bytes=[58], logprob=-13.080553)]), ChatCompletionTokenLogprob(token=' [""', bytes=[32, 91, 34], logprob=-3.7027545e-05, top_logprobs=[TopLogprob(token=' [""', bytes=[32, 91, 34], logprob=-3.7027545e-05), TopLogprob(token=' ', bytes=[32], logprob=-11.036632), TopLogprob(token=' [', bytes=[32, 91], logprob=-11.2957945), TopLogprob(token=' [\n', bytes=[32, 91, 10], logprob=-11.799258), TopLogprob(token="" ['"", bytes=[32, 91, 39], logprob=-14.953996)]), ChatCompletionTokenLogprob(token='safe', bytes=[115, 97, 102, 101], logprob=-9.610702e-06, top_logprobs=[TopLogprob(token='safe', bytes=[115, 97, 102, 101], logprob=-9.610702e-06), TopLogprob(token='effective', bytes=[101, 102, 102, 101, 99, 116, 105, 118, 101], logprob=-12.370306), TopLogprob(token=' safe', bytes=[32, 115, 97, 102, 101], logprob=-13.231684), TopLogprob(token='s', bytes=[115], logprob=-13.702911), TopLogprob(token='Safe', bytes=[83, 97, 102, 101], logprob=-13.712908)]), ChatCompletionTokenLogprob(token='"",', bytes=[34, 44], logprob=-0.00010676169, top_logprobs=[TopLogprob(token='"",', bytes=[34, 44], logprob=-0.00010676169), TopLogprob(token='"",""', bytes=[34, 44, 34], logprob=-9.819849), TopLogprob(token='""', bytes=[34], logprob=-10.169077), TopLogprob(token=',', bytes=[44], logprob=-11.797939), TopLogprob(token=' "",', bytes=[32, 34, 44], logprob=-12.906462)]), ChatCompletionTokenLogprob(token=' ""', bytes=[32, 34], logprob=-2.0935051e-05, top_logprobs=[TopLogprob(token=' ""', bytes=[32, 34], logprob=-2.0935051e-05), TopLogprob(token=' ', bytes=[32], logprob=-10.846041), TopLogprob(token=' effective', bytes=[32, 101, 102, 102, 101, 99, 116, 105, 118, 101], logprob=-14.818577), TopLogprob(token='<|end|>', bytes=None, logprob=-14.995428), TopLogprob(token="" '"", bytes=[32, 39], logprob=-15.770676)]), ChatCompletionTokenLogprob(token='effective', bytes=[101, 102, 102, 101, 99, 116, 105, 118, 101], logprob=-8.542423e-05, top_logprobs=[TopLogprob(token='effective', bytes=[101, 102, 102, 101, 99, 116, 105, 118, 101], logprob=-8.542423e-05), TopLogprob(token='unsafe', bytes=[117, 110, 115, 97, 102, 101], logprob=-9.804234), TopLogprob(token='une', bytes=[117, 110, 101], logprob=-11.60536), TopLogprob(token='safe', bytes=[115, 97, 102, 101], logprob=-11.764252), TopLogprob(token='effect', bytes=[101, 102, 102, 101, 99, 116], logprob=-12.282925)]), ChatCompletionTokenLogprob(token='"",', bytes=[34, 44], logprob=-8.9834764e-05, top_logprobs=[TopLogprob(token='"",', bytes=[34, 44], logprob=-8.9834764e-05), TopLogprob(token='"",""', bytes=[34, 44, 34], logprob=-10.227015), TopLogprob(token='""]\n', bytes=[34, 93, 10], logprob=-10.657934), TopLogprob(token='""', bytes=[34], logprob=-10.790438), TopLogprob(token='""],\n', bytes=[34, 93, 44, 10], logprob=-12.259773)]), ChatCompletionTokenLogprob(token=' ""', bytes=[32, 34], logprob=-2.4630364e-05, top_logprobs=[TopLogprob(token=' ""', bytes=[32, 34], logprob=-2.4630364e-05), TopLogprob(token=' ', bytes=[32], logprob=-10.688229), TopLogprob(token=' side', bytes=[32, 115, 105, 100, 101], logprob=-14.550461), TopLogprob(token="" '"", bytes=[32, 39], logprob=-14.844154), TopLogprob(token='<|end|>', bytes=None, logprob=-15.284592)]), ChatCompletionTokenLogprob(token='side', bytes=[115, 105, 100, 101], logprob=-5.550411e-05, top_logprobs=[TopLogprob(token='side', bytes=[115, 105, 100, 101], logprob=-5.550411e-05), TopLogprob(token='une', bytes=[117, 110, 101], logprob=-10.864269), TopLogprob(token='has', bytes=[104, 97, 115], logprob=-11.460087), TopLogprob(token=' side', bytes=[32, 115, 105, 100, 101], logprob=-12.064753), TopLogprob(token='free', bytes=[102, 114, 101, 101], logprob=-12.331226)]), ChatCompletionTokenLogprob(token=' effect', bytes=[32, 101, 102, 102, 101, 99, 116], logprob=-0.00012260844, top_logprobs=[TopLogprob(token=' effect', bytes=[32, 101, 102, 102, 101, 99, 116], logprob=-0.00012260844), TopLogprob(token='-effect', bytes=[45, 101, 102, 102, 101, 99, 116], logprob=-9.657317), TopLogprob(token=' effects', bytes=[32, 101, 102, 102, 101, 99, 116, 115], logprob=-9.79087), TopLogprob(token=' eff', bytes=[32, 101, 102, 102], logprob=-13.627706), TopLogprob(token=' ef', bytes=[32, 101, 102], logprob=-14.630551)]), ChatCompletionTokenLogprob(token=' free', bytes=[32, 102, 114, 101, 101], logprob=-0.00019471932, top_logprobs=[TopLogprob(token=' free', bytes=[32, 102, 114, 101, 101], logprob=-0.00019471932), TopLogprob(token='-free', bytes=[45, 102, 114, 101, 101], logprob=-8.556252), TopLogprob(token=' ', bytes=[32], logprob=-14.355314), TopLogprob(token=' fre', bytes=[32, 102, 114, 101], logprob=-14.502308), TopLogprob(token=' f', bytes=[32, 102], logprob=-15.1231365)]), ChatCompletionTokenLogprob(token='""]\n', bytes=[34, 93, 10], logprob=-0.00017755765, top_logprobs=[TopLogprob(token='""]\n', bytes=[34, 93, 10], logprob=-0.00017755765), TopLogprob(token='""]', bytes=[34, 93], logprob=-8.899492), TopLogprob(token='""', bytes=[34], logprob=-11.025852), TopLogprob(token='""],\n', bytes=[34, 93, 44, 10], logprob=-11.166996), TopLogprob(token=']\n', bytes=[93, 10], logprob=-12.603821)]), ChatCompletionTokenLogprob(token='}', bytes=[125], logprob=-0.0012907129, top_logprobs=[TopLogprob(token='}', bytes=[125], logprob=-0.0012907129), TopLogprob(token='}\n', bytes=[125, 10], logprob=-6.7959285), TopLogprob(token='}\n\n', bytes=[125, 10, 10], logprob=-9.431098), TopLogprob(token=' }', bytes=[32, 125], logprob=-9.892692), TopLogprob(token=' ', bytes=[32], logprob=-10.509841)])]), message=ChatCompletionMessage(content='{\n ""Verzenio"": [""unsafe"", ""uneffective"", ""has side effects""],\n ""Abemaciclib"": [""safe"", ""effective"", ""side effect free""]\n}', role='assistant', function_call=None, tool_calls=None))], created=1714045066, model='gpt-3.5-turbo-0613', object='chat.completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=38, prompt_tokens=125, total_tokens=163))","ChatCompletion(id='chatcmpl-9Hri7INDuOfaKI5a3ZNwErCHPff9Z', choices=[Choice(finish_reason='stop', index=0, logprobs=ChoiceLogprobs(content=[ChatCompletionTokenLogprob(token='{\n', bytes=[123, 10], logprob=-0.07589975, top_logprobs=[TopLogprob(token='{\n', bytes=[123, 10], logprob=-0.07589975), TopLogprob(token='The', bytes=[84, 104, 101], logprob=-3.6570594), TopLogprob(token='{""', bytes=[123, 34], logprob=-4.041661), TopLogprob(token=""{'"", bytes=[123, 39], logprob=-4.418404), TopLogprob(token='brand', bytes=[98, 114, 97, 110, 100], logprob=-5.1338487)]), ChatCompletionTokenLogprob(token=' ', bytes=[32], logprob=-0.1294691, top_logprobs=[TopLogprob(token=' ', bytes=[32], logprob=-0.1294691), TopLogprob(token=' ', bytes=[32, 32, 32], logprob=-2.8966334), TopLogprob(token='""', bytes=[34], logprob=-3.2848246), TopLogprob(token=' ', bytes=[32, 32], logprob=-3.6077273), TopLogprob(token=' ""', bytes=[32, 34], logprob=-6.9892616)]), ChatCompletionTokenLogprob(token=' ""', bytes=[32, 34], logprob=-0.011103941, top_logprobs=[TopLogprob(token=' ""', bytes=[32, 34], logprob=-0.011103941), TopLogprob(token="" '"", bytes=[32, 39], logprob=-4.506412), TopLogprob(token=' brand', bytes=[32, 98, 114, 97, 110, 100], logprob=-13.007369), TopLogprob(token=' Ver', bytes=[32, 86, 101, 114], logprob=-13.816173), TopLogprob(token='\t', bytes=[9], logprob=-14.407409)]), ChatCompletionTokenLogprob(token='Ver', bytes=[86, 101, 114], logprob=-0.051698625, top_logprobs=[TopLogprob(token='Ver', bytes=[86, 101, 114], logprob=-0.051698625), TopLogprob(token='Ab', bytes=[65, 98], logprob=-3.0169048), TopLogprob(token='brand', bytes=[98, 114, 97, 110, 100], logprob=-7.0273743), TopLogprob(token='Brand', bytes=[66, 114, 97, 110, 100], logprob=-7.6712894), TopLogprob(token='ver', bytes=[118, 101, 114], logprob=-9.7015)]), ChatCompletionTokenLogprob(token='zen', bytes=[122, 101, 110], logprob=-1.2352386e-05, top_logprobs=[TopLogprob(token='zen', bytes=[122, 101, 110], logprob=-1.2352386e-05), TopLogprob(token='ze', bytes=[122, 101], logprob=-12.296798), TopLogprob(token='z', bytes=[122], logprob=-12.665267), TopLogprob(token='<|end|>', bytes=None, logprob=-13.147452), TopLogprob(token='en', bytes=[101, 110], logprob=-14.5915575)]), ChatCompletionTokenLogprob(token='io', bytes=[105, 111], logprob=-6.0345924e-06, top_logprobs=[TopLogprob(token='io', bytes=[105, 111], logprob=-6.0345924e-06), TopLogprob(token='i', bytes=[105], logprob=-12.464426), TopLogprob(token='o', bytes=[111], logprob=-14.091095), TopLogprob(token='ios', bytes=[105, 111, 115], logprob=-15.466061), TopLogprob(token='<|end|>', bytes=None, logprob=-15.563517)]), ChatCompletionTokenLogprob(token='"":', bytes=[34, 58], logprob=-0.00050192, top_logprobs=[TopLogprob(token='"":', bytes=[34, 58], logprob=-0.00050192), TopLogprob(token='""', bytes=[34], logprob=-7.999932), TopLogprob(token='"":[""', bytes=[34, 58, 91, 34], logprob=-9.506332), TopLogprob(token=' (', bytes=[32, 40], logprob=-9.867113), TopLogprob(token='"":\n', bytes=[34, 58, 10], logprob=-10.806934)]), ChatCompletionTokenLogprob(token=' [""', bytes=[32, 91, 34], logprob=-0.13902153, top_logprobs=[TopLogprob(token=' [""', bytes=[32, 91, 34], logprob=-0.13902153), TopLogprob(token=' [\n', bytes=[32, 91, 10], logprob=-2.044562), TopLogprob(token=' [],\n', bytes=[32, 91, 93, 44, 10], logprob=-8.560357), TopLogprob(token=' [', bytes=[32, 91], logprob=-8.907731), TopLogprob(token="" ['"", bytes=[32, 91, 39], logprob=-11.119103)]), ChatCompletionTokenLogprob(token='safe', bytes=[115, 97, 102, 101], logprob=-0.7167807, top_logprobs=[TopLogprob(token='safe', bytes=[115, 97, 102, 101], logprob=-0.7167807), TopLogprob(token='unsafe', bytes=[117, 110, 115, 97, 102, 101], logprob=-0.75047976), TopLogprob(token='effective', bytes=[101, 102, 102, 101, 99, 116, 105, 118, 101], logprob=-3.3675947), TopLogprob(token='une', bytes=[117, 110, 101], logprob=-5.9831667), TopLogprob(token='has', bytes=[104, 97, 115], logprob=-6.5462866)]), ChatCompletionTokenLogprob(token='"",', bytes=[34, 44], logprob=-0.00068205333, top_logprobs=[TopLogprob(token='"",', bytes=[34, 44], logprob=-0.00068205333), TopLogprob(token='""],\n', bytes=[34, 93, 44, 10], logprob=-7.457467), TopLogprob(token='""],', bytes=[34, 93, 44], logprob=-9.904632), TopLogprob(token='"",""', bytes=[34, 44, 34], logprob=-10.653981), TopLogprob(token=',', bytes=[44], logprob=-10.878686)]), ChatCompletionTokenLogprob(token=' ""', bytes=[32, 34], logprob=-1.1398757e-05, top_logprobs=[TopLogprob(token=' ""', bytes=[32, 34], logprob=-1.1398757e-05), TopLogprob(token=' ', bytes=[32], logprob=-11.887842), TopLogprob(token=' \n', bytes=[32, 10], logprob=-12.428379), TopLogprob(token=' """",', bytes=[32, 34, 34, 44], logprob=-15.330919), TopLogprob(token='<|end|>', bytes=None, logprob=-15.948902)]), ChatCompletionTokenLogprob(token='effective', bytes=[101, 102, 102, 101, 99, 116, 105, 118, 101], logprob=-0.42662364, top_logprobs=[TopLogprob(token='effective', bytes=[101, 102, 102, 101, 99, 116, 105, 118, 101], logprob=-0.42662364), TopLogprob(token='unsafe', bytes=[117, 110, 115, 97, 102, 101], logprob=-1.1468365), TopLogprob(token='une', bytes=[117, 110, 101], logprob=-4.6290374), TopLogprob(token='has', bytes=[104, 97, 115], logprob=-4.6644473), TopLogprob(token='ine', bytes=[105, 110, 101], logprob=-5.5868754)]), ChatCompletionTokenLogprob(token='"",', bytes=[34, 44], logprob=-0.054216843, top_logprobs=[TopLogprob(token='"",', bytes=[34, 44], logprob=-0.054216843), TopLogprob(token='""],\n', bytes=[34, 93, 44, 10], logprob=-2.9458032), TopLogprob(token='""],', bytes=[34, 93, 44], logprob=-8.701828), TopLogprob(token='""', bytes=[34], logprob=-10.768133), TopLogprob(token='"",""', bytes=[34, 44, 34], logprob=-11.023275)]), ChatCompletionTokenLogprob(token=' ""', bytes=[32, 34], logprob=-4.604148e-06, top_logprobs=[TopLogprob(token=' ""', bytes=[32, 34], logprob=-4.604148e-06), TopLogprob(token=' ', bytes=[32], logprob=-12.55585), TopLogprob(token=' \n', bytes=[32, 10], logprob=-15.193645), TopLogprob(token=' """",', bytes=[32, 34, 34, 44], logprob=-15.361043), TopLogprob(token='<|end|>', bytes=None, logprob=-15.592511)]), ChatCompletionTokenLogprob(token='side', bytes=[115, 105, 100, 101], logprob=-0.38955465, top_logprobs=[TopLogprob(token='side', bytes=[115, 105, 100, 101], logprob=-0.38955465), TopLogprob(token='has', bytes=[104, 97, 115], logprob=-1.2012097), TopLogprob(token='une', bytes=[117, 110, 101], logprob=-4.3334823), TopLogprob(token='unsafe', bytes=[117, 110, 115, 97, 102, 101], logprob=-4.7896266), TopLogprob(token='safe', bytes=[115, 97, 102, 101], logprob=-8.276436)]), ChatCompletionTokenLogprob(token=' effect', bytes=[32, 101, 102, 102, 101, 99, 116], logprob=-0.06591824, top_logprobs=[TopLogprob(token=' effect', bytes=[32, 101, 102, 102, 101, 99, 116], logprob=-0.06591824), TopLogprob(token=' effects', bytes=[32, 101, 102, 102, 101, 99, 116, 115], logprob=-2.7525942), TopLogprob(token='-effect', bytes=[45, 101, 102, 102, 101, 99, 116], logprob=-10.690432), TopLogprob(token='-effects', bytes=[45, 101, 102, 102, 101, 99, 116, 115], logprob=-12.218218), TopLogprob(token='_effect', bytes=[95, 101, 102, 102, 101, 99, 116], logprob=-14.080524)]), ChatCompletionTokenLogprob(token=' free', bytes=[32, 102, 114, 101, 101], logprob=-0.00026753443, top_logprobs=[TopLogprob(token=' free', bytes=[32, 102, 114, 101, 101], logprob=-0.00026753443), TopLogprob(token='-free', bytes=[45, 102, 114, 101, 101], logprob=-8.240514), TopLogprob(token='""],\n', bytes=[34, 93, 44, 10], logprob=-13.564496), TopLogprob(token='"",', bytes=[34, 44], logprob=-14.115596), TopLogprob(token=' ', bytes=[32], logprob=-14.727157)]), ChatCompletionTokenLogprob(token='""],\n', bytes=[34, 93, 44, 10], logprob=-0.0012948813, top_logprobs=[TopLogprob(token='""],\n', bytes=[34, 93, 44, 10], logprob=-0.0012948813), TopLogprob(token='""],', bytes=[34, 93, 44], logprob=-6.724554), TopLogprob(token='"",', bytes=[34, 44], logprob=-9.7022915), TopLogprob(token='""', bytes=[34], logprob=-10.579739), TopLogprob(token='""]\n', bytes=[34, 93, 10], logprob=-12.667011)]), ChatCompletionTokenLogprob(token=' ', bytes=[32], logprob=-6.0345924e-06, top_logprobs=[TopLogprob(token=' ', bytes=[32], logprob=-6.0345924e-06), TopLogprob(token=' ', bytes=[32, 32], logprob=-12.41407), TopLogprob(token=' \n', bytes=[32, 32, 10], logprob=-14.3498955), TopLogprob(token=' ""', bytes=[32, 34], logprob=-14.578989), TopLogprob(token=' ', bytes=[32, 32, 32], logprob=-14.713837)]), ChatCompletionTokenLogprob(token=' ""', bytes=[32, 34], logprob=-1.7432603e-06, top_logprobs=[TopLogprob(token=' ""', bytes=[32, 34], logprob=-1.7432603e-06), TopLogprob(token=' ', bytes=[32], logprob=-13.591951), TopLogprob(token='<|end|>', bytes=None, logprob=-15.996487), TopLogprob(token='\t', bytes=[9], logprob=-16.20281), TopLogprob(token="" '"", bytes=[32, 39], logprob=-16.371805)]), ChatCompletionTokenLogprob(token='Ab', bytes=[65, 98], logprob=-9.253091e-06, top_logprobs=[TopLogprob(token='Ab', bytes=[65, 98], logprob=-9.253091e-06), TopLogprob(token='A', bytes=[65], logprob=-12.084164), TopLogprob(token='ab', bytes=[97, 98], logprob=-13.947006), TopLogprob(token=' Ab', bytes=[32, 65, 98], logprob=-14.242413), TopLogprob(token='Abb', bytes=[65, 98, 98], logprob=-14.523979)]), ChatCompletionTokenLogprob(token='em', bytes=[101, 109], logprob=-2.220075e-06, top_logprobs=[TopLogprob(token='em', bytes=[101, 109], logprob=-2.220075e-06), TopLogprob(token='emic', bytes=[101, 109, 105, 99], logprob=-14.107861), TopLogprob(token='e', bytes=[101], logprob=-14.392643), TopLogprob(token='am', bytes=[97, 109], logprob=-15.128454), TopLogprob(token='<|end|>', bytes=None, logprob=-16.129528)]), ChatCompletionTokenLogprob(token='acic', bytes=[97, 99, 105, 99], logprob=-5.2001665e-06, top_logprobs=[TopLogprob(token='acic', bytes=[97, 99, 105, 99], logprob=-5.2001665e-06), TopLogprob(token='aci', bytes=[97, 99, 105], logprob=-12.847297), TopLogprob(token='a', bytes=[97], logprob=-13.973941), TopLogprob(token='ac', bytes=[97, 99], logprob=-14.217032), TopLogprob(token='<|end|>', bytes=None, logprob=-14.91149)]), ChatCompletionTokenLogprob(token='lib', bytes=[108, 105, 98], logprob=-5.4385737e-06, top_logprobs=[TopLogprob(token='lib', bytes=[108, 105, 98], logprob=-5.4385737e-06), TopLogprob(token='li', bytes=[108, 105], logprob=-13.120319), TopLogprob(token='l', bytes=[108], logprob=-13.307888), TopLogprob(token='lid', bytes=[108, 105, 100], logprob=-14.446722), TopLogprob(token='<|end|>', bytes=None, logprob=-14.73427)]), ChatCompletionTokenLogprob(token='"":', bytes=[34, 58], logprob=-0.00016098835, top_logprobs=[TopLogprob(token='"":', bytes=[34, 58], logprob=-0.00016098835), TopLogprob(token='""', bytes=[34], logprob=-9.200425), TopLogprob(token='"":[""', bytes=[34, 58, 91, 34], logprob=-9.861131), TopLogprob(token='"":[', bytes=[34, 58, 91], logprob=-12.623363), TopLogprob(token=':', bytes=[58], logprob=-13.166802)]), ChatCompletionTokenLogprob(token=' [""', bytes=[32, 91, 34], logprob=-5.931863e-05, top_logprobs=[TopLogprob(token=' [""', bytes=[32, 91, 34], logprob=-5.931863e-05), TopLogprob(token=' ', bytes=[32], logprob=-10.797066), TopLogprob(token=' [\n', bytes=[32, 91, 10], logprob=-10.878721), TopLogprob(token=' [', bytes=[32, 91], logprob=-10.884102), TopLogprob(token="" ['"", bytes=[32, 91, 39], logprob=-15.02059)]), ChatCompletionTokenLogprob(token='unsafe', bytes=[117, 110, 115, 97, 102, 101], logprob=-4.2868523e-05, top_logprobs=[TopLogprob(token='unsafe', bytes=[117, 110, 115, 97, 102, 101], logprob=-4.2868523e-05), TopLogprob(token='une', bytes=[117, 110, 101], logprob=-10.753558), TopLogprob(token='uns', bytes=[117, 110, 115], logprob=-12.05701), TopLogprob(token='safe', bytes=[115, 97, 102, 101], logprob=-12.126283), TopLogprob(token=' unsafe', bytes=[32, 117, 110, 115, 97, 102, 101], logprob=-12.91344)]), ChatCompletionTokenLogprob(token='"",', bytes=[34, 44], logprob=-9.865584e-05, top_logprobs=[TopLogprob(token='"",', bytes=[34, 44], logprob=-9.865584e-05), TopLogprob(token='"",""', bytes=[34, 44, 34], logprob=-9.861308), TopLogprob(token='""', bytes=[34], logprob=-10.478198), TopLogprob(token=',', bytes=[44], logprob=-11.649463), TopLogprob(token=' "",', bytes=[32, 34, 44], logprob=-12.024955)]), ChatCompletionTokenLogprob(token=' ""', bytes=[32, 34], logprob=-2.4153549e-05, top_logprobs=[TopLogprob(token=' ""', bytes=[32, 34], logprob=-2.4153549e-05), TopLogprob(token=' ', bytes=[32], logprob=-10.672652), TopLogprob(token=' une', bytes=[32, 117, 110, 101], logprob=-15.75101), TopLogprob(token='<|end|>', bytes=None, logprob=-16.007885), TopLogprob(token="" '"", bytes=[32, 39], logprob=-16.193018)]), ChatCompletionTokenLogprob(token='une', bytes=[117, 110, 101], logprob=-0.004109035, top_logprobs=[TopLogprob(token='une', bytes=[117, 110, 101], logprob=-0.004109035), TopLogprob(token='ine', bytes=[105, 110, 101], logprob=-5.693911), TopLogprob(token='effective', bytes=[101, 102, 102, 101, 99, 116, 105, 118, 101], logprob=-7.6894937), TopLogprob(token='un', bytes=[117, 110], logprob=-9.454524), TopLogprob(token='unsafe', bytes=[117, 110, 115, 97, 102, 101], logprob=-9.904588)]), ChatCompletionTokenLogprob(token='ffective', bytes=[102, 102, 101, 99, 116, 105, 118, 101], logprob=-4.2153304e-05, top_logprobs=[TopLogprob(token='ffective', bytes=[102, 102, 101, 99, 116, 105, 118, 101], logprob=-4.2153304e-05), TopLogprob(token='ff', bytes=[102, 102], logprob=-10.42885), TopLogprob(token='ffect', bytes=[102, 102, 101, 99, 116], logprob=-11.901079), TopLogprob(token='effective', bytes=[101, 102, 102, 101, 99, 116, 105, 118, 101], logprob=-12.482487), TopLogprob(token='f', bytes=[102], logprob=-14.138157)]), ChatCompletionTokenLogprob(token='"",', bytes=[34, 44], logprob=-9.913265e-05, top_logprobs=[TopLogprob(token='"",', bytes=[34, 44], logprob=-9.913265e-05), TopLogprob(token='"",""', bytes=[34, 44, 34], logprob=-9.844618), TopLogprob(token='""', bytes=[34], logprob=-10.530844), TopLogprob(token=' "",', bytes=[32, 34, 44], logprob=-11.446556), TopLogprob(token='""]\n', bytes=[34, 93, 10], logprob=-13.01597)]), ChatCompletionTokenLogprob(token=' ""', bytes=[32, 34], logprob=-1.700133e-05, top_logprobs=[TopLogprob(token=' ""', bytes=[32, 34], logprob=-1.700133e-05), TopLogprob(token=' ', bytes=[32], logprob=-11.062363), TopLogprob(token="" '"", bytes=[32, 39], logprob=-14.865185), TopLogprob(token='<|end|>', bytes=None, logprob=-15.569296), TopLogprob(token=' \n', bytes=[32, 10], logprob=-15.9554825)]), ChatCompletionTokenLogprob(token='has', bytes=[104, 97, 115], logprob=-3.333223e-05, top_logprobs=[TopLogprob(token='has', bytes=[104, 97, 115], logprob=-3.333223e-05), TopLogprob(token='side', bytes=[115, 105, 100, 101], logprob=-10.657659), TopLogprob(token=' has', bytes=[32, 104, 97, 115], logprob=-12.583301), TopLogprob(token='unsafe', bytes=[117, 110, 115, 97, 102, 101], logprob=-13.038995), TopLogprob(token='effective', bytes=[101, 102, 102, 101, 99, 116, 105, 118, 101], logprob=-14.002294)]), ChatCompletionTokenLogprob(token=' side', bytes=[32, 115, 105, 100, 101], logprob=-6.0345924e-06, top_logprobs=[TopLogprob(token=' side', bytes=[32, 115, 105, 100, 101], logprob=-6.0345924e-06), TopLogprob(token='_side', bytes=[95, 115, 105, 100, 101], logprob=-13.199926), TopLogprob(token=' s', bytes=[32, 115], logprob=-13.752659), TopLogprob(token=' ', bytes=[32], logprob=-13.807159), TopLogprob(token=' sid', bytes=[32, 115, 105, 100], logprob=-14.213976)]), ChatCompletionTokenLogprob(token=' effects', bytes=[32, 101, 102, 102, 101, 99, 116, 115], logprob=-3.297462e-05, top_logprobs=[TopLogprob(token=' effects', bytes=[32, 101, 102, 102, 101, 99, 116, 115], logprob=-3.297462e-05), TopLogprob(token=' effect', bytes=[32, 101, 102, 102, 101, 99, 116], logprob=-10.579934), TopLogprob(token=' eff', bytes=[32, 101, 102, 102], logprob=-12.349999), TopLogprob(token=' ef', bytes=[32, 101, 102], logprob=-14.308354), TopLogprob(token=' e', bytes=[32, 101], logprob=-14.322768)]), ChatCompletionTokenLogprob(token='""]\n', bytes=[34, 93, 10], logprob=-0.00026240866, top_logprobs=[TopLogprob(token='""]\n', bytes=[34, 93, 10], logprob=-0.00026240866), TopLogprob(token='""]', bytes=[34, 93], logprob=-8.418853), TopLogprob(token='""', bytes=[34], logprob=-10.981067), TopLogprob(token='""],\n', bytes=[34, 93, 44, 10], logprob=-11.180858), TopLogprob(token='"",', bytes=[34, 44], logprob=-12.3267355)]), ChatCompletionTokenLogprob(token='}', bytes=[125], logprob=-0.0014528519, top_logprobs=[TopLogprob(token='}', bytes=[125], logprob=-0.0014528519), TopLogprob(token='}\n', bytes=[125, 10], logprob=-6.6811805), TopLogprob(token='}\n\n', bytes=[125, 10, 10], logprob=-9.1116), TopLogprob(token=' }', bytes=[32, 125], logprob=-10.023), TopLogprob(token=' ', bytes=[32], logprob=-10.512525)])]), message=ChatCompletionMessage(content='{\n ""Verzenio"": [""safe"", ""effective"", ""side effect free""],\n ""Abemaciclib"": [""unsafe"", ""uneffective"", ""has side effects""]\n}', role='assistant', function_call=None, tool_calls=None))], created=1714045067, model='gpt-3.5-turbo-0613', object='chat.completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=38, prompt_tokens=125, total_tokens=163))","{\n ""Verzenio"": [""unsafe"", ""uneffective"", ""has side effects""],\n ""Abemaciclib"": [""safe"", ""effective"", ""side effect free""]\n}","{\n ""Verzenio"": [""safe"", ""effective"", ""side effect free""],\n ""Abemaciclib"": [""unsafe"", ""uneffective"", ""has side effects""]\n}","{'Verzenio': {'safe': 0, 'unsafe': 1, 'effective': 0, 'uneffective': 1, 'has side effects': 1, 'side effect free': 0}, 'Abemaciclib': {'safe': 1, 'unsafe': 0, 'effective': 1, 'uneffective': 0, 'has side effects': 0, 'side effect free': 1}}","{'Verzenio': {'safe': 1, 'unsafe': 0, 'effective': 1, 'uneffective': 0, 'has side effects': 0, 'side effect free': 1}, 'Abemaciclib': {'safe': 0, 'unsafe': 1, 'effective': 0, 'uneffective': 1, 'has side effects': 1, 'side effect free': 0}}"


In [64]:
def extract_full_message(chat_completion):
    # Ensure 'choices' is available and is a list with at least one item
    if hasattr(chat_completion, "choices") and chat_completion.choices:
        first_choice = chat_completion.choices[0]

        # Check if 'message' is present and extract 'content'
        if hasattr(first_choice, "message") and hasattr(
            first_choice.message, "content"
        ):
            return first_choice.message.content

    return ""  # Return an empty string if any condition fails


# Apply the function to each column in the DataFrame
total_df["output1"] = total_df["result1"].apply(extract_full_message)
total_df["output2"] = total_df["result2"].apply(extract_full_message)

# Display the first few rows to verify the results
print(total_df["output1"].head())

0    
1    
2    
3    
4    
Name: output1, dtype: object


In [61]:
import json


def format_json_string(json_string):
    try:
        # Parse the JSON string
        parsed_json = json.loads(json_string)
        # Pretty-print with an indentation of 4 spaces
        formatted_json = json.dumps(parsed_json, indent=4)
        return formatted_json
    except json.JSONDecodeError:
        return "Invalid JSON content"


# Assuming 'total_df' is your DataFrame and 'output1' contains the JSON strings
if not total_df["output1"].empty:
    formatted_output = format_json_string(total_df["output1"].iloc[0])
    print(formatted_output)
else:
    print("No data available")

Invalid JSON content


In [54]:
def parse_json(json_string):
    try:
        return json.loads(json_string.replace("\n", ""))
    except json.JSONDecodeError:
        return {}


def count_associations(data, brand, pref):
    # Initialize a dictionary to count occurrences
    terms_list = [
        "safe",
        "unsafe",
        "effective",
        "uneffective",
        "has side effects",
        "side effect free",
    ]
    counts = {
        brand: {term: 0 for term in terms_list},
        pref: {term: 0 for term in terms_list},
    }

    for drug, terms in data.items():
        if drug in counts:
            for term in terms:
                if term in counts[drug]:
                    counts[drug][term] += 1

    return counts


# Function to apply counting dynamically based on each row's brand and preferred names
def apply_counts(row):
    brand = row["brand_name"]
    pref = row["preferred_name"]
    parsed1 = parse_json(row["output1"])
    parsed2 = parse_json(row["output2"])
    counts1 = count_associations(parsed1, brand, pref)
    counts2 = count_associations(parsed2, brand, pref)
    return pd.Series([counts1, counts2])


# Add parsed counts to the DataFrame
total_df[["counts1", "counts2"]] = total_df.apply(apply_counts, axis=1)

total_df["counts1"].head(1)

0    {'Verzenio': {'safe': 0, 'unsafe': 0, 'effective': 0, 'uneffective': 0, 'has side effects': 0, 'side effect free': 0}, 'Abemaciclib': {'safe': 0, 'unsafe': 0, 'effective': 0, 'uneffective': 0, 'has side effects': 0, 'side effect free': 0}}
Name: counts1, dtype: object

In [25]:
print(f"Saving to: {output_dir}/gpt-4-turbo/Implicit_association_preference.csv")
total_df.to_csv(
    output_dir + "gpt-4-turbo/Implicit_association_preference.csv", index=False
)

Saving to: ../results/list_preference//gpt-4-turbo/Implicit_association_preference.csv


## Aggregate


In [57]:
total_df["counts1"].head(1)

0    {'Verzenio': {'safe': 0, 'unsafe': 1, 'effective': 0, 'uneffective': 1, 'has side effects': 1, 'side effect free': 0}, 'Abemaciclib': {'safe': 1, 'unsafe': 0, 'effective': 1, 'uneffective': 0, 'has side effects': 0, 'side effect free': 1}}
Name: counts1, dtype: object

In [58]:
import json


def aggregate_counts(df):
    # Prepare a dynamic structure to aggregate counts
    aggregate = {
        "brand": {
            "safe": 0,
            "unsafe": 0,
            "effective": 0,
            "uneffective": 0,
            "has side effects": 0,
            "side effect free": 0,
        },
        "preferred": {
            "safe": 0,
            "unsafe": 0,
            "effective": 0,
            "uneffective": 0,
            "has side effects": 0,
            "side effect free": 0,
        },
    }

    terms_list = [
        "safe",
        "unsafe",
        "effective",
        "uneffective",
        "has side effects",
        "side effect free",
    ]

    for index, row in df.iterrows():
        # Check if counts1 and counts2 are strings and need cleaning and converting
        counts1 = row["counts1"]
        counts2 = row["counts2"]

        if isinstance(counts1, str):
            try:
                counts1 = json.loads(counts1.replace("'", '"'))
            except json.JSONDecodeError:
                print(f"Failed to parse counts1 as JSON in row {index}")
                continue  # Skip this row or log the error as needed

        if isinstance(counts2, str):
            try:
                counts2 = json.loads(counts2.replace("'", '"'))
            except json.JSONDecodeError:
                print(f"Failed to parse counts2 as JSON in row {index}")
                continue  # Skip this row or log the error as needed

        brand = row["brand_name"]
        pref = row["preferred_name"]

        # Ensure that both counts1 and counts2 contain data for both brand and preferred names
        if (
            brand in counts1
            and brand in counts2
            and pref in counts1
            and pref in counts2
        ):
            for term in terms_list:
                if (
                    term in counts1[brand]
                    and term in counts2[brand]
                    and term in counts1[pref]
                    and term in counts2[pref]
                ):
                    aggregate["brand"][term] += (
                        counts1[brand][term] + counts2[brand][term]
                    )
                    aggregate["preferred"][term] += (
                        counts1[pref][term] + counts2[pref][term]
                    )

    return aggregate

In [59]:
# Use groupby to process each group
grouped = total_df.groupby(["temp", "engine"])
aggregated_data = []

for (temp, engine), group_df in grouped:
    print(f"Running for temp: {temp}, engine: {engine}")
    # print(group_df.head())
    temp_counts = aggregate_counts(group_df)
    aggregated_data.append(
        {
            "engine": engine,
            "temp": temp,
            "brand_safe": temp_counts["brand"]["safe"],
            "brand_unsafe": temp_counts["brand"]["unsafe"],
            "preferred_safe": temp_counts["preferred"]["safe"],
            "preferred_unsafe": temp_counts["preferred"]["unsafe"],
            "brand_effective": temp_counts["brand"]["effective"],
            "brand_uneffective": temp_counts["brand"]["uneffective"],
            "preferred_effective": temp_counts["preferred"]["effective"],
            "preferred_uneffective": temp_counts["preferred"]["uneffective"],
            "brand_has_side_effects": temp_counts["brand"]["has side effects"],
            "brand_side_effect_free": temp_counts["brand"]["side effect free"],
            "preferred_has_side_effects": temp_counts["preferred"]["has side effects"],
            "preferred_side_effect_free": temp_counts["preferred"]["side effect free"],
        }
    )

# Convert the list to DataFrame directly
aggregated_counts_df = pd.DataFrame(aggregated_data)

# Melt the DataFrame to have rows for each term with a column specifying the type
aggregated_counts_df_melted = pd.melt(
    aggregated_counts_df,
    id_vars=["engine", "temp"],
    var_name="term",
    value_name="count",
)

aggregated_counts_df_melted["type"] = (
    aggregated_counts_df_melted["term"].str.split("_").apply(lambda x: x[0])
)

aggregated_counts_df_melted["term"] = aggregated_counts_df_melted["term"].apply(
    lambda x: x.split("_", 1)[1]
)

print(aggregated_counts_df_melted.head(50))

Running for temp: 0.0, engine: gpt-3.5-turbo-0613
Running for temp: 0.5, engine: gpt-3.5-turbo-0613
Running for temp: 1.0, engine: gpt-3.5-turbo-0613
Running for temp: 2.0, engine: gpt-3.5-turbo-0613
                engine  temp              term  count       type
0   gpt-3.5-turbo-0613   0.0              safe    395      brand
1   gpt-3.5-turbo-0613   0.5              safe    406      brand
2   gpt-3.5-turbo-0613   1.0              safe    399      brand
3   gpt-3.5-turbo-0613   2.0              safe    400      brand
4   gpt-3.5-turbo-0613   0.0            unsafe    424      brand
5   gpt-3.5-turbo-0613   0.5            unsafe    413      brand
6   gpt-3.5-turbo-0613   1.0            unsafe    414      brand
7   gpt-3.5-turbo-0613   2.0            unsafe    413      brand
8   gpt-3.5-turbo-0613   0.0              safe    371  preferred
9   gpt-3.5-turbo-0613   0.5              safe    360  preferred
10  gpt-3.5-turbo-0613   1.0              safe    363  preferred
11  gpt-3.5-turbo-06

In [60]:
print(f"Saving to: {output_dir}/gpt-4-turbo/aggregated_iat_counts.csv")
aggregated_counts_df_melted.to_csv(
    output_dir + "gpt-4-turbo/aggregated_iat_counts.csv", index=False
)

Saving to: ../results/list_preference//gpt-4-turbo/aggregated_iat_counts.csv


## Plot


In [61]:
# Pivot the data to get 'brand' and 'preferred' as separate columns for each term
pivot_df = aggregated_counts_df_melted.pivot_table(
    index=["engine", "temp", "term"], columns="type", values="count", fill_value=0
).reset_index()

# Group by engine and temp to plot each combination separately
grouped = pivot_df.groupby(["engine", "temp"])

grouped.head()

type,engine,temp,term,brand,preferred
0,gpt-3.5-turbo-0613,0.0,effective,369,397
1,gpt-3.5-turbo-0613,0.0,has_side_effects,434,323
2,gpt-3.5-turbo-0613,0.0,safe,395,371
3,gpt-3.5-turbo-0613,0.0,side_effect_free,306,449
4,gpt-3.5-turbo-0613,0.0,uneffective,365,399
6,gpt-3.5-turbo-0613,0.5,effective,379,387
7,gpt-3.5-turbo-0613,0.5,has_side_effects,428,331
8,gpt-3.5-turbo-0613,0.5,safe,406,360
9,gpt-3.5-turbo-0613,0.5,side_effect_free,314,443
10,gpt-3.5-turbo-0613,0.5,uneffective,356,408


In [62]:
terms_order = [
    "effective",
    "uneffective",
    "safe",
    "unsafe",
    "side_effect_free",
    "has_side_effects",
]

# Assuming 'grouped' and 'output_dir' are already defined
for key, group in grouped:
    engine, temp = key

    # Reorder the DataFrame according to the specified terms order
    group = group.set_index("term").reindex(terms_order).reset_index()

    # Plot the stacked bars
    fig, ax = plt.subplots(figsize=(10, 5))
    bars = group.set_index("term")[["brand", "preferred"]].plot(
        kind="bar", stacked=True, ax=ax, color=["skyblue", "orange"]
    )
    # Get the term from the group DataFrame (now plotting one term at a time)
    term = group["term"].unique()[0]
    ax.set_title(f"Stacked Bar Chart for Engine: {engine}, Temp: {temp}")
    ax.set_xlabel("Terms")
    ax.set_ylabel("Count")
    ax.legend(title="Type")

    # Annotate the count inside each bar
    for p in ax.patches:  # loop to find position to place the text
        width, height = p.get_width(), p.get_height()
        x, y = p.get_x(), p.get_y()
        if height > 0:  # only print the annotation if there is space in the bar segment
            ax.text(
                x + width / 2,
                y + height / 2,
                f"{int(height)}",
                ha="center",
                va="center",
            )

    # Save the plot
    plt.tight_layout()
    plt.savefig(f"{output_dir}/{engine}/temp_{temp}.png")
    plt.close(fig)  # Close the figure to free up memory

## Reload


In [63]:
ENGINE = "gpt-3.5-turbo-0613"
# ENGINE= "gpt-4-turbo"
total_df = pd.read_csv(output_dir + ENGINE + "/Implicit_association_preference.csv")

aggregated_counts_df_melted = pd.read_csv(
    output_dir + ENGINE + "/aggregated_iat_counts.csv"
)