In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from typing import List
import os

import seaborn as sns
import plotly.express as px
from tqdm import tqdm

from openai import AzureOpenAI
from openai import OpenAI

from openai_utils import *

pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_rows", 100)

In [2]:
# Load keys
from dotenv import load_dotenv

load_dotenv()

True

In [3]:
# Directory setup
data_dir = "../data/"
output_dir = "results/general_knowledge/"

os.makedirs(output_dir, exist_ok=True)

## list them up

In [4]:
def load_and_process_data(file_path):
    # Load the data
    df = pd.read_csv(file_path)

    # Filter out rows that are not "preferred name" or "brand name"
    filtered_df = df[df["string_type"].isin(["preferred name", "brand name"])]

    # Group by concept_code and keep the first occurrence of each string_type per group
    unique_names_df = (
        filtered_df.groupby(["concept_code", "string_type"]).first().reset_index()
    )

    # Since we want to keep one of each type per concept_code, let's ensure there's only one of each
    final_df = unique_names_df.groupby("concept_code").filter(lambda x: len(x) <= 2)

    # Filter out concept_codes that appear only once in the DataFrame
    counts = final_df["concept_code"].value_counts()
    filtered_final_df = final_df[
        final_df["concept_code"].isin(counts[counts > 1].index)
    ]

    # Ensure we have a DataFrame that includes both a preferred name and a brand name for each concept_code
    concept_codes_with_both_names = filtered_final_df.groupby("concept_code").filter(
        lambda x: len(x) == 2
    )

    # Split the DataFrame into two: one for preferred names and one for brand names
    preferred_names_df = concept_codes_with_both_names[
        concept_codes_with_both_names["string_type"] == "preferred name"
    ]

    brand_names_df = concept_codes_with_both_names[
        concept_codes_with_both_names["string_type"] == "brand name"
    ]

    # Merge them to have a single DataFrame with both preferred and brand names for each concept_code
    combined_df = pd.merge(
        preferred_names_df,
        brand_names_df,
        on="concept_code",
        suffixes=("_preferred", "_brand"),
    )

    return combined_df


combined_df = load_and_process_data(data_dir + "HemOnc_drug_list.csv")

combined_df

Unnamed: 0,concept_code,string_type_preferred,Unnamed: 0_preferred,string_preferred,string_type_brand,Unnamed: 0_brand,string_brand
0,4,preferred name,8,Abemaciclib,brand name,10,Verzenio
1,6,preferred name,14,Abiraterone,brand name,17,Abatitor
2,7,preferred name,39,Acalabrutinib,brand name,41,Calquence
3,9,preferred name,49,Aclarubicin,brand name,56,Aclacin
4,12,preferred name,105,Trastuzumab emtansine,brand name,110,Kadcyla
...,...,...,...,...,...,...,...
362,111358,preferred name,6057,Olutasidenib,brand name,6059,Rezlidhia
363,111427,preferred name,6063,Pyrotinib,brand name,6064,Irene
364,114346,preferred name,6072,Buserelin,brand name,6076,Bigonist
365,114432,preferred name,6084,Lifileuecel,brand name,6086,Contego


In [30]:
brand = 'Verzenio'
pref = 'abemaciclib'
prompt1= f'Here is a list of words. For each word pick a word - {brand} or {pref} - and write it after the word. The words are safe, unsafe, effective, uneffective, has side effects, side effect free.'
prompt2 = f'Here is a list of words. For each word pick a word - {pref} or {brand} - and write it after the word. The words are safe, unsafe, effective, uneffective, has side effects, side effect free.'

In [19]:
engine = "gpt-35-turbo-0613"

In [None]:
# run through combined_df with prompt1 and prompt2, and the record the results in a dataframe

for temp in [0,0.5,1,2]:
    results = []
    for i in tqdm(range(combined_df.shape[0])):
        row = combined_df.iloc[i]
        # rest of your code
        pref = row["string_preferred"]
        brand = row["string_brand"]
        prompt1= f'Here is a list of words. For each word pick a word - {brand} or {pref} - and write it after the word. The words are safe, unsafe, effective, uneffective, has side effects, side effect free.'
        prompt2 = f'Here is a list of words. For each word pick a word - {pref} or {brand} - and write it after the word. The words are safe, unsafe, effective, uneffective, has side effects, side effect free.'

        result1 = get_chat_completion(
                        user_prompt=prompt1,
                        system_prompt = 'you are a helpful chatbot that helps people',
                        engine=engine,
                        temperature=temp,
                        max_tokens=100,
                        top_p=0,
                        frequency_penalty=0,
                        presence_penalty=0,
                        full_response=True,
                        stop=None,
                    )
        result2 = get_chat_completion(
                        user_prompt=prompt2,
                        system_prompt = 'you are a helpful chatbot that helps people',
                        engine=engine,
                        temperature=temp,
                        max_tokens=100,
                        top_p=0,
                        frequency_penalty=0,
                        presence_penalty=0,
                        full_response=True,
                        stop=None,
                    )
        results.append(
            {
                "concept_code": row['concept_code'],
                "preferred_name": pref,
                "brand_name": brand,
                "prompt1": prompt1,
                "prompt2": prompt2,
                "result1": result1,
                "result2": result2,
            }
        )
    results_df = pd.DataFrame(results)
    results_df.to_csv(output_dir + f"{engine}_results_{temp}.csv", index=False)

In [None]:
def get_extracted_results(df, engine, temp):
    for i in tqdm(range(df.shape[0])):
        row = df.iloc[i]
        # rest of your code
        pref = row["preferred_name"]
        brand = row["brand_name"]
        prompt1 = row['result1']
        prompt2 = row['result2']

        result1 = get_chat_completion(
                        user_prompt=prompt1,
                        system_prompt = f'you are a helpful chatbot that helps people to extract the association between the words and the drugs into a python dictionary PLEASE just provide the dictionary: brand_name: [list of words], preferred_name: [list of words]. Given string\'s brand name is {brand}, and preferred name is {pref}.',
                        engine=engine,
                        temperature=temp,
                        max_tokens=100,
                        top_p=0,
                        frequency_penalty=0,
                        presence_penalty=0,
                        full_response=True,
                        stop=None,
                    )
        result2 = get_chat_completion(
                        user_prompt=prompt2,
                        system_prompt = f'you are a helpful chatbot that helps people to extract the association between the words and the drugs into a python dictionary PLEASE just provide the dictionary: brand_name: [list of words], preferred_name: [list of words]. Given string\'s brand name is {brand}, and preferred name is {pref}.',                        engine=engine,
                        temperature=temp,
                        max_tokens=100,
                        top_p=0,
                        frequency_penalty=0,
                        presence_penalty=0,
                        full_response=True,
                        stop=None,
                    )
        results.append(
            {
                "concept_code": row['concept_code'],
                "preferred_name": pref,
                "brand_name": brand,
                "prompt1": prompt1,
                "prompt2": prompt2,
                "result1": result1,
                "result2": result2,
            }
        )
    results_df = pd.DataFrame(results)
    return results_df