In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from typing import List
import os

import seaborn as sns
import plotly.express as px
from tqdm import tqdm

from openai import AzureOpenAI
from openai import OpenAI

pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_rows", 100)

In [3]:
# Directory setup
data_dir = "../data/"
output_dir = "../results/list_preference/"

os.makedirs(output_dir, exist_ok=True)

In [61]:
import json


def format_json_string(json_string):
    try:
        # Parse the JSON string
        parsed_json = json.loads(json_string)
        # Pretty-print with an indentation of 4 spaces
        formatted_json = json.dumps(parsed_json, indent=4)
        return formatted_json
    except json.JSONDecodeError:
        return "Invalid JSON content"


# Assuming 'total_df' is your DataFrame and 'output1' contains the JSON strings
if not total_df["output1"].empty:
    formatted_output = format_json_string(total_df["output1"].iloc[0])
    print(formatted_output)
else:
    print("No data available")

Invalid JSON content


In [54]:
def parse_json(json_string):
    try:
        return json.loads(json_string.replace("\n", ""))
    except json.JSONDecodeError:
        return {}


def count_associations(data, brand, pref):
    # Initialize a dictionary to count occurrences
    terms_list = [
        "safe",
        "unsafe",
        "effective",
        "uneffective",
        "has side effects",
        "side effect free",
    ]
    counts = {
        brand: {term: 0 for term in terms_list},
        pref: {term: 0 for term in terms_list},
    }

    for drug, terms in data.items():
        if drug in counts:
            for term in terms:
                if term in counts[drug]:
                    counts[drug][term] += 1

    return counts


# Function to apply counting dynamically based on each row's brand and preferred names
def apply_counts(row):
    brand = row["brand_name"]
    pref = row["preferred_name"]
    parsed1 = parse_json(row["output1"])
    parsed2 = parse_json(row["output2"])
    counts1 = count_associations(parsed1, brand, pref)
    counts2 = count_associations(parsed2, brand, pref)
    return pd.Series([counts1, counts2])


# Add parsed counts to the DataFrame
total_df[["counts1", "counts2"]] = total_df.apply(apply_counts, axis=1)

total_df["counts1"].head(1)

0    {'Verzenio': {'safe': 0, 'unsafe': 0, 'effective': 0, 'uneffective': 0, 'has side effects': 0, 'side effect free': 0}, 'Abemaciclib': {'safe': 0, 'unsafe': 0, 'effective': 0, 'uneffective': 0, 'has side effects': 0, 'side effect free': 0}}
Name: counts1, dtype: object

In [25]:
print(f"Saving to: {output_dir}/gpt-4-turbo/Implicit_association_preference.csv")
total_df.to_csv(
    output_dir + "gpt-4-turbo/Implicit_association_preference.csv", index=False
)

Saving to: ../results/list_preference//gpt-4-turbo/Implicit_association_preference.csv


## Aggregate


In [57]:
total_df["counts1"].head(1)

0    {'Verzenio': {'safe': 0, 'unsafe': 1, 'effective': 0, 'uneffective': 1, 'has side effects': 1, 'side effect free': 0}, 'Abemaciclib': {'safe': 1, 'unsafe': 0, 'effective': 1, 'uneffective': 0, 'has side effects': 0, 'side effect free': 1}}
Name: counts1, dtype: object

In [58]:
import json


def aggregate_counts(df):
    # Prepare a dynamic structure to aggregate counts
    aggregate = {
        "brand": {
            "safe": 0,
            "unsafe": 0,
            "effective": 0,
            "uneffective": 0,
            "has side effects": 0,
            "side effect free": 0,
        },
        "preferred": {
            "safe": 0,
            "unsafe": 0,
            "effective": 0,
            "uneffective": 0,
            "has side effects": 0,
            "side effect free": 0,
        },
    }

    terms_list = [
        "safe",
        "unsafe",
        "effective",
        "uneffective",
        "has side effects",
        "side effect free",
    ]

    for index, row in df.iterrows():
        # Check if counts1 and counts2 are strings and need cleaning and converting
        counts1 = row["counts1"]
        counts2 = row["counts2"]

        if isinstance(counts1, str):
            try:
                counts1 = json.loads(counts1.replace("'", '"'))
            except json.JSONDecodeError:
                print(f"Failed to parse counts1 as JSON in row {index}")
                continue  # Skip this row or log the error as needed

        if isinstance(counts2, str):
            try:
                counts2 = json.loads(counts2.replace("'", '"'))
            except json.JSONDecodeError:
                print(f"Failed to parse counts2 as JSON in row {index}")
                continue  # Skip this row or log the error as needed

        brand = row["brand_name"]
        pref = row["preferred_name"]

        # Ensure that both counts1 and counts2 contain data for both brand and preferred names
        if (
            brand in counts1
            and brand in counts2
            and pref in counts1
            and pref in counts2
        ):
            for term in terms_list:
                if (
                    term in counts1[brand]
                    and term in counts2[brand]
                    and term in counts1[pref]
                    and term in counts2[pref]
                ):
                    aggregate["brand"][term] += (
                        counts1[brand][term] + counts2[brand][term]
                    )
                    aggregate["preferred"][term] += (
                        counts1[pref][term] + counts2[pref][term]
                    )

    return aggregate

In [59]:
# Use groupby to process each group
grouped = total_df.groupby(["temp", "engine"])
aggregated_data = []

for (temp, engine), group_df in grouped:
    print(f"Running for temp: {temp}, engine: {engine}")
    # print(group_df.head())
    temp_counts = aggregate_counts(group_df)
    aggregated_data.append(
        {
            "engine": engine,
            "temp": temp,
            "brand_safe": temp_counts["brand"]["safe"],
            "brand_unsafe": temp_counts["brand"]["unsafe"],
            "preferred_safe": temp_counts["preferred"]["safe"],
            "preferred_unsafe": temp_counts["preferred"]["unsafe"],
            "brand_effective": temp_counts["brand"]["effective"],
            "brand_uneffective": temp_counts["brand"]["uneffective"],
            "preferred_effective": temp_counts["preferred"]["effective"],
            "preferred_uneffective": temp_counts["preferred"]["uneffective"],
            "brand_has_side_effects": temp_counts["brand"]["has side effects"],
            "brand_side_effect_free": temp_counts["brand"]["side effect free"],
            "preferred_has_side_effects": temp_counts["preferred"]["has side effects"],
            "preferred_side_effect_free": temp_counts["preferred"]["side effect free"],
        }
    )

# Convert the list to DataFrame directly
aggregated_counts_df = pd.DataFrame(aggregated_data)

# Melt the DataFrame to have rows for each term with a column specifying the type
aggregated_counts_df_melted = pd.melt(
    aggregated_counts_df,
    id_vars=["engine", "temp"],
    var_name="term",
    value_name="count",
)

aggregated_counts_df_melted["type"] = (
    aggregated_counts_df_melted["term"].str.split("_").apply(lambda x: x[0])
)

aggregated_counts_df_melted["term"] = aggregated_counts_df_melted["term"].apply(
    lambda x: x.split("_", 1)[1]
)

print(aggregated_counts_df_melted.head(50))

Running for temp: 0.0, engine: gpt-3.5-turbo-0613
Running for temp: 0.5, engine: gpt-3.5-turbo-0613
Running for temp: 1.0, engine: gpt-3.5-turbo-0613
Running for temp: 2.0, engine: gpt-3.5-turbo-0613
                engine  temp              term  count       type
0   gpt-3.5-turbo-0613   0.0              safe    395      brand
1   gpt-3.5-turbo-0613   0.5              safe    406      brand
2   gpt-3.5-turbo-0613   1.0              safe    399      brand
3   gpt-3.5-turbo-0613   2.0              safe    400      brand
4   gpt-3.5-turbo-0613   0.0            unsafe    424      brand
5   gpt-3.5-turbo-0613   0.5            unsafe    413      brand
6   gpt-3.5-turbo-0613   1.0            unsafe    414      brand
7   gpt-3.5-turbo-0613   2.0            unsafe    413      brand
8   gpt-3.5-turbo-0613   0.0              safe    371  preferred
9   gpt-3.5-turbo-0613   0.5              safe    360  preferred
10  gpt-3.5-turbo-0613   1.0              safe    363  preferred
11  gpt-3.5-turbo-06

In [60]:
print(f"Saving to: {output_dir}/gpt-4-turbo/aggregated_iat_counts.csv")
aggregated_counts_df_melted.to_csv(
    output_dir + "gpt-4-turbo/aggregated_iat_counts.csv", index=False
)

Saving to: ../results/list_preference//gpt-4-turbo/aggregated_iat_counts.csv


## Plot


In [61]:
# Pivot the data to get 'brand' and 'preferred' as separate columns for each term
pivot_df = aggregated_counts_df_melted.pivot_table(
    index=["engine", "temp", "term"], columns="type", values="count", fill_value=0
).reset_index()

# Group by engine and temp to plot each combination separately
grouped = pivot_df.groupby(["engine", "temp"])

grouped.head()

type,engine,temp,term,brand,preferred
0,gpt-3.5-turbo-0613,0.0,effective,369,397
1,gpt-3.5-turbo-0613,0.0,has_side_effects,434,323
2,gpt-3.5-turbo-0613,0.0,safe,395,371
3,gpt-3.5-turbo-0613,0.0,side_effect_free,306,449
4,gpt-3.5-turbo-0613,0.0,uneffective,365,399
6,gpt-3.5-turbo-0613,0.5,effective,379,387
7,gpt-3.5-turbo-0613,0.5,has_side_effects,428,331
8,gpt-3.5-turbo-0613,0.5,safe,406,360
9,gpt-3.5-turbo-0613,0.5,side_effect_free,314,443
10,gpt-3.5-turbo-0613,0.5,uneffective,356,408


In [62]:
terms_order = [
    "effective",
    "uneffective",
    "safe",
    "unsafe",
    "side_effect_free",
    "has_side_effects",
]

# Assuming 'grouped' and 'output_dir' are already defined
for key, group in grouped:
    engine, temp = key

    # Reorder the DataFrame according to the specified terms order
    group = group.set_index("term").reindex(terms_order).reset_index()

    # Plot the stacked bars
    fig, ax = plt.subplots(figsize=(10, 5))
    bars = group.set_index("term")[["brand", "preferred"]].plot(
        kind="bar", stacked=True, ax=ax, color=["skyblue", "orange"]
    )
    # Get the term from the group DataFrame (now plotting one term at a time)
    term = group["term"].unique()[0]
    ax.set_title(f"Stacked Bar Chart for Engine: {engine}, Temp: {temp}")
    ax.set_xlabel("Terms")
    ax.set_ylabel("Count")
    ax.legend(title="Type")

    # Annotate the count inside each bar
    for p in ax.patches:  # loop to find position to place the text
        width, height = p.get_width(), p.get_height()
        x, y = p.get_x(), p.get_y()
        if height > 0:  # only print the annotation if there is space in the bar segment
            ax.text(
                x + width / 2,
                y + height / 2,
                f"{int(height)}",
                ha="center",
                va="center",
            )

    # Save the plot
    plt.tight_layout()
    plt.savefig(f"{output_dir}/{engine}/temp_{temp}.png")
    plt.close(fig)  # Close the figure to free up memory

## Reload


In [63]:
ENGINE = "gpt-3.5-turbo-0613"
# ENGINE= "gpt-4-turbo"
total_df = pd.read_csv(output_dir + ENGINE + "/Implicit_association_preference.csv")

aggregated_counts_df_melted = pd.read_csv(
    output_dir + ENGINE + "/aggregated_iat_counts.csv"
)