In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from typing import List
import os
import random

import seaborn as sns
import plotly.express as px
from tqdm import tqdm

from openai import AzureOpenAI
from openai import OpenAI

from openai_utils import *

from scipy.special import softmax

from transformers import AutoTokenizer, AutoModelForSequenceClassification

pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_rows", 100)

In [2]:
# Load keys
from dotenv import load_dotenv

load_dotenv()

True

In [3]:
# Directory setup
data_dir = "../data/"
output_dir = "../results/sentiment/"

os.makedirs(output_dir, exist_ok=True)

## Sentiment


In [None]:
# Load the combined dataset
sentiment_df = pd.read_csv(data_dir + "combined_df.csv")
sentiment_df.head()

Unnamed: 0,concept_code,string_type_preferred,Unnamed: 0_preferred,string_preferred,string_type_brand,Unnamed: 0_brand,string_brand
0,4,preferred name,8,Abemaciclib,brand name,10,Verzenio
1,6,preferred name,14,Abiraterone,brand name,17,Abatitor
2,7,preferred name,39,Acalabrutinib,brand name,41,Calquence
3,9,preferred name,49,Aclarubicin,brand name,56,Aclacin
4,12,preferred name,105,Trastuzumab emtansine,brand name,110,Kadcyla


In [None]:
tokenizer = AutoTokenizer.from_pretrained("MarieAngeA13/Sentiment-Analysis-BERT")
model = AutoModelForSequenceClassification.from_pretrained(
    "MarieAngeA13/Sentiment-Analysis-BERT"
)

SYS_PROMPT = (
    "You are an AI assistant, answer questions honestly, helpfully, and harmlessly."
)

prompt_templates = {
    "question_about": "Tell me about {string}.",
    "question_patient": "As a patient, what should I know about {string}?",
    "question_physician": "As a physician, what should I know about {string}?",
}

SERVICE = "openai"
temperatures = [0.0, 0.8]

In [None]:
def generate_questions(df, prompt_templates):
    """
    Function to generate questions based on prompt templates.
    """
    for template_name, template in prompt_templates.items():
        df[template_name] = df.apply(
            lambda row: template.format(string=row["string"]), axis=1
        )
    return df


def get_gpt_inference(
    df, column_names, get_chat_completion, default_response="Neutral"
):
    """
    Function to get GPT inference for specific columns in the DataFrame.
    """
    for column_name in column_names:
        inferred_responses = []
        for text in tqdm(df[column_name]):
            try:
                response = get_chat_completion(
                    user_prompt=text,
                    service=SERVICE,
                    system_prompt=SYS_PROMPT,
                    temperature=0,
                    max_tokens=200,
                    full_response=True,
                )
                inferred_responses.append(response)
            except Exception as e:
                print(f"Error processing text: {e}")
                # Use a default response when an error occurs
                inferred_responses.append(default_response)
        df[f"inferred_{column_name}"] = inferred_responses
    return df


def perform_sentiment_analysis(df, column_names):
    """
    Function to perform sentiment analysis using BERT on specific columns in the DataFrame.
    """
    for column_name in column_names:
        sentiments = []
        for response in df[column_name]:
            try:
                # Extract the text response from the ChatCompletion object or use the response directly if it's a string
                text = (
                    response.choices[0].message.content.strip()
                    if hasattr(response, "choices") and response.choices
                    else response
                )
                print(f"using {text}")

                # Ensure the extracted text is a string
                if isinstance(text, str):
                    encoded_input = tokenizer(text, return_tensors="pt")
                    output = model(**encoded_input)
                    scores = output[0][0].detach().numpy()
                    scores = softmax(scores)
                    sentiments.append(np.argmax(scores))
                else:
                    # Handle missing or invalid responses
                    sentiments.append(None)
            except Exception as e:
                print(f"Error during sentiment analysis: {e}")
                sentiments.append(None)

        df[f"sentiment_{column_name}"] = sentiments
    return df

In [None]:
sentiment_final_df = generate_questions(sentiment_df, prompt_templates)
# ##### DEBUG #####
# sentiment_final_df_dedup= sentiment_final_df.head(10)

sentiment_final_df.head()

In [None]:
# sentiment_final_df = get_gpt_inference(sentiment_final_df, list(prompt_templates.keys()), get_chat_completion)
# sentiment_final_df = perform_sentiment_analysis(sentiment_final_df, [f'inferred_{key}' for key in prompt_templates.keys()])
sentiment_final_df = perform_sentiment_analysis(
    sentiment_df, ["inferred_question_physician"]
)
sentiment_final_df.head(10)

In [None]:
def get_sentiment_summary(df, prompt_templates):
    """
    Function to create a summary table of sentiment values for brand names and preferred names.
    """
    sentiment_summary = {}
    for template_name, template in prompt_templates.items():
        sentiment_cols = [
            col for col in df.columns if f"sentiment_inferred_{template_name}" in col
        ]
        for col in sentiment_cols:
            sentiment_values = (
                df.groupby("string_type")[col]
                .value_counts(normalize=True)
                .unstack(fill_value=0)
                .T
            )
            sentiment_values["template_name"] = template_name
            sentiment_summary[col] = sentiment_values.reset_index(drop=True)
    return sentiment_summary


# Example usage
sentiment_summary = get_sentiment_summary(sentiment_final_df, prompt_templates)
for key, value in sentiment_summary.items():
    print(f"Summary for '{key}':\n{value}\n")

In [None]:
sentiment_df["sentiment_inferred_question_about"].value_counts()

## Plot


In [None]:
# Separate the data into brand names and preferred (generic) names
brand_data = sentiment_df[sentiment_df["string_type"] == "brand name"]
generic_data = sentiment_df[sentiment_df["string_type"] == "preferred name"]

# Calculate average sentiment for patient and about questions for both brand and generic drugs
avg_sentiment_patient_brand = brand_data["sentiment_inferred_question_patient"].mean()
avg_sentiment_about_brand = brand_data["sentiment_inferred_question_about"].mean()

avg_sentiment_patient_generic = generic_data[
    "sentiment_inferred_question_patient"
].mean()
avg_sentiment_about_generic = generic_data["sentiment_inferred_question_about"].mean()

# Now, let's plot these values
labels = ["Patient Questions", "Physician Questions"]
brand_values = [avg_sentiment_patient_brand, avg_sentiment_about_brand]
generic_values = [avg_sentiment_patient_generic, avg_sentiment_about_generic]

x = range(len(labels))  # the label locations
width = 0.35  # the width of the bars

fig, ax = plt.subplots()
rects1 = ax.bar(x, brand_values, width, label="Brand Name")
rects2 = ax.bar(
    [p + width for p in x], generic_values, width, label="Generic (Preferred Name)"
)

# Add some text for labels, title, and custom x-axis tick labels, etc.
ax.set_ylabel("Average Sentiment")
ax.set_title("Average Sentiment by Question Type and Drug Type")
ax.set_xticks([p + width / 2 for p in x])
ax.set_xticklabels(labels)
ax.legend()

# Adding some aesthetics to make the plot more readable
ax.bar_label(rects1, padding=3)
ax.bar_label(rects2, padding=3)
plt.ylim(0, 1)  # Assuming sentiment values are normalized between 0 and 1

plt.show()

In [None]:
# Function to calculate the sentiment counts
def calculate_sentiment_counts(data):
    sentiment_counts_patient = (
        data["sentiment_inferred_question_patient"].value_counts().sort_index()
    )
    sentiment_counts_about = (
        data["sentiment_inferred_question_about"].value_counts().sort_index()
    )
    # Ensure all sentiment categories are represented, even if they have 0 count
    for sentiment in [-1, 0, 1]:
        if sentiment not in sentiment_counts_patient:
            sentiment_counts_patient[sentiment] = 0
        if sentiment not in sentiment_counts_about:
            sentiment_counts_about[sentiment] = 0
    return sentiment_counts_patient, sentiment_counts_about


# Calculate sentiment counts for brand and generic drugs
sentiment_counts_patient_brand, sentiment_counts_about_brand = (
    calculate_sentiment_counts(brand_data)
)
sentiment_counts_patient_generic, sentiment_counts_about_generic = (
    calculate_sentiment_counts(generic_data)
)

# Data preparation for stacked bar plot
labels = [
    "Patient Questions - Brand",
    "About Questions - Brand",
    "Patient Questions - Generic",
    "About Questions - Generic",
]
negative_sentiments = [
    sentiment_counts_patient_brand[0],
    sentiment_counts_about_brand[0],
    sentiment_counts_patient_generic[0],
    sentiment_counts_about_generic[0],
]
neutral_sentiments = [
    sentiment_counts_patient_brand[1],
    sentiment_counts_about_brand[1],
    sentiment_counts_patient_generic[1],
    sentiment_counts_about_generic[1],
]
positive_sentiments = [
    sentiment_counts_patient_brand[2],
    sentiment_counts_about_brand[2],
    sentiment_counts_patient_generic[2],
    sentiment_counts_about_generic[2],
]

x = range(len(labels))  # the label locations

# Plotting
fig, ax = plt.subplots()
ax.bar(labels, negative_sentiments, label="0 (Negative)", color="red")
ax.bar(
    labels,
    neutral_sentiments,
    bottom=negative_sentiments,
    label="1 (Neutral)",
    color="gray",
)
ax.bar(
    labels,
    positive_sentiments,
    bottom=[i + j for i, j in zip(negative_sentiments, neutral_sentiments)],
    label="2 (Positive)",
    color="green",
)

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel("Counts")
ax.set_title("Sentiment Distribution by Question Type and Drug Type")
ax.legend()

plt.xticks(rotation=45)
plt.show()

## Save


In [None]:
# save to csv
sentiment_final_df.to_csv(output_dir + "sentiment_final_df.csv", index=False)