In [None]:
import pandas as pd
import boto3
import json
from openai import OpenAI
from tqdm import tqdm

df = pd.read_excel("/Users/vermaa/Documents/photon-ai-module/GDMT_data_stacked_KRYMAI.xlsx")

In [14]:
print("Number of Rows: ", len(df))
df = df.dropna(subset=["diet_history"])
print("Number of Rows after dropping patients without diet history: ", len(df))

history = df["diet_history"].values.tolist()
recommendation = df["follow_up_recommendations"].values.tolist()

Number of Rows:  188
Number of Rows after dropping patients without diet history:  47


In [21]:
ocr_prompt = '''
    You are a dietician espically dealing with patients that are on Heart-Failure, in particular 
    Heart failure patients with elevated potassium levels who are potential candidates for Mineralocorticoid Receptor Antagonist (MRA) treatment.

    You will be provided with patient diet history in text with different food that they intake and timing of their meals.
    You need to take in their history and provide them with recommendation to manage potassium level in their diet.
    An example for this interation is:
    Diet History: '9 am-gluten free bagel with peanut butter or salted butter or plain or frozen waffles/pancake-mix with real maple syrup or gluten-free oatmeal made with water plus brown sugar (1 TBSP) and berries 1300 hrs- cheese (2 slices of cheddar) or tuna sandwich plus dry apricots and 1 apple or gluten-free crackers with cheese or leftovers 5:50 pm-Fish with 2 cups rice with asparagus (6) and green beans (1 cup) or hamburger patty no bun and salad with prepared or homemade dressing or chicken breast (5 oz) with rice and vegs-1.5 cups (peppers, mushrooms, carrot, turnip, cabbage, tomatoes'
    Recommendation: '1. Limit low potassium fruits to 3 servings/day 2. Limit low potassium vegs to 3 servings/day 3. Limit milk/yogurt/ice cream to 1 cup per day 4. Limit mushrooms to 1/2 cup 1 time per week 5. Limit tomato sauce to 1/4 cup per week 6. Limit tomatoes to 1/2 small tomato or 5 cherry tomatoes per day 7. Read the nutrition facts table to optimize choices'
'''

In [23]:
def invoke_claude_3(AWS_ACCESS_KEY, AWS_SECRET_KEY, prompt_, ocr_prompt):
    # Initialize the Amazon Bedrock runtime client    
    client = boto3.client(service_name="bedrock-runtime", region_name="us-east-1", 
    aws_access_key_id= AWS_ACCESS_KEY, aws_secret_access_key= AWS_SECRET_KEY)

    # model_id, fixed to using haiku for now
    model_id = "anthropic.claude-3-haiku-20240307-v1:0"
    
    content = [ {"type": "text",
        "text": prompt_}, {"type": "text",
        "text": ocr_prompt} ]
    request_body = {
        "anthropic_version": "bedrock-2023-05-31",
        "max_tokens": 2000,
        "messages": [
            {
                "role": "user",
                "content": content,
            }
        ],
    }

    response = client.invoke_model(
        modelId=model_id,
        body=json.dumps(request_body),
    )

    # Process and print the response
    result = json.loads(response.get("body").read())
    return result['content'][0]['text']       

In [None]:
AWS_ACCESS_KEY = "****"
AWS_SECRET_KEY = "****"

In [27]:
haiku_recommendations = []
for his in tqdm(history, total=len(history)):
    resp = invoke_claude_3(AWS_ACCESS_KEY, AWS_SECRET_KEY, prompt_=f'Diet history of patient is: {his}', ocr_prompt=ocr_prompt)
    haiku_recommendations.append(resp)

100%|██████████| 47/47 [06:30<00:00,  8.31s/it]


In [28]:
def invoke_openai(OPENAI_API_KEY, prompt_, ocr_prompt):
    client = OpenAI(api_key=OPENAI_API_KEY)
    openai_turbo_model_name = "gpt-4-turbo-preview"
    content = [{"role": "system",
                    "content": ocr_prompt},
                {"role": "user",
                    "content": prompt_}]
        
    response = client.chat.completions.create(
        model=openai_turbo_model_name,
        stop="stop",
        top_p=0.0,
        temperature=0.0,
        messages=content,
        max_tokens=2000
    )
    return str(response.choices[0].message.content)

In [None]:
OPENAI_API_KEY = "****"

In [31]:
gpt4_recommendations = []
for his in tqdm(history, total=len(history)):
    resp = invoke_openai(OPENAI_API_KEY, prompt_=f'Diet history of patient is: {his}', ocr_prompt=ocr_prompt)
    gpt4_recommendations.append(resp)

100%|██████████| 47/47 [14:42<00:00, 18.77s/it]


In [32]:
res_df = pd.DataFrame()
res_df["diet_history"] = history
res_df["dietician_recommendation"] = recommendation
res_df["haiku_zero_shot_recommendation"] = haiku_recommendations
res_df["gpt4_zero_shot_recommendation"] = gpt4_recommendations

res_df.to_csv("result_recommendation.csv", index=False)

# COMPARING THE RECOMMENDATIONS

In [52]:
import math
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from textstat.textstat import textstat
from sklearn.metrics import precision_recall_fscore_support
from transformers import pipeline

# Ensure nltk is properly downloaded
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /Users/vermaa/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/vermaa/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [53]:
# 1. Precision, Recall, F1-Score (Token-level calculation)
def precision_recall_f1(generated_text, reference_text):
    generated_tokens = nltk.word_tokenize(generated_text.lower())
    reference_tokens = nltk.word_tokenize(reference_text.lower())
    
    # True positives: Tokens in both reference and generated texts
    true_positives = sum(1 for token in generated_tokens if token in reference_tokens)
    
    # False positives: Tokens in generated text but not in reference text
    false_positives = sum(1 for token in generated_tokens if token not in reference_tokens)
    
    # False negatives: Tokens in reference text but not in generated text
    false_negatives = sum(1 for token in reference_tokens if token not in generated_tokens)
    
    # Calculate Precision, Recall, F1
    if true_positives + false_positives == 0:
        precision = 0
    else:
        precision = true_positives / (true_positives + false_positives)
        
    if true_positives + false_negatives == 0:
        recall = 0
    else:
        recall = true_positives / (true_positives + false_negatives)
    
    if precision + recall == 0:
        f1 = 0
    else:
        f1 = 2 * (precision * recall) / (precision + recall)
    
    return precision, recall, f1


# 2. BLEU Score
def bleu_score(reference_text, generated_text):
    reference_tokens = nltk.word_tokenize(reference_text.lower())
    generated_tokens = nltk.word_tokenize(generated_text.lower())

    # Using smoothing function to avoid zero BLEU scores
    smoothing_function = SmoothingFunction().method1
    return sentence_bleu([reference_tokens], generated_tokens, smoothing_function=smoothing_function)


# 3. ROUGE Score
def rouge_score_fn(reference_text, generated_text):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference_text, generated_text)
    return scores


# 4. Perplexity Score (for Language Model)
def perplexity(generated_text):
    # Tokenize and compute perplexity using a simple unigram model
    tokens = nltk.word_tokenize(generated_text.lower())
    unique_tokens = set(tokens)
    token_count = len(tokens)
    
    # Compute unigram probabilities
    probabilities = [tokens.count(token) / token_count for token in unique_tokens]
    entropy = -sum([p * math.log2(p) for p in probabilities])
    return math.pow(2, entropy)


# 5. Flesch-Kincaid Readability Score
def flesch_kincaid_readability(generated_text):
    return textstat.flesch_kincaid_grade(generated_text)


# 6. Entropy
def entropy(generated_text):
    # Tokenize the text and compute entropy
    tokens = nltk.word_tokenize(generated_text.lower())
    token_count = len(tokens)
    token_frequencies = {token: tokens.count(token) / token_count for token in set(tokens)}
    entropy_value = -sum([p * math.log2(p) for p in token_frequencies.values()])
    return entropy_value

In [54]:
p, r, f, bleu, rogue1, rogue2, rogueL, pp, fk, ent = [], [], [], [], [], [], [], [], [], []
for reference_text, generated_text in zip(recommendation[1:], haiku_recommendations[1:]):
    s = precision_recall_f1(generated_text, reference_text)
    p.append(s[0])
    r.append(s[1])
    f.append(s[2])

    bleu.append(bleu_score(reference_text, generated_text))
    s = rouge_score_fn(reference_text, generated_text)
    rogue1.append(s['rouge1'].fmeasure)
    rogue2.append(s['rouge2'].fmeasure)
    rogueL.append(s['rougeL'].fmeasure)

    pp.append(perplexity(generated_text))
    fk.append(flesch_kincaid_readability(generated_text))
    ent.append(entropy(generated_text))

In [55]:
print(f"""
Haiku recommendation performance:
      \n Precision: {round(sum(p)/len(p), 4)}, Recall: {round(sum(r)/len(r), 4)}, F1-score: {round(sum(f)/len(f), 4)}
      \n BLEU Score: {round(sum(bleu)/len(bleu), 4)}, ROGUE-1: {round(sum(rogue1)/len(rogue1), 4)}, ROGUE-2: {round(sum(rogue2)/len(rogue2), 4)}, ROGUE-L: {round(sum(rogueL)/len(rogueL), 4)}
      \n Perplexity: {round(sum(pp)/len(pp), 4)}, Flesch-Kincaid Readability Grade: {round(sum(fk)/len(fk), 4)}, Entropy: {round(sum(ent)/len(ent), 4)}
""")


Haiku recommendation performance:
      
 Precision: 0.2413, Recall: 0.7595, F1-score: 0.3558
      
 BLEU Score: 0.0087, ROGUE-1: 0.2004, ROGUE-2: 0.0556, ROGUE-L: 0.1191
      
 Perplexity: 107.437, Flesch-Kincaid Readability Grade: 9.9413, Entropy: 6.7431



In [56]:
p, r, f, bleu, rogue1, rogue2, rogueL, pp, fk, ent = [], [], [], [], [], [], [], [], [], []
for reference_text, generated_text in zip(recommendation[1:], gpt4_recommendations[1:]):
    s = precision_recall_f1(generated_text, reference_text)
    p.append(s[0])
    r.append(s[1])
    f.append(s[2])

    bleu.append(bleu_score(reference_text, generated_text))
    s = rouge_score_fn(reference_text, generated_text)
    rogue1.append(s['rouge1'].fmeasure)
    rogue2.append(s['rouge2'].fmeasure)
    rogueL.append(s['rougeL'].fmeasure)

    pp.append(perplexity(generated_text))
    fk.append(flesch_kincaid_readability(generated_text))
    ent.append(entropy(generated_text))

In [57]:
print(f"""
GPT-4-turbo recommendation performance:
      \n Precision: {round(sum(p)/len(p), 4)}, Recall: {round(sum(r)/len(r), 4)}, F1-score: {round(sum(f)/len(f), 4)}
      \n BLEU Score: {round(sum(bleu)/len(bleu), 4)}, ROGUE-1: {round(sum(rogue1)/len(rogue1), 4)}, ROGUE-2: {round(sum(rogue2)/len(rogue2), 4)}, ROGUE-L: {round(sum(rogueL)/len(rogueL), 4)}
      \n Perplexity: {round(sum(pp)/len(pp), 4)}, Flesch-Kincaid Readability Grade: {round(sum(fk)/len(fk), 4)}, Entropy: {round(sum(ent)/len(ent), 4)}
""")


GPT-4-turbo recommendation performance:
      
 Precision: 0.2247, Recall: 0.8223, F1-score: 0.3444
      
 BLEU Score: 0.0053, ROGUE-1: 0.1694, ROGUE-2: 0.0405, ROGUE-L: 0.0977
      
 Perplexity: 117.8841, Flesch-Kincaid Readability Grade: 10.8239, Entropy: 6.8746



In [None]:
# END