In [3]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_groq import ChatGroq
# from ragas import evaluate
# from ragas.embeddings import LangchainEmbeddingsWrapper
# from ragas.llms import LangchainLLMWrapper
# from ragas.metrics import answer_relevancy, faithfulness, context_recall, context_precision
from dotenv import load_dotenv
import os
# from datasets import Dataset
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_recall_fscore_support
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import requests
import time
from pathlib import Path

In [4]:
def evaluate_multiclass_classification(y_true, y_pred, class_labels):
    """
    Evaluates a multiclass classification model.
    
    y_true: Ground truth labels
    y_pred: Predicted labels from the classifier
    class_labels: List of class names
    
    Returns a dictionary of accuracy, precision, recall, F1, and confusion matrix.
    """
    
    # Accuracy
    accuracy = accuracy_score(y_true, y_pred)
    
    # Precision, Recall, F1-score
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='macro')  # macro-averaging
    
    # Classification report (optional detailed breakdown for each class)
    class_report = classification_report(y_true, y_pred, target_names=class_labels)
    
    # Confusion Matrix
    conf_matrix = confusion_matrix(y_true, y_pred)
    
    # Print evaluation metrics
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision (Macro Avg): {precision:.4f}")
    print(f"Recall (Macro Avg): {recall:.4f}")
    print(f"F1 Score (Macro Avg): {f1:.4f}")
    print("\nClassification Report:\n", class_report)
    
    # Plot confusion matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=class_labels, yticklabels=class_labels)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title('Confusion Matrix')
    plt.show()
    
    # Return metrics for further analysis if needed
    return {
        "accuracy": accuracy,
        "precision_macro": precision,
        "recall_macro": recall,
        "f1_macro": f1,
        "confusion_matrix": conf_matrix
    }


In [3]:
load_dotenv()

groq = os.getenv("groqkey")

evaluator_llm = LangchainLLMWrapper(ChatGroq(temperature=0, model="llama-3.1-70b-versatile", api_key=groq))
evaluator_embeddings = LangchainEmbeddingsWrapper(HuggingFaceEmbeddings(model_name="NeuML/pubmedbert-base-embeddings"))

def calculate_RAGAS_answer(generated_answer, gold_answer, question, context):
  data = {'question': [question],
          'contexts': [context],
          'answer': [generated_answer],
          'ground_truth': [gold_answer]}
  dataset = Dataset.from_dict(data)
  score = evaluate(dataset, llm=evaluator_llm, embeddings=evaluator_embeddings, metrics = [faithfulness, answer_relevancy, context_precision, context_recall]).to_pandas()
  faithfulness_score = score['faithfulness'].iloc[0]
  answer_relevancy_score = score['answer_relevancy'].iloc[0]
  context_precision_score = score['context_precision'].iloc[0]
  context_recall_score = score['context_recall'].iloc[0]
  return faithfulness_score, answer_relevancy_score, context_precision_score, context_recall_score

  evaluator_embeddings = LangchainEmbeddingsWrapper(HuggingFaceEmbeddings(model_name="NeuML/pubmedbert-base-embeddings"))


In [6]:
import pandas as pd
from fastapi import FastAPI
from fastapi.responses import HTMLResponse
import uvicorn
from IPython.display import Markdown, display
from extraction import llmAgent
from ppi_deprescribe import merge_results, ppi_deprescribe
import os 
from pathlib import Path
from dotenv import load_dotenv

load_dotenv()

path = Path(os.getenv("data_path"))
groq = os.getenv("groqkey")

def deprescribe(key):
    # extract information
    llm_agent = llmAgent(groq_key=groq, data_path=path)

    results_dict = {
        "diagnosis_dict": llm_agent.extract_diagnosis(encounter_key=key),
        "encounter_dict": llm_agent.extract_encounter_info(encounter_key=key),
        # Is the reasoning in the json or sepearte?
        # Should the reasoning be included in any of them or just the diangosis with the reasoning seperate?
        "notes_dict": llm_agent.extract_notes(encounter_key=key),
    }
    print(results_dict['notes_dict'])
    # # #   master formatter step   # # #
    # merge the diagnosis booleans (just use OR logic for now)
    # make a final "reasoning" behind the recommendation
    final_dict = merge_results(results_dict=results_dict)

    # feed the three reasonings to LLM to get a single summary
    final_reasoning = llm_agent.summarize_reasonings(results_dict=results_dict)

    # # #   get recommendation from PPI algorithm   # # #
    recommendation_str = ppi_deprescribe(patient_diagnosis=final_dict)
    return recommendation_str, final_reasoning
    #print("Recommendation: ")
    #print(recommendation_str)
    #print("\nReasoning: ")
    #print(final_reasoning)

In [None]:
load_dotenv()

data = pd.read_csv(path / 'LabeledResponses.csv')

results_df = pd.DataFrame(columns=['key', 'rec', 'response'])

for key in data['key']:
    start = time.time()
    print(f"Starting key: {key}.")
    response = deprescribe(key)
    new_row = pd.DataFrame([{'key': key, 'rec': response[0], 'response': response[1]}])
    results_df = pd.concat([results_df, new_row], ignore_index=True)
    print(f"{key} took {time.time() - start} seconds to process.")

results_df.to_csv(path / 'model_results.csv', index=False)

In [None]:
# load data
data = pd.read_csv(path / 'LabeledResponses_ReturnedResponse.csv')

y_true = data['recommendation']
y_pred = data['Rec_Returned']
class_labels = ['continue', 'deprescribe', 'stop']

# Evaluate the model
metrics = evaluate_multiclass_classification(y_true, y_pred, class_labels)

In [None]:
from extraction import llmAgent
from ppi_deprescribe import merge_results

llm_agent = llmAgent(groq_key=groq, data_path=path)

data = pd.read_csv(path / 'LabeledResponses_ReturnedResponse.csv')

ragas_results_df = pd.DataFrame(columns=['key', 'ragas_score'])

for key in data['key']:
    print(f"Starting key {key}")
    start = time.time()

    temp = data[data['key'] == key]
    temp = temp.reset_index()

    results_dict = {
        "diagnosis_dict": llm_agent.extract_diagnosis(encounter_key=key),
        "encounter_dict": llm_agent.extract_encounter_info(encounter_key=key),
        "notes_dict": llm_agent.extract_notes(encounter_key=key),
    }
    final_dict = merge_results(results_dict=results_dict)

    final_reasoning = llm_agent.summarize_reasonings(results_dict=results_dict)

    generated_answer = temp['Reasoning'][0]
    gold_answer = temp['GS_response'][0]
    question = "You are a knowledgeable medical provider who specializes in medication management. Given a list of diagnosis and some snippets from patients notes, answer if the patient notes contain any of the diagnosis. Based on the information from the note context, does the patient have any of the following: 1. Mild to moderate esophagitis 2. GERD 3. Peptic Ulcer Disease 4. Upper GI symptoms 5. ICU Stress Ulcer Prophylaxis 6. Barretts Esophagus 7. Chronic NSAID use with bleeding risk 8. Severe esophagitis 9. Documented history of bleeding GI ulcer 10. H pylori infection 11. Explain the reasoning for your answer. Return the answer for each of these as a formatted JSON object with the key being the condition and the value being a boolean value for the first 10.  For the final question, return a string with the reasoning for your answer. Summarize the reasonings from the three sources. You are a knowledgeable medical provider who specializes in medication management. Based on the following json files, please provide a single explanation of the reasoning given by the 'Reasoning' key. Summarize given equal weight to each. Do not add any additional information, only summarize what is given." 
    context = [final_reasoning]

    ragas_score = calculate_RAGAS_answer(generated_answer, gold_answer, question, context)

    print(ragas_score)
    print(f"{key} took {time.time() - start} seconds to process.")

    new_row = pd.DataFrame([{'key': key, 'ragas_score': ragas_score}])
    ragas_results_df = pd.concat([ragas_results_df, new_row], ignore_index=True)

ragas_results_df.to_csv(path / 'model_results.csv', index=False)
