In [74]:
import numpy as np
import pandas as pd
import re
from difflib import SequenceMatcher
from pathlib import Path

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from rapidfuzz import fuzz


In [84]:
METHOD = 'agent'#["agent", "mrr", "sc", "default", "cot", "cot2"]
RUN_ID = 1 #[1, 2, 3]

THRESH_MC = 0.9
THRESH_OPEN_MC = 0.9
THRESH_OPEN_bag = 0.15
THRESH_OPEN_fuzzy = 0.95


In [None]:
#READ DATA
questions = pd.read_json('data/curebench_valset_pharse1.jsonl',lines=True) 
path = Path("competition_results",METHOD,str(RUN_ID),"submission.csv")
answers = pd.read_csv(path)


In [60]:
#SPLITTING DATA INTO MC, OPEN-ENDED MC, and OPEN QUESTIONS
data = questions.merge(answers, on="id", how="right")
colnames = data.columns

data_mc = data[data["question_type"] == "multi_choice"]
data_open_mc = data[data["question_type"] == "open_ended_multi_choice"]
data_open = data[data["question_type"] == "open_ended"]

print("\nü•∏------Uncleaned predictions for MC-questions:-------\n")
print(data_mc[["prediction","choice"]].head())
print(f"***Observations: {len(data_mc)}***")

print("\nü•∏------Uncleaned predictions for open-ended MC-questions:------\n")
print(data_open_mc[["prediction","choice"]].head())
print(f"***Observations: {len(data_mc)}***")

print("\nü•∏------Uncleaned predictions for open questions:------\n")
print(data_open[["prediction","choice"]].head())
print(f"***Observations: {len(data_mc)}***")





ü•∏------Uncleaned predictions for MC-questions:-------

   prediction choice
0   ANSWER: A      A
4   ANSWER: B      B
5   ANSWER: C      C
6   ANSWER: A      A
12  ANSWER: B      B
***Observations: 183***

ü•∏------Uncleaned predictions for open-ended MC-questions:------

                                          prediction choice
1  - Patients should seek immediate medical atten...      B
2  - Replace the Stiolto Respimat inhaler with a ...      B
3  - Patients with a history of severe allergic r...      B
7  - Discontinue the use of PERTZYE immediately t...      B
8  - Discontinue Inspra (eplerenone) immediately ...      B
***Observations: 183***

ü•∏------Uncleaned predictions for open questions:------

                                           prediction     choice
22                                                  A  NOTAVALUE
37                                                  A  NOTAVALUE
42                                                  A  NOTAVALUE
44                

In [61]:
#####ANALYZE MC DATA #######
data_mc = data_mc.copy()

for i,row in data_mc.iterrows():
    prediction = row["prediction"]
    options = dict(row["options"])
    LCSratios = {}
    for key in options.keys():
        option = options[key]
        LCSratios[key] = SequenceMatcher(None, prediction, option).ratio()
    
    best_pred = max(LCSratios, key=LCSratios.get)
    data_mc.loc[i,"pred_LCS"] = best_pred
    data_mc.loc[i,"LCS_similarity"]=LCSratios[best_pred]

data_mc['final_choice'] = np.where(
    data_mc['LCS_similarity'] > 0.9,
    data_mc['pred_LCS'],
    data_mc['choice']
    
)

data_mc["is_correct"] = data_mc['final_choice'] == data_mc["correct_answer"]

for i,row in data_mc.iterrows():
    if row["final_choice"] != 'NOTAVALUE':
        print(f"\n----Row {i}:----")
        
        print(f"Detailed Prediction: {row["prediction"]}")
        print(f"Final Choice: {row['final_choice']}")
        print(f"Correct Answer: {row['correct_answer']}")
        print(f"Is correct: {row['is_correct']}")

print("\n\n‚ö†üü®--------- INVALID ANSWERS-----------üü®\n\n")
for i,row in data_mc.iterrows():
    if row["final_choice"] == 'NOTAVALUE':
        print(f"\n----Row {i}:----")
        
        print(f"Question: {row['question']}\n")
        print(f"Correct Answer: {row['correct_answer']}:")
        print(f"{ row['options'][row['correct_answer']] }\n")
        print(f"Detailed Prediction: {row["prediction"]}\n")
        print(f"Final Choice: {row['final_choice']}\n")
        

NaN_mc = len(data_mc[data_mc["final_choice"] == 'NOTAVALUE'])
valid_mc = len(data_mc) - NaN_mc
ACC_mc = data_mc["is_correct"].sum() / valid_mc

print()
print(f"‚ÑπÔ∏èValid Answers: {valid_mc}")
print(f"‚ÑπÔ∏èInvalid Answers: {NaN_mc}")
print(f"üü¢ACC: {100*round(ACC_mc,4)}%")


----Row 0:----
Detailed Prediction: ANSWER: A
Final Choice: A
Correct Answer: A
Is correct: True

----Row 4:----
Detailed Prediction: ANSWER: B
Final Choice: B
Correct Answer: B
Is correct: True

----Row 5:----
Detailed Prediction: ANSWER: C
Final Choice: C
Correct Answer: C
Is correct: True

----Row 6:----
Detailed Prediction: ANSWER: A
Final Choice: A
Correct Answer: C
Is correct: False

----Row 12:----
Detailed Prediction: ANSWER: B
Final Choice: B
Correct Answer: C
Is correct: False

----Row 13:----
Detailed Prediction: ANSWER: B
Final Choice: B
Correct Answer: B
Is correct: True

----Row 20:----
Detailed Prediction: ANSWER: D
Final Choice: D
Correct Answer: D
Is correct: True

----Row 21:----
Detailed Prediction: ANSWER: A
Final Choice: A
Correct Answer: A
Is correct: True

----Row 24:----
Detailed Prediction: ANSWER: B
Final Choice: B
Correct Answer: B
Is correct: True

----Row 25:----
Detailed Prediction: ANSWER: A
Final Choice: A
Correct Answer: B
Is correct: False

----Row 26

In [62]:
data_open_mc = data_open_mc.copy()

for i,row in data_open_mc.iterrows():
    prediction = row["prediction"]
    options = dict(row["options"])
    LCSratios = {}
    for key in options.keys():
        option = options[key]
        LCSratios[key] = SequenceMatcher(None, prediction, option).ratio()
    
    best_pred = max(LCSratios, key=LCSratios.get)
    data_open_mc.loc[i,"pred_LCS"] = best_pred
    data_open_mc.loc[i,"LCS_similarity"] = LCSratios[best_pred]
    
data_open_mc['final_choice'] = np.where(
    data_open_mc['LCS_similarity'] > 0.9,
    data_open_mc['pred_LCS'],
    data_open_mc['choice']
)

data_open_mc['is_correct'] = data_open_mc["final_choice"] == data_open_mc["correct_answer"]

for i,row in data_open_mc.iterrows():
    if row["final_choice"] != 'NOTAVALUE':
        print(f"\n----Row {i}:----")
        
        print(f"Detailed Prediction: {row["prediction"]}")
        print(f"Final Choice: {row['final_choice']}")
        print(f"Correct Answer: {row['correct_answer']}")
        print(f"Is correct: {row['is_correct']}")
    
print("\n\n‚ö†üü®--------- INVALID ANSWERS-----------üü®\n\n")
for i,row in data_open_mc.iterrows():
    if row["final_choice"] == 'NOTAVALUE':
        print(f"\n----Row {i}:----")
        
        print(f"Question: {row['question']}\n")
        print(f"Correct Answer: {row['correct_answer']}:")
        print(f"{ row['options'][row['correct_answer']] }\n")
        print(f"Detailed Prediction: {row["prediction"]}\n")
        print(f"Final Choice: {row['final_choice']}\n")
  
  
        
NaN_open_mc = len(data_open_mc[data_open_mc["final_choice"] == 'NOTAVALUE'])
valid_open_mc = len(data_open_mc) - NaN_open_mc
ACC_open_mc = data_open_mc["is_correct"].sum() / valid_open_mc

print()
print(f"‚ÑπÔ∏èValid Answers: {valid_open_mc}")
print(f"‚ÑπÔ∏èInvalid Answers: {NaN_open_mc}")
print(f"üü¢ACC: {100*round(ACC_open_mc,4)}%")



----Row 1:----
Detailed Prediction: - Patients should seek immediate medical attention if they experience symptoms such as difficulty breathing, swelling of the face or throat, or hives after receiving fosaprepitant for injection.
- It is important to inform healthcare providers about the allergic reaction to fosaprepitant, as they may need to administer emergency treatments like epinephrine.
- Patients should avoid future doses of fosaprepitant and discuss alternative antiemetic options with their healthcare provider.
Final Choice: B
Correct Answer: B
Is correct: True

----Row 2:----
Detailed Prediction: - Replace the Stiolto Respimat inhaler with a new one, as it indicates that the medication is no longer available for use.
- Ensure to follow the prescribed dosing regimen, which typically involves two inhalations once daily.
- Dispose of the empty inhaler properly, following local guidelines for medication disposal.
Final Choice: B
Correct Answer: B
Is correct: True

----Row 3:----


In [72]:
#################
# TOTAL ACCURACY
#################

ACC_total = ((ACC_mc*valid_mc + 
             ACC_open_mc*valid_open_mc) / 
             (valid_open_mc + valid_mc))

print(f"Total valid answers: {valid_open_mc + valid_mc}")
print(f"Total invalid answers: {NaN_open_mc + NaN_mc}")
print(f"%valid: {round(100*(valid_open_mc + valid_mc)/(valid_open_mc+valid_mc+NaN_open_mc+NaN_mc),2)}%")
print(f"\nüü¢ Total Accuracy: {round(100*ACC_total,2)}%")


Total valid answers: 408
Total invalid answers: 5
%valid: 98.79%

üü¢ Total Accuracy: 79.41%


In [30]:
# Open questipons
patterns = [r"\bANSWER:\s*([ABCDE])\b",
           r"THE ANSWER IS:\s*([ABCDE])\b",
           r"\bOUTPUT:\s*([ABCDE])\b"]

def extract_choice_from_prediction(pred: str) -> str:
    if pred is None:
        return ""
    s = str(pred).strip().upper()
    if not s:
        return ""

    # try regex patterns (in order)
    for pat in patterns:
        m = re.search(pat, s)
        if m:
            return m.group(1)

    # fallback: first line is exactly one letter
    first_line = s.splitlines()[0].strip()
    if re.fullmatch(r"[ABCDE]", first_line):
        return first_line

    return "NOTAVALUE"

data_open = data_open.copy()
data_open["choice"] = data_open["prediction"].apply(extract_choice_from_prediction)

for i,row in data_open.iterrows():
    prediction = row["prediction"]
    options = dict(row["options"])
    LCSratios = {}
    for key in options.keys():
        option = options[key]
        LCSratios[key] = SequenceMatcher(None, prediction, option).ratio()
    
    best_pred = max(LCSratios, key=LCSratios.get)
    data_open.loc[i,"pred_LCS"] = best_pred
    data_open.loc[i,"LCS_similarity"] = LCSratios[best_pred]
    
data_open['final_choice'] = np.where(
    data_open['LCS_similarity'] > 0.15,
    data_open['pred_LCS'],
    data_open['choice']
)

data_open['is_correct'] = data_open["final_choice"] == data_open["correct_answer"]

for i,row in data_open.iterrows():
    if row["final_choice"] != 'NOTAVALUE':
        print(f"\n----Row {i}:----")
        
        print(f"Detailed Prediction: {row["prediction"]}")
        print(f"Final Choice: {row['final_choice']}")
        print(f"Correct Answer: {row['correct_answer']}")
        print(f"Is correct: {row['is_correct']}")
    
print("\n\n‚ö†üü®--------- INVALID ANSWERS-----------üü®\n\n")
for i,row in data_open.iterrows():
    if row["final_choice"] == 'NOTAVALUE':
        print(f"\n----Row {i}:----")
        
        print(f"Question: {row['question']}\n")
        print(f"Correct Answer: {row['correct_answer']}:")
        print(f"{ row['options'][row['correct_answer']] }\n")
        print(f"Detailed Prediction: {row["prediction"]}\n")
        print(f"Final Choice: {row['final_choice']}\n")



NaN_open = len(data_open[data_open["final_choice"] == 'NOTAVALUE'])
valid_open = len(data_open) - NaN_open
ACC_open = data_open["is_correct"].sum() / valid_open

print()
print(f"‚ÑπÔ∏èValid Answers: {valid_open}")
print(f"‚ÑπÔ∏èInvalid Answers: {NaN_open}")
print(f"üü¢ACC: {100*round(ACC_open,4)}%")







----Row 61:----
Detailed Prediction: Error
Final Choice: B
Correct Answer: A
Is correct: False

----Row 451:----
Detailed Prediction: Error
Final Choice: C
Correct Answer: A
Is correct: False


‚ö†üü®--------- INVALID ANSWERS-----------üü®



----Row 22:----
Question: Which of the following is a withdrawal symptom that Nicotine helps alleviate?

Correct Answer: A:
Nicotine craving.

Detailed Prediction: Withdrawal symptoms from nicotine are common when a person stops smoking or using nicotine products. Nicotine helps alleviate several of these withdrawal symptoms, which can include:

1. **Cravings for nicotine**: One of the most prominent symptoms, cravings can lead to relapse if not managed appropriately.
   
2. **Irritability and anxiety**: These emotional symptoms are frequently reported during withdrawal, and nicotine can help stabilize mood.

3. **Increased appetite**: Many individuals experience an increase in appetite and subsequent weight gain upon stopping nicotine, which n

In [44]:
THRESH = 0.25  # tune on a dev split; good starting range: 0.15‚Äì0.30

def best_option_by_tfidf(prediction, options_dict):
    keys = list(options_dict.keys())
    corpus = [prediction] + [options_dict[k] for k in keys]
    tfidf = TfidfVectorizer(ngram_range=(1,2), stop_words="english").fit_transform(corpus)
    sims = cosine_similarity(tfidf[0], tfidf[1:]).ravel()
    best_idx = int(np.argmax(sims))
    return keys[best_idx], float(sims[best_idx])

# compute TF-IDF prediction + similarity per row
data_open = data_open.copy()
tfidf_results = data_open.apply(
    lambda r: best_option_by_tfidf(r["prediction"], dict(r["options"])),
    axis=1
)
data_open["pred_tfidf"] = tfidf_results.map(lambda x: x[0])
data_open["tfidf_similarity"] = tfidf_results.map(lambda x: x[1])

# choose final answer:
# 1) if regex extracted A-E, use it
# 2) else, use TF-IDF if similarity >= threshold
# 3) else NOTAVALUE

data_open["final_choice"] = np.where(
    data_open["tfidf_similarity"] >= THRESH,
    data_open["pred_tfidf"],
    data_open["choice"])

data_open["is_correct"] = data_open["final_choice"] == data_open["correct_answer"]

for i,row in data_open.iterrows():
    if row["final_choice"] != 'NOTAVALUE':
        print(f"\n----Row {i}:----")
        
        print(f"Detailed Prediction: {row["prediction"]}")
        print(f"Final Choice: {row['final_choice']}")
        print(f"Correct Answer: {row['correct_answer']}")
        print(f"Is correct: {row['is_correct']}")
    
print("\n\n‚ö†üü®--------- INVALID ANSWERS-----------üü®\n\n")
for i,row in data_open.iterrows():
    if row["final_choice"] == 'NOTAVALUE':
        print(f"\n----Row {i}:----")
        
        print(f"Question: {row['question']}\n")
        print(f"Correct Answer: {row['correct_answer']}:")
        print(f"{ row['options'][row['correct_answer']] }\n")
        print(f"Detailed Prediction: {row["prediction"]}\n")
        print(f"Final Choice: {row['final_choice']}\n")



NaN_open = len(data_open[data_open["final_choice"] == 'NOTAVALUE'])
valid_open = len(data_open) - NaN_open
ACC_open = data_open["is_correct"].sum() / valid_open

print()
print(f"‚ÑπÔ∏èValid Answers: {valid_open}")
print(f"‚ÑπÔ∏èInvalid Answers: {NaN_open}")
print(f"üü¢ACC: {100*round(ACC_open,4)}%")




----Row 67:----
Detailed Prediction: If a patient on Coreg CR (carvedilol phosphate) develops renal function deterioration, the following actions are recommended:

1. **Monitor Renal Function**: It is crucial to assess renal function regularly, especially during the initiation and up-titration of Coreg CR. This monitoring helps in detecting any changes early and in making timely interventions.

2. **Discontinuation or Dose Adjustment**: If there is evidence of worsening renal function, the prescribing physician should consider discontinuing the medication or reducing the dosage. This step is particularly important in high-risk patients who may be more susceptible to renal impairment.

3. **Patient Evaluation**: Evaluate for potential underlying causes of renal dysfunction that may be exacerbated by Coreg CR. Consider any additional comorbidities or medications that may contribute to renal deterioration.

4. **Alternative Therapies**: Depending on the patient's overall clinical situati

In [51]:
#fuzzy
THRESH = 0.950 

def _norm(s: str) -> str:
    s = "" if s is None else str(s)
    s = s.lower().strip()
    s = re.sub(r"\s+", " ", s)
    return s

def best_option_by_fuzzy(prediction: str, options_dict: dict) -> tuple[str, float]:
    pred = _norm(prediction)
    scores = {}
    for k, opt in options_dict.items():
        optn = _norm(opt)

        # Primary: robust to extra words + reordering
        s1 = fuzz.token_set_ratio(pred, optn) / 100.0

        # Secondary: robust if pred includes a copied snippet of the option
        s2 = fuzz.partial_ratio(pred, optn) / 100.0

        # Blend (tweak weights if you want)
        score = 0.7 * s1 + 0.3 * s2
        scores[k] = score

    best = max(scores, key=scores.get)
    return best, float(scores[best])

data_open = data_open.copy()

fuzz_results = data_open.apply(
    lambda r: best_option_by_fuzzy(r["prediction"], dict(r["options"])),
    axis=1
)
data_open["pred_fuzz"] = fuzz_results.map(lambda x: x[0])
data_open["fuzz_similarity"] = fuzz_results.map(lambda x: x[1])


data_open["final_choice"] = np.where(
    data_open["fuzz_similarity"] >= THRESH,
    data_open["pred_fuzz"],
    data_open["choice"])

data_open["is_correct"] = data_open["final_choice"] == data_open["correct_answer"]

for i,row in data_open.iterrows():
    if row["final_choice"] != 'NOTAVALUE':
        print(f"\n----Row {i}:----")
        
        print(f"Detailed Prediction: {row["prediction"]}")
        print(f"Final Choice: {row['final_choice']}")
        print(f"Correct Answer: {row['correct_answer']}")
        print(f"Is correct: {row['is_correct']}")
    
print("\n\n‚ö†üü®--------- INVALID ANSWERS-----------üü®\n\n")
for i,row in data_open.iterrows():
    if row["final_choice"] == 'NOTAVALUE':
        print(f"\n----Row {i}:----")
        
        print(f"Question: {row['question']}\n")
        print(f"Correct Answer: {row['correct_answer']}:")
        print(f"{ row['options'][row['correct_answer']] }\n")
        print(f"Detailed Prediction: {row["prediction"]}\n")
        print(f"Final Choice: {row['final_choice']}\n")



NaN_open = len(data_open[data_open["final_choice"] == 'NOTAVALUE'])
valid_open = len(data_open) - NaN_open
ACC_open = data_open["is_correct"].sum() / valid_open

print()
print(f"‚ÑπÔ∏èValid Answers: {valid_open}")
print(f"‚ÑπÔ∏èInvalid Answers: {NaN_open}")
print(f"üü¢ACC: {100*round(ACC_open,4)}%")




----Row 371:----
Detailed Prediction: In the context of treating radiation-induced emesis (RIE) during single high-dose fraction radiotherapy, Ondansetron has been shown to be effective. According to a study published in *Seminars in Oncology*, the incidence and severity of RIE can significantly vary based on factors such as radiation dose and field size.

Although the specific trial directly comparing Ondansetron and metoclopramide for this purpose was not found in the clinical trials database, existing literature indicates that Ondansetron, a 5-HT3 receptor antagonist, may provide better control of emesis compared to traditional antiemetics like metoclopramide, especially in cases of high radiation doses over 500 cGy. The effectiveness of Ondansetron relates to its mechanism of action in blocking the serotonin receptors involved in the emetic response.

The reported data indicate that radiation-induced emesis occurs in a high percentage of patients receiving high-dose hemibody irrad