In [None]:
!pip install openai==0.28
import openai
import time
import random
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# ----------------------------------------------------------------------
# 1. OPENAI API SETUP
# ----------------------------------------------------------------------
openai.api_key = "YOUR_API_KEY"

def call_gpt_model(prompt_text):
    """
    Makes a call to GPT-3.5-Turbo, returning the response text.
    Expects the model to produce only 'ACCEPT' or 'REJECT'.
    """
    try:
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt_text}
            ],
            temperature=0.0  # for more deterministic output
        )
        # Extract the response text
        answer = response.choices[0].message["content"].strip()
        return answer
    except Exception as e:
        print(f"Error calling OpenAI API: {e}")
        return "REJECT"

# ----------------------------------------------------------------------
# 2. ABSTRACTS MOCK UP DATASET
# ----------------------------------------------------------------------
abstracts = [
    {
        "title": "Efficacy of Drug X in Adult Patients with Mild Hypertension",
        "abstract_text": (
            "Background: This randomized controlled trial (RCT) evaluated a new antihypertensive therapy, Drug X, "
            "in adults aged ≥18. Patients underwent a 6-month follow-up to assess systolic blood pressure changes."
        )
    },
    {
        "title": "Pilot Study on Non-Pharmacological Methods for Pediatric Hypertension",
        "abstract_text": (
            "Objective: This pilot trial, not randomized, examined lifestyle interventions in children (5–12 years). "
            "Follow-up was 8 months. Blood pressure readings were collected monthly."
        )
    },
    {
        "title": "Effects of Extended Drug X Therapy in Younger Adults with Prehypertension",
        "abstract_text": (
            "Methods: An open-label RCT focusing on prehypertensive adults (average age 30). "
            "Participants received Drug X for 3 months. Blood pressure changes were recorded."
        )
    },
    {
        "title": "Blood Pressure Reduction via Drug Y in Middle-Aged Adults",
        "abstract_text": (
            "We studied the effect of Drug Y in adults. This was not an RCT, but results show decreased blood pressure. "
            "Follow-up was not clearly mentioned, but participants were surveyed multiple times."
        )
    },
    {
        "title": "Comparing Placebo vs. New Antihypertensive Regimen in Adult Hypertension",
        "abstract_text": (
            "A double-blind RCT was conducted in adults aged ≥18. The new regimen combined a low-dose diuretic "
            "with an ACE inhibitor. We observed significant systolic and diastolic reductions over 12 months."
        )
    },
    {
        "title": "Short-Term Evaluation of Drug X for Mild to Moderate Hypertension",
        "abstract_text": (
            "In this randomized trial, adult patients were given Drug X for a 2-month period. "
            "Outcomes indicate moderate blood pressure reductions, but further research is required."
        )
    },
    {
        "title": "Assessing Drug X vs. Standard Care: A Retrospective Cohort",
        "abstract_text": (
            "This retrospective study reviewed medical records of adult patients receiving Drug X or standard care. "
            "Blood pressure was followed for 1 year. No randomization was performed."
        )
    },
    {
        "title": "Drug X Efficacy in Adolescents with Hypertension",
        "abstract_text": (
            "A single-arm trial examined patients aged 12–17 with high blood pressure. Although short in duration, "
            "the study noted improvements, but there was no adult cohort."
        )
    },
    {
        "title": "Multi-Center RCT on Drug X in Adult Hypertensive Patients",
        "abstract_text": (
            "We performed an RCT across five centers with adults (≥18). Patients received Drug X for 6 months. "
            "Results show a significant drop in systolic blood pressure compared to baseline."
        )
    },
    {
        "title": "Drug X and Blood Pressure Variability in the Elderly",
        "abstract_text": (
            "A controlled, randomized study observed seniors (65+ years). Participants took Drug X with a 9-month follow-up. "
            "Observations revealed a reduced incidence of blood pressure variability and fewer cardiovascular events."
        )
    },
]


# ----------------------------------------------------------------------
# 3. SOFT AND STRICT PROMPTS
# ----------------------------------------------------------------------
soft_prompt = """You are an AI assistant helping with a systematic review on mild-to-moderate hypertension.
Classify each abstract as ACCEPT or REJECT based on these criteria:
1) Population: Adult patients (≥18).
2) Intervention: Must involve Drug X (a new antihypertensive).
3) Comparison: Placebo or standard care (or not contradicting a control group).
4) Study design: RCT or strongly implying random allocation.
5) Minimum follow-up: 6 months.

SOFT RULES:
- If the abstract does NOT clearly violate any of the criteria, respond "ACCEPT."
- Only respond "REJECT" if the abstract explicitly shows a violation (e.g., exclusively pediatric, follow-up <6 months, non-randomized).
- Output just the word "ACCEPT" or "REJECT." No further explanation.
"""

strict_prompt = """You are an AI assistant helping with a systematic review on mild-to-moderate hypertension.
Classify each abstract as ACCEPT or REJECT based on these criteria:
1) Population: Must explicitly mention adults (≥18).
2) Intervention: Must explicitly mention Drug X.
3) Comparison: Must mention a control (placebo or standard care).
4) Study design: Must explicitly mention randomized or RCT.
5) Minimum follow-up: Must be at least 6 months.

STRICT RULES:
- If ANY required element is not explicitly stated, respond "REJECT."
- Otherwise respond "ACCEPT."
- Output just the word "ACCEPT" or "REJECT." No further explanation.
"""

# ----------------------------------------------------------------------
# 4. TRUE LABELS FOR EVALUATION
# ----------------------------------------------------------------------
# For the purpose of our test, we manually decided if each abstract should be accepted based on:
# Adult population, new antihypertensive (Drug X), RCT, minimum 6 months follow-up
true_labels = [
    "ACCEPT",  # 1. RCT, adult, Drug X, 6mo
    "REJECT",  # 2. Pediatric
    "REJECT",  # 3. 3-month follow-up only
    "REJECT",  # 4. Not an RCT
    "ACCEPT",  # 5. RCT, adult, new antihypertensive, 12mo
    "REJECT",  # 6. RCT but 2-month follow-up
    "REJECT",  # 7. Not randomized
    "REJECT",  # 8. Adolescents
    "ACCEPT",  # 9. Adult, RCT, 6mo
    "ACCEPT",  # 10. Adult (elderly), RCT, 9mo
]

# ----------------------------------------------------------------------
# 5. CLASSIFY ABSTRACTS WITH GPT-3.5 TURBO
# ----------------------------------------------------------------------
soft_predictions = []
strict_predictions = []

for item in abstracts:
    # Prepare the prompts
    s_prompt = (
        f"{soft_prompt}\n"
        f"Title: {item['title']}\n"
        f"Abstract: {item['abstract_text']}\n"
        "Decision:"
    )
    st_prompt = (
        f"{strict_prompt}\n"
        f"Title: {item['title']}\n"
        f"Abstract: {item['abstract_text']}\n"
        "Decision:"
    )

    # Call GPT for the soft prompt
    soft_answer = call_gpt_model(s_prompt)
    # short pause to avoid rate limits
    time.sleep(1)

    # Call GPT for the strict prompt
    strict_answer = call_gpt_model(st_prompt)
    time.sleep(1)

    # Collect the predictions
    soft_predictions.append(soft_answer.upper())
    strict_predictions.append(strict_answer.upper())

# ----------------------------------------------------------------------
# 6. EVALUATE PERFORMANCE
# ----------------------------------------------------------------------
def evaluate_predictions(true_labels, predicted_labels):
    # We'll label "ACCEPT" as positive
    cm = confusion_matrix(true_labels, predicted_labels, labels=["ACCEPT", "REJECT"])
    acc = accuracy_score(true_labels, predicted_labels)
    prec = precision_score(true_labels, predicted_labels, pos_label="ACCEPT")
    rec = recall_score(true_labels, predicted_labels, pos_label="ACCEPT")
    f1 = f1_score(true_labels, predicted_labels, pos_label="ACCEPT")
    return cm, acc, prec, rec, f1

soft_cm, soft_acc, soft_prec, soft_rec, soft_f1 = evaluate_predictions(true_labels, soft_predictions)
strict_cm, strict_acc, strict_prec, strict_rec, strict_f1 = evaluate_predictions(true_labels, strict_predictions)

# ----------------------------------------------------------------------
# 7. DISPLAY RESULTS
# ----------------------------------------------------------------------
def display_results(name, predictions, cm, acc, prec, rec, f1):
    print(f"\n=== {name} RESULTS ===")
    print(f"Predictions: {predictions}")
    print(f"Confusion Matrix (Rows: True ACCEPT/REJECT, Cols: Predicted ACCEPT/REJECT):\n{cm}")
    print(f"Accuracy:  {acc:.2f}")
    print(f"Precision: {prec:.2f}")
    print(f"Recall:    {rec:.2f}")
    print(f"F1 Score:  {f1:.2f}")

display_results("Soft Prompt", soft_predictions, soft_cm, soft_acc, soft_prec, soft_rec, soft_f1)
display_results("Strict Prompt", strict_predictions, strict_cm, strict_acc, strict_prec, strict_rec, strict_f1)
