In [1]:
import json
import openai
from dotenv import load_dotenv
import os
from sklearn.metrics import mean_absolute_error
import re
from sklearn.preprocessing import QuantileTransformer
import numpy as np
from concurrent.futures import ThreadPoolExecutor, as_completed
import time
from sklearn.isotonic import IsotonicRegression

In [2]:
# Load API key from .env file
load_dotenv('key.env')
api_key = os.getenv('API_KEY')
openai.api_key = api_key

# Load the dataset
data_path = 'sentence-level-certainty.jsonl'
entries = []
with open(data_path, 'r') as file:
    entries = [json.loads(line) for line in file]

def get_numeric_prediction(text):
    """Extracts the first found float value from the given text and clamps it to the range 0.0 to 5.0."""
    matches = re.findall(r"[-+]?\d*\.\d+|\d+", text)
    if matches:
        value = float(matches[0])
        # Clamp the value to be within 0.0 to 5.0
        return max(0.0, min(5.0, value))
    else:
        return None

In [3]:
def get_gpt_prediction(model, prompt_template, include_examples, finding):
    examples = ""
    if include_examples:
        examples = (
            'Given the finding: "Kids get more exercise when the sun is out later in the evening.", predict a certainty score between 0.0 and 5.0: 5.0\n'
            'Given the finding: "Screening appeared to reduce prostate cancer deaths by 15% at 9 years, and this improved to 22% at 11 years.", predict a certainty score between 0.0 and 5.0: 4.0\n'
            'Given the finding: "The results indicate that organizations may benefit from encouraging employees to consider creative activities in their efforts to recover from work.", predict a certainty score between 0.0 and 5.0: 3.0\n'
        )
    prompt = examples + prompt_template.format(finding=finding['finding'])
    try:
        response = openai.ChatCompletion.create(
            model=model,
            messages=[
                {"role": "system", "content": "You are a helpful assistant trained to predict certainty scores for medical findings on a scale from 0.0 to 5.0."},
                {"role": "user", "content": prompt}
            ]
        )
        predicted_text = response.choices[0].message['content'].strip()
        predicted_score = get_numeric_prediction(predicted_text)
    except Exception as e:
        print(f"Error in processing prediction: {e}")
        predicted_score = None
    return predicted_score, finding['sentence-level-certainty']

In [4]:
def evaluate_model(model, prompt_template, entries, include_examples=False):
    # Splitting the dataset into training and testing sets
    train_size = int(len(entries) * 0.8)
    train_entries = entries[:train_size]
    test_entries = entries[train_size:]
    
    train_results = []
    test_results = []
    task_count = 0
    
    # Processing training set
    with ThreadPoolExecutor(max_workers=1240) as executor:
        future_to_prediction = {executor.submit(get_gpt_prediction, model, prompt_template, include_examples, entry): entry for entry in train_entries}
        for future in as_completed(future_to_prediction):
            result = future.result()
            task_count += 1
            print(f"Completed training tasks: {task_count}/{len(train_entries)}")
            if result[0] is not None:
                train_results.append(result)
    
    # Processing testing set
    task_count = 0  # Reset task count for testing set
    with ThreadPoolExecutor(max_workers=311) as executor:
        future_to_prediction = {executor.submit(get_gpt_prediction, model, prompt_template, include_examples, entry): entry for entry in test_entries}
        for future in as_completed(future_to_prediction):
            result = future.result()
            task_count += 1
            print(f"Completed testing tasks: {task_count}/{len(test_entries)}")
            if result[0] is not None:
                test_results.append(result)
    
    # Extracting predicted and true scores for training and testing sets
    train_predicted_scores = [result[0] for result in train_results if result[0] is not None]
    train_true_scores = [result[1] for result in train_results]
    test_predicted_scores = [result[0] for result in test_results if result[0] is not None]
    test_true_scores = [result[1] for result in test_results]
    
    # Calculating MAE for training set
    train_mae = mean_absolute_error(train_true_scores, train_predicted_scores)
    print(f"Training MAE: {train_mae}")
    
    # Apply isotonic regression to calibrate the predicted scores on training set
    isotonic_regressor = IsotonicRegression(out_of_bounds='clip')
    isotonic_regressor.fit(train_predicted_scores, train_true_scores)
    
    # Calibrating test set predictions
    test_predicted_scores_calibrated = isotonic_regressor.predict(test_predicted_scores)
    
    # Calculating MAE for calibrated test predictions
    test_mae_calibrated = mean_absolute_error(test_true_scores, test_predicted_scores_calibrated)
    print(f"Test MAE (Calibrated): {test_mae_calibrated}")


In [5]:
# Function calls
evaluate_model("gpt-3.5-turbo", "Given the finding: \"{finding}\", predict a certainty score between 0.0 and 5.0:", entries, include_examples=False)
evaluate_model("gpt-3.5-turbo", "Given the finding: \"{finding}\", predict a certainty score between 0.0 and 5.0. Here are a few examples:\n", entries, include_examples=True)
evaluate_model("gpt-4-0125-preview", "Given the finding: \"{finding}\", predict a certainty score between 0.0 and 5.0:", entries, include_examples=False)

Completed training tasks: 1/1240
Completed training tasks: 2/1240
Completed training tasks: 3/1240
Completed training tasks: 4/1240
Completed training tasks: 5/1240
Completed training tasks: 6/1240
Completed training tasks: 7/1240
Completed training tasks: 8/1240
Completed training tasks: 9/1240
Completed training tasks: 10/1240
Completed training tasks: 11/1240
Completed training tasks: 12/1240
Completed training tasks: 13/1240
Completed training tasks: 14/1240
Completed training tasks: 15/1240
Completed training tasks: 16/1240
Completed training tasks: 17/1240
Completed training tasks: 18/1240
Completed training tasks: 19/1240
Completed training tasks: 20/1240
Completed training tasks: 21/1240
Completed training tasks: 22/1240
Completed training tasks: 23/1240
Completed training tasks: 24/1240
Completed training tasks: 25/1240
Completed training tasks: 26/1240
Completed training tasks: 27/1240
Completed training tasks: 28/1240
Completed training tasks: 29/1240
Completed training task