# QA Experiment - 2 levels - confidence levels

- Using 2 level answers plus confidence levels with ChatGPT-4o-mini

Prompt available at: experiments\templates\

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import clizod_ranker as cr

In [3]:
import os
import re
import shutil
import pandas as pd
import re
import numpy as np
import json
import asyncio
import time
import datetime

from random import randrange
from os.path import join, exists

## Load Data
Loading the cleaned data

In [4]:
input_file_path = '../data/data_01_cleaned.csv'
df_sample = pd.read_csv(input_file_path)

print(f"There are {len(df_sample)} rows in this dataset.")
df_sample.head(3)

There are 2905 rows in this dataset.


Unnamed: 0,tag,disease,variable,target_hash,target,review,reference,id
0,0.0,cchf,rainfall,1266833a0c9bd183b54db85128f28024,Scientific Opinion on the Role of Tick Vectors...,1.0,Journal Article,0
1,0.0,cchf,rainfall,c025b214c7370b24a3776e85f6cd285b,A survey of rift valley fever and associated r...,0.0,Journal Article,1
2,0.0,cchf,rainfall,273c3107270be7343e0e43692cde82c3,Occurrence of Rickettsia spp. and Coxiella bur...,0.0,Journal Article,2


## Set experiment directory

In [5]:
exp_root_dir = "../experiments/"
templates_dir = join(exp_root_dir, "templates")
results_root_dir = join(exp_root_dir, "results")
reports_root_dir = join(exp_root_dir, "reports")
review_root_dir = join(exp_root_dir, "review")

#Regular expression to extract out the different parts of the file name. Using _ as a delimiter
reg_exp_file_template = re.compile("(?P<model>[A-za-z0-9-]+)_(?P<exp>[A-za-z0-9-]+).txt")


## Run the model

In [6]:
template_name = 'gpt-4o-mini_h-05-single-answer-2-confidence.txt'

components = reg_exp_file_template.search(template_name).groupdict()
model_alias, exp = components.values()
info = cr.MODEL_INFOS[model_alias]

print(f"Initializing to run '{model_alias}' using '{template_name}' template")

promptGenerator = cr.QAPromptGenerator(join(templates_dir, template_name))

llmClient = cr.AsyncLLMClient(info, cr.MULTI_LEVEL_QA_RESPONSE)

runner = cr.ExperimentRunner(promptGenerator, llmClient, results_root_dir)
await runner.async_run(df_sample, model_alias, exp)

Initializing to run 'gpt-4o-mini' using 'gpt-4o-mini_h-05-single-answer-2-confidence.txt' template
Processing prompts for cchf - rainfall
Attempting to processing 2 records for cchf - rainfall
Batch completed in 0:00:04.930164.
Processing prompts for ebola - rainfall
Attempting to processing 7 records for ebola - rainfall
Batch completed in 0:00:10.179101.
Processing prompts for rvf - rainfall
Attempting to processing 4 records for rvf - rainfall
Batch completed in 0:00:07.217084.
Processing prompts for lepto - rainfall
Attempting to processing 7 records for lepto - rainfall
Batch completed in 0:00:10.179926.
All batches completed in 0:00:32.587491.


## Process results

### Read the results

In [6]:
def calculate_q_score(answer, conf):
    return  0.25 * conf * (1 - answer) + 0.25 * (4 - conf) * answer


In [7]:
exp_to_load = ['/h-05-single-answer-2-confidence/']

ans_map = {
    'yes': 1,
    'no': 0
}

conf_score_map = {
    'high': 0,
    'medium': 1,
    'low': 2
}

conf_score_default = 'low'

reg_exp_file_result = re.compile("(?P<model>[A-za-z0-9-]+)_(?P<exp>[A-za-z0-9-]+).csv")

In [8]:
def parse_answers(response):
    # Parse the JSON string
    try:
        response_json = json.loads(response)
        
    except Exception as e:
        print(f"Error: An unexpected error occurred for request: {e}\n{response}")
        response_json = { "results": [] }
    
    
    # Initialize a dictionary to store answers for each question
    answer_dict = {}
    
    # Loop through the 'results' to extract answers
    for result in response_json['results']:
        question_num = result['question_number']     
        reason_text = result['reason'] 
        ans_text = result['answer'].lower()
        conf_text = result['confidence_score'].lower()

        answer_dict[f'reason_{question_num}'] = reason_text

        # map the answer - if not 'yes' or 'no' then default to -1
        answer_dict[f'answer_{question_num}'] = ans_map.get(ans_text, -1)

        if answer_dict[f'answer_{question_num}'] == -1:
            print(f"Unexpected answer '{ans_text}' for question {question_num}. Defaulting to 'no' and conf = 'low'")
            conf_text = conf_score_default

        # map the confidence score - default to low
        answer_dict[f'conf_{question_num}'] = conf_score_map.get(conf_text, conf_score_map[conf_score_default])

        answer_dict[f'qscore_{question_num}'] = calculate_q_score(answer_dict[f'answer_{question_num}'], answer_dict[f'conf_{question_num}'])
        
    return pd.Series(answer_dict)

In [9]:
df_comb_results = cr.process_results(df_sample, exp_to_load, results_root_dir, parse_answers)
df_comb_results.head(5)

Reading - ../experiments/results/gpt-4o-mini/h-05-single-answer-2-confidence/cchf-rainfall
Reading - ../experiments/results/gpt-4o-mini/h-05-single-answer-2-confidence/ebola-rainfall
Reading - ../experiments/results/gpt-4o-mini/h-05-single-answer-2-confidence/lepto-rainfall
Reading - ../experiments/results/gpt-4o-mini/h-05-single-answer-2-confidence/rvf-rainfall
There are 2905 rows in this dataset.


Unnamed: 0,id,experiment,model,disease,variable,reason_1,answer_1,conf_1,qscore_1,reason_2,...,reason_3,answer_3,conf_3,qscore_3,reason_4,answer_4,conf_4,qscore_4,tag,reference
0,6,h-05-single-answer-2-confidence,gpt-4o-mini,cchf,rainfall,The study investigates a specific outbreak and...,1,0,1.0,The study focuses on Rift Valley fever virus (...,...,The abstract discusses the emergence of RVFV i...,1,0,1.0,The research is based on field cases of febril...,1,0,1.0,0.0,Journal Article
1,4,h-05-single-answer-2-confidence,gpt-4o-mini,cchf,rainfall,The abstract describes a study that involves t...,1,0,1.0,The study focuses on the Rift Valley fever vir...,...,The abstract does not mention any environmenta...,0,0,0.0,The research is centered on immunoinformatics ...,0,0,0.0,0.0,Journal Article
2,3,h-05-single-answer-2-confidence,gpt-4o-mini,cchf,rainfall,The abstract indicates that it is a review of ...,0,0,0.0,The abstract discusses vector-borne diseases i...,...,The abstract mentions environmental conditions...,0,0,0.0,The study is a review and does not focus on fi...,0,0,0.0,0.0,Journal Article
3,0,h-05-single-answer-2-confidence,gpt-4o-mini,cchf,rainfall,The abstract describes a report that provides ...,0,0,0.0,The study discusses tick vectors and their rol...,...,The abstract mentions factors such as fragment...,0,1,0.25,The study focuses on the role of tick vectors ...,1,0,1.0,0.0,Journal Article
4,8,h-05-single-answer-2-confidence,gpt-4o-mini,cchf,rainfall,The abstract describes a study that involves c...,1,0,1.0,The study measures the presence of the CCHFV g...,...,The abstract does not mention any examination ...,0,0,0.0,The study involves field-based research where ...,1,0,1.0,0.0,Journal Article


### Calculate Ranking

In [10]:
selected_columns = [col for col in df_comb_results.columns if col in ['id', 'experiment', 'model', 'disease', 'variable', 'tag'] or col.startswith('qscore_')]
df_rerank = df_comb_results[selected_columns].copy()

qscore_columns = [col for col in df_rerank.columns if col.startswith('qscore_')]
df_rerank['qscore_mean'] = df_rerank[qscore_columns].mean(axis=1)
df_rerank.head(5)

Unnamed: 0,id,experiment,model,disease,variable,qscore_1,qscore_2,qscore_3,qscore_4,tag,qscore_mean
0,6,h-05-single-answer-2-confidence,gpt-4o-mini,cchf,rainfall,1.0,0.0,1.0,1.0,0.0,0.75
1,4,h-05-single-answer-2-confidence,gpt-4o-mini,cchf,rainfall,1.0,0.0,0.0,0.0,0.0,0.25
2,3,h-05-single-answer-2-confidence,gpt-4o-mini,cchf,rainfall,0.0,0.0,0.0,0.0,0.0,0.0
3,0,h-05-single-answer-2-confidence,gpt-4o-mini,cchf,rainfall,0.0,0.0,0.25,1.0,0.0,0.3125
4,8,h-05-single-answer-2-confidence,gpt-4o-mini,cchf,rainfall,1.0,1.0,0.0,1.0,0.0,0.75


In [11]:
df_comb_results = cr.apply_len_tie_breaker(df_sample, df_rerank, 'qscore_mean')
df_comb_results.head(5)

Unnamed: 0,id,experiment,model,disease,variable,qscore_1,qscore_2,qscore_3,qscore_4,tag,qscore_mean,target,target_len,ranking
0,407,h-05-single-answer-2-confidence,gpt-4o-mini,cchf,rainfall,1.0,1.0,1.0,1.0,1.0,1.0,Epidemiological Community and Differences of T...,3765,1
1,221,h-05-single-answer-2-confidence,gpt-4o-mini,cchf,rainfall,1.0,1.0,1.0,1.0,1.0,1.0,Climatic prerequisites for changing activity i...,2414,2
2,441,h-05-single-answer-2-confidence,gpt-4o-mini,cchf,rainfall,1.0,1.0,1.0,1.0,1.0,1.0,"Temporal tendency, seasonality and relationshi...",2385,3
3,80,h-05-single-answer-2-confidence,gpt-4o-mini,cchf,rainfall,1.0,1.0,1.0,1.0,1.0,1.0,Effect of meteorological factors on Hyalomma s...,1978,4
4,178,h-05-single-answer-2-confidence,gpt-4o-mini,cchf,rainfall,1.0,1.0,1.0,1.0,1.0,1.0,Applying network analysis process (Anp) and ge...,1836,5


## Review Document

Creating a CSV file for reviewing the results.

In [12]:
selected_columns = [col for col in df_comb_results.columns if col in ['id', 'experiment', 'model'] or col.startswith('reason_') or col.startswith('answer_') or col.startswith('conf_')]
df_review_ref = pd.merge(
    df_comb_results[selected_columns], 
    df_sample[['id', 'target']], 
    how='left',
    left_on='id', 
    right_on='id'
)

In [13]:
df_review = pd.merge(
    df_comb_results,
    df_review_ref, 
    how='left',
    left_on=['id', 'experiment', 'model'], 
    right_on=['id', 'experiment', 'model']
)


Creating separate files for each experiment

In [14]:
print(df_review.shape)
groups = df_review.groupby(["model", "experiment"])
for key, data in groups:
    print(f"{key[0]} - {key[1]}")
    df_output = df_review.query(f"model == '{key[0]}' & experiment == '{key[1]}'")
    print(df_output.shape)
    df_output.to_csv(review_root_dir + f'/review_h-01_{key[0]}_{key[1]}.csv', header=True, index=False, encoding='utf-8')

    #reporting
    df_reporting = df_output[['id', 'experiment', 'model', 'disease', 'variable', 'tag', 'qscore_mean', 'ranking']].copy()
    df_reporting.rename(columns={"qscore_mean": "score"}, inplace=True)
    df_reporting.to_csv(reports_root_dir + f'/report_{key[0]}_{key[1]}.csv', header=True, index=False, encoding='utf-8')

(2905, 15)
gpt-4o-mini - h-05-single-answer-2-confidence
(2905, 15)
