# QA Experiment - 2 levels

- Using 2 level answers with ChatGPT-4o-mini

Prompt available at: experiments\templates\

In [1]:
%load_ext autoreload
%autoreload 2

Loading internal package

In [2]:
import clizod_ranker as cr

In [3]:
import os
import re
import shutil
import pandas as pd
import re
import numpy as np
import json
import asyncio
import time
import datetime

from random import randrange
from os.path import join, exists

## Load Data
Loading the cleaned data

In [4]:
input_file_path = '../data/data_01_cleaned.csv'
df_sample = pd.read_csv(input_file_path)

print(f"There are {len(df_sample)} rows in this dataset.")
df_sample.head(3)

There are 2905 rows in this dataset.


Unnamed: 0,tag,disease,variable,target_hash,target,review,reference,id
0,0.0,cchf,rainfall,1266833a0c9bd183b54db85128f28024,Scientific Opinion on the Role of Tick Vectors...,1.0,Journal Article,0
1,0.0,cchf,rainfall,c025b214c7370b24a3776e85f6cd285b,A survey of rift valley fever and associated r...,0.0,Journal Article,1
2,0.0,cchf,rainfall,273c3107270be7343e0e43692cde82c3,Occurrence of Rickettsia spp. and Coxiella bur...,0.0,Journal Article,2


## Set experiment directory

In [5]:
exp_root_dir = "../experiments/"
templates_dir = join(exp_root_dir, "templates")
results_root_dir = join(exp_root_dir, "results")
reports_root_dir = join(exp_root_dir, "reports")
review_root_dir = join(exp_root_dir, "review")

#Regular expression to extract out the different parts of the file name. Using _ as a delimiter
reg_exp_file_template = re.compile("(?P<model>[A-za-z0-9-]+)_(?P<exp>[A-za-z0-9-]+).txt")


## Run the model

In [6]:
template_name = 'gpt-4o-mini_h-05-single-answer-2.txt'

components = reg_exp_file_template.search(template_name).groupdict()
model_alias, exp = components.values()
info = cr.MODEL_INFOS[model_alias]

print(f"Initializing to run '{model_alias}' using '{template_name}' template")

promptGenerator = cr.QAPromptGenerator(join(templates_dir, template_name))

llmClient = cr.AsyncLLMClient(info, cr.SINGLE_LEVEL_QA_RESPONSE)

runner = cr.ExperimentRunner(promptGenerator, llmClient, results_root_dir)
await runner.async_run(df_sample, model_alias, exp)

Initializing to run 'gpt-4o-mini' using 'gpt-4o-mini_h-05-single-answer-2.txt' template
Processing prompts for cchf - rainfall
Attempting to processing 454 records for cchf - rainfall
Batch completed in 0:03:05.867062.
Processing prompts for ebola - rainfall
Attempting to processing 915 records for ebola - rainfall
Batch completed in 0:05:47.272297.
Processing prompts for rvf - rainfall
Attempting to processing 537 records for rvf - rainfall
Batch completed in 0:03:12.311930.
Processing prompts for lepto - rainfall
Attempting to processing 999 records for lepto - rainfall
Batch completed in 0:06:23.837144.
All batches completed in 0:18:29.374268.


## Process results

### Read the results

In [16]:
exp_to_load = ['/h-05-single-answer-2/']

ans_map = {
    'yes': 1,
    'no': 0
}

In [17]:
def parse_answers(response):
    # Parse the JSON string
    try:
        response_json = json.loads(response)
        
    except Exception as e:
        print(f"Error: An unexpected error occurred for request: {e}\n{response}")
        response_json = { "results": [] }
    
    
    # Initialize a dictionary to store answers for each question
    answer_dict = {}
    
    # Loop through the 'results' to extract answers
    for result in response_json['results']:
        question_num = result['question_number']     
        reason_text = result['reason'] 
        ans_text = result['answer'].lower()

        answer_dict[f'reason_{question_num}'] = reason_text

        # map the answer - if not 'yes' or 'no' then default to 0.5
        answer_dict[f'answer_{question_num}'] = ans_map.get(ans_text, 0.5)
        
    return pd.Series(answer_dict)

In [18]:
df_comb_results = cr.process_results(df_sample, exp_to_load, results_root_dir, parse_answers)
df_comb_results.head(5)

Reading - ../experiments/results/gpt-4o-mini/h-05-single-answer-2/cchf-rainfall
Reading - ../experiments/results/gpt-4o-mini/h-05-single-answer-2/ebola-rainfall
Reading - ../experiments/results/gpt-4o-mini/h-05-single-answer-2/lepto-rainfall
Reading - ../experiments/results/gpt-4o-mini/h-05-single-answer-2/rvf-rainfall
There are 2905 rows in this dataset.


Unnamed: 0,id,experiment,model,disease,variable,reason_1,answer_1,reason_2,answer_2,reason_3,answer_3,reason_4,answer_4,tag,reference
0,2,h-05-single-answer-2,gpt-4o-mini,cchf,rainfall,The study describes the collection and analysi...,1,The study focuses on the detection of Ricketts...,0,The abstract mentions that most infections occ...,1,The research involves field collection of tick...,1,0.0,Journal Article
1,9,h-05-single-answer-2,gpt-4o-mini,cchf,rainfall,The study describes a specific investigation i...,1,The study does not measure the incidence or pr...,0,The abstract mentions that the disease appeare...,1,The study is focused on investigating a diseas...,1,0.0,Journal Article
2,5,h-05-single-answer-2,gpt-4o-mini,cchf,rainfall,The study presents original data collected fro...,1,The study focuses on Aedes albopictus and its ...,0,The abstract does not discuss environmental fa...,0,The research is based on field surveys and the...,1,0.0,Journal Article
3,8,h-05-single-answer-2,gpt-4o-mini,cchf,rainfall,The study involves the collection and analysis...,1,The study measures the presence of the CCHFV g...,1,The abstract does not mention any examination ...,0,The research involves field collection of tick...,1,0.0,Journal Article
4,11,h-05-single-answer-2,gpt-4o-mini,cchf,rainfall,The study describes a retrospective screening ...,1,The study focuses on Dengue and Alkhumra Hemor...,0,The abstract does not mention any examination ...,0,The study involves screening samples and analy...,1,0.0,Journal Article


### Calculate Ranking

In [19]:
selected_columns = [col for col in df_comb_results.columns if col in ['id', 'experiment', 'model', 'disease', 'variable', 'tag'] or col.startswith('answer_')]
df_rerank = df_comb_results[selected_columns].copy()

qscore_columns = [col for col in df_rerank.columns if col.startswith('answer_')]
df_rerank['answer_mean'] = df_rerank[qscore_columns].mean(axis=1)
df_rerank.head(5)

Unnamed: 0,id,experiment,model,disease,variable,answer_1,answer_2,answer_3,answer_4,tag,answer_mean
0,2,h-05-single-answer-2,gpt-4o-mini,cchf,rainfall,1,0,1,1,0.0,0.75
1,9,h-05-single-answer-2,gpt-4o-mini,cchf,rainfall,1,0,1,1,0.0,0.75
2,5,h-05-single-answer-2,gpt-4o-mini,cchf,rainfall,1,0,0,1,0.0,0.5
3,8,h-05-single-answer-2,gpt-4o-mini,cchf,rainfall,1,1,0,1,0.0,0.75
4,11,h-05-single-answer-2,gpt-4o-mini,cchf,rainfall,1,0,0,1,0.0,0.5


In [20]:
df_comb_results = cr.apply_len_tie_breaker(df_sample, df_rerank, 'answer_mean')
df_comb_results.head(5)

Unnamed: 0,id,experiment,model,disease,variable,answer_1,answer_2,answer_3,answer_4,tag,answer_mean,target,target_len,ranking
0,407,h-05-single-answer-2,gpt-4o-mini,cchf,rainfall,1,1,1,1,1.0,1.0,Epidemiological Community and Differences of T...,3765,1
1,221,h-05-single-answer-2,gpt-4o-mini,cchf,rainfall,1,1,1,1,1.0,1.0,Climatic prerequisites for changing activity i...,2414,2
2,39,h-05-single-answer-2,gpt-4o-mini,cchf,rainfall,1,1,1,1,1.0,1.0,The Crimean-Congo haemorrhagic fever tick vect...,2393,3
3,441,h-05-single-answer-2,gpt-4o-mini,cchf,rainfall,1,1,1,1,1.0,1.0,"Temporal tendency, seasonality and relationshi...",2385,4
4,436,h-05-single-answer-2,gpt-4o-mini,cchf,rainfall,1,1,1,1,1.0,1.0,Projecting the potential distribution of ticks...,2236,5


## Review Document

Creating a CSV file for reviewing the results.

In [21]:
selected_columns = [col for col in df_comb_results.columns if col in ['id', 'experiment', 'model'] or col.startswith('reason_') ]
df_review_ref = pd.merge(
    df_comb_results[selected_columns], 
    df_sample[['id', 'target']], 
    how='left',
    left_on='id', 
    right_on='id'
)

df_review = pd.merge(
    df_comb_results,
    df_review_ref, 
    how='left',
    left_on=['id', 'experiment', 'model'], 
    right_on=['id', 'experiment', 'model']
)

Creating separate files for each experiment

In [22]:
print(df_review.shape)
groups = df_review.groupby(["model", "experiment"])
for key, data in groups:
    print(f"{key[0]} - {key[1]}")
    df_output = df_review.query(f"model == '{key[0]}' & experiment == '{key[1]}'")
    print(df_output.shape)
    df_output.to_csv(review_root_dir + f'/review_{key[0]}_{key[1]}.csv', header=True, index=False, encoding='utf-8')

    #reporting
    df_reporting = df_output[['id', 'experiment', 'model', 'disease', 'variable', 'tag', 'answer_mean', 'ranking']].copy()
    df_reporting.rename(columns={"answer_mean": "score"}, inplace=True)
    df_reporting.to_csv(reports_root_dir + f'/report_{key[0]}_{key[1]}.csv', header=True, index=False, encoding='utf-8')

(2905, 15)
gpt-4o-mini - h-05-single-answer-2
(2905, 15)
