In [46]:
import pandas as pd
import numpy as np
import re
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt


In [69]:

def get_auc(test_question):
    label_encoder = LabelEncoder()
    test_question['label_encoded'] = label_encoder.fit_transform(test_question['label'])
    test_question['extracted_answer_encoded'] = label_encoder.transform(test_question['extracted_answer'])
    auc_score = roc_auc_score(test_question['label_encoded'], test_question['extracted_answer_encoded'])
    return auc_score

def extract_answer(text):
    pattern = r"(True|False|Don't know)"
    matches = re.findall(pattern, text)
    return matches



In [93]:
# RESPONSE_PATH = "../../../data/analysis_results/Llama_2_13b_chat_hf_node_retrieval_rag_based_response.csv"

RESPONSE_PATH = "../../../data/analysis_results/Llama_2_13b_chat_hf_prompt_based_response.csv"


In [94]:
response_df = pd.read_csv(RESPONSE_PATH)

response_df.loc[:, 'extracted_answer'] = response_df['llm_answer'].apply(extract_answer)
response_df.loc[:, "answer_count"] = response_df.extracted_answer.apply(lambda x:len(x))

response_df_multiple_answers = response_df[response_df.answer_count > 1]
response_df_single_answer = response_df.drop(response_df_multiple_answers.index)
response_df_single_answer.drop("answer_count", axis=1, inplace=True)


response_df_multiple_answers_ = []
for index, row in response_df_multiple_answers.iterrows():
    if row["extracted_answer"][0] == row["extracted_answer"][1]:
        response_df_multiple_answers_.append((row["question"], row["label"], row["llm_answer"], row["extracted_answer"][0]))
    else:
        response_df_multiple_answers_.append((row["question"], row["label"], row["llm_answer"], "Don't know"))

response_df_multiple_answers_ = pd.DataFrame(response_df_multiple_answers_, columns=["question", "label", "llm_answer", "extracted_answer"])

response_df_final = pd.concat([response_df_single_answer, response_df_multiple_answers_], ignore_index=True)
response_df_final = response_df_final.explode("extracted_answer")
response_df_final['extracted_answer'].fillna("Don't know", inplace=True)
response_df_final.head()


Unnamed: 0,question,label,llm_answer,extracted_answer
0,enhanced S-cone syndrome is not a vitreoretina...,False,"{\n""answer"": ""False""\n}\n\nEnhanced S-cone s...",False
1,metronidazole treats crohn's disease,True,"{\n""answer"": ""False""\n}\n\nMetronidazole is ...",False
2,KLEEFSTRA SYNDROME 1 is not associated with Ge...,False,"{\n ""answer"": ""False""\n}",False
3,STARGARDT DISEASE 1 (disorder) is not associat...,False,"{\n ""answer"": ""False""\n}",False
4,Glycogen storage disease type II associates Ge...,True,"{\n""answer"": ""True""\n}",True


In [95]:
response_df_uncertain_response = response_df_final[response_df_final.extracted_answer == "Don't know"]
response_df_certain_response = response_df_final[response_df_final.extracted_answer != "Don't know"]
# response_df_certain_response.loc[:, "extracted_answer"] = response_df_certain_response.extracted_answer.astype(bool)


response_transform = {
    "True" : True,
    "False" : False
}

response_df_certain_response.extracted_answer = response_df_certain_response.extracted_answer.apply(lambda x:response_transform[x])



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  response_df_certain_response.extracted_answer = response_df_certain_response.extracted_answer.apply(lambda x:response_transform[x])


In [91]:
total_certain_response = response_df_certain_response.shape[0]
total_uncertain_response = response_df_uncertain_response.shape[0]
total_response = response_df_final.shape[0]

correct_response = response_df_certain_response[response_df_certain_response.label == response_df_certain_response.extracted_answer].shape[0]
incorrect_response = response_df_certain_response[response_df_certain_response.label != response_df_certain_response.extracted_answer].shape[0]

correct_response_ = correct_response/total_response
incorrect_response_ = incorrect_response/total_response
uncertainty = total_uncertain_response/total_response


print("Correct response = ",correct_response_)
print("Incorrect response = ",incorrect_response_)
print("Uncertainty = ",uncertainty)


Correct response =  0.8669950738916257
Incorrect response =  0.12315270935960591
Uncertainty =  0.009852216748768473


In [97]:
total_certain_response = response_df_certain_response.shape[0]
total_uncertain_response = response_df_uncertain_response.shape[0]
total_response = response_df_final.shape[0]

correct_response = response_df_certain_response[response_df_certain_response.label == response_df_certain_response.extracted_answer].shape[0]
incorrect_response = response_df_certain_response[response_df_certain_response.label != response_df_certain_response.extracted_answer].shape[0]

correct_response_ = correct_response/total_response
incorrect_response_ = incorrect_response/total_response
uncertainty = total_uncertain_response/total_response


print("Correct response = ",correct_response_)
print("Incorrect response = ",incorrect_response_)
print("Uncertainty = ",uncertainty)


Correct response =  0.8768472906403941
Incorrect response =  0.10673234811165845
Uncertainty =  0.016420361247947456


In [52]:
total_certain_response = response_df_certain_response.shape[0]
total_uncertain_response = response_df_uncertain_response.shape[0]
total_response = response_df_final.shape[0]

correct_response = response_df_certain_response[response_df_certain_response.label == response_df_certain_response.extracted_answer].shape[0]
incorrect_response = response_df_certain_response[response_df_certain_response.label != response_df_certain_response.extracted_answer].shape[0]

correct_response_ = correct_response/total_response
incorrect_response_ = incorrect_response/total_certain_response
uncertainty = total_uncertain_response/total_response


print("Correct response = ",correct_response_)
print("Incorrect response = ",incorrect_response_)
print("Uncertainty = ",uncertainty)


Correct response =  0.8669950738916257
Incorrect response =  0.12146422628951747
Uncertainty =  0.013136288998357963


In [92]:
from scipy.stats import binom_test

N = response_df_certain_response.shape[0]
x = correct_response
p = response_df_certain_response[response_df_certain_response.label==True].shape[0]/N
p_value = binom_test(x, N, p=p, alternative='greater') 
p_value

H = np.divide(incorrect_response, total_uncertain_response)
print(p_value, H)



8.487249606902626e-35 12.5


  p_value = binom_test(x, N, p=p, alternative='greater')


In [13]:
from scipy.stats import binom_test

N = response_df_certain_response.shape[0]
x = correct_response
p = response_df_certain_response[response_df_certain_response.label==True].shape[0]/N
p_value = binom_test(x, N, p=p, alternative='greater') 
p_value

H = np.divide(false_response, uncertainty)
print(p_value, H)

7.657548241650208e-44 3.8511804384485666


  p_value = binom_test(x, N, p=p, alternative='greater')


In [286]:
total_certain_response = response_df_certain_response.shape[0]
total_uncertain_response = response_df_uncertain_response.shape[0]
total_response = response_df_final.shape[0]

correct_response = response_df_certain_response[response_df_certain_response.label == response_df_certain_response.extracted_answer].shape[0]
incorrect_response = response_df_certain_response[response_df_certain_response.label != response_df_certain_response.extracted_answer].shape[0]

true_response = correct_response/total_certain_response
false_response = incorrect_response/total_certain_response
uncertainty = total_uncertain_response/total_response


print("True response = ",true_response)
print("False response = ",false_response)
print("Uncertainty = ",uncertainty)


True response =  0.8988195615514334
False response =  0.10118043844856661
Uncertainty =  0.026272577996715927


In [291]:
auc_score = get_auc(response_df_certain_response)
print("AUC score = ", auc_score)


AUC score =  0.8829975227085054


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_question['label_encoded'] = label_encoder.fit_transform(test_question['label'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_question['extracted_answer_encoded'] = label_encoder.transform(test_question['extracted_answer'])
