In [25]:
import pandas as pd
import re
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder


In [46]:

def get_auc(test_question):
    label_encoder = LabelEncoder()
    test_question['label_encoded'] = label_encoder.fit_transform(test_question['label'])
    test_question['random_answer_encoded'] = label_encoder.transform(test_question['random_answer'])
    auc_score = roc_auc_score(test_question['label_encoded'], test_question['random_answer_encoded'])
    return auc_score

def extract_answer(text):
    pattern = r"(True|False|Don't know)"
    matches = re.findall(pattern, text, re.IGNORECASE)
    return matches

# def extract_answer(text):
#     pattern = r'{answer : <(True|False|Don\'t know)>}'
#     matches = re.search(pattern, text)
#     if matches:
#         return matches.group(1)
#     else:
#         return None


In [44]:
RESPONSE_PATH = "../../../data/analysis_results/Llama_2_7b_Chat_GPTQ_prompt_based_response.csv"


In [35]:
response_df = pd.read_csv(RESPONSE_PATH)

pattern = r'{answer : (?:<([^>]+)>)|([^}\n]+)}'

response_df['extracted_answer'] = response_df['llm_answer'].apply(lambda x: ' or '.join(filter(None, re.findall(pattern, x))))
response_df.head()

# print(response_df.shape)


TypeError: sequence item 0: expected str instance, tuple found

In [41]:
response_df.iloc[1].llm_answer

"  {answer : <True> or <False> or <Don't know>}\n\n{answer : False}"

In [47]:
response_df = pd.read_csv(RESPONSE_PATH)

response_df.loc[:, 'extracted_answer'] = response_df['llm_answer'].apply(extract_answer)
response_df.head()
# response_df = response_df.explode("llm_answer")



Unnamed: 0,question,label,llm_answer,extracted_answer
0,enhanced S-cone syndrome is not a vitreoretina...,False,{answer : <False>},[False]
1,metronidazole treats crohn's disease,True,{answer : <True> or <False> or <Don't know>}...,"[True, False, Don't know, False]"
2,KLEEFSTRA SYNDROME 1 is not associated with Ge...,False,{answer : <False>},[False]
3,STARGARDT DISEASE 1 (disorder) is not associat...,False,{answer : False},[False]
4,Juvenile polyposis syndrome associates Gene SMAD4,True,{answer : True},[True]


In [8]:
response_df.llm_answer.unique()

array(['False', "Don't know", 'True', 'true'], dtype=object)

In [168]:
response_df_uncertain_response = response_df[response_df.llm_answer == "Don't know"]
response_df_certain_response = response_df[response_df.llm_answer != "Don't know"]
response_df_certain_response.loc[:, "llm_answer"] = response_df_certain_response.llm_answer.astype(bool)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  response_df_certain_response.loc[:, "llm_answer"] = response_df_certain_response.llm_answer.astype(bool)
  response_df_certain_response.loc[:, "llm_answer"] = response_df_certain_response.llm_answer.astype(bool)


In [169]:
total_certain_response = response_df_certain_response.shape[0]
total_uncertain_response = response_df_uncertain_response.shape[0]
total_response = response_df.shape[0]

correct_response = response_df_certain_response[response_df_certain_response.label == response_df_certain_response.llm_answer].shape[0]
incorrect_response = response_df_certain_response[response_df_certain_response.label != response_df_certain_response.llm_answer].shape[0]

true_response = correct_response/total_response
false_response = incorrect_response/total_response
uncertainty = total_uncertain_response/total_response


print("True response = ",true_response)
print("False response = ",false_response)
print("Uncertainty = ",uncertainty)


True response =  0.9507575757575758
False response =  0.04734848484848485
Uncertainty =  0.001893939393939394


In [170]:
response_df_certain_response

Unnamed: 0,question,label,llm_answer
0,Sandhoff Disease associates Gene HEXB,True,True
1,Liver carcinoma associates Gene TP53,True,True
2,Malignant neoplasm of breast associates Gene TP53,True,True
3,Li-Fraumeni Syndrome associates Gene TP53,True,True
4,Sarcoma associates Gene TP53,True,True
...,...,...,...
523,Disease ontology identifier for Norrie disease...,False,True
524,Disease ontology identifier for infantile myof...,False,True
525,Disease ontology identifier for otospondylomeg...,False,True
526,Disease ontology identifier for Pelizaeus-Merz...,False,True


In [171]:
response_df

Unnamed: 0,question,label,llm_answer
0,Sandhoff Disease associates Gene HEXB,True,False
1,Liver carcinoma associates Gene TP53,True,True
2,Malignant neoplasm of breast associates Gene TP53,True,True
3,Li-Fraumeni Syndrome associates Gene TP53,True,True
4,Sarcoma associates Gene TP53,True,True
...,...,...,...
523,Disease ontology identifier for Norrie disease...,False,True
524,Disease ontology identifier for infantile myof...,False,True
525,Disease ontology identifier for otospondylomeg...,False,False
526,Disease ontology identifier for Pelizaeus-Merz...,False,True


In [48]:
response_df.loc[:, "answer_count"] = response_df.extracted_answer.apply(lambda x:len(x))
response_df.answer_count.max()

5

In [51]:
response_df[response_df.answer_count > 1]

Unnamed: 0,question,label,llm_answer,extracted_answer,answer_count
1,metronidazole treats crohn's disease,True,{answer : <True> or <False> or <Don't know>}...,"[True, False, Don't know, False]",4
24,Variant rs4402960 associates Diabetes Mellitus...,True,{answer : <True> or <False> or <Don't know>}...,"[True, False, Don't know, Don't know]",4
50,Deficiency of pyruvate kinase associates Gene ...,True,{answer : <True> or <False> or <Don't know>}...,"[True, False, Don't know, False]",4
90,Variant rs13266634 associates Diabetes Mellitu...,True,{answer : <True> or <False> or <Don't know>}...,"[True, False, Don't know, Don't know]",4
116,Alpha-Sarcoglycanopathies associates Gene SGCA,True,{SGCA : <True> or <False> or <Don't know>},"[True, False, Don't know]",3
118,Shprintzen syndrome associates Gene TBX1,True,{True : True},"[True, True]",2
176,Juvenile Myelomonocytic Leukemia associates Ge...,True,{answer : <True> or <False> or <Don't know>}...,"[True, False, Don't know, False]",4
178,Ornithine carbamoyltransferase deficiency asso...,True,{answer : <True> or <False> or <Don't know>}...,"[True, False, Don't know, Don't know]",4
201,Deficiency of butyryl-CoA dehydrogenase associ...,True,{answer : <True> or <False> or <Don't know>}...,"[True, False, Don't know, True]",4
244,"FUNDUS DYSTROPHY, PSEUDOINFLAMMATORY, OF SORSB...",True,{answer : <True> or <False> or <Don't know>}...,"[True, False, Don't know, True]",4


In [64]:
ind = 349
response_df.iloc[ind].llm_answer

"  {OAT : <True> or <False> or <Don't know>}"

In [65]:
response_df.iloc[ind].llm_answer.split("\n\n")[0]

"  {OAT : <True> or <False> or <Don't know>}"

In [24]:
response_df.iloc[577]

question      Congenital bilateral aplasia of vas deferens a...
label                                                      True
llm_answer          {answer : <True> or <False> or <Don't know>
Name: 577, dtype: object