In [49]:
import pandas as pd
import numpy as np
import json
import ast
from tqdm import tqdm
import re
import os
import seaborn as sns
import matplotlib.pyplot as plt
from neo4j import GraphDatabase, basic_auth
from dotenv import load_dotenv
import random
from scipy.stats import sem


In [50]:
def jaccard_similarity(list1, list2):
    set1 = set(list1)
    set2 = set(list2)
    intersection = len(set1.intersection(set2))
    union = len(set1) + len(set2) - intersection    
    if union == 0:
        return 0.0
    else:
        jaccard_similarity = intersection / union
        return jaccard_similarity
    
def extract_answer(text):
    pattern = r'{[^{}]*}'
    match = re.search(pattern, text)
    if match:
        return match.group()
    else:
        return None
    
def extract_by_splitting(text):
    compound_list = text.split(':')[1].split("Diseases")[0].split("], ")[0]+"]"
    disease_list = text.split(':')[-1].split("}")[0]
    resp = {}
    resp["Compounds"] = ast.literal_eval(compound_list)
    resp["Diseases"] = ast.literal_eval(disease_list)
    return resp
    

In [58]:
PARENT_PATH = "../../../data/analysis_results/"

FILES = [
    "Llama_2_13b_chat_hf_entity_recognition_based_node_retrieval_rag_based_drug_repurposing_questions_response.csv",
    "gpt_35_turbo_entity_recognition_based_node_retrieval_rag_based_drug_repurposing_questions_response.csv",
    "gpt_4_entity_recognition_based_node_retrieval_rag_based_drug_repurposing_questions_response.csv"
]

MODEL_LIST = ["Llama-13b", "GPT-3.5-Turbo", "GPT-4"]


In [59]:

files = FILES

llm_performance_list = []
llm_performance_list_metric = []
for file_index, file in tqdm(enumerate(files)):
    df = pd.read_csv(os.path.join(PARENT_PATH, file))
    df.dropna(subset=["llm_answer"], inplace=True)
    llm_performance_list_across_questions = []
    for index, row in df.iterrows():
        cmp_gt = ast.literal_eval(row["compound_groundTruth"])
        disease_gt = ast.literal_eval(row["disease_groundTruth"])
        try:
            llm_answer = json.loads(extract_answer(row["llm_answer"]))
        except:
            try:
                llm_answer = json.loads(row.llm_answer + '"]}')
            except:
                try:
                    split_string = row.llm_answer.rsplit(',', 1)
                    join_string = split_string[0] + split_string[1]
                    llm_answer = json.loads(join_string + ']}')
                except:
                    llm_answer = extract_by_splitting(row["llm_answer"])
        cmp_llm = llm_answer["Compounds"]
        disease_llm = llm_answer["Diseases"]
        cmp_similarity = jaccard_similarity(cmp_gt, cmp_llm)
        disease_similarity = jaccard_similarity(disease_gt, disease_llm)
        llm_performance = np.mean([cmp_similarity, disease_similarity])
        llm_performance_list_across_questions.append(llm_performance)
    llm_performance_list.append(llm_performance_list_across_questions)
    llm_performance_list_metric.append((np.mean(llm_performance_list_across_questions), np.std(llm_performance_list_across_questions), sem(llm_performance_list_across_questions), MODEL_LIST[file_index]))
drug_repurpose_perf = pd.DataFrame(llm_performance_list_metric, columns=["performance_mean", "performance_std", "performance_sem", "model_name"])        


0it [00:00, ?it/s]


SyntaxError: closing parenthesis ']' does not match opening parenthesis '{' (<unknown>, line 3)

In [57]:
drug_repurpose_perf

Unnamed: 0,performance_mean,performance_std,performance_sem,model_name
0,0.550097,0.176328,0.021542,GPT-3.5-Turbo
1,0.621062,0.200532,0.024499,GPT-4


In [60]:
row.llm_answer

'  As an expert biomedical researcher, based on the provided context, I can provide the following answer:\n\n{Compounds: [1-(Dimethylamino)-2-methylpropan-2-yl] 2-hydroxy-2,2-diphenylacetate, Methylhomatropine, Drotaverine, Methylatropine, Oxyphencyclimine, Otilonium, Oxyphenonium, Fenoverine, Diphemanil, Mebeverine, Metoclopramide, Mepenzolate, Cinitapride, Mosapride, Itopride, Propulsid, Cimetropium, Dicyclomine, Imipramine, Tridihexethyl, Tiropramide, Alosetron, Atropine, Phloroglucinol, Pipenzolate, Dexmedetomidine, Isopropamide, Isometheptene, Trimebutine, Methantheline, Ciprofloxacin, Butylscopolamine, BELLADONNA, Hexocyclium, Glycopyrronium bromide, Pinaverium, Alverine, Foscarnet, Domperidone, Clidinium, ZINC ion, ANXA11, CHRNA1, CHRNA3, CYP27A1, HOXB1, REPS1, MYORG, DNAH5, PELVIC PAIN, DYSPEPSIA, CHRND, SOX4, SLC10A2, CAMK2B, PLXND1, MED17, HFE, CAMK2A, POLR1D, SOX3, H3-3B, POLR3A, SLC2A3, HSPA9, PHGDH, MRPS25, ARSL, ECM1, SLC46A1, CARMIL2, RPL35A, RYR1, IDS, POLR3B, PHKA2, HL