In [90]:
import pandas as pd
import numpy as np
import json
import ast
from tqdm import tqdm
import re
import os
import seaborn as sns
import matplotlib.pyplot as plt
from neo4j import GraphDatabase, basic_auth
from dotenv import load_dotenv
import random
from scipy.stats import sem


In [91]:
def jaccard_similarity(list1, list2):
    set1 = set(list1)
    set2 = set(list2)
    intersection = len(set1.intersection(set2))
    union = len(set1) + len(set2) - intersection    
    if union == 0:
        return 0.0
    else:
        jaccard_similarity = intersection / union
        return jaccard_similarity
    
def extract_answer(text):
    pattern = r'{[^{}]*}'
    match = re.search(pattern, text)
    if match:
        return match.group()
    else:
        return None
    
def extract_by_splitting(text):
    compound_list = text.split(':')[1].split("Diseases")[0].split("], ")[0]+"]"
    disease_list = text.split(':')[-1].split("}")[0]
    resp = {}
    resp["Compounds"] = ast.literal_eval(compound_list)
    resp["Diseases"] = ast.literal_eval(disease_list)
    return resp

def get_gpt_performance(files):
    llm_performance_list = []
    llm_performance_list_metric = []
    for file_index, file in tqdm(enumerate(files)):
        df = pd.read_csv(os.path.join(PARENT_PATH, file))
        df.dropna(subset=["llm_answer"], inplace=True)
        llm_performance_list_across_questions = []
        for index, row in df.iterrows():
            cmp_gt = ast.literal_eval(row["compound_groundTruth"])
            disease_gt = ast.literal_eval(row["disease_groundTruth"])
            try:
                llm_answer = json.loads(extract_answer(row["llm_answer"]))
            except:
                try:
                    llm_answer = json.loads(row.llm_answer + '"]}')
                except:
                    try:
                        split_string = row.llm_answer.rsplit(',', 1)
                        join_string = split_string[0] + split_string[1]
                        llm_answer = json.loads(join_string + ']}')
                    except:
                        llm_answer = extract_by_splitting(row["llm_answer"])
            cmp_llm = llm_answer["Compounds"]
            disease_llm = llm_answer["Diseases"]
            cmp_similarity = jaccard_similarity(cmp_gt, cmp_llm)
            disease_similarity = jaccard_similarity(disease_gt, disease_llm)
            llm_performance = np.mean([cmp_similarity, disease_similarity])
            llm_performance_list_across_questions.append(llm_performance)
        llm_performance_list.append(llm_performance_list_across_questions)
        llm_performance_list_metric.append((np.mean(llm_performance_list_across_questions), np.std(llm_performance_list_across_questions), sem(llm_performance_list_across_questions), GPT_MODEL_LIST[file_index]))
    drug_repurpose_perf = pd.DataFrame(llm_performance_list_metric, columns=["performance_mean", "performance_std", "performance_sem", "model_name"])        
    return llm_performance_list, drug_repurpose_perf
    

In [92]:
PARENT_PATH = "../../../data/analysis_results/"

GPT_FILES = [
    "gpt_35_turbo_entity_recognition_based_node_retrieval_rag_based_drug_repurposing_questions_response.csv",
    "gpt_4_entity_recognition_based_node_retrieval_rag_based_drug_repurposing_questions_response.csv"
]
LLAMA_FILES = [
    "Llama_2_13b_chat_hf_entity_recognition_based_node_retrieval_rag_based_drug_repurposing_questions_response.csv"
]

GPT_MODEL_LIST = ["GPT-3.5-Turbo", "GPT-4"]
LLAMA_MODEL_LIST = ["Llama-2-13b"]


In [95]:
gpt_llm_performance_list, gpt_drug_repurpose_perf = get_gpt_performance(GPT_FILES)


2it [00:00, 39.06it/s]


In [96]:
gpt_drug_repurpose_perf

Unnamed: 0,performance_mean,performance_std,performance_sem,model_name
0,0.550097,0.176328,0.021542,GPT-3.5-Turbo
1,0.621062,0.200532,0.024499,GPT-4


In [98]:
df = pd.read_csv(os.path.join(PARENT_PATH, LLAMA_FILES[0]))
df

Unnamed: 0,disease,compound_groundTruth,disease_groundTruth,text,llm_answer
0,hemolytic-uremic syndrome,"['ECULIZUMAB', 'GALASOMITE', 'RAVULIZUMAB']","['HELLP syndrome', 'membranoproliferative glom...",What are the drugs that treat 'hemolytic-uremi...,"{\n""Compounds"": [\n""RAVULIZUMAB"",\n""ECULIZUM..."
1,hemophagocytic lymphohistiocytosis,"['Methylprednisolone', 'Ruxolitinib', 'GLOBULI...","['Crimean-Congo hemorrhagic fever', 'histoplas...",What are the drugs that treat 'hemophagocytic ...,"{\n""Compounds"": [\n""Ruxolitinib"",\n""EMAPALUM..."
2,hepatobiliary disease,"['Dexmedetomidine', 'Oxyphencyclimine', 'Otilo...","['splenic disease', 'gastrointestinal system c...",What are the drugs that treat 'hepatobiliary d...,{Compounds: [1-(Dimethylamino)-2-methylpropa...
3,hereditary hemorrhagic telangiectasia,"['BEVACIZUMAB', 'Propranolol', 'Mupirocin', 'S...","['hepatopulmonary syndrome', 'Coats disease', ...",What are the drugs that treat 'hereditary hemo...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\...
4,hereditary lymphedema I,"['ZINC hydrate', 'Fibrin']","['elephantiasis', 'hypotrichosis', 'immunodefi...",What are the drugs that treat 'hereditary lymp...,"Based on the context provided, I can provide..."
...,...,...,...,...,...
63,urea cycle disorder,"['4-Phenylbutyric acid', 'Glycerol phenylbutyr...","['autosomal dominant osteopetrosis 2', 'hypera...",What are the drugs that treat 'urea cycle diso...,"{\n""Compounds"": [\n""4-Phenylbutyric acid"",\n..."
64,urinary bladder cancer,"['Valrubicin', 'Vinblastine', 'Lenvatinib', 'S...",['transitional cell carcinoma'],What are the drugs that treat 'urinary bladder...,{Compounds: [11-Cyclobutanedicarboxylic acid...
65,vascular disease,"['Clopidogrel', 'Oxygen', 'Aspirin', 'Ambrisen...","['aortic aneurysm', 'anterior spinal artery sy...",What are the drugs that treat 'vascular diseas...,"{\n""Compounds"": [\n""Ambrisentan"",\n""Clopidog..."
66,viral infectious disease,"['Abacavir', 'Podofilox', 'Amenamevir', 'Falda...","['tick-borne encephalitis', 'Crimean-Congo hem...",What are the drugs that treat 'viral infectiou...,"{\n""Compounds"": [\n""Vidarabine"",\n""Amprenavi..."


In [103]:
df.iloc[0].llm_answer

'  {\n"Compounds": [\n"RAVULIZUMAB",\n"ECULIZUMAB",\n"mitomycin C",\n"quinine",\n"Everolimus",\n"GALASOMITE"\n],\n"Diseases": [\n"hemolytic-uremic syndrome",\n"membranoproliferative glomerulonephritis",\n"malignant hypertension",\n"HELLP syndrome"\n]\n}\n\nThe drugs that treat hemolytic-uremic syndrome are:\n\n* RAVULIZUMAB\n* ECULIZUMAB\n* mitomycin C\n* quinine\n* Everolimus\n* GALASOMITE\n\nThese drugs can be repurposed to treat other diseases that resemble hemolytic-uremic syndrome, such as:\n\n* Membranoproliferative glomerulonephritis\n* Malignant hypertension\n* HELLP syndrome'

In [87]:
import re
import json

# Your input string
input_string = row.llm_answer

# Use regular expression to extract the desired portion
pattern = r'{Compounds:(.*?)Diseases:'
match = re.search(pattern, input_string, re.DOTALL)

if match:
    compounds_section = match.group(1)
    # Remove leading and trailing whitespace
    compounds_section = compounds_section.strip()
    # Add square brackets to make it a valid JSON list
    compounds_section = f"[{compounds_section}]"

    # Parse the string into a list
    compounds_list = json.loads(compounds_section)
    print(compounds_list)
else:
    print("Pattern not found.")


JSONDecodeError: Expecting ',' delimiter: line 1 column 4 (char 3)

In [88]:
compounds_section

'[[1-(Dimethylamino)-2-methylpropan-2-yl] 2-hydroxy-2,2-diphenylacetate, Methylhomatropine, Drotaverine, Methylatropine, Oxyphencyclimine, Otilonium, Oxyphenonium, Fenoverine, Diphemanil, Mebeverine, Metoclopramide, Mepenzolate, Cinitapride, Mosapride, Itopride, Propulsid, Cimetropium, Dicyclomine, Imipramine, Tridihexethyl, Tiropramide, Alosetron, Atropine, Phloroglucinol, Pipenzolate, Dexmedetomidine, Isopropamide, Isometheptene, Trimebutine, Methantheline, Ciprofloxacin, Butylscopolamine, BELLADONNA, Hexocyclium, Glycopyrronium bromide, Pinaverium, Alverine, Foscarnet, Domperidone, Clidinium, ZINC ion, ANXA11, CHRNA1, CHRNA3, CYP27A1, HOXB1, REPS1, MYORG, DNAH5, PELVIC PAIN, DYSPEPSIA, CHRND, SOX4, SLC10A2, CAMK2B, PLXND1, MED17, HFE, CAMK2A, POLR1D, SOX3, H3-3B, POLR3A, SLC2A3, HSPA9, PHGDH, MRPS25, ARSL, ECM1, SLC46A1, CARMIL2, RPL35A, RYR1, IDS, POLR3B, PHKA2, HLA-B, TYMS, ROR2, ANTXR2, Fibrin},]'