# GPT-NER 

Schema used to perform GPT queries for the study entities. It is performed for each entity separately. 

For each of the entities, separate results have been obtained for grades with 0,1 and >1 entities.

In [None]:
import os
import json
import spacy
from main import LargeLanguageModel, PARAMS, log_ner
from costs import GPT3, GPT4
from models import Message, Sample
from database import load_promts
from utils import load_datasets
from dotenv import load_dotenv

In [None]:
load_dotenv()

In [None]:
MODEL_PATH = os.getenv("MODEL_CLINICAL_PATH")
CORPUS_PATH = os.getenv("CORPUS_CLINICAL_PATH")
CORPUS_PATH_OUT = os.getenv("CORPUS_CLINICAL_FILTERED_PATH")

In [None]:
ENTS = spacy.info(MODEL_PATH)['labels']['ner']

In [None]:
label = ENTS[0]
label

In [None]:
MODEL = spacy.load(MODEL_PATH)

# Load corpus

In [None]:
docs_dict = load_datasets(MODEL, CORPUS_PATH_OUT, label)
docs_dict.keys()

In [None]:
docs_1ent = docs_dict["eq1_ents"]
len(docs_1ent)

In [None]:
docs_0ent = docs_dict["eq0_ents"]
len(docs_0ent)

In [None]:
docs_mt1ent = docs_dict["gt1_ents"]
len(docs_mt1ent)

# Load Prompts

In [None]:
prompts = load_promts("prompts-clinical.xlsx")

In [None]:
def load_json(file_name):
    ds_folder = "ds"
    file_path = os.path.join(ds_folder, file_name)
    
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data

file_name = "good_values.json"
json_data = load_json(file_name)

if json_data is not None:
    print(json_data)

In [None]:
if json_data is not None and label in json_data:
    values = json_data[label]
    values = ', '.join(values)
    print(label)
    print(values)
else:
    print(f"Not found '{label}' in JSON.")

In [None]:
prompt = prompts[1]
behave =  prompt.msg
behave["content"] = behave["content"].replace("$$", label)

In [None]:
prompt2 = prompts[2]
behave2 = prompt2.msg
behave2["content"] = behave2["content"].replace("$$", label)
behave2["content"] = behave2["content"].replace("&&", values)

## Configure LLM

In [None]:
llms = [ LargeLanguageModel(GPT3, **PARAMS), LargeLanguageModel(GPT4, **PARAMS)]

# Split Corpus

## 0 ent

In [None]:
from utils import select_notes

In [None]:
sample_docs, other_docs = select_notes(docs_1ent, 5, label)

In [None]:
print(len(sample_docs))

In [None]:
eval_docs = docs_0ent
print(len(eval_docs))

# Generate Samples

In [None]:
from gptFormat import encoder

In [None]:
samples = [Sample(user=Message(role="user", content=doc.text),
                  agent=Message(role="assistant", content=encoder(doc, label)))
           for doc in sample_docs]

# Query openAI

In [None]:
from models import SetEvalDocs, Promt
from utils import eval_pipline

In [None]:
sets_eval = [SetEvalDocs({"name": "set_test_CANCER_CONCEPT_eq0" , "docs": eval_docs})]

In [None]:
promts2test = [ Promt(behave=behave, name="Zero Shot we"), Promt(behave=behave, name="5 Few-shot we", samples=samples), Promt(behave=behave2, name="Zero Shot"), Promt(behave=behave2, name="5 Few-shot", samples=samples)]
promts2test

In [None]:
results = eval_pipline(llms, sets_eval, promts2test, label)

In [None]:
results

In [None]:
from utils import dump_results
label_eq0 = label + "_eq0"
path = "results/sample/" + label_eq0 + ".json"
dump_results(results, path)

## 1 ent

In [None]:
print(len(sample_docs))

In [None]:
print(len(other_docs))

In [None]:
eval_docs = other_docs
print(len(eval_docs))

# Generate Samples

In [None]:
samples = [Sample(user=Message(role="user", content=doc.text),
                  agent=Message(role="assistant", content=encoder(doc, label)))
           for doc in sample_docs]

# Query openAI

In [None]:
sets_eval = [SetEvalDocs({"name": "set_test_CANCER_CONCEPT_eq1" , "docs": eval_docs})]

In [None]:
promts2test = [ Promt(behave=behave, name="Zero Shot we"), Promt(behave=behave, name="5 Few-shot we", samples=samples), Promt(behave=behave2, name="Zero Shot"), Promt(behave=behave2, name="5 Few-shot", samples=samples)]
promts2test

In [None]:
results = eval_pipline(llms, sets_eval, promts2test, label)

In [None]:
results

In [None]:
label_eq1 = label + "_eq1"
path = "results/sample/" + label_eq1 + ".json"
dump_results(results, path)

## >1 ent

In [None]:
sample_docs, other_docs = select_notes(docs_mt1ent, 5, label)

In [None]:
print(len(sample_docs))

In [None]:
print(len(other_docs))

In [None]:
eval_docs = other_docs
print(len(eval_docs))

# Generate Samples

In [None]:
samples = [Sample(user=Message(role="user", content=doc.text),
                  agent=Message(role="assistant", content=encoder(doc, label)))
           for doc in sample_docs]

# Query openAI

In [None]:
sets_eval = [SetEvalDocs({"name": "set_test_CANCER_CONCEPT_gt1" , "docs": eval_docs})]

In [None]:
promts2test = [ Promt(behave=behave, name="Zero Shot we"), Promt(behave=behave, name="5 Few-shot we", samples=samples), Promt(behave=behave2, name="Zero Shot"), Promt(behave=behave2, name="5 Few-shot", samples=samples)]
promts2test

In [None]:
results = eval_pipline(llms, sets_eval, promts2test, label)

In [None]:
results

In [None]:
label_gt1 = label + "_gt1"
path = "results/sample/" + label_gt1 + ".json"
dump_results(results, path)