In [1]:
import os
import json
from tqdm import tqdm

from configs import ConfigPath
from knowledge_graph.loader import GraphLoader

In [2]:


def read_file(file_path: str):
        with open(file_path, "r", encoding="utf-8") as file:
            data = json.load(file)
        return data

data = read_file(file_path=os.path.join(ConfigPath.RAW_DATA_DIR, "pqa_labeled.json"))


file_path = os.path.join(ConfigPath.KG_CONFIG_DIR, "schema_config.json")
graph_loader = GraphLoader.from_json_file(path=file_path)             
        

In [None]:
def _map_nodes(schema_nodes, data):
    pass

def _map_document_node(document_schema, item_info, pmid):

    label_key = "label"
    properties_key = "properties"
    pmid_key = "pmid"

    info = {key.lower(): value for key, value in item_info.items()}
    item = {
        label_key: document_schema[label_key],
        properties_key: {key: info[key] for key in document_schema[properties_key].keys() if key != pmid_key}
    }
    item[properties_key][pmid_key] = pmid
    return item




def map_data_to_schema(schema_config, data):
    nodes_schema = schema_config['nodes']

    document_key = "document"

    mapped_items = []
    for pmid, info in tqdm(data.items(), total=len(data), desc="mapping data to schema..."):
        
        document_node = _map_document_node(document_schema=nodes_schema[document_key], item_info=info, pmid=pmid)
        print(document_node)
        break


    

In [22]:
map_data_to_schema(schema_config=graph_loader.schema_config, data=data)

mapping data to schema...:   0%|          | 0/1000 [00:00<?, ?it/s]

{'label': 'DOCUMENT', 'properties': {'year': '2011', 'reasoning_required_pred': 'yes', 'reasoning_free_pred': 'yes', 'final_decision': 'yes', 'pmid': '21645374'}}





### Loading to graph

In [3]:
from knowledge_graph.loader import GraphLoader

graph_loader = GraphLoader(data=data)
graph_loader.create_schema_node_data()

  0%|          | 0/1000 [00:00<?, ?it/s]


[{'article': {'properties': {'pmid': '21645374', 'year': '2011'}},
  'context': {'properties': {'text_content': 'Programmed cell death (PCD) is the regulated death of cells within an organism. The lace plant (Aponogeton madagascariensis) produces perforations in its leaves through PCD. The leaves of the plant consist of a latticework of longitudinal and transverse veins enclosing areoles. PCD occurs in the cells at the center of these areoles and progresses outwards, stopping approximately five cells from the vasculature. The role of mitochondria during PCD has been recognized in animals; however, it has been less studied during PCD in plants. The following paper elucidates the role of mitochondrial dynamics during developmentally regulated PCD in vivo in A. madagascariensis. A single areole within a window stage leaf (PCD is occurring) was divided into three areas based on the progression of PCD; cells that will not undergo PCD (NPCD), cells in early stages of PCD (EPCD), and cells in

In [3]:
from data_preprocessing.text_splitter import TextSplitter

text = """[
"The transanal endorectal pull-through (TERPT) is becoming the most popular procedure in the treatment of Hirschsprung disease (HD), but overstretching of the anal sphincters remains a critical issue that may impact the continence. This study examined the long-term outcome of TERPT versus conventional transabdominal (ABD) pull-through for HD.",
"Records of 41 patients more than 3 years old who underwent a pull-through for HD (TERPT, n = 20; ABD, n = 21) were reviewed, and their families were thoroughly interviewed and scored via a 15-item post-pull-through long-term outcome questionnaire. Patients were operated on between the years 1995 and 2003. During this time, our group transitioned from the ABD to the TERPT technique. Total scoring ranged from 0 to 40: 0 to 10, excellent; 11 to 20 good; 21 to 30 fair; 31 to 40 poor. A 2-tailed Student t test, analysis of covariance, as well as logistic and linear regression were used to analyze the collected data with confidence interval higher than 95%.",
"Overall scores were similar. However, continence score was significantly better in the ABD group, and the stool pattern score was better in the TERPT group. A significant difference in age at interview between the 2 groups was noted; we therefore reanalyzed the data controlling for age, and this showed that age did not significantly affect the long-term scoring outcome between groups."
        ]"""


text_splitter = TextSplitter(chunk_size=50, chunk_overlap=10)

chunks = text_splitter.split_text(text=text)
chunks


['[\n"The transanal endorectal pull-through (TERPT) is becoming the most popular procedure in the treatment of Hirschsprung disease (HD), but overstretching of the anal sphincters remains a critical issue that may impact the contin',
 'ers remains a critical issue that may impact the continence. This study examined the long-term outcome of TERPT versus conventional transabdominal (ABD) pull-through for HD.",\n"Records of 41 patients more than 3 years old who',
 ' 41 patients more than 3 years old who underwent a pull-through for HD (TERPT, n = 20; ABD, n = 21) were reviewed, and their families were thoroughly interviewed and scored via a 15-item post',
 ' thoroughly interviewed and scored via a 15-item post-pull-through long-term outcome questionnaire. Patients were operated on between the years 1995 and 2003. During this time, our group transitioned from the ABD to the TERPT technique',
 'ed from the ABD to the TERPT technique. Total scoring ranged from 0 to 40: 0 to 10, excellent; 11

In [9]:
graph_loader = GraphLoader(data=data)

result = graph_loader.create_schema_node_data()
result

  0%|          | 0/1000 [00:00<?, ?it/s]


[{'article': {'properties': {'pmid': '21645374', 'year': '2011'}},
  'context': {'properties': {'text_content': 'Programmed cell death (PCD) is the regulated death of cells within an organism. The lace plant (Aponogeton madagascariensis) produces perforations in its leaves through PCD. The leaves of the plant consist of a latticework of longitudinal and transverse veins enclosing areoles. PCD occurs in the cells at the center of these areoles and progresses outwards, stopping approximately five cells from the vasculature. The role of mitochondria during PCD has been recognized in animals; however, it has been less studied during PCD in plants. The following paper elucidates the role of mitochondrial dynamics during developmentally regulated PCD in vivo in A. madagascariensis. A single areole within a window stage leaf (PCD is occurring) was divided into three areas based on the progression of PCD; cells that will not undergo PCD (NPCD), cells in early stages of PCD (EPCD), and cells in

### Huggingface embeddings


In [1]:
from langchain_huggingface import HuggingFaceEmbeddings

# model_name = "sentence-transformers/all-mpnet-base-v2"
model_name = "neuml/pubmedbert-base-embeddings"
model_kwargs = {'device': 'cuda'}
encode_kwargs = {'normalize_embeddings': False}
hf = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

  from .autonotebook import tqdm as notebook_tqdm


In [1]:
from llms.embedding_model import EmbeddingModel

test = "hello world"

model_name = "neuml/pubmedbert-base-embeddings"
model_kwargs = {'device': 'cuda'}
encode_kwargs = {'normalize_embeddings': False}
embedding_model = EmbeddingModel(model_name=model_name,
                                 model_kwargs=model_kwargs,
                                encode_kwargs=encode_kwargs)

2025-02-12 20:47:07,392 [DEBUG] embedding_model - CUDA is available, using GPU
  from .autonotebook import tqdm as notebook_tqdm
2025-02-12 20:47:32,115 [DEBUG] embedding_model - Embedding model initialized: neuml/pubmedbert-base-embeddings


In [2]:
embedding = embedding_model.embed_query("Hello world")
len(embedding)

768

In [4]:
import getpass
import os

# if not os.environ.get("GROQ_API_KEY"):
#   os.environ["GROQ_API_KEY"] = getpass.getpass("Enter API key for Groq: ")

from langchain.chat_models import init_chat_model

model = init_chat_model("llama3-8b-8192", model_provider="groq")

In [2]:
model.invoke("hey")

AIMessage(content="Hey! How's it going?", additional_kwargs={}, response_metadata={'token_usage': {'completion_tokens': 8, 'prompt_tokens': 11, 'total_tokens': 19, 'completion_time': 0.006666667, 'prompt_time': 0.001745382, 'queue_time': 0.322890651, 'total_time': 0.008412049}, 'model_name': 'llama3-8b-8192', 'system_fingerprint': 'fp_179b0f92c9', 'finish_reason': 'stop', 'logprobs': None}, id='run-3206ac3a-685f-4af3-9e96-b15f3dd34e39-0', usage_metadata={'input_tokens': 11, 'output_tokens': 8, 'total_tokens': 19})

In [5]:
from llms.llm import LLM

model_name = "deepseek-r1-distill-llama-70b"
model = LLM.from_name(model_name=model_name)
model.invoke("hellooo")

AIMessage(content='<think>\n\n</think>\n\nHellooo! 😊 How can I assist you today?', additional_kwargs={}, response_metadata={'token_usage': {'completion_tokens': 17, 'prompt_tokens': 6, 'total_tokens': 23, 'completion_time': 0.061818182, 'prompt_time': 0.003499071, 'queue_time': 0.614788378, 'total_time': 0.065317253}, 'model_name': 'deepseek-r1-distill-llama-70b', 'system_fingerprint': 'fp_95405939f8', 'finish_reason': 'stop', 'logprobs': None}, id='run-503a256f-026d-4011-96e4-39675bd74914-0', usage_metadata={'input_tokens': 6, 'output_tokens': 17, 'total_tokens': 23})

In [81]:
from pydantic import BaseModel, create_model, Field

def create_dynamic_basemodel(model_name: str, properties: dict, model_description: str = None):
    DynamicModel = create_model(model_name, __doc__=model_description, **properties)
    return DynamicModel

properties = {"reference": (str, Field(description="The extracted reference segment from the input text"))}
MitoModel = create_dynamic_basemodel(model_name="Mitochondria", properties=properties)
MitoModel

__main__.Mitochondria

In [82]:
MitoModel.model_json_schema()

{'properties': {'reference': {'description': 'The extracted reference segment from the input text',
   'title': 'Reference',
   'type': 'string'}},
 'required': ['reference'],
 'title': 'Mitochondria',
 'type': 'object'}

In [8]:
test_text = """"Programmed cell death (PCD) is the regulated death of cells within an organism. The lace plant (Aponogeton madagascariensis) produces perforations in its leaves through PCD. The leaves of the plant consist of a latticework of longitudinal and transverse veins enclosing areoles. PCD occurs in the cells at the center of these areoles and progresses outwards, stopping approximately five cells from the vasculature. The role of mitochondria during PCD has been recognized in animals; however, it has been less studied during PCD in plants.",
"The following paper elucidates the role of mitochondrial dynamics during developmentally regulated PCD in vivo in A. madagascariensis. A single areole within a window stage leaf (PCD is occurring) was divided into three areas based on the progression of PCD; cells that will not undergo PCD (NPCD), cells in early stages of PCD (EPCD), and cells in late stages of PCD (LPCD). Window stage leaves were stained with the mitochondrial dye MitoTracker Red CMXRos and examined. Mitochondrial dynamics were delineated into four categories (M1-M4) based on characteristics including distribution, motility, and membrane potential (\u0394\u03a8m). A TUNEL assay showed fragmented nDNA in a gradient over these mitochondrial stages. Chloroplasts and transvacuolar strands were also examined using live cell imaging. The possible importance of mitochondrial permeability transition pore (PTP) formation during PCD was indirectly examined via in vivo cyclosporine A (CsA) treatment. This treatment resulted in lace plant leaves with a significantly lower number of perforations compared to controls, and that displayed mitochondrial dynamics similar to that of non-PCD cells."
"""

In [86]:
from typing import List

model_desc = "Extract the following entities only if exist. You working with medical data which means are critical, the extraction should be performed carefully."
ExtractionModel = create_dynamic_basemodel(model_name="ExtractionEntities", properties={"MitochondriaEntities": (List[MitoModel], None)}, model_description=model_desc)
ExtractionModel.model_json_schema()

{'$defs': {'Mitochondria': {'properties': {'reference': {'description': 'The extracted reference segment from the input text',
     'title': 'Reference',
     'type': 'string'}},
   'required': ['reference'],
   'title': 'Mitochondria',
   'type': 'object'}},
 'description': 'Extract the following entities only if exist. You working with medical data which means are critical, the extraction should be performed carefully.',
 'properties': {'MitochondriaEntities': {'default': None,
   'items': {'$ref': '#/$defs/Mitochondria'},
   'title': 'Mitochondriaentities',
   'type': 'array'}},
 'title': 'ExtractionEntities',
 'type': 'object'}

In [6]:
model_with_structured_output = model.with_structured_output(schema=ExtractionModel)
model_with_structured_output.invoke(test_text)

NameError: name 'ExtractionModel' is not defined

## model factory

In [1]:
from utils.model_factory import ModelFactory

model_factory = ModelFactory()

prop_dict = {
    "name": "reference",
    "type": str,
    "description": "The extracted reference segment from the input text"
}

properties = model_factory.construct_property_fields(properties=[prop_dict])

In [2]:
model_dict = {
    "name": "Mitochondria"
}

mito_model = model_factory.create_dynamic_base_model(model_name=model_dict["name"], properties=properties)
mito_model.model_json_schema()

{'properties': {'reference': {'description': 'The extracted reference segment from the input text',
   'title': 'Reference',
   'type': 'string'}},
 'required': ['reference'],
 'title': 'Mitochondria',
 'type': 'object'}

In [3]:
from typing import List

extraction_model_dict = {
    "name": "ExtractionEntities",
    "description": "Extract the following entities only if exist. You working with medical data which means are critical, the extraction should be performed carefully."
}

prop_dict = {
    "name": "Mitochondria",
    "type": List[mito_model]
}

prop_field_dict = model_factory.construct_property_fields(properties=[prop_dict])


extraction_model = model_factory.create_dynamic_base_model(
    model_name=extraction_model_dict["name"],
    model_description=extraction_model_dict["description"],
    properties=prop_field_dict)
extraction_model.model_json_schema()

{'$defs': {'Mitochondria': {'properties': {'reference': {'description': 'The extracted reference segment from the input text',
     'title': 'Reference',
     'type': 'string'}},
   'required': ['reference'],
   'title': 'Mitochondria',
   'type': 'object'}},
 'description': 'Extract the following entities only if exist. You working with medical data which means are critical, the extraction should be performed carefully.',
 'properties': {'Mitochondria': {'items': {'$ref': '#/$defs/Mitochondria'},
   'title': 'Mitochondria',
   'type': 'array'}},
 'required': ['Mitochondria'],
 'title': 'ExtractionEntities',
 'type': 'object'}

In [9]:
model_with_structured_output = model.with_structured_output(schema=extraction_model)
model_with_structured_output.invoke(test_text)

ExtractionEntities(Mitochondria=[Mitochondria(reference='The role of mitochondria during PCD has been recognized in animals; however, it has been less studied during PCD in plants.'), Mitochondria(reference='The following paper elucidates the role of mitochondrial dynamics during developmentally regulated PCD in vivo in A. madagascariensis.'), Mitochondria(reference='Mitochondrial dynamics were delineated into four categories (M1-M4) based on characteristics including distribution, motility, and membrane potential (ΔΨm).'), Mitochondria(reference='The possible importance of mitochondrial permeability transition pore (PTP) formation during PCD was indirectly examined via in vivo cyclosporine A (CsA) treatment.')])