In [None]:
PROMPT = """
You are a professional JSON programmer. Your task is to extract data from the text and represent it as JSON.

Output the result ONLY in JSON format.
The JSON structure must be EXACTLY like this:
{
  "entities": [
    {
      "name": "Entity Name",
      "description": "Concise description based ONLY on the text"
    }
    // ... other entities found
  ],
  "relationships": [
    {
	"source": "Source Entity Name", 
	"description": "Relationship Description", 
	"target": "Target Entity Name"
    }
    // ... other relationships found 
  ]
}
**Rules:**
1.  **entities**: List entities with descriptions from the text.
2.  **relationships**: List relationships. Use EXACT names from `entities`.
3.  Use ONLY information from the text.
4.  Your response MUST be ONLY valid JSON. Nothing else. (start with ```json)

**TEXT TO ANALYZE:**
 """

In [None]:
prompt_tempalte = str("""
<|im_start|>system
{system_message}<|im_end|>
<|im_start|>user
{prompt}<|im_end|>
<|im_start|>assistant
""")

In [None]:
user_input = """
Olga Dmitrievna and Semyon's Cabin
A poster of Fantômas can be seen on the wall.
Near the Fantômas poster is a poster of a bitard.
A poster of a typical bitard hangs on the right wall.
A hammer lies under Olga Dmitrievna's bed, referencing the Banhammer-chan mascot, which served as the prototype for the counselor.
The map in each cabin shows its occupants. In the counselor and Semyon's cabin, Semyon is depicted as the same bitard with a bag on his head.
Slavya and Zhenya's Cabin
A poster of Mitsgol (the basis for Mitsgël, which served as the prototype for Zhenya) can be seen on the left wall.
Famous paintings by Vasnetsov hang on the right wall.
The shelves above Zhenya's bed are filled with books, indicating that she reads all the time, even in her cabin (since she spends the rest of her time in the library).
Alya and Ulyana's Cabin
A poster from the anime "Detroit Metal City" hangs above Alya's bed.
A poster of Stalin from the game "Stalin vs. Martians" hangs above Ulyana's bed.
Next to the Stalin poster is a poster of a kitten named Gav.
On the other side of the Stalin poster is a poster of Soviet-era hockey players, likely one of the following: Larionov, Krutov, Fetisov, Makarov, Kasatonov.
The wallpaper in the cabin features drawings of a sickle and hammer.
"""

In [None]:
# сторонние импорты
from langchain_openai import ChatOpenAI
from langchain.output_parsers.json import SimpleJsonOutputParser
# from set_enviroment import *

from dotenv import load_dotenv
import os

load_dotenv()

# локальные импорты

llm = ChatOpenAI(model = os.getenv("RAG_LLM_MODEL"), base_url = os.getenv('RAG_BASE_URL'), temperature=0.7, top_p=0.9, max_tokens=None, timeout=None) | SimpleJsonOutputParser()



def stream_graph_updates(user_input: str):
    for event in llm.stream(PROMPT + user_input):
        event["messages"][-1].pretty_print()

print("-----------------------------------------------------")

In [None]:
from langchain.document_loaders import PyPDFLoader, Docx2txtLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

def get_input_files(files_dir):
    'Return list of txt, docx, pdf documents'
    
    def load_file(Loader = TextLoader):
        return Loader(os.path.join(files_dir, file)).load()
    
    files = []
    for file in os.listdir(files_dir):
        if file.endswith(".txt"):
            files.append(load_file())
        if file.endswith(".docx"):
            files.append(load_file(Docx2txtLoader))
        if file.endswith(".pdf"):
            files.append(load_file(PyPDFLoader))

    return files

def get_chunks(documents):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,  # Размер чанка
        chunk_overlap=200  # Перекрытие чанков
    )

    return text_splitter.split_documents(documents)


In [None]:
chunks = get_chunks(get_input_files(os.getenv("DATA_PATH")))
print(chunks[0])

In [None]:
# response2 = llm.invoke(str(prompt_tempalte.format(
#     system_message=PROMPT,
#     prompt=user_input
# )))

# response1, response2 = llm.batch(
#     [
#         str(prompt_tempalte.format(
#             system_message=PROMPT,
#             prompt=user_input
#         )),
#         str(prompt_tempalte.format(
#             system_message=PROMPT,
#             prompt=user_input
#         ))
#     ]
# )

In [None]:
results = {
    "entities": [],
    "relationships": []
}

def exctract_data(response):
    # Extract Entities
    for entity in response.get("entities", []):
        # Check if the entity already exists in the results
        name = entity.get("name")
        
        if name is None:
            continue  # Skip if name is not provided

        for existing_entity in results["entities"]:
            if existing_entity["name"] == name:
                description = entity.get("description", False)
                if description:
                    existing_entity["description"].append(description)
                continue

        results["entities"].append({
            "name": entity["name"],
            "description": [entity["description"]]
        })

    # Extract Relationships
    for relationship in response.get("relationships", []):
        # Check if the source and target entities exist in the results
        source = relationship.get("source")
        description = relationship.get("description", False)
        target = relationship.get("target")
        
        if source is None or target is None:
            continue  # Skip if source or target is not provided

        for existing_entity in results["entities"]:
            if existing_entity["name"] == source:
                if description:
                    existing_entity["description"].append(description)
                continue

        results["relationships"].append({
            "source": source,
            "description": description if description else [],
            "target": target
        })

exctract_data(response1)
exctract_data(response2)

In [None]:
from json import dumps, dump
with open("./output.json", "w") as f:
    dump(results, f, indent=4)

# with open("./output.json", "w", encoding='utf-8') as f:
#     f.write(response.content)