In [2]:
import os
from crewai import LLM

os.environ["OPENAI_MODEL_NAME"] = "gpt-4o-mini"

gpt_4o_llm = LLM(
    model="gpt-4o-mini",
    temperature=0.1,
)

gpt_3 = LLM(
    model="gpt-3.5-turbo-1106",
    temperature=0.1,
)

In [23]:
from pydantic import BaseModel
from typing import List

class filePath(BaseModel):
    filePath: List[str]

class mainFileContent(BaseModel):
    mainFile: str

class associateFilePath(BaseModel):
    mainFile: str
    relatedFiles: List[str]
    imageFiles: List[str]

In [32]:
from crewai import Agent
from crewai_tools import FileReadTool, DirectoryReadTool, VisionTool


class Agents:
    def markdownPathSearcher(self):
        return Agent(
            role="pathSearcher",
            goal="Finds the markdown files inside {file_path} path. Should never modify the path of the file.",
            backstory="You are very good at finding markdown files.",
            allow_delegation=False,
            verbose=True,
            llm=gpt_4o_llm,
            tools=[
                DirectoryReadTool(),
            ],
        )

    def imgPathSearcher(self):
        return Agent(
            role="pathSearcher",
            goal="Finds the img files inside {img_path} path",
            backstory="You are fluent in Korean, and you are very good at finding image files.",
            allow_delegation=False,
            verbose=True,
            llm=gpt_4o_llm,
            tools=[
                DirectoryReadTool(),
            ],
        )

    def mainFilesearcher(self):
        return Agent(
            role="mainFilesearcher",
            goal="""
            Print out ONLY one document path that can answer {question}.
            if filepath does not appear to be related to the question.
            """,
            backstory="You are fluent in Korean. You have a talent for finding files that seem to solve questions.",
            allow_delegation=False,
            verbose=True,
            llm=gpt_3,
        )

    def fileReader(self):
        return Agent(
            role="fileReader",
            goal="""
            Print out ONLY one document. To use the Tool, The parameter MUST be file_path = `filepath`. 
            It should be outputted as it is without modification.
            """,
            backstory="You are fluent in Korean. You are a bookworm.",
            allow_delegation=False,
            verbose=True,
            llm=gpt_3,
            tools=[
                FileReadTool(),
            ],
        )

    def fileSelector(self):
        return Agent(
            role="fileSelector",
            goal="Find out the path of all other files that correspond to the document and print them out.",
            backstory="You are a file search expert and fluent in Korean. You have a great ability to read and analyze the details of the file.",
            llm=gpt_4o_llm,
            allow_delegation=False,
            verbose=True,
        )

    def imgExtracter(self):
        return Agent(
            role="imgExtracter",
            goal="Extract the image files. and Add supplementary content to understand the contents of the existing answer. To use the Tool, The parameter MUST be image_path = `image_path`.",
            backstory="You are fluent in Korean, and You have a good ability to read images and convert them into text.",
            allow_delegation=False,
            verbose=True,
            llm=gpt_3,
            tools=[
                VisionTool(),
            ],
        )

In [42]:
from crewai import Task


class Tasks:
    def markdownPathSearch(self, agent):
        return Task(
            description="Finds ALL the markdown files and inside {file_path} path",
            expected_output="Your final answer MUST be markdown file path. NEVER arbitrarily modify the path. Just Answer path in file_path",
            agent=agent,
            output_json=filePath,
            output_file="MarkdownPath.md",
        )

    def imgPathSearch(self, agent):
        return Task(
            description="Finds ALL the image files and inside {img_path} path. but NOT Include svg Image.",
            expected_output="Your final answer MUST be image path. svg images should NEVER be included. NEVER arbitrarily modify the path. Just Answer path in file_path",
            agent=agent,
            output_json=filePath,
            output_file="ImgPath.md",
        )

    def mainFileSearch(self, agent, context):
        return Task(
            description="""
            Based on the markdownPathSearch, markdownPathSearch is in json format. Document Paths are stored in 'filePath' key value and is in List. Search Only one file path that can solve question.
            NEVER modify the file path in fileSelect.
            
            question : {question}
            """,
            expected_output="""
            Print out the path of the file you read.

            if filepath does not appear to be related to the question,
            DON'T read ANY Files. JUST Answer 'No files are associated.'
            """,
            agent=agent,
            context=context,
            output_json=mainFileContent,
            output_file="mainFilePath.md",
        )

    def fileRead(self, agent):
        return Task(
            description="""
            Read the file in the given path and output it as it is WITHOUT modification.
            
            Read the entire contents of the file based on the file path and print it out.
            DON'T do this more than once.

            file_path : {file_path}
            """,
            expected_output="""
            Never modify it and print out the file you read as it is.
            """,
            agent=agent,
            output_file="mainFileSearch.md",
        )

    def fileSelect(self, agent, context):
        return Task(
            description="""
            Based on the mainFileSearch, 
            There are other documents linked by the symbol '[[...]]' and '![[...]]' in that file NOT '[...]
            '[[...]]' symbol means a markdown file and '![[...]]' means an image file.
            
            Find all of the '[[...]]' and '![[...]]' and print out the ONLY file path associated with the word in it in markdownPathSearch or imgPathSearch. 
            All file paths should EXIST in that markdownPathSearch Output or imgPathSearch Output. 
            DON'T make it up and look for it.
            If the relevant document/image does not exist, JUST Return EMPTY List.",
            """,
            expected_output="""
            Your final answer MUST include the path of the first file and the path of other files within that file.
            It doesn't include ANYTHING other than file paths. 

            minaFile and relatedFiles Include ONLY markdown File!

            Example Answer 1
            {
                "mainFile": "./Algorithm/Algorithm Content/Tree/MST(Minimum Spanning Tree).md",
                "relatedFiles": [
                    "./Algorithm/Algorithm Content/Graph Theory/DFS(Depth-First Search).md",
                    "./Algorithm/Algorithm Content/Graph Theory/BFS(Breadth-First Search).md",
                    "./Algorithm/Algorithm Content/Tree/Union Find.md",
                ],
                "imageFiles": ["./Algorithm/Reference/Tree Reference/MST Ref/MST Graph.png",]
            }

            Example Answer 2
            {
                "mainFile": "./Algorithm/Algorithm Content/Graph Theory/BFS(Breadth-First Search).md",
                "relatedFiles": [],
                "imageFiles": [
                    "./Algorithm/Reference/Graph Theory Reference/BASE TREE.png",
                    "./Algorithm/Reference/Graph Theory Reference/BFS Ref/BFS Queue.png",
                ]
            }

            Example Answer 3
            {
                "mainFile": "./c/k.md",
                "relatedFiles": [
                    "./c/g.md",
                    "./c/c.md",
                    "./c/d.md",
                ],
                "imageFiles": []
            }
            """,
            agent=agent,
            context=context,
            output_json=associateFilePath,
            output_file="associateFilePath.md",
        )

    def imgExtract(self, agent):
        return Task(
            description="""
            Read all the img files and Add supplementary content to understand the contents of the existing answer.
            NEVER modify the file path in fileSelect.
            You are very good at using Korean and English.
            We have provied an existing answer to a certain point : {existing_content}
            We have the opportunity to refine the existing answer (only if needed) with some more context below.
            ------
            {img_path}
            ------
            """,
            expected_output="""
            Given the new context, refine the original answer.
            If the context ins't useful, RETURN the original answer.
            """,
            agent=agent,
            output_file="ImgExtractContent.md",
        )

In [43]:
from crewai import Crew

agents = Agents()
tasks = Tasks()

In [8]:
markdownPathSearcher = agents.markdownPathSearcher()

markdownPathSearcher_task = tasks.markdownPathSearch(markdownPathSearcher)

filePathCrew = Crew(
    agents=[markdownPathSearcher],
    tasks=[markdownPathSearcher_task],
    verbose=True,
)

filePathResult = filePathCrew.kickoff(
    dict(
        file_path=".\Algorithm\Algorithm Content",
    )
)

  file_path=".\Algorithm\Algorithm Content",


[1m[95m# Agent:[00m [1m[92mpathSearcher[00m
[95m## Task:[00m [92mFinds ALL the markdown files and inside .\Algorithm\Algorithm Content path[00m


[1m[95m# Agent:[00m [1m[92mpathSearcher[00m
[95m## Thought:[00m [92mI need to find all the markdown files in the specified directory .\Algorithm\Algorithm Content. To do this, I will list the files in that directory.[00m
[95m## Using tool:[00m [92mList files in directory[00m
[95m## Tool Input:[00m [92m
"{\"directory\": \".\\\\Algorithm\\\\Algorithm Content\"}"[00m
[95m## Tool Output:[00m [92m
File paths: 
-.\Algorithm\Algorithm Content/Array\Binary Search.md
- .\Algorithm\Algorithm Content/Array\MITM(Meet in the middle).md
- .\Algorithm\Algorithm Content/Array\PBS(Parallel Binary Search).md
- .\Algorithm\Algorithm Content/Graph Theory\Articulation Points And Bridges.md
- .\Algorithm\Algorithm Content/Graph Theory\BFS(Breadth-First Search).md
- .\Algorithm\Algorithm Content/Graph Theory\CCW(Counter Clock Wise).md

In [11]:
imgPathSearcher = agents.imgPathSearcher()

imgPathSearcher_task = tasks.imgPathSearch(imgPathSearcher)

imgPathCrew = Crew(
    agents=[imgPathSearcher],
    tasks=[imgPathSearcher_task],
    verbose=True,
)

imgPathResult = imgPathCrew.kickoff(
    dict(
        img_path=".\Algorithm\Reference",
    )
)

  img_path=".\Algorithm\Reference",


[1m[95m# Agent:[00m [1m[92mpathSearcher[00m
[95m## Task:[00m [92mFinds ALL the image files and inside .\Algorithm\Reference path. but NOT Include svg Image.[00m


[1m[95m# Agent:[00m [1m[92mpathSearcher[00m
[95m## Thought:[00m [92mI need to find all image files in the specified directory, excluding SVG images. I'll start by listing the contents of the .\Algorithm\Reference path to see what files are available.[00m
[95m## Using tool:[00m [92mList files in directory[00m
[95m## Tool Input:[00m [92m
"{\"directory\": \".\\\\Algorithm\\\\Reference\"}"[00m
[95m## Tool Output:[00m [92m
File paths: 
-.\Algorithm\Reference/BAEKJOON.png
- .\Algorithm\Reference/Array Reference\Binary Search Ref\Binary Search Flowchart.md
- .\Algorithm\Reference/Array Reference\Binary Search Ref\Binary Search Flowchart.png
- .\Algorithm\Reference/Array Reference\Binary Search Ref\Binary Search Flowchart.svg
- .\Algorithm\Reference/Array Reference\MITM Recursion Ref\MITM Recursion.md
-

In [28]:
mainFileSearcher = agents.mainFilesearcher()

mainFileSearcher_task = tasks.mainFileSearch(
    mainFileSearcher, [markdownPathSearcher_task]
)

mainFileResultCrew = Crew(
    agents=[
        mainFileSearcher,
    ],
    tasks=[
        mainFileSearcher_task,
    ],
    verbose=True,
)

question = "HLD"

mainFileResult = mainFileResultCrew.kickoff(
    dict(
        question=question,
    )
)



[1m[95m# Agent:[00m [1m[92mmainFilesearcher[00m
[95m## Task:[00m [92m
            Based on the markdownPathSearch, markdownPathSearch is in json format. Document Paths are stored in 'filePath' key value and is in List. Search Only one file path that can solve question.
            NEVER modify the file path in fileSelect.
            
            Read the entire contents of the file based on the file path and print it out.
            DON'T do this more than once
            question : HLD
            [00m


[1m[95m# Agent:[00m [1m[92mmainFilesearcher[00m
[95m## Final Answer:[00m [92m
{
  "mainFile": ".\\Algorithm\\Algorithm Content\\Tree\\HLD(Heavy Light Decomposition).md"
}[00m




In [49]:
import json

fileReader = agents.fileReader()

fileReader_task = tasks.fileRead(fileReader)

fileReadResultCrew = Crew(
    agents=[
        fileReader,
    ],
    tasks=[
        fileReader_task,
    ],
    verbose=True,
)

mainFilePathResult = mainFileResult.raw

mainFilePathJson = json.loads(mainFilePathResult)
mainFilePath = mainFilePathJson["mainFile"]

fileReadResultCrew.kickoff(dict(file_path = mainFilePath))




[1m[95m# Agent:[00m [1m[92mfileReader[00m
[95m## Task:[00m [92m
            Read the file in the given path and output it as it is WITHOUT modification.
            
            Read the entire contents of the file based on the file path and print it out.
            DON'T do this more than once.

            file_path : .\Algorithm\Algorithm Content\Tree\HLD(Heavy Light Decomposition).md
            [00m


[1m[95m# Agent:[00m [1m[92mfileReader[00m
[95m## Using tool:[00m [92mRead a file's content[00m
[95m## Tool Input:[00m [92m
"{\"file_path\": \".\\\\Algorithm\\\\Algorithm Content\\\\Tree\\\\HLD(Heavy Light Decomposition).md\"}"[00m
[95m## Tool Output:[00m [92m
# Concept
- `Tree`에 **Edge**을 'Heavy Edge'와 'Light Edge'로 나누어 구분하는 알고리즘이다.
- 부모 Node(`u`)와 u의 자식 Node(`v`)를 있는 Edge(`e`)가 있을 때, v의 Sub Tree 크기가 u의 Sub Tree 크기의 1/2 이상일 때 `e`를 **Heavy Edge**라 정의하며 이 이외에는 모두 **Light Edge**이다.
- `Size[Node] : Node의 Sub Tree 크기`라 정의하면 `Heavy Edge는 Size[v] >= Size[u] / 2`를

CrewOutput(raw='# Concept\n- `Tree`에 **Edge**을 \'Heavy Edge\'와 \'Light Edge\'로 나누어 구분하는 알고리즘이다.\n- 부모 Node(`u`)와 u의 자식 Node(`v`)를 있는 Edge(`e`)가 있을 때, v의 Sub Tree 크기가 u의 Sub Tree 크기의 1/2 이상일 때 `e`를 **Heavy Edge**라 정의하며 이 이외에는 모두 **Light Edge**이다.\n- `Size[Node] : Node의 Sub Tree 크기`라 정의하면 `Heavy Edge는 Size[v] >= Size[u] / 2`를 만족한다. \n- 어떠한 Node에서 Light Edge을 타고 올라갈 경우 `무조건 Sub Tree의 크기가 2배 이상`이 되게 되며 이는 다시 말해 어떠한 Node에서 Root Node로 가는 경우 최대 **logN**개의 Light Edge을 거쳐가게 된다는 것을 뜻한다.\n- 특정 Node u와 v가 있을 때 그 둘을 잇는 Light Edge는 최대 **2 * logN**개 이다.\n- Edge를 각각 Heavy Edge와 Light Edge를 분리하여 **이어지는 Heavy Edge를 하나의 그룹**으로써 그리고 **Light Edge는 개별적인 그룹**으로써 값을 관리하면 `구간의 Edge`를 효율적으로 관리할 수 있다.\n- 쉽게 말해, Edge를 Heavy, Light로 나누고 이어지는 Heavy는 하나의 그룹으로 보면서 Node u와 v를 잇는 Edge들을 하나하나 보는 것이 아닌 각 `Edge 그룹` 별로 처리하는 알고리즘이다.\n- [[DFS(Depth-First Search)]]를 이용해 Edge를 Heavy Edge와 Light Edge로 나누기 때문에 `O(N)`의 시간복잡도가 소요된다.\n- 각각의 나누어진 Edge들은 구간 계산을 위해 [[Segment Tree]]을 이용하게 되며, `O(logN)`만큼의 시간복잡도가 소요된다.\n# HLD 원리\n- HLD의

In [None]:
mainFileSearcher = agents.mainFilesearcher()
fileSelector = agents.fileSelector()

mainFileSearcher_task = tasks.mainFileSearch(
    mainFileSearcher, [markdownPathSearcher_task]
)
fileSelector_task = tasks.fileSelect(
    fileSelector,
    [mainFileSearcher_task, markdownPathSearcher_task, imgPathSearcher_task],
)

fileSelectorCrew = Crew(
    agents=[
        mainFileSearcher,
        fileSelector,
    ],
    tasks=[
        mainFileSearcher_task,
        fileSelector_task,
    ],
    verbose=True,
)

question = "HLD의 예제 코드를 출력해줘"

fileSelectorResult = fileSelectorCrew.kickoff(
    dict(
        question=question,
    )
)

In [10]:
from langchain_openai import ChatOpenAI
from langchain.callbacks import StreamingStdOutCallbackHandler

llm = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0.1,
)

In [11]:
from langchain_unstructured import UnstructuredLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter


def document_split(file_path, includeCode=True):
    loader = UnstructuredLoader(file_path=file_path)
    splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=500,
        chunk_overlap=60,
    )
    docs = loader.load()

    content = ""
    codeDocs = []
    codeDoc = ""
    flag = False
    for doc in docs:
        if "```" in doc.page_content:
            if flag:
                codeDocs.append(codeDoc)
                doc.page_content.replace("```", " ")
                flag = False
            else:
                codeDoc = ""
                flag = True

        if flag:
            codeDoc += doc.page_content + " "
        else:
            content += doc.page_content + " "
    textDocs = splitter.split_text(content)
    if includeCode:
        textDocs.extend(codeDocs)
    return textDocs

In [12]:
import json
import os.path

fileSelector_Json = json.loads(fileSelectorResult.raw)

relatedDocs = []
mainDocs = []

mainFilePath = fileSelector_Json["mainFile"]

if os.path.isfile(mainFilePath):
    mainDocs = document_split(mainFilePath)

for filePath in fileSelector_Json["relatedFiles"]:
    if os.path.isfile(filePath):
        relatedDocs.extend(document_split(filePath, False))



In [13]:
from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser

first_prompt = ChatPromptTemplate.from_template(
    """
    Your job is to find the right answer to the {question}.
    You are very good at using Korean and English.
    We have provied an existing answer to a certain point : {existing_content}
    We have the opportunity to refine the existing answer (only if needed) with some more context below.
    ------
    {context}
    ------
    Given the new context, refine the original answer.
    If the context ins't useful, RETURN the original answer.    
    """
)

first_chain = first_prompt | llm | StrOutputParser()

answer = ""

for doc in mainDocs:
    answer = first_chain.invoke({"question" : question, "existing_content" : answer, "context" : doc})

In [14]:
imgExtracter = agents.imgExtracter()

imgExtracter_task = tasks.imgExtract(imgExtracter)

imgExtractCrew = Crew(
    agents=[imgExtracter],
    tasks=[imgExtracter_task],
    verbose=True,
)

imgPath = fileSelector_Json["imageFiles"]


answer = imgExtractCrew.kickoff(
    dict(
        existing_content = answer,
        img_path = imgPath
    )
).raw



[1m[95m# Agent:[00m [1m[92mimgExtracter[00m
[95m## Task:[00m [92m
            Read all the img files and Add supplementary content to understand the contents of the existing answer.
            NEVER modify the file path in fileSelect.
            You are very good at using Korean and English.
            We have provied an existing answer to a certain point : The provided context enhances the original explanation by introducing additional details about the implementation of Heavy Light Decomposition (HLD) in C++. It includes a more comprehensive understanding of how to manage connectivity queries and edge removals using a boolean segment tree. Here’s a refined version of the original answer, incorporating the new context:

---

HLD(Heavy Light Decomposition)는 트리를 효율적으로 쿼리하고 업데이트하기 위한 알고리즘입니다. 이 알고리즘은 트리를 'Heavy Edge'와 'Light Edge'로 나누어 구분합니다. Heavy Edge는 부모 노드(u)와 자식 노드(v) 사이의 엣지(e)로, v의 서브트리 크기가 u의 서브트리 크기의 1/2 이상일 때 정의됩니다. 즉, 다음과 같은 조건을 만족합니다:

\[ \text{Size}[v] \geq \frac{

In [15]:
refine_prompt  = ChatPromptTemplate.from_template(
    """
    Your job is to add supplementary content to understand the contents of the existing answer.
    You are very good at using Korean and English.
    We have provied an existing answer to a certain point : {existing_content}
    We have the opportunity to refine the existing answer (only if needed) with some more context below.
    ------
    {context}
    ------
    Given the new context, refine the original answer.
    If the context ins't useful, RETURN the original answer.    
    """
)

refine_chain = refine_prompt | llm | StrOutputParser()

for doc in relatedDocs:
    answer = refine_chain.invoke({"existing_content" : answer, "context" : doc})

answer

"The provided context about the Segment Tree adds valuable insight into the Heavy Light Decomposition (HLD) algorithm and its application in efficiently managing range queries and updates. Here’s a refined version of the original answer, incorporating this new context:\n\n---\n\nHLD(Heavy Light Decomposition)는 트리를 효율적으로 쿼리하고 업데이트하기 위한 알고리즘입니다. 이 알고리즘은 트리를 'Heavy Edge'와 'Light Edge'로 나누어 구분합니다. Heavy Edge는 부모 노드(u)와 자식 노드(v) 사이의 엣지(e)로, v의 서브트리 크기가 u의 서브트리 크기의 1/2 이상일 때 정의됩니다. 즉, 다음과 같은 조건을 만족합니다:\n\n\\[ \\text{Size}[v] \\geq \\frac{\\text{Size}[u]}{2} \\]\n\n이 외의 엣지는 모두 Light Edge로 간주됩니다. 서브트리 크기(Size[Node])는 해당 노드의 서브트리에 포함된 노드의 수를 의미합니다. Light Edge를 따라 올라갈 경우, 서브트리의 크기가 항상 2배 이상 증가하게 되므로, 어떤 노드에서 루트 노드로 가는 경우 최대 logN개의 Light Edge를 거치게 됩니다.\n\n특정 노드 u와 v가 있을 때, 이 둘을 연결하는 Light Edge는 최대 2 * logN개가 존재합니다. Edge를 각각 Heavy Edge와 Light Edge로 분리하여 이어지는 Heavy Edge를 하나의 그룹으로 보고, Light Edge는 개별적인 그룹으로써 값을 관리하면 구간의 Edge를 효율적으로 관리할 수 있습니다. 쉽게 말해, Edge를 Heavy, Light로 나누고 이어지는 Heavy는 하나의 그룹으로 보면서 N

In [24]:
strList = {"Introduction.md" : "./file/Project/Document AI Secretary/Introduction.md", "kk.md" : "kkkkk.md"}
target = "Introduction.md"

if "kkkkk.md" in strList:
    print(strList["kk.md"])
else:
    print("NO")

NO


In [2]:
import sqlite3

sqlite3.version

  sqlite3.version


'2.6.0'