In [None]:
# 새로운 Jupyter notebook에서 리서치 AI 에이전트를 만들고 커스텀 도구를 부여합니다.
# 에이전트는 다음 작업을 수행할 수 있어야 합니다:
# Wikipedia에서 검색
# DuckDuckGo에서 검색
# 웹사이트의 텍스트를 스크랩하고 추출합니다.
# 리서치 결과를 .txt 파일에 저장하기
# 다음 쿼리로 에이전트를 실행합니다: "Research about the XZ backdoor" 라는 쿼리로 에이전트를 실행하면, 
# 에이전트는 Wikipedia 또는 DuckDuckGo에서 검색을 시도하고,
#  DuckDuckGo에서 웹사이트를 찾으면 해당 웹사이트에 들어가서 콘텐츠를 추출한 다음 
# .txt 파일에 조사 내용을 저장하는 것으로 완료해야 합니다.

import os
from typing import Any, Type
from langchain.chat_models import ChatOpenAI
from langchain.tools import BaseTool
from pydantic import BaseModel, Field
from langchain.agents import initialize_agent, AgentType
from langchain.utilities import DuckDuckGoSearchAPIWrapper
from langchain_community.tools import DuckDuckGoSearchResults
from langchain_community.tools import WikipediaQueryRun
from langchain_community.utilities import WikipediaAPIWrapper
from langchain_community.document_loaders import WebBaseLoader

llm = ChatOpenAI(
    temperature=0.1
)
    

class DuckDuckGoSearchingToolArgsSchema(BaseModel):
    query: str = Field(
        description="The query you will search for.Example query: Contents for subject"
    )

class DuckDuckGoSearchingTool(BaseTool):
    name: str = "DuckDuckGoSearchingTool"
    description: str = """
    Use this tool to find links of content on a given topic using a query. 
    It takes a query as an argument.
    
    """
    args_schema: Type[DuckDuckGoSearchingToolArgsSchema] = DuckDuckGoSearchingToolArgsSchema

    def _run(self, query):
        ddg = DuckDuckGoSearchResults()
        return ddg.run(query)


class WikipediaSearchingToolArgsSchema(BaseModel):
    query: str = Field(
        description="The query you will search for.Example query: Contents for subject"
    )

class WikipediaSearchingTool(BaseTool):
    name: str = "WikipediaSearchingTool"
    description: str = """
    Use this tool to find content on a given topic using a query. 
    It takes a query as an argument.
    
    """
    args_schema: Type[WikipediaSearchingToolArgsSchema] = WikipediaSearchingToolArgsSchema

    def _run(self, query):
        wikipedia = WikipediaQueryRun(api_wrapper=WikipediaAPIWrapper())
        return wikipedia.run(query)
    
class WebBaseLoaderArgsSchema(BaseModel):
    urls: list = Field(
        description="list of urls to search the content and to save files with content."
    )
    
class WebBaseLoaderTool(BaseTool):
    name: str = "WebBaseLoaderTool"
    description: str = """
        This tool fetches documents from predefined web pages, processes them, 
        and saves the summarized content into a specified file.
    """
    args_schema: Type[WebBaseLoaderArgsSchema] = WebBaseLoaderArgsSchema

    # 정리된 데이터 출력 함수
    def save_docs_to_file(docs, directory, filename):
        # 디렉토리가 존재하지 않으면 생성
        if not os.path.exists(directory):
            os.makedirs(directory)
        
        # 파일 경로 생성
        filepath = os.path.join(directory, filename)
        
        with open(filepath, 'w', encoding='utf-8') as file:
            for i, doc in enumerate(docs):
                file.write(f"Document {i+1}:\n")
                file.write(f"Source: {doc.metadata['source']}\n")
                file.write(f"Title: {doc.metadata['title']}\n")
                file.write(f"Description: {doc.metadata['description']}\n")
                file.write(f"Language: {doc.metadata['language']}\n")
                file.write(f"Content Preview: {doc.page_content[:100]}...\n")  # 첫 100자만 출력
                file.write("\n" + "="*80 + "\n\n")

    def _run(self, urls):
        # 웹에서 문서를 로드
        loader = WebBaseLoader(urls)
        docs = loader.load()

        # 디렉토리와 파일 이름 지정
        directory = 'c:/Users/wonjooLAPTOP/Dropbox/github/study_gpt/output'
        filename = 'challenge9_output.txt'

        # 파일에 저장
        # self.save_docs_to_file(docs, directory, filename)
        WebBaseLoaderTool.save_docs_to_file(docs, directory, filename)
        return f"Documents successfully saved to {os.path.join(directory, filename)}."


agent = initialize_agent(
    llm=llm,
    verbose=True,
    agent=AgentType.OPENAI_FUNCTIONS,
    handle_parsing_errors=True,
    tools=[
        DuckDuckGoSearchingTool(),
        WikipediaSearchingTool(),
        WebBaseLoaderTool()
    ],
)

prompt = "Research about the XZ backdoor"
# 프롬프트 실행
# prompt = "Fetch and save information about XZ backdoor"

# agent.invoke(prompt)

# 에이전트 실행 순서
search_results = agent.invoke([{
    "tool": "DuckDuckGoSearchingTool",
    "query": "Research about the XZ backdoor"
}])





[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `DuckDuckGoSearchingTool` with `{'query': 'Research about the XZ backdoor'}`


[0m[36;1m[1;3msnippet: Recently, a backdoor was discovered in XZ, a popular library for lossless data compression. Initial research efforts were predominantly concentrated on unpacking the well-disguised attack vector, while the social aspects of the attack received only murmurings. To investigate this attack, I never read a line of code., title: Everything I Know About the XZ Backdoor - Boehs, link: https://boehs.org/node/everything-i-know-about-the-xz-backdoor, snippet: What does the backdoor do? Malicious code added to xz Utils versions 5.6.0 and 5.6.1 modified the way the software functions. The backdoor manipulated sshd, the executable file used to make remote ..., title: What we know about the xz Utils backdoor that almost infected the world, link: https://arstechnica.com/security/2024/04/what-we-know-about-the-xz-utils-backdoor

In [37]:
# DuckDuckGo 검색 결과를 WebBaseLoaderTool로 처리
response = agent.invoke([{
    "tool": "WebBaseLoaderTool",
    "urls": search_results
}])



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `WebBaseLoaderTool` with `{'urls': [{'tool': 'DuckDuckGoSearchingTool', 'query': 'Research about the XZ backdoor'}]}`


[0m

AttributeError: 'dict' object has no attribute 'endswith'

In [28]:
import os

loader = WebBaseLoader([
    "https://www.wired.com/story/xz-backdoor-everything-you-need-to-know/",
    "https://arstechnica.com/security/2024/03/backdoor-found-in-widely-used-linux-utility-breaks-encrypted-ssh-connections/"
])
docs = loader.load()
# docs

# 정리된 데이터 출력 함수
def save_docs_to_file(docs, directory, filename):
    # 디렉토리가 존재하지 않으면 생성
    if not os.path.exists(directory):
        os.makedirs(directory)
    
    # 파일 경로 생성
    filepath = os.path.join(directory, filename)
    
    with open(filepath, 'w', encoding='utf-8') as file:
        for i, doc in enumerate(docs):
            file.write(f"Document {i+1}:\n")
            file.write(f"Source: {doc.metadata['source']}\n")
            file.write(f"Title: {doc.metadata['title']}\n")
            file.write(f"Description: {doc.metadata['description']}\n")
            file.write(f"Language: {doc.metadata['language']}\n")
            file.write(f"Content Preview: {doc.page_content[:100]}...\n")  # 첫 100자만 출력
            file.write("\n" + "="*80 + "\n\n")

# 디렉토리와 파일 이름 지정
directory = 'c:/Users/wonjooLAPTOP/Dropbox/github/study_gpt/output'
filename = 'challenge9_output.txt'

# 정리된 데이터 파일에 저장
save_docs_to_file(docs, directory, filename)

In [None]:
# from langchain.document_loaders import SitemapLoader
# from langchain.text_splitter import RecursiveCharacterTextSplitter
# import asyncio
# from fake_useragent import UserAgent
# asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
# ua = UserAgent()

# loader = SitemapLoader("https://www.wired.com/story/xz-backdoor-everything-you-need-to-know/")
# loader.requests_per_second = 1
# loader.headers = {'User-Agent': ua.random}

# splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
#     chunk_size=1000,
#     chunk_overlap=200,
# )

# docs = loader.load_and_split(text_splitter=splitter)
# docs