Skip to content
This repository has been archived by the owner on Jul 20, 2024. It is now read-only.

Commit

Permalink
feat: added web scraping-function
Browse files Browse the repository at this point in the history
  • Loading branch information
ayoubmrx committed Feb 12, 2024
1 parent 4f4d425 commit 585960e
Show file tree
Hide file tree
Showing 3 changed files with 60 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
FROM hyko-sdk:latest


WORKDIR /app

COPY . .
RUN poetry run pip install langchain html2text

CMD ["poetry", "run", "uvicorn", "--host", "0.0.0.0", "--port", "3000", "main:func"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from langchain_community.document_loaders import AsyncHtmlLoader
from langchain_community.document_transformers import Html2TextTransformer
from metadata import Inputs, Outputs, Params, func


@func.on_execute
async def main(inputs: Inputs, params: Params) -> Outputs:
"""Scrapes HTML content from the given URLs and converts it to plain text.
Args:
urls (list): A list of URLs to scrape.
Returns:
list: A list of transformed documents as plain text.
"""

loader = AsyncHtmlLoader(web_path=inputs.urls)
docs = loader.load()
html2text = Html2TextTransformer()
docs_transformed = html2text.transform_documents(docs)
page_contents = [text.page_content for text in docs_transformed]

return Outputs(result=page_contents)
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from pydantic import Field

from hyko_sdk.function import SDKFunction
from hyko_sdk.metadata import CoreModel

func = SDKFunction(
description="Scrape HTML content from URLs and convert it to plain text"
)


@func.set_input
class Inputs(CoreModel):
urls: list[str] = Field(
...,
description="A list of URLs to scrape. Protocol must be either 'http' or 'https'.",
)


@func.set_param
class Params(CoreModel):
pass


@func.set_output
class Outputs(CoreModel):
result: list[str] = Field(
..., description="List of transformed documents as plain text."
)

0 comments on commit 585960e

Please sign in to comment.