In [None]:
%pip install -q langchain-openai langchain playwright beautifulsoup4
%run -m playwright install

In [None]:
%run -m playwright  install-deps

In [None]:
%%sh
playwright install-deps   

In [None]:
%pip install nest_asyncio

In [None]:
import nest_asyncio
nest_asyncio.apply()

In [None]:
from langchain_community.document_loaders import AsyncChromiumLoader
from langchain_community.document_transformers import BeautifulSoupTransformer

# Load HTML
loader = AsyncChromiumLoader(["https://www.wsj.com"])
html = loader.load()

In [None]:
# Transform
bs_transformer = BeautifulSoupTransformer()
docs_transformed = bs_transformer.transform_documents(html, tags_to_extract=["p"])

In [None]:
# Result
docs_transformed[0].page_content[0:500]

In [None]:
from langchain_community.document_loaders import AsyncHtmlLoader

urls = ["https://www.espn.com", "https://lilianweng.github.io/posts/2023-06-23-agent/"]
loader = AsyncHtmlLoader(urls)
docs = loader.load()
docs[0].page_content[0:500]

In [None]:
import openai
from langchain_openai import ChatOpenAI

llm = ChatOpenAI( model_name="custom",
openai_api_key='OPENAI_API_KEY', 
openai_api_base='http://localhost:11434/v1',
# model_kwargs={
#     "logit_bias":{
#         # "320":-100,
#         # "2474":-100,
#         "20122":-100
#     }    
# },
temperature=0,
streaming=True,
verbose=True )

In [None]:
from langchain.chains import create_extraction_chain
from langchain.prompts import PromptTemplate
from langchain_core.prompts import BasePromptTemplate, ChatPromptTemplate
schema = {
    "properties": {
        "news_article_title": {"type": "string"},
        "news_article_summary": {"type": "string"},
    },
    "required": ["news_article_title", "news_article_summary"],
}


prompt = ChatPromptTemplate.from_template("""### Instruction: 
Extract and save the relevant entities mentioned \
in the following passage together with their properties.

Only extract the properties mentioned in the 'information_extraction' function.

If a property is not present and is not required in the function parameters, do not include it in the output.

Passage:
{input}"""
)

def extract(content: str, schema: dict):
    return create_extraction_chain(schema=schema, llm=llm,prompt=prompt).run(content)

In [None]:
from langchain.chains import LLMChain
from langchain.globals import set_debug
from langchain.prompts import PromptTemplate


set_debug(True)


In [None]:
import pprint

from langchain.text_splitter import RecursiveCharacterTextSplitter


def scrape_with_playwright(urls, schema):
    loader = AsyncChromiumLoader(urls)
    docs = loader.load()
    bs_transformer = BeautifulSoupTransformer()
    docs_transformed = bs_transformer.transform_documents(
        docs, tags_to_extract=["span"]
    )
    print("Extracting content with LLM")

    # Grab the first 1000 tokens of the site
    splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=1000, chunk_overlap=0
    )
    splits = splitter.split_documents(docs_transformed)

    # Process the first split
    extracted_content = extract(schema=schema, content=splits[0].page_content)
    pprint.pprint(extracted_content)
    return extracted_content


urls = ["https://www.wsj.com"]
extracted_content = scrape_with_playwright(urls, schema=schema)