In [1]:
import asyncio
from langchain.chains.summarize import load_summarize_chain
from langchain.chat_models.openai import ChatOpenAI
from langchain.document_loaders import AsyncHtmlLoader, AsyncChromiumLoader
from langchain.document_transformers import Html2TextTransformer
from langchain.prompts import PromptTemplate
from langchain.output_parsers import PydanticOutputParser
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
import pandas as pd
from pydantic import BaseModel
from serpapi import GoogleSearch
import os
from typing import Dict, List, Optional, Union

In [2]:
%pip install nest_asyncio --quiet

Note: you may need to restart the kernel to use updated packages.


In [3]:
import nest_asyncio
nest_asyncio.apply()

## Topic Research:

Use a browsing agent to search for the top 3 posts on a topic and summarize them. Progressive summarization and agent tool use.

- https://python.langchain.com/docs/use_cases/web_scraping/#loader
- https://python.langchain.com/docs/modules/data_connection/retrievers/web_research - Not viable for production, because it is only question based answering.


In [4]:
%pip install google-search-results --quiet
%pip install html2text --quiet

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [5]:
# Installing Playwright:
%pip install pytest-playwright

Note: you may need to restart the kernel to use updated packages.


In [6]:
!playwright install

In [4]:
TOPIC = "Neural networks"

In [5]:
os.environ["SERPAPI_API_KEY"] = ""

In [6]:
# LLM + text splitter:
llm = ChatOpenAI(temperature=0)
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=1500, chunk_overlap=400
)

In [7]:
search = GoogleSearch(
    {
        "q": TOPIC,
        "location": "Austin,Texas",
        "api_key": os.environ["SERPAPI_API_KEY"],
    }
)
# Get the results:
result = search.get_dict()

# Put the results in a Pandas DataFrame:
serp_results = pd.DataFrame(result["organic_results"])

In [8]:
def get_html_content_from_urls(
    df: pd.DataFrame, number_of_urls: int = 3, url_column: str = "link"
) -> List[Document]:
    # Get the HTML content of the first 3 URLs:
    urls = df[url_column].values[:number_of_urls].tolist()
    # If there is only one URL, convert it to a list:
    if isinstance(urls, str):
        urls = [urls]
    # Check for empty URLs:
    urls = [url for url in urls if url != ""]

    # Check for duplicate URLs:
    urls = list(set(urls))

    # Throw error if no URLs are found:
    if len(urls) == 0:
        raise ValueError("No URLs found!")
    # loader = AsyncHtmlLoader(urls) # Faster but might not always work.
    loader = AsyncChromiumLoader(urls)
    docs = loader.load()
    return docs

In [9]:
def extract_text_from_webpages(documents: List[Document]):
    html2text = Html2TextTransformer()
    return html2text.transform_documents(documents)

In [10]:
class DocumentSummary(BaseModel):
    concise_summary: str
    writing_style: str
    key_points: List[str]
    expert_opinions: Optional[List[str]] = None
    metadata: Dict[
        str, str
    ] = None  # This comes natively from the LangChain document loader

In [11]:
import asyncio

async def create_summary_from_text(
    document: Document, parser: PydanticOutputParser
) -> Union[DocumentSummary, None]:
    # Split the parent document into chunks:
    split_docs = text_splitter.split_documents([document])

    # If there are no documents, return None:
    if len(split_docs) == 0:
        return None

    # Run a refine summarization chain that extracts unique key points and opinions within an article:
    prompt_template = """Act as a content SEO researcher. You are interested in summarizing and extracting key points from the following text. 
    The insights gained will be used to do content research and we will compare the key points, insights and summaries across multiple articles.
    ---
    - You must analyze the text and extract the key points and opinions from the following text
    - You must extract the key points and opinions from the following text:
    {text}

    {format_instructions}
    """
    prompt = PromptTemplate.from_template(prompt_template)

    # Refine template:
    refine_template = (
        "Your job is to produce a final summary.\n"
        "We have provided an existing summary, key points, and expert opinions up to a certain point: {existing_answer}\n"
        "We have the opportunity to refine the existing content (only if needed) with some more context below.\n"
        "------------\n"
        "{text}\n"
        "------------\n"
        "Given the new context, refine the original summary.\n"
        "If the context isn't useful or does not provide additional key points or expert opinions, you must return the original summary."
        "{format_instructions}"
    )
    refine_prompt = PromptTemplate.from_template(refine_template)

    chain = load_summarize_chain(
        llm=llm,
        chain_type="refine",
        question_prompt=prompt,
        refine_prompt=refine_prompt,
        return_intermediate_steps=True,
        input_key="input_documents",
        output_key="output_text",
    )

    print('Summarizing the data!')
    summary_result = await chain._acall(inputs=
        {
            "input_documents": split_docs,
            "format_instructions": parser.get_format_instructions(),
        },
    )

    print("Parsing the output!")
    document_summary = parser.parse(summary_result["output_text"])
    print("Parsed the output!")

    document_summary.metadata = document.metadata
    return document_summary


In [12]:
parser = PydanticOutputParser(pydantic_object=DocumentSummary)

# Extract the html content from the URLs:
html_documents = get_html_content_from_urls(serp_results)

# Extract the text from the URLs:
text_documents = extract_text_from_webpages(html_documents)

In [13]:
async def create_all_summaries(text_documents, parser):
    # Create an array of coroutines
    tasks = [create_summary_from_text(document, parser) for document in text_documents]
    
    # Execute the tasks concurrently and gather all the results
    results = await asyncio.gather(*tasks)
    
    # Filter out None values
    summaries = [summary for summary in results if summary is not None]
    
    if len(summaries) == 0:
        raise ValueError("No summaries were created!")
    
    return summaries

In [14]:
summaries = await create_all_summaries(text_documents, parser)

Summarizing the data!
Summarizing the data!
Summarizing the data!
Parsing the output!
Parsed the output!
Parsing the output!
Parsed the output!
Parsing the output!
Parsed the output!


---

## Expert Interview Questions:


Promising agents:
- https://python.langchain.com/docs/modules/agents/
- https://python.langchain.com/docs/modules/agents/agent_types/structured_chat


In [52]:
import langchain
from langchain.chat_models import ChatOpenAI
from langchain.agents import AgentExecutor
from langchain.schema import SystemMessage
from langchain.agents import OpenAIFunctionsAgent, OpenAIMultiFunctionsAgent
from langchain.prompts import MessagesPlaceholder
from langchain.memory import ConversationBufferMemory
langchain.debug = True

In [53]:
# Custom tools:
from tools.generate_interview_questions import GenerateInterviewQuestions
from tools.human_in_the_loop import HumanInTheLoop
from tools.load_json import LoadJsonFile
from tools.save_json import SaveJsonFile

In [54]:
llm = ChatOpenAI(temperature=0, model='gpt-4')
# llm = ChatOpenAI(temperature=0)

# Generate tools:
tools = [GenerateInterviewQuestions(), HumanInTheLoop(), LoadJsonFile(), SaveJsonFile()]

system_message = SystemMessage(content=f'''You are very powerful assistant and are responsible for investigating the following topic: {TOPIC}. 
                               You are bad at extracting key points from articles and need help.
                               Also you must let the human answer questions as this is their interview.
                               ---
                               ''')

# Generate memory:
MEMORY_KEY = "chat_history"
memory = ConversationBufferMemory(memory_key=MEMORY_KEY, return_messages=True)

prompt = OpenAIFunctionsAgent.create_prompt(system_message=system_message, extra_prompt_messages=[MessagesPlaceholder(variable_name=MEMORY_KEY)])

# Create the agent:
agent = OpenAIFunctionsAgent(llm=llm, tools=tools, prompt=prompt)

# Create the agent executor:
agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True, memory=memory)

In [55]:
print("Starting the agent executor!")
agent_executor.memory.buffer

Starting the agent executor!


[]

In [56]:
result = agent_executor.run(f"My name is James, what is your name?")

[32;1m[1;3m[chain/start][0m [1m[1:chain:AgentExecutor] Entering Chain run with input:
[0m{
  "input": "My name is James, what is your name?",
  "chat_history": []
}
[32;1m[1;3m[llm/start][0m [1m[1:chain:AgentExecutor > 2:llm:ChatOpenAI] Entering LLM run with input:
[0m{
  "prompts": [
    "System: You are very powerful assistant and are responsible for investigating the following topic: Neural networks. \n                               You are bad at extracting key points from articles and need help.\n                               Also you must let the human answer questions as this is their interview.\n                               ---\n                               \nHuman: My name is James, what is your name?"
  ]
}
[36;1m[1;3m[llm/end][0m [1m[1:chain:AgentExecutor > 2:llm:ChatOpenAI] [3.57s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": "Hello James, I'm OpenAI's language model. I don't have a personal name as I'm an artificial

In [57]:
agent_executor.run(f"What is my name?")

[32;1m[1;3m[chain/start][0m [1m[1:chain:AgentExecutor] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[llm/start][0m [1m[1:chain:AgentExecutor > 2:llm:ChatOpenAI] Entering LLM run with input:
[0m{
  "prompts": [
    "System: You are very powerful assistant and are responsible for investigating the following topic: Neural networks. \n                               You are bad at extracting key points from articles and need help.\n                               Also you must let the human answer questions as this is their interview.\n                               ---\n                               \nHuman: My name is James, what is your name?\nAI: Hello James, I'm OpenAI's language model. I don't have a personal name as I'm an artificial intelligence. How can I assist you today?\nHuman: What is my name?"
  ]
}
[36;1m[1;3m[llm/end][0m [1m[1:chain:AgentExecutor > 2:llm:ChatOpenAI] [1.14s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "te

'Your name is James.'

In [59]:
agent_executor.run(f"""Here is some useful information for later on:
                   document_summaries: {[s.dict() for s in summaries]}
                   topic: {TOPIC}
                   """)

[32;1m[1;3m[chain/start][0m [1m[1:chain:AgentExecutor] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[llm/start][0m [1m[1:chain:AgentExecutor > 2:llm:ChatOpenAI] Entering LLM run with input:
[0m{
  "prompts": [
    "System: You are very powerful assistant and are responsible for investigating the following topic: Neural networks. \n                               You are bad at extracting key points from articles and need help.\n                               Also you must let the human answer questions as this is their interview.\n                               ---\n                               \nHuman: My name is James, what is your name?\nAI: Hello James, I'm OpenAI's language model. I don't have a personal name as I'm an artificial intelligence. How can I assist you today?\nHuman: What is my name?\nAI: Your name is James.\nHuman: Here is some useful information for later on:\n                   document_summaries: [{'concise_summary': 'Neural network theory helps

ValidationError: 1 validation error for SaveJsonFileArgsSchema
content
  field required (type=value_error.missing)

---

## General Article Outline:


---

## Article Text Generation:


---

## Re-write the Article:

- Read this - https://blog.langchain.dev/using-langsmith-to-support-fine-tuning-of-open-source-llms/
- Use this? https://github.com/langchain-ai/twitter-finetune/tree/main
- Use chat loaders -> https://python.langchain.com/docs/integrations/chat_loaders/?ref=blog.langchain.dev


---

## Title Tag Optimization:


---


## Gradio Interface (use Mike's to do this):
