In [153]:
from langchain.chains.summarize import load_summarize_chain
from langchain.chat_models.openai import ChatOpenAI
from langchain.document_loaders import AsyncHtmlLoader, AsyncChromiumLoader
from langchain.document_transformers import Html2TextTransformer
from langchain.prompts import PromptTemplate
from langchain.output_parsers import PydanticOutputParser
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
import pandas as pd
from pydantic import BaseModel, Field
from serpapi import GoogleSearch
import os
from typing import Dict, List, Optional, Union

In [2]:
%pip install nest_asyncio --quiet

Note: you may need to restart the kernel to use updated packages.


In [4]:
import nest_asyncio
nest_asyncio.apply()

## Topic Research:

Use a browsing agent to search for the top 3 posts on a topic and summarize them. Progressive summarization and agent tool use.

- https://python.langchain.com/docs/use_cases/web_scraping/#loader
- https://python.langchain.com/docs/modules/data_connection/retrievers/web_research - Not viable for production, because it is only question based answering.


In [5]:
%pip install google-search-results --quiet
%pip install html2text --quiet

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [6]:
# Installing Playwright:
%pip install pytest-playwright

Note: you may need to restart the kernel to use updated packages.


In [7]:
!playwright install

In [54]:
os.environ["SERPAPI_API_KEY"] = ""
TOPIC = "What are the disadvantages of using a neural network?"

In [13]:
# LLM + text splitter:
llm = ChatOpenAI(temperature=0)
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=1500, chunk_overlap=400
)

In [14]:
search = GoogleSearch(
    {
        "q": TOPIC,
        "location": "Austin,Texas",
        "api_key": os.environ["SERPAPI_API_KEY"],
    }
)
# Get the results:
result = search.get_dict()

# Put the results in a Pandas DataFrame:
serp_results = pd.DataFrame(result["organic_results"])

In [15]:
def get_html_content_from_urls(
    df: pd.DataFrame, number_of_urls: int = 3, url_column: str = "link"
) -> List[Document]:
    # Get the HTML content of the first 3 URLs:
    urls = df[url_column].values[:number_of_urls].tolist()
    # If there is only one URL, convert it to a list:
    if isinstance(urls, str):
        urls = [urls]
    # Check for empty URLs:
    urls = [url for url in urls if url != ""]

    # Check for duplicate URLs:
    urls = list(set(urls))

    # Throw error if no URLs are found:
    if len(urls) == 0:
        raise ValueError("No URLs found!")
    # loader = AsyncHtmlLoader(urls) # Faster but might not always work.
    loader = AsyncChromiumLoader(urls)
    docs = loader.load()
    return docs

In [16]:
def extract_text_from_webpages(documents: List[Document]):
    html2text = Html2TextTransformer()
    return html2text.transform_documents(documents)

In [17]:
class DocumentSummary(BaseModel):
    concise_summary: str
    writing_style: str
    key_points: List[str]
    expert_opinions: Optional[List[str]] = None
    metadata: Dict[
        str, str
    ] = None  # This comes natively from the LangChain document loader

In [18]:
# Create a function to summarize the text --> LangChain documents --> Summarize the documents
def create_summary_from_text(
    document: Document, parser: PydanticOutputParser
) -> Union[DocumentSummary, None]:
    # Split the parent document into chunks:
    split_docs = text_splitter.split_documents([document])

    # If there are no documents, return None:
    if len(split_docs) == 0:
        return None

    # Run a refine summarization chain that extracts unique key points and opinions within an article:
    prompt_template = """Act as a content SEO researcher. You are interested in summarizing and extracting key points from the following text. 
    The insights gained will be used to do content research and we will compare the key points, insights and summaries across multiple articles.
    ---
    - You must analyze the text and extract the key points and opinions from the following text
    - You must extract the key points and opinions from the following text:
    {text}

    {format_instructions}
    """
    prompt = PromptTemplate.from_template(prompt_template)

    # Refine template:
    refine_template = (
        "Your job is to produce a final summary.\n"
        "We have provided an existing summary, key points, and expert opinions up to a certain point: {existing_answer}\n"
        "We have the opportunity to refine the existing content (only if needed) with some more context below.\n"
        "------------\n"
        "{text}\n"
        "------------\n"
        "Given the new context, refine the original summary.\n"
        "If the context isn't useful or does not provide additional key points or expert opinions, you must return the original summary."
        "{format_instructions}"
    )
    refine_prompt = PromptTemplate.from_template(refine_template)
    chain = load_summarize_chain(
        llm=llm,
        chain_type="refine",
        question_prompt=prompt,
        refine_prompt=refine_prompt,
        return_intermediate_steps=True,
        input_key="input_documents",
        output_key="output_text",
    )
    summary_result = chain(
        {
            "input_documents": split_docs,
            "format_instructions": parser.get_format_instructions(),
        },
        return_only_outputs=True,
    )

    # Parsing the output:
    document_summary = parser.parse(summary_result["output_text"])
    # Adding on the metadata:
    document_summary.metadata = document.metadata
    return document_summary

In [20]:
parser = PydanticOutputParser(pydantic_object=DocumentSummary)

# Extract the html content from the URLs:
html_documents = get_html_content_from_urls(serp_results)

# Extract the text from the URLs:
text_documents = extract_text_from_webpages(html_documents)

# Create a customized summary with a refine chain for all docs, use a list comprehension:
summaries = [
    create_summary_from_text(document, parser)
    for document in text_documents
    if create_summary_from_text(document, parser) is not None
]

if len(summaries) == 0:
    raise ValueError("No summaries were created!")

---

## Expert Interview Questions:


In [101]:
import langchain
langchain.debug = True
from langchain.prompts.chat import SystemMessagePromptTemplate


In [None]:
from langchain.tools import HumanInputRun, StructuredTool

# def get_input() -> str:
#     print("Insert your text. Enter 'q' or press Ctrl-D (or Ctrl-Z on Windows) to end.")
#     contents = []
#     while True:
#         try:
#             line = input()
#         except EOFError:
#             break
#         if line == "q":
#             break
#         contents.append(line)
#     return "\n".join(contents)

In [185]:
# Creating the tools:
# def generate_a_list_of_interview_questions(
#     document_summaries: str
# ) -> str:

#     # Create an LLM:
#     chat = ChatOpenAI(temperature=0.6)

#     # Create an output parser:
#     class InterviewQuestions(BaseModel):
#         questions: List[str] = Field(description='A list of questions interviewing a specific interview expert.')
    
#     # Set up a parser + inject instructions into the prompt template.
#     parser = PydanticOutputParser(pydantic_object=InterviewQuestions)

#     # Create a system message:
#     system_message = """You are a content SEO researcher. Previously you have summarized and extracted key points from SERP results. 
#     The insights gained will be used to do content research and we will compare the key points, insights and summaries across multiple articles.
#     You are now going to interview a content expert. You will ask them questions about the following topic: {topic}.

#     You must follow the following rules:
#     - Return a list of questions that you would ask a content expert about the topic.
#     - You must ask at least 5 questions.
#     - You are looking for information gain and unique insights that are not already covered in the {document_summaries} information.
#     - You must ask questions that are open-ended and not yes/no questions.
#     {format_instructions}
#     """
#     system_prompt = SystemMessagePromptTemplate.from_template(system_message)
#     system_message = system_prompt.format(document_summaries=document_summaries, topic=TOPIC, format_instructions=parser.get_format_instructions())
#     result = chat([system_message])
#     return parser.parse(result.content)


# # Create all of the tools:
# tools = [StructuredTool.from_function(generate_a_list_of_interview_questions, description="""
#     Generates a list of open-ended questions to ask a content expert about a given topic.

#     Args:
#     - document_summaries: a list of DocumentSummary objects containing key points and opinions about the topic.
#     - topic: a string representing the topic to ask questions about.

#     Returns:
#     - A string representing a list of at least 5 open-ended questions to ask a content expert about the topic.

#     Raises:
#     - ValueError: if no questions are generated.

#     Example usage:
#     ```
#     # create a list of DocumentSummary objects
#     summaries = [create_summary_from_text(document, parser) for document in text_documents if create_summary_from_text(document, parser) is not None]

#     # generate a list of interview questions
#     questions = generate_a_list_of_interview_questions(summaries, "content marketing")
#     print(questions)
#     ```
#     """)]

In [186]:
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationBufferMemory, ReadOnlySharedMemory
from langchain.agents import initialize_agent, Tool

In [187]:
from langchain.agents import Tool, AgentExecutor, LLMSingleActionAgent, AgentOutputParser
from langchain.prompts import BaseChatPromptTemplate
from langchain import SerpAPIWrapper, LLMChain
from typing import List, Union
from langchain.schema import AgentAction, AgentFinish, SystemMessage
import re

In [197]:
# Set up the base template
template = """Complete the objective as best you can. You have access to the following tools:

{tools}

Use the following format:

Question: the input question you must answer
Thought: you should always think about what to do
Action: the action to take, should be one of [{tool_names}]
Action Input: the input to the action
Observation: the result of the action
... (this Thought/Action/Action Input/Observation can repeat N times)
Thought: I now know the final answer
Final Answer: the final answer to the original input question

----

Extra information that you have:
# Topic: {topic}
# Document Summaries: {document_summaries}

Question: {input}

----
Chat History {chat_history}

These were previous tasks you completed:
{agent_scratchpad}"""

In [198]:
class CustomOutputParser(AgentOutputParser):
    
    def parse(self, llm_output: str) -> Union[AgentAction, AgentFinish]:
        # Check if agent should finish
        if "Final Answer:" in llm_output:
            return AgentFinish(
                # Return values is generally always a dictionary with a single `output` key
                # It is not recommended to try anything else at the moment :)
                return_values={"output": llm_output.split("Final Answer:")[-1].strip()},
                log=llm_output,
            )
        # Parse out the action and action input
        regex = r"Action\s*\d*\s*:(.*?)\nAction\s*\d*\s*Input\s*\d*\s*:[\s]*(.*)"
        match = re.search(regex, llm_output, re.DOTALL)
        if not match:
            raise ValueError(f"Could not parse LLM output: `{llm_output}`")
        action = match.group(1).strip()
        action_input = match.group(2)
        # Return the action and action input
        return AgentAction(tool=action, tool_input=action_input.strip(" ").strip('"'), log=llm_output)

In [209]:
# Set up a prompt template
class CustomPromptTemplate(BaseChatPromptTemplate):
    # The template to use
    template: str
    # The list of tools available
    tools: List[Tool]
    
    def format_messages(self, **kwargs) -> str:
        # Get the intermediate steps (AgentAction, Observation tuples)
        # Format them in a particular way
        intermediate_steps = kwargs.pop("intermediate_steps")
        thoughts = ""
        for action, observation in intermediate_steps:
            thoughts += action.log
            thoughts += f"\nObservation: {observation}\nThought: "
        # Set the agent_scratchpad variable to that value
        kwargs["agent_scratchpad"] = thoughts
        # Create a tools variable from the list of tools provided
        kwargs["tools"] = "\n".join([f"{tool.name}: {tool.description}" for tool in self.tools])
        # Create a list of tool names for the tools provided
        kwargs["tool_names"] = ", ".join([tool.name for tool in self.tools])
        # Add the chat history:
        print(kwargs)

        formatted = self.template.format(**kwargs)
        return [SystemMessage(content=formatted)]

In [210]:
# Define which tools the agent can use to answer user queries
search = SerpAPIWrapper(serpapi_api_key=os.environ['SERPAPI_API_KEY'])
tools = [
    Tool(
        name = "Search",
        func=search.run,
        description="useful for when you need to answer questions about current events"
    )
]

In [211]:
prompt = CustomPromptTemplate(
    template=template,
    tools=tools,
    # This omits the `agent_scratchpad`, `tools`, and `tool_names` variables because those are generated dynamically
    # This includes the `intermediate_steps` variable because that is needed
    input_variables=["topic", "document_summaries", "input", "intermediate_steps"]
)

In [212]:
# LLM chain consisting of the LLM and a prompt
llm_chain = LLMChain(llm=llm, prompt=prompt)

In [213]:
# Output parser:
output_parser = CustomOutputParser()

# Make the memory read only:
memory = ConversationBufferMemory(memory_key="chat_history")
readonlymemory = ReadOnlySharedMemory(memory=memory)

In [214]:
tool_names = [tool.name for tool in tools]
agent = LLMSingleActionAgent(
    llm_chain=llm_chain, 
    output_parser=output_parser,
    stop=["\nObservation:"], 
    allowed_tools=tool_names
)
agent_executor = AgentExecutor.from_agent_and_tools(agent=agent, tools=tools, 
                                                    memory=memory, verbose=True)

In [215]:
agent_executor.run(topic=TOPIC, document_summaries={str([summary.dict() for summary in summaries])}, input='''My name is James Phoenix, what is your name?''')

[32;1m[1;3m[chain/start][0m [1m[1:chain:AgentExecutor] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[1:chain:AgentExecutor > 2:chain:LLMChain] Entering Chain run with input:
[0m[inputs]
{'topic': 'What are the disadvantages of using a neural network?', 'document_summaries': {'[{\'concise_summary\': \'Neural networks have become popular for modeling complex data, but they have disadvantages.\', \'writing_style\': \'Informative\', \'key_points\': [\'Artificial Neural Networks require lots of computational power.\', \'Neural network models are hard to explain.\', \'Neural network training requires lots of data.\', \'Data preparation for neural network models needs careful attention.\', \'Optimizing neural network models for production can be challenging.\'], \'expert_opinions\': [], \'metadata\': {\'source\': \'https://www.the-analytics.club/disadvantages-of-artificial-neural-networks/\'}}, {\'concise_summary\': "Neural networks are a form of AI-base

KeyError: 'chat_history'

In [63]:
# # Create a custom LLMChain with memory:
# prefix = """Have a conversation with a human, answering the following questions as best you can. You have access to the following tools:"""
# suffix = """Begin!"

# {chat_history}
# Topic: {topic}
# Document Summaries: {document_summaries}
# Task: {task}
# {agent_scratchpad}"""

# prompt = ZeroShotAgent.create_prompt(
#     tools,
#     prefix=prefix,
#     suffix=suffix,
#     input_variables=["topic", "document_summaries", "task" ,"chat_history", "agent_scratchpad"],
# )

# # Attach the custom LLMChain to an agent:
# llm_chain = LLMChain(llm=ChatOpenAI(temperature=0), prompt=prompt)
# agent = ZeroShotAgent(llm_chain=llm_chain, tools=tools, verbose=True)
# agent_chain = AgentExecutor.from_agent_and_tools(
#     agent=agent, tools=tools, verbose=True, memory=readonlymemory
# )

# agent_chain.run(topic=TOPIC, document_summaries={str([summary.dict() for summary in summaries])}, task='''
#                 You are to act as an interviewer, and are interviewing an expert.  
#                 Given the following Document Summaries from other articles you will need to return 5 question and answers that are answered as part of an interview.
#                 ''')

---

## General Article Outline:


---

## Article Text Generation:


---

## Re-write the Article:

- Read this - https://blog.langchain.dev/using-langsmith-to-support-fine-tuning-of-open-source-llms/
- Use this? https://github.com/langchain-ai/twitter-finetune/tree/main
- Use chat loaders -> https://python.langchain.com/docs/integrations/chat_loaders/?ref=blog.langchain.dev


---

## Title Tag Optimization:


---


## Gradio Interface (use Mike's to do this):
