In [28]:
from langchain.chains import LLMChain
from langchain.chains.summarize import load_summarize_chain
from langchain.chat_models.openai import ChatOpenAI
from langchain.document_loaders import AsyncHtmlLoader, AsyncChromiumLoader
from langchain.document_transformers import Html2TextTransformer
from langchain.prompts import PromptTemplate
from langchain.output_parsers import PydanticOutputParser
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
import pandas as pd
from pydantic import BaseModel
from serpapi import GoogleSearch
import os
from typing import Dict, List, Optional, Union

In [2]:
%pip install nest_asyncio --quiet

Note: you may need to restart the kernel to use updated packages.


In [3]:
import nest_asyncio
nest_asyncio.apply()

## Topic Research:

Use a browsing agent to search for the top 3 posts on a topic and summarize them. Progressive summarization and agent tool use.

- https://python.langchain.com/docs/use_cases/web_scraping/#loader
- https://python.langchain.com/docs/modules/data_connection/retrievers/web_research - Not viable for production, because it is only question based answering.


In [4]:
%pip install google-search-results --quiet
%pip install html2text --quiet

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [5]:
# Installing Playwright:
%pip install pytest-playwright --quiet

Note: you may need to restart the kernel to use updated packages.


In [6]:
!playwright install

In [4]:
TOPIC = "Neural networks"

In [5]:
os.environ["SERPAPI_API_KEY"] = ""

In [6]:
# LLM + text splitter:
llm = ChatOpenAI(temperature=0)
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=1500, chunk_overlap=400
)

In [7]:
search = GoogleSearch(
    {
        "q": TOPIC,
        "location": "Austin,Texas",
        "api_key": os.environ["SERPAPI_API_KEY"],
    }
)
# Get the results:
result = search.get_dict()

# Put the results in a Pandas DataFrame:
serp_results = pd.DataFrame(result["organic_results"])

In [8]:
def get_html_content_from_urls(
    df: pd.DataFrame, number_of_urls: int = 3, url_column: str = "link"
) -> List[Document]:
    # Get the HTML content of the first 3 URLs:
    urls = df[url_column].values[:number_of_urls].tolist()
    # If there is only one URL, convert it to a list:
    if isinstance(urls, str):
        urls = [urls]
    # Check for empty URLs:
    urls = [url for url in urls if url != ""]

    # Check for duplicate URLs:
    urls = list(set(urls))

    # Throw error if no URLs are found:
    if len(urls) == 0:
        raise ValueError("No URLs found!")
    # loader = AsyncHtmlLoader(urls) # Faster but might not always work.
    loader = AsyncChromiumLoader(urls)
    docs = loader.load()
    return docs

In [9]:
def extract_text_from_webpages(documents: List[Document]):
    html2text = Html2TextTransformer()
    return html2text.transform_documents(documents)

In [10]:
class DocumentSummary(BaseModel):
    concise_summary: str
    writing_style: str
    key_points: List[str]
    expert_opinions: Optional[List[str]] = None
    metadata: Dict[
        str, str
    ] = None  # This comes natively from the LangChain document loader

In [11]:
import asyncio

async def create_summary_from_text(
    document: Document, parser: PydanticOutputParser
) -> Union[DocumentSummary, None]:
    # Split the parent document into chunks:
    split_docs = text_splitter.split_documents([document])

    # If there are no documents, return None:
    if len(split_docs) == 0:
        return None

    # Run a refine summarization chain that extracts unique key points and opinions within an article:
    prompt_template = """Act as a content SEO researcher. You are interested in summarizing and extracting key points from the following text. 
    The insights gained will be used to do content research and we will compare the key points, insights and summaries across multiple articles.
    ---
    - You must analyze the text and extract the key points and opinions from the following text
    - You must extract the key points and opinions from the following text:
    {text}

    {format_instructions}
    """
    prompt = PromptTemplate.from_template(prompt_template)

    # Refine template:
    refine_template = (
        "Your job is to produce a final summary.\n"
        "We have provided an existing summary, key points, and expert opinions up to a certain point: {existing_answer}\n"
        "We have the opportunity to refine the existing content (only if needed) with some more context below.\n"
        "------------\n"
        "{text}\n"
        "------------\n"
        "Given the new context, refine the original summary.\n"
        "If the context isn't useful or does not provide additional key points or expert opinions, you must return the original summary."
        "{format_instructions}"
    )
    refine_prompt = PromptTemplate.from_template(refine_template)

    chain = load_summarize_chain(
        llm=llm,
        chain_type="refine",
        question_prompt=prompt,
        refine_prompt=refine_prompt,
        return_intermediate_steps=True,
        input_key="input_documents",
        output_key="output_text",
    )

    print('Summarizing the data!')
    summary_result = await chain._acall(inputs=
        {
            "input_documents": split_docs,
            "format_instructions": parser.get_format_instructions(),
        },
    )

    print("Parsing the output!")
    document_summary = parser.parse(summary_result["output_text"])
    print("Parsed the output!")

    document_summary.metadata = document.metadata
    return document_summary


In [12]:
parser = PydanticOutputParser(pydantic_object=DocumentSummary)

# Extract the html content from the URLs:
html_documents = get_html_content_from_urls(serp_results)

# Extract the text from the URLs:
text_documents = extract_text_from_webpages(html_documents)

In [13]:
async def create_all_summaries(text_documents, parser):
    # Create an array of coroutines
    tasks = [create_summary_from_text(document, parser) for document in text_documents]
    
    # Execute the tasks concurrently and gather all the results
    results = await asyncio.gather(*tasks)
    
    # Filter out None values
    summaries = [summary for summary in results if summary is not None]
    
    if len(summaries) == 0:
        raise ValueError("No summaries were created!")
    
    return summaries

In [14]:
summaries = await create_all_summaries(text_documents, parser)

Summarizing the data!
Summarizing the data!
Summarizing the data!
Parsing the output!
Parsed the output!
Parsing the output!
Parsed the output!
Parsing the output!
Parsed the output!


---

## Expert Interview Questions:


Promising agents:
- https://python.langchain.com/docs/modules/agents/
- https://python.langchain.com/docs/modules/agents/agent_types/structured_chat


In [15]:
import langchain
from langchain.chat_models import ChatOpenAI
from langchain.agents import AgentExecutor
from langchain.schema import SystemMessage
from langchain.agents import OpenAIFunctionsAgent, OpenAIMultiFunctionsAgent
from langchain.prompts import MessagesPlaceholder
from langchain.memory import ConversationBufferMemory
langchain.debug = True

In [16]:
# Custom tools:
from tools.generate_interview_questions import GenerateInterviewQuestions
from tools.human_in_the_loop import HumanInTheLoop

In [17]:
llm = ChatOpenAI(temperature=0, model='gpt-4')

# Generate tools:
tools = [GenerateInterviewQuestions(), HumanInTheLoop()]

system_message = SystemMessage(content=f'''You are very powerful assistant and are responsible for investigating the following topic: {TOPIC}. 
                               You are bad at extracting key points from articles and need help.
                               Also you must let the human answer questions as this is their interview.
                               ---
                               ''')

# Generate memory:
MEMORY_KEY = "chat_history"
memory = ConversationBufferMemory(memory_key=MEMORY_KEY, return_messages=True)

prompt = OpenAIMultiFunctionsAgent.create_prompt(system_message=system_message, extra_prompt_messages=[MessagesPlaceholder(variable_name=MEMORY_KEY)])

# Create the agent:
agent = OpenAIFunctionsAgent(llm=llm, tools=tools, prompt=prompt)

# Create the agent executor:
agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True, memory=memory)

In [18]:
# Agent code:
print("Starting the agent executor!")
agent_executor.memory.buffer    

Starting the agent executor!


[]

In [19]:
result = agent_executor.run(f"My name is James, what is your name?")

[32;1m[1;3m[chain/start][0m [1m[1:chain:AgentExecutor] Entering Chain run with input:
[0m{
  "input": "My name is James, what is your name?",
  "chat_history": []
}
[32;1m[1;3m[llm/start][0m [1m[1:chain:AgentExecutor > 2:llm:ChatOpenAI] Entering LLM run with input:
[0m{
  "prompts": [
    "System: You are very powerful assistant and are responsible for investigating the following topic: Neural networks. \n                               You are bad at extracting key points from articles and need help.\n                               Also you must let the human answer questions as this is their interview.\n                               ---\n                               \nHuman: My name is James, what is your name?"
  ]
}
[36;1m[1;3m[llm/end][0m [1m[1:chain:AgentExecutor > 2:llm:ChatOpenAI] [4.37s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": "Hello James, I'm OpenAI's language model. I don't have a personal name as I'm an artificial

In [20]:
agent_executor.run(f"What is my name?")
agent_executor.memory.buffer

[32;1m[1;3m[chain/start][0m [1m[1:chain:AgentExecutor] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[llm/start][0m [1m[1:chain:AgentExecutor > 2:llm:ChatOpenAI] Entering LLM run with input:
[0m{
  "prompts": [
    "System: You are very powerful assistant and are responsible for investigating the following topic: Neural networks. \n                               You are bad at extracting key points from articles and need help.\n                               Also you must let the human answer questions as this is their interview.\n                               ---\n                               \nHuman: My name is James, what is your name?\nAI: Hello James, I'm OpenAI's language model. I don't have a personal name as I'm an artificial intelligence. How can I assist you today?\nHuman: What is my name?"
  ]
}
[36;1m[1;3m[llm/end][0m [1m[1:chain:AgentExecutor > 2:llm:ChatOpenAI] [1.09s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "te

[HumanMessage(content='My name is James, what is your name?', additional_kwargs={}, example=False),
 AIMessage(content="Hello James, I'm OpenAI's language model. I don't have a personal name as I'm an artificial intelligence. How can I assist you today?", additional_kwargs={}, example=False),
 HumanMessage(content='What is my name?', additional_kwargs={}, example=False),
 AIMessage(content='Your name is James.', additional_kwargs={}, example=False)]

In [21]:
agent_executor.run(f"""document_summaries: {[s.dict() for s in summaries]}
                       topic: {TOPIC}
                       ---
                       Use the above to make interview questions, then I want to answer them.
                       """)

[32;1m[1;3m[chain/start][0m [1m[1:chain:AgentExecutor] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[llm/start][0m [1m[1:chain:AgentExecutor > 2:llm:ChatOpenAI] Entering LLM run with input:
[0m{
  "prompts": [
    "System: You are very powerful assistant and are responsible for investigating the following topic: Neural networks. \n                               You are bad at extracting key points from articles and need help.\n                               Also you must let the human answer questions as this is their interview.\n                               ---\n                               \nHuman: My name is James, what is your name?\nAI: Hello James, I'm OpenAI's language model. I don't have a personal name as I'm an artificial intelligence. How can I assist you today?\nHuman: What is my name?\nAI: Your name is James.\nHuman: document_summaries: [{'concise_summary': \"Neural networks are a subset of machine learning and are at the heart of deep learning algor

'Great! You can now use the questions and answers in the file "questions_and_answers.json" for your interview. If you need further assistance, feel free to ask.'

In [22]:
agent_executor.memory.buffer

[HumanMessage(content='My name is James, what is your name?', additional_kwargs={}, example=False),
 AIMessage(content="Hello James, I'm OpenAI's language model. I don't have a personal name as I'm an artificial intelligence. How can I assist you today?", additional_kwargs={}, example=False),
 HumanMessage(content='What is my name?', additional_kwargs={}, example=False),
 AIMessage(content='Your name is James.', additional_kwargs={}, example=False),
 HumanMessage(content='document_summaries: [{\'concise_summary\': "Neural networks are a subset of machine learning and are at the heart of deep learning algorithms. They mimic the way biological neurons signal to one another. Neural networks rely on training data to improve their accuracy over time and are powerful tools in computer science and artificial intelligence. They can classify and cluster data quickly, such as in speech recognition or image recognition tasks. Google\'s search algorithm is a well-known neural network.", \'writing_

---

## General Article Outline:


In [None]:
import json
with open('questions_and_answers.json', 'r') as f:
    questions_and_answers = json.load(f)

In [74]:
class SubHeading(BaseModel):
    title: str

class BlogOutline(BaseModel):
    title: str
    sub_headings: List[SubHeading]

prompt = """
Based on my answers and the summary, generate an outline for a blog article on {topic}.
topic: {topic}
document_summaries: {document_summaries}
---
Here is the interview which I answered: {interview_questions_and_answers}
---
Output format: {format_instructions}
"""

from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
)

# Create the prompt
system_message_prompt = SystemMessagePromptTemplate.from_template(prompt)
chat_prompt = ChatPromptTemplate.from_messages([system_message_prompt])

# Create an output parser:
from langchain.output_parsers import PydanticOutputParser
parser = PydanticOutputParser(pydantic_object=BlogOutline)

# outline_chain = LLMChain(llm=llm, prompt=chat_prompt, output_parser=parser, output_key='blog_post_outline')

# LangChain Expression Language does the same as the above:
outline_chain = (
    chat_prompt | 
    ChatOpenAI() | 
    parser 
)

# Invoke the chain:
outline_result = outline_chain.invoke({
    "topic": TOPIC,
    "document_summaries": [s.dict() for s in summaries],
    "interview_questions_and_answers": questions_and_answers,
    "format_instructions": parser.get_format_instructions(),
})

[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence] Entering Chain run with input:
[0m{
  "topic": "Neural networks",
  "document_summaries": [
    {
      "concise_summary": "Neural networks are a subset of machine learning and are at the heart of deep learning algorithms. They mimic the way biological neurons signal to one another. Neural networks rely on training data to improve their accuracy over time and are powerful tools in computer science and artificial intelligence. They can classify and cluster data quickly, such as in speech recognition or image recognition tasks. Google's search algorithm is a well-known neural network.",
      "writing_style": "The text provides a clear explanation of neural networks and their structure. It uses examples and equations to illustrate how neural networks work.",
      "key_points": [
        "Neural networks are inspired by the human brain and mimic the way neurons signal to one another.",
        "They consist of node layers, in

---

## Article Text Generation:


In [149]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings

# Create the embeddings:
embeddings = OpenAIEmbeddings()

# Chunk the .html text documents: 
from langchain.text_splitter import RecursiveCharacterTextSplitter
chunk_size = 400
chunk_overlap = 100

text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=chunk_size, chunk_overlap=chunk_overlap
)

# Split the text_documents variables:
chunked_documents = text_splitter.split_documents(text_documents)
print(len(chunked_documents))

42


In [150]:
# Vectorize all of the original .html pages:
chroma_db = Chroma.from_documents(chunked_documents, embedding=embeddings)

In [160]:
from langchain.memory import ConversationSummaryBufferMemory
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate, MessagesPlaceholder
from langchain.schema import SystemMessage
from typing import Any

class OnlyStoreAIMemory(ConversationSummaryBufferMemory):
    def save_context(self, inputs: Dict[str, Any], outputs: Dict[str, str]) -> None:
        """Save context from this conversation to buffer."""
        input_str, output_str = self._get_input_output(inputs, outputs)
        # self.chat_memory.add_user_message(input_str) # Removing the HumanMessages as you are using 
        self.chat_memory.add_ai_message(output_str)

prompt = f"""
Act as a content SEO writer.
You are currently writing a blog post on topic: {TOPIC}.
This is the outline of the blog post: {outline_result.json()}. You will be responsible for writing the blog post sections.
---
Use your previous AI messages to avoid repeating yourself as you continually re-write the blog post sections.
"""

chat = ChatOpenAI(model='gpt-3.5-turbo-16k')
memory = OnlyStoreAIMemory(llm=chat, memory_key='chat_history', return_messages=True, max_token_limit=1200)


chat_prompt = ChatPromptTemplate.from_messages([SystemMessage(content=prompt), 
                                                MessagesPlaceholder(variable_name="chat_history"), # Where the memory will be stored.
                                                HumanMessagePromptTemplate.from_template("{human_input}"), # Where the human input will injected
                                                ])


In [161]:
# Creating the chain:
blog_post_chain = LLMChain(llm=chat, prompt=chat_prompt, memory=memory, output_key='blog_post')

In [163]:
blog_post = []

In [165]:
for subheading in outline_result.sub_headings:
    k = 5  # Initialize k
    while k >= 0:  # Loop until k is zero or negative
        try:
            # Get the relevant documents:
            relevant_documents = chroma_db.as_retriever().get_relevant_documents(subheading.title, k=k)

            # Generate the AI HumanMessage:
            section_prompt = f"""
            You are currently writing the section: {subheading.title}
            ---
            Here are the relevant documents for this section: {relevant_documents}.
            If the relevant documents are not useful, you can ignore them.
            You must never copy the relevant documents as this is plagiarism.
            --- 
            Here are the relevant insights that we gathered from our interview questions and answers: {questions_and_answers}. 
            You must include these insights where possible as they are important and will help our content rank better.

            ---
            You must follow the following principles:
            - You must write the section: {subheading.title}
            - Render the output in .md format
            - Include relevant formats such as bullet points, numbered lists, etc.
            ---
            Section text: 
            """
            # Invoke the chain:
            result = blog_post_chain.predict(human_input=section_prompt)
            
            # Append the result:
            print('Appending the result!')
            blog_post.append(result)
            
            # Break the loop since it succeeded
            break

        except Exception as e:  # Replace Exception with a more specific exception if possible
            print(f"An error occurred: {e}")
            k -= 1  # Reduce k by 1

        if k < 0:
            # If k is less than 0, then you've exhausted all attempts. Just add the relevant documents with an empty string.
            print("All attempts to fetch relevant documents have failed. Using an empty string for relevant_documents.")
            relevant_documents = ""

[32;1m[1;3m[chain/start][0m [1m[1:chain:LLMChain] Entering Chain run with input:
[0m{
  "human_input": "\n            You are currently writing the section: What are Neural Networks?\n            ---\n            Here are the relevant documents for this section: [Document(page_content='Cloud Log in Create IBM Cloud account\\n\\n#  What are neural networks?\\n\\nNeural networks try to emulate the human brain, combining computer science and\\nstatistics to solve common problems in the field of AI\\n\\nDiscover watsonx.ai\\n\\n##  What is a neural network?\\n\\nNeural networks, also known as artificial neural networks (ANNs) or simulated\\nneural networks (SNNs), are a subset of machine learning and are at the heart\\nof deep learning algorithms. Their name and structure are inspired by the\\nhuman brain, mimicking the way that biological neurons signal to one another.\\n\\nArtificial neural networks (ANNs) are comprised of a node layers, containing\\nan input layer, one or more hidd

In [167]:
print(len(blog_post))

5


---

## Re-write the Article:

- Read this - https://blog.langchain.dev/using-langsmith-to-support-fine-tuning-of-open-source-llms/
- Use this? https://github.com/langchain-ai/twitter-finetune/tree/main
- Use chat loaders -> https://python.langchain.com/docs/integrations/chat_loaders/?ref=blog.langchain.dev


---

## Title Tag Optimization:


---


## Gradio Interface (use Mike's to do this):
