In [33]:
from langchain.document_loaders.base import Document
from langchain.indexes import VectorstoreIndexCreator
from langchain.utilities import ApifyWrapper

apify = ApifyWrapper()

loader = apify.call_actor(
    actor_id="apify/website-content-crawler",
    run_input={"startUrls": [{"url": "https://python.langchain.com/en/latest/"}]},
    dataset_mapping_function=lambda item: Document(
        page_content=item["text"] or "", metadata={"source": item["url"]}
    ),

)


In [44]:
def pretty_print_docs(docs):
    print(f"\n{'-' * 100}\n".join([f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]))

In [42]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import TokenTextSplitter
from langchain.vectorstores import Pinecone
import os


documents = loader.load()

cleaned = []

common = "Skip to main content \nCtrl+K \n🦜🔗 LangChain 0.0.152\nBuild MongoDB Atlas databases with Python, Java, C# & more. Try it for free today.\nAd by EthicalAds · ℹ️\nv: latest \nVersions latest stable harrison/docs-refactor-3-24 \nDownloads HTML \nOn Read the Docs Project Home Builds Downloads \nOn GitHub View Edit \nSearch \nHosted by Read the Docs · Privacy Policy"

print(len(documents))


counter = 0
for doc in documents:
    original_content = doc.page_content
    doc.page_content = doc.page_content.replace(common, "")
    if original_content != doc.page_content:
        counter += 1
    cleaned.append(doc)

print(counter)

documents = cleaned

len_arr = [len(doc.page_content) for doc in documents]

text_splitter = TokenTextSplitter(chunk_size=1000, chunk_overlap=200)
docs = text_splitter.split_documents(documents)

len_arr2 = [len(doc.page_content) for doc in docs]

print(len(len_arr))
print(len(len_arr2))

if len_arr == len_arr2:
    print("Error")

embeddings = OpenAIEmbeddings()
import pinecone 

# initialize pinecone
pinecone_api_key = os.environ.get("PINECONE_API_KEY")
pinecone.init(
    api_key=pinecone_api_key,
    environment="us-east1-gcp"  # next to api key in console
)

index_name = "langchain-docs"

retriever = Pinecone.from_documents(docs, embeddings, index_name=index_name)

from langchain.chat_models import ChatOpenAI
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo")
compressor = LLMChainExtractor.from_llm(llm)
compression_retriever = ContextualCompressionRetriever(base_compressor=compressor, base_retriever=retriever)

compressed_docs = compression_retriever.get_relevant_documents("What did the president say about Ketanji Jackson Brown")
pretty_print_docs(compressed_docs)

1141
33
1141
6992


ValidationError: 1 validation error for ContextualCompressionRetriever
base_retriever
  instance of BaseRetriever expected (type=type_error.arbitrary_type; expected_arbitrary_type=BaseRetriever)

In [46]:
from langchain.chat_models import ChatOpenAI
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

retriever2 = retriever.as_retriever()

llm = ChatOpenAI(temperature=0, model_name="gpt-4")
compressor = LLMChainExtractor.from_llm(llm)
compression_retriever = ContextualCompressionRetriever(base_compressor=compressor, base_retriever=retriever2)

compressed_docs = compression_retriever.get_relevant_documents("Give me a tool to create a conversational agent with web searching from LangChain.")
pretty_print_docs(compressed_docs)

Document 1:

Conversational Agent with Tools (Langchain AGI)
----------------------------------------------------------------------------------------------------
Document 2:

from langchain.agents import Tool
from langchain.agents import AgentType
from langchain.memory import ConversationBufferMemory
from langchain import OpenAI
from langchain.utilities import SerpAPIWrapper
from langchain.agents import initialize_agent
search = SerpAPIWrapper()
tools = [
    Tool(
        name = "Current Search",
        func=search.run,
        description="useful for when you need to answer questions about current events or the current state of the world"
    ),
]
memory = ConversationBufferMemory(memory_key="chat_history")
llm=OpenAI(temperature=0)
agent_chain = initialize_agent(tools, llm, agent=AgentType.CONVERSATIONAL_REACT_DESCRIPTION, verbose=True, memory=memory)
----------------------------------------------------------------------------------------------------
Document 3:

search = SerpAPIWr

In [32]:
query = "What is the latest and most powerful model"

from langchain.llms import OpenAI

llm = OpenAI(model_name="gpt-4", verbose=True)

result = index.query_with_sources(query, llm=llm)

print(result)

# print(result["answer"])
# print(result["sources"])



{'question': 'What is the latest and most powerful model', 'answer': "I don't know the latest and most powerful model as the provided content does not specify the model or the domain it belongs to.\nSOURCES:", 'sources': ''}
