In [20]:
%pip install pinecone-client openai tiktoken

Note: you may need to restart the kernel to use updated packages.


In [21]:
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Pinecone
import os
import getpass
import pinecone

In [None]:
os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter in your OpenAI API Key:")
PINECONE_API_KEY = getpass.getpass("Pinecone API Key:")
PINECONE_ENV = getpass.getpass("Pinecone Environment:")
PINECONE_INDEX = ""

In [2]:
# loader = PyPDFLoader("data/principles_of_marketing_book.pdf") # If the file was local
loader = PyPDFLoader(
    "https://storage.googleapis.com/strapi_cms_assets/principles_of_marketing_book.pdf"
)
raw_documents = loader.load_and_split()
print(raw_documents[0])

page_content='Principles of Mark eting' metadata={'source': '/var/folders/_y/20jl658s4jl0zvy5c0x0c5140000gn/T/tmpr74w14jh/tmp.pdf', 'page': 0}


In [3]:
# Using the last 100 documents for this example:
raw_documents = raw_documents[-100:]

In [7]:
# Load the marketing principles .pdf, split it into chunks, embed each chunk and load it into the vector store.
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
documents = text_splitter.split_documents(raw_documents)

# Initialize the Pinecone authentication:
pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_ENV)

index = pinecone.Index(index_name=PINECONE_INDEX)

In [11]:
# This will create a new vector store, only run this line once or you will duplicate the vector store and the number of documents in it:
doc_search = Pinecone.from_documents(
    documents, embedding=OpenAIEmbeddings(), index_name=PINECONE_INDEX
)

Often, you won't want to ingest data into your vector database, but simply query the existing vectors:


In [23]:
existing_vector_store = Pinecone.from_existing_index(
    index_name=PINECONE_INDEX, embedding=OpenAIEmbeddings()
)
retriever = existing_vector_store.as_retriever()

In [25]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI

qa = RetrievalQA.from_chain_type(
    llm=ChatOpenAI(),
    chain_type="map_reduce",
    retriever=retriever,
    return_source_documents=True,
)
qa({"query": "What is marketing?"})

{'query': 'What is marketing?',
 'result': 'Marketing refers to the activities and strategies used by a company to promote and sell its products or services. It involves creating advertising and promotional campaigns, participating in trade shows, preparing collateral materials, and building brand awareness. Marketing also includes lead management, which is the process of identifying and qualifying potential customers to generate new business opportunities. Overall, marketing aims to create value for both the company and its customers by effectively communicating product benefits and building strong customer relationships.',
 'source_documents': [Document(page_content='13.5 Integr ating Sales and Mark eting\nLearning Objectiv es\n1.Identify the ways in which the marketing function supports the sales function.\n2.Describe how the sales group of a company can support its marketing ef forts.\nTraditionally , sales and marketing are like oil and water—the departments don’ t mix well. Sales