In [1]:
!pip install openai==0.28 langchain pinecone
!pip install -U langchain-community
!pip install pypdf
!pip install tiktoken



In [2]:
import os
os.environ['PINECONE_API_KEY'] = '<pinecone_api_key>'
os.environ['OPENAI_API_KEY'] = '<openai_api_key>'

In [12]:
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter


# Read documents in directory
def read_doc(directory: str):
    file_loader = PyPDFDirectoryLoader(directory)
    documents = file_loader.load()
    return documents


# Divide document into text chunks
def chunk_data(docs, chunk_size=500, chunk_overlap=20):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    chunks = text_splitter.split_documents(docs)
    return chunks


# Cosine similarity to retrieve results from vectorDB
def retrieve_query(index, query, k=2):
    matching_results = index.similarity_search(query, k=k)
    return matching_results


# Search answers from Pinecone VectorDB
def retrieve_answers(index, chain, query):
    doc_search = retrieve_query(index, query)
    response = chain.run(input_documents=doc_search, question=query)
    return response


# Get results from query
def qa_manager(query):
    return retrieve_answers(query)

In [13]:
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone
from langchain.vectorstores import Pinecone
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI

In [14]:
DOC_DIR_PATH = '/content/documents'
PINECONE_INDEX = 'bot'
OPENAI_LLM = 'gpt-3.5-turbo'
# Read docs
doc = read_doc(DOC_DIR_PATH)

# Split docs to smaller chunks
documents = chunk_data(doc)

# Get OpenAI embeddings
embeddings = OpenAIEmbeddings(api_key=os.getenv("OPENAI_API_KEY"),model="text-embedding-ada-002")

# Connect Pinecone client

pc = pinecone.Pinecone(
    api_key=os.getenv("PINECONE_API_KEY")
)

# Database index
index_name = PINECONE_INDEX

# Store embeddings in Pinecone
index = Pinecone.from_documents(doc, embeddings, index_name=index_name)

# Load LLM from OpenAI
llm = OpenAI(model_name=OPENAI_LLM, temperature=0.3)

# Load Question-Answer Chain from LLM
chain = load_qa_chain(llm, chain_type="stuff")





In [8]:
query = 'What is the capacity of the washer?'
answer = retrieve_answers(index, chain, query)
print(answer)

The capacity of the washer is 4.7 cu. ft.


In [9]:
query = 'How many temeprature settings avialable in the washer?'
answer = retrieve_answers(index, chain, query)
print(answer)

There are 5 temperature settings available in the washer.


In [10]:
query = 'How many washing programs available in the washer?'
answer = retrieve_answers(index, chain, query)
print(answer)

There are 12 washing programs available in the washer.


In [16]:
query = 'what are the features available in the washer?'
answer = retrieve_answers(index, chain, query)
print(answer)

The features available in the washer include WaveForce™ technology, ColdWash™ option, SmartRinse™ Jet Spray System, SmartDiagnosis™, SlamProof™ Lid, truebalance™ Anti-Vibration System, LoDecibel™ Quiet operation, SenseClean™, Delay Wash (Up to 19 Hours), 4.7 cu. ft. Ultra Large Capacity with neveRust™ Stainless Steel Drum, Direct Drive motor with 10-Year Limited Warranty, 1,100 RPM, 12 Washing Programs, 5 temperature Settings (All Cold Rinses), Electronic Control Panel with Dual LED Display and Dial-A-Cycle™, and Clear Diamond glass Lid.


In [18]:

query = 'what are the UPC CODES of the dryer?'
answer = retrieve_answers(index, chain, query)
print(answer)

The UPC CODES of the dryers are 048231012546 for the electric dryer and 048231012553 for the gas dryer.
