In [45]:
import os
import openai
import pinecone
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain
from dotenv import load_dotenv, find_dotenv

_ = load_dotenv(find_dotenv())


In [46]:
OPENAI_API_VERSION = '0.28.0'

In [47]:
directory = 'D:/question-answering-using-llm/data'   #keep multiple files (.txt, .pdf) in data folder.

def load_docs(directory):
  loader = DirectoryLoader(directory)
  documents = loader.load()
  return documents

documents = load_docs(directory)
len(documents)

1

In [48]:
def split_docs(documents, chunk_size=1000, chunk_overlap=0):
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
  docs = text_splitter.split_documents(documents)
  return docs

docs = split_docs(documents)
print(len(docs))

842


In [49]:
docs[0]

Document(page_content='Data Science Projects with Python second edition\n\nA case study approach to gaining valuable insights\n\nfrom real data with machine learning\n\nStephen Klosterman\n\nData Science Projects with Python second edition\n\nCopyright © 2021 Packt Publishing\n\nAll rights reserved. No part of this course may be reproduced, stored in a retrieval system, or transmitted in any form or by any means, without the prior written permission of the publisher, except in the case of brief quotations embedded in critical articles or reviews.\n\nEvery effort has been made in the preparation of this course to ensure the accuracy of the information presented. However, the information contained in this course is sold without warranty, either express or implied. Neither the author nor Packt Publishing, and its dealers and distributors will be held liable for any damages caused or alleged to be caused directly or indirectly by this course.', metadata={'source': 'D:\\question-answering-u

In [50]:
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

In [51]:
import pinecone      

pinecone.init(      
	api_key=PINECONE_API_KEY,      
	environment='gcp-starter'      
)


index_name = "question-index"



In [52]:
pinecone.create_index(name=index_name, dimension=1536,
                          metric="cosine", pods=1)





In [53]:
docsearch = Pinecone.from_texts(texts=[d.page_content for d in docs], embedding=embeddings, index_name='question-index')

In [54]:
texts=[d.page_content for d in docs]

len(texts)

842

In [55]:
def get_similiar_docs(query, k=2):  # we can control k value to get no. of context with respect to question.

  similar_docs = docsearch.similarity_search(query, k=k)
  return similar_docs

In [56]:
from langchain.llms import OpenAI


query = "how to import pandas library?"

similar_docs = get_similiar_docs(query=query)

llm = OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)

chain = load_qa_chain(llm, chain_type="stuff")

chain.run(input_documents=similar_docs, question=query)

'\nYou can import pandas library by using the command "import pandas".'

In [None]:
query = "What is flow of control statements?"

similar_docs = get_similiar_docs(query=query)

print(similar_docs)

In [20]:
from langchain.llms import AzureOpenAI

model_name = "text-davinci-003"

llm = AzureOpenAI(model_name=model_name, openai_api_version=OPENAI_API_VERSION)

chain = load_qa_chain(llm, chain_type="stuff") #we can use map_reduce chain_type also.

def get_answer(query):
  similar_docs = get_similiar_docs(query)
  print(similar_docs)
  answer = chain.run(input_documents=similar_docs, question=query)
  return answer

In [21]:
query = "What are examples of good data science teams?"
answer = get_answer(query)
print(answer)

[]


InvalidRequestError: Invalid URL (POST /v1/openai/deployments/completions)