In [1]:
import os
import streamlit as st
import pickle
import time
import langchain
from langchain import OpenAI
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.chains.qa_with_sources.loading import load_qa_with_sources_chain
from langchain.document_loaders import UnstructuredURLLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS

In [2]:
#load openAI api key
os.environ['OPENAI_API_KEY'] = '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'

In [3]:
# Initialise LLM with required params
llm = OpenAI(temperature=0.9, max_tokens=500, model='gpt-3.5-turbo-instruct' ) 

### (1) Load data

In [4]:
loaders = UnstructuredURLLoader(urls=
    ["https://www.muscleandfitness.com/features/feature-news/joey-swoll-hilariously-surprises-girl-as-she-films-herself-flexing/",
    "https://www.muscleandfitness.com/flexonline/flex-news/lessons-being-learned-on-bodybuildings-best-podcasts/"
])
data = loaders.load() 

In [5]:
type(data)

list

### (2) Split data to create chunks

In [6]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    
)

# As data is of type documents we can directly use split_documents over split_text in order to get the chunks.
docs = text_splitter.split_documents(data)

### (3) Create embeddings for these chunks and save them to FAISS index

In [10]:
# Create the embeddings of the chunks using openAIEmbeddings
embeddings = OpenAIEmbeddings()

# Pass the documents and embeddings inorder to create FAISS vector index
vectorindex_openai = FAISS.from_documents(docs, embeddings)

In [11]:
# vectorindex_openai.as_retriever()
vectorindex_openai

<langchain.vectorstores.faiss.FAISS at 0x7f5f18c8b910>

In [15]:
from sentence_transformers import SentenceTransformer

In [16]:
# Storing vector index create in local
file_path="vector_index.pkl"
with open(file_path, "wb") as f:
    pickle.dump(vectorindex_openai, f)

In [17]:
if os.path.exists(file_path):
    with open(file_path, "rb") as f:
        vectorIndex = pickle.load(f)

In [19]:
vectorIndex

<langchain.vectorstores.faiss.FAISS at 0x7fceb32c9d90>

### (4) Retrieve similar embeddings for a given question and call LLM to retrieve final answer

In [8]:
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name="all-mpnet-base-v2")
db = FAISS.from_documents(docs, embeddings)


In [9]:
db

<langchain.vectorstores.faiss.FAISS at 0x7f5f3274fa30>

In [12]:
chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=db.as_retriever())
chain



In [13]:
# query = "what is the price of Tiago iCNG?"
query = "what are these documents about?"

langchain.debug=True

chain({"question": query}, return_only_outputs=True)

[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQAWithSourcesChain] Entering Chain run with input:
[0m{
  "question": "what are these documents about?"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQAWithSourcesChain > 3:chain:MapReduceDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQAWithSourcesChain > 3:chain:MapReduceDocumentsChain > 4:chain:LLMChain] Entering Chain run with input:
[0m{
  "input_list": [
    {
      "context": "Men\n                                                            \n                                                                \n                                                                \n                                                                Women\n                                                            \n                                                            \n                                                        \n                                

{'answer': ' These documents are about promotions for Muscle & Fitness and its magazine, as well as their podcast. \n',
 'sources': 'https://www.muscleandfitness.com/features/feature-news/joey-swoll-hilariously-surprises-girl-as-she-films-herself-flexing/'}