In [None]:
# 1. Loading URLs into Smaller Documents
#       - Load URL
#       - Split document into chunks
#       - Create text embeddings with OpenAI
#
# 2. Storing Embeddings in Pinecone
# 3. Defining the Retriever
# 4. Creating the QA Chain

In [None]:
# !pip install langchain
# !pip install openai
# !pip install pinecone-client
# !pip install tiktoken
# !pip install selenium langchain-pinecone

In [None]:
import os

os.environ["OPENAI_API_KEY"] = "YOUR_OPENAI_API_KEY"
os.environ["PINECONE_API_KEY"] = "YOUR_PINECONE_API_KEY"

In [None]:
urls = [
    'https://www.apollodiagnostics.in/blog/10-benefits-of-fruits-in-our-life',
    'https://www.healthline.com/nutrition/healthy-fruit'
]

In [None]:
from langchain_community.document_loaders import SeleniumURLLoader

loader = SeleniumURLLoader(urls=urls)
documents = loader.load()

In [None]:
documents

In [None]:
from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter(
    
    separator = '\n',
    chunk_size = 1000,
    chunk_overlap = 150
    
)

docs = text_splitter.split_documents(documents)

In [None]:
len(docs)

In [None]:
import os
from langchain.embeddings.openai import OpenAIEmbeddings

openai_api_key = os.getenv('OPENAI_API_KEY')
model_name = "text-embedding-ada-002"

embeddings = OpenAIEmbeddings(
    
    model = model_name,
    openai_api_key = openai_api_key
    
)

In [None]:
embeddings

In [None]:
import time
from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import PineconeVectorStore

pinecone_api_key = os.getenv('PINECONE_API_KEY')
region = 'us-east-1'

pc = Pinecone(api_key=pinecone_api_key)

index_name = "langchain-chatbot"

existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]

if index_name not in existing_indexes:
    pc.create_index(
        
        name = index_name,
        metric = 'cosine',
        dimension = 1536,
        spec=ServerlessSpec(cloud="aws", region=region)
        
    )
    while not pc.describe_index(index_name).status["ready"]:
        time.sleep(1)


In [None]:
index = PineconeVectorStore.from_documents(docs, embeddings, index_name=index_name)

In [None]:
retriever = index.as_retriever(search_type="similarity", search_kwargs={"k":2})

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQAWithSourcesChain

llm_name = 'gpt-3.5-turbo'

qa_chain = RetrievalQAWithSourcesChain.from_llm(
    
    ChatOpenAI(temperature=0, model=llm_name, openai_api_key=openai_api_key),
    retriever = retriever
    
)

In [None]:
question = "What are the benefits of fruits?"

In [None]:
result = qa_chain({"question": question})

print("Answer:", result["answer"])
print("Sources:", result["sources"])