In [None]:
# 1. Loading URLs into Smaller Documents
#       - Load URL
#       - Split document into chunks
#       - Create text embeddings with OpenAI
#
# 2. Storing Embeddings in Pinecone
# 3. Defining the Retriever
# 4. Creating the Chatbot Chain

In [None]:
# !pip install langchain
# !pip install openai
# !pip install pinecone-client
# !pip install tiktoken

In [1]:
import os

os.environ["OPENAI_API_KEY"] = "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"
os.environ["PINECONE_API_KEY"] = "bce703c9-3001-4e5d-86d3-6c2d8c9d72cd"

In [2]:
urls = [
    'https://www.apollodiagnostics.in/blog/10-benefits-of-fruits-in-our-life',
    'https://www.healthline.com/nutrition/healthy-fruit'
]

In [13]:
from langchain_community.document_loaders import SeleniumURLLoader

loader = SeleniumURLLoader(urls=urls)
documents = loader.load()

In [14]:
documents

[Document(page_content="Home\n\nBlog\n\n10 BENEFITS OF FRUITS IN OUR LIFE\n\n10 BENEFITS OF FRUITS IN OUR LIFE\n\nJun 04, 2020\n\nFruits must be an indispensable part of our daily diet if we wish to lead a disease-free life. They come with numerous health benefits owing to the presence of crucial nutrients and minerals.\n\nFruit consumption can cure deficiency problems and also regulate the physiological functions of the body.\n\nRead on to know about the 10 benefits of eating fruits\n\nGreat source of essential vitamins: They are an excellent source of vitamins, minerals and phytonutrients. Eating a variety of fruits in the form of a fruit salad can go a long way in stimulating the energy level and health factor of the body.\n\nKeeping cardiovascular diseases at bay: Fruits also reduce the risk of heart-related diseases and even cancer. Fruits like apricot, apple, grapefruit are rich in flavonoids, carotenoids, fibre, potassium and magnesium that protects the heart against a plethora 

In [15]:
from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter(
    
    separator = '\n',
    chunk_size = 1000,
    chunk_overlap = 150
    
)

docs = text_splitter.split_documents(documents)

In [16]:
len(docs)

32

In [17]:
import os
from langchain.embeddings.openai import OpenAIEmbeddings

openai_api_key = os.getenv('OPENAI_API_KEY')
model_name = "text-embedding-ada-002"

embeddings = OpenAIEmbeddings(
    
    model = model_name,
    openai_api_key = openai_api_key
    
)

In [18]:
embeddings

OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x1684aa580>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x176b31430>, model='text-embedding-ada-002', deployment='text-embedding-ada-002', openai_api_version='', openai_api_base=None, openai_api_type='', openai_proxy='', embedding_ctx_length=8191, openai_api_key='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', openai_organization=None, allowed_special=set(), disallowed_special='all', chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None)

In [21]:
import time
from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import PineconeVectorStore

pinecone_api_key = os.getenv('PINECONE_API_KEY')
region = 'us-east-1'

pc = Pinecone(api_key=pinecone_api_key)

index_name = "langchain-chatbot"

existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]

if index_name not in existing_indexes:
    pc.create_index(
        
        name = index_name,
        metric = 'cosine',
        dimension = 1536,
        spec=ServerlessSpec(cloud="aws", region=region)
        
    )
    while not pc.describe_index(index_name).status["ready"]:
        time.sleep(1)


In [22]:
index = PineconeVectorStore.from_documents(docs, embeddings, index_name=index_name)

In [28]:
retriever = index.as_retriever(search_type="similarity", search_kwargs={"k":2})

In [33]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQAWithSourcesChain

llm_name = 'gpt-3.5-turbo'

qa_chain = RetrievalQAWithSourcesChain.from_llm(
    
    ChatOpenAI(temperature=0, model=llm_name, openai_api_key=openai_api_key),
    retriever = retriever
    
)

In [39]:
question = "What are the benefits of fruits?"

In [40]:
result = qa_chain({"question": question})

print("Answer:", result["answer"])
print("Sources:", result["sources"])

Answer: Fruits have numerous health benefits, including being a great source of essential vitamins, reducing the risk of cardiovascular diseases, aiding in digestion, keeping body weight in check, nourishing the skin and hair, and hydrating the body.

Sources: https://www.apollodiagnostics.in/blog/10-benefits-of-fruits-in-our-life
