In [1]:
!pip install --quiet google-generativeai langchain faiss-cpu


In [2]:
!pip install --upgrade --quiet  langchain-google-genai

In [3]:
!pip install --quiet langchain_community

In [4]:
pip install --quiet unstructured

In [5]:
import os
import pickle
import time
import langchain
from langchain.chat_models import ChatGooglePalm
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import UnstructuredURLLoader
from langchain.embeddings import GooglePalmEmbeddings  # Use Google PaLM embeddings
from langchain.vectorstores import FAISS


In [6]:
#load openAI api key
os.environ['GOOGLE_API_KEY'] = 'AIzaSyDFcyfSAdQNcXKtjhmZ0i5kq10BuIrszT4'

In [7]:
# Initialise LLM with required params
llm = ChatGooglePalm(model="gemini-pro", max_output_tokens=2048, google_api_key=os.environ['GOOGLE_API_KEY'])

### (1) Load data

In [8]:
!pip install --upgrade --quiet unstructured

In [9]:
pip install --quiet --upgrade nltk


In [10]:
import nltk

# Download the punkt tokenizer data
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [37]:
from langchain_community.document_loaders import WebBaseLoader

loader = WebBaseLoader(["https://corporate.britannica.com/termsofuse.html", "https://www.apple.com/legal/internet-services/terms/site.html"])


In [38]:
data = loader.load()

data[0]

Document(metadata={'source': 'https://corporate.britannica.com/termsofuse.html', 'title': 'Encyclopædia Britannica, Inc. Corporate Site', 'language': 'No language found.'}, page_content='\n\n\n\nEncyclopædia Britannica, Inc. Corporate Site\n\n\n\n\n\n\n\n\nENCYCLOPAEDIA BRITANNICA, INC.\nTERMS OF USEFor All Encyclopaedia Britannica, Inc. ("Britannica") Websites, Mobile Application and  Online Services\nAdvertising-Supported and Subscription, both Consumer and Institutional\n\nLast Updated and Effective as of: February 8, 2024\nUnless other terms and conditions or agreements with Britannica expressly govern, these Terms of Use govern your use of our websites, mobile applications or online services that link to these Terms of Use and those of our wholly owned subsidiaries, including, wihtout limitation, Britannica Asia Pacific Pty Ltd., Britannica Japan Co., Ltd., Encyclopaedia Britannica Australia Ltd., Encyclopaedia Britannica (UK) Ltd, Melingo Ltd. and Merriam-Webster, Incorporated.\n

### (2) Split data to create chunks

In [39]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)

# As data is of type documents we can directly use split_documents over split_text in order to get the chunks.
docs = text_splitter.split_documents(data)

In [40]:
len(docs)

89

In [41]:
docs[0]

Document(metadata={'source': 'https://corporate.britannica.com/termsofuse.html', 'title': 'Encyclopædia Britannica, Inc. Corporate Site', 'language': 'No language found.'}, page_content='Encyclopædia Britannica, Inc. Corporate Site\n\n\n\n\n\n\n\n\nENCYCLOPAEDIA BRITANNICA, INC.\nTERMS OF USEFor All Encyclopaedia Britannica, Inc. ("Britannica") Websites, Mobile Application and  Online Services\nAdvertising-Supported and Subscription, both Consumer and Institutional')

### (3) Create embeddings for these chunks and save them to FAISS index

In [42]:
!pip install --quiet InstructorEmbedding

In [43]:
pip install --quiet sentence-transformers==2.2.2


In [44]:
!pip install --quiet tiktoken


In [45]:
from langchain_community.embeddings import HuggingFaceInstructEmbeddings

In [46]:
model_name = "hkunlp/instructor-large"
instructor_embeddings = HuggingFaceInstructEmbeddings(model_name=model_name)


load INSTRUCTOR_Transformer
max_seq_length  512


  model.load_state_dict(torch.load(os.path.join(input_path, 'pytorch_model.bin'), map_location=torch.device('cpu')))


In [None]:

# Pass the documents and embeddings inorder to create FAISS vector index
vectordb=FAISS.from_documents(documents=docs, embedding=instructor_embeddings)

In [23]:
print(vectordb)

<langchain_community.vectorstores.faiss.FAISS object at 0x7a4f84c8a560>


In [25]:
# Storing vector index create in local
file_path="vector_index.pkl"
with open(file_path, "wb") as f:
    pickle.dump(vectordb, f)

In [26]:
if os.path.exists(file_path):
    with open(file_path, "rb") as f:
        vectorIndex = pickle.load(f)

  return torch.load(io.BytesIO(b))


### (4) Retrieve similar embeddings for a given question and call LLM to retrieve final answer

In [28]:
from langchain.chains import RetrievalQAWithSourcesChain


In [29]:
chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=vectorIndex.as_retriever())
chain



In [34]:
query = "When was tesla found ?"
# query = "what are the main features of punch iCNG?"

langchain.debug=True

chain({"question": query}, return_only_outputs=True)

[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain] Entering Chain run with input:
[0m{
  "question": "When was tesla found ?"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain > chain:LLMChain] Entering Chain run with input:
[0m{
  "input_list": [
    {
      "context": "Industry:Auto CEO:Elon MuskRecent NewsSep. 6, 2024, 9:17 PM UTC(AP)Stock market today: Wall Street falls sharply to close its worst week in nearly 18 monthsSep. 5, 2024, 9:25 PM UTC(AP)Stock market today: Most of Wall Street slips as S&P 500 stays on track for worst week since AprilShow MoreTesla, Inc.,  American manufacturer of electric automobiles, solar panels, and batteries for cars and home power storage. It was founded in 2003 by American entrepreneurs Martin Eberhard and Marc Tarpenning and wa



[31;1m[1;3m[llm/error][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain > chain:LLMChain > llm:ChatGooglePalm] [315.16s] LLM run errored with error:
[0m"NotFound('POST http://localhost:36955/v1beta/models/chat-bison-001:generateMessage?%24alt=json%3Benum-encoding%3Dint: Requested entity was not found.')Traceback (most recent call last):\n\n\n  File \"/usr/local/lib/python3.10/dist-packages/langchain_core/language_models/chat_models.py\", line 624, in generate\n    self._generate_with_cache(\n\n\n  File \"/usr/local/lib/python3.10/dist-packages/langchain_core/language_models/chat_models.py\", line 846, in _generate_with_cache\n    result = self._generate(\n\n\n  File \"/usr/local/lib/python3.10/dist-packages/langchain_community/chat_models/google_palm.py\", line 302, in _generate\n    response: genai.types.ChatResponse = chat_with_retry(\n\n\n  File \"/usr/local/lib/python3.10/dist-packages/langchain_community/chat_models/google_palm.py\", line 201, in chat_

NotFound: 404 POST https://generativelanguage.googleapis.com/v1beta/models/chat-bison-001:generateMessage?%24alt=json%3Benum-encoding%3Dint: Requested entity was not found.

In [35]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

def get_html_links(url):
    try:
        # Send a GET request to the URL
        response = requests.get(url)
        response.raise_for_status()  # Raise an error for bad responses

        # Parse the content of the response
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find all anchor tags
        anchors = soup.find_all('a')

        # Extract and filter links that end with .html
        html_links = [urljoin(url, a.get('href')) for a in anchors if a.get('href') and a.get('href').endswith('.html')]

        return html_links
    except requests.RequestException as e:
        print(f"An error occurred: {e}")
        return []

# Example URL
url = 'https://www.apple.com/apple-events/event-stream/'
links = get_html_links(url)

# Print the extracted .html links
for link in links:
    print(link)


https://www.apple.com/legal/internet-services/terms/site.html


In [36]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

def get_html_links(url):
    try:
        # Send a GET request to the URL
        response = requests.get(url)
        response.raise_for_status()  # Raise an error for bad responses

        # Parse the content of the response
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find all anchor tags
        anchors = soup.find_all('a')

        # Extract and filter links that end with .html
        html_links = [urljoin(url, a.get('href')) for a in anchors if a.get('href') and a.get('href').endswith('.html')]

        return html_links
    except requests.RequestException as e:
        print(f"An error occurred: {e}")
        return []

# Example URL
url = 'https://www.britannica.com/money/Tesla-Motors'
links = get_html_links(url)

# Print the extracted .html links
for link in links:
    print(link)


https://corporate.britannica.com/termsofuse.html
