# Information extraction from Web URL using FAISS Vector Database

Import all necessary libraries

In [2]:
from langchain_community.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
from langchain_community.llms import HuggingFaceHub
from langchain_community.vectorstores.faiss import FAISS
from langchain.prompts import PromptTemplate 
from langchain.chains import LLMChain
from dotenv import load_dotenv
import os

Load all the environmental variables

In [3]:
load_dotenv()

True

Load the data from Web URL

In [4]:
# Define Web URL
URL = ["https://ibighit.com/bts/eng/profile/","https://www.geeksforgeeks.org/"]
# Load the data
data = WebBaseLoader(URL)

Extract the content

In [5]:
content = data.load()

Convert the content into chunks

In [6]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 256,chunk_overlap = 50)
chunking = text_splitter.split_documents(content)

Convert these chunks into embedding

In [7]:
embeddings = HuggingFaceInferenceAPIEmbeddings(api_key = os.environ["HUGGINGFACEHUB_API_TOKEN"], model_name = "BAAI/bge-base-en-v1.5")

Define the vector database

In [10]:
vector_store =  FAISS.from_documents(chunking,embeddings)

Retrieval the right content

In [11]:
# Retrieval
def retrieve_content(query):
    docs_rel = vector_store.similarity_search(query,k=3)
    docs_page_content = " ".join([d.page_content for d in docs_rel])
    return docs_page_content

Augmentation

In [12]:
prompt = PromptTemplate(
        input_variables = ["question","docs"], template="""
        You are an AI Assistant that follows instructions extremely well.
        Please be truthful and give direct answers. Please tell 'I don't know' if user query is not in content
        
        Answer the following question: \n{question}\n\n
        By searching the following transcript:\n{docs}.
        \nAnswer:"""
    )

Read the LLM model

In [13]:
model = HuggingFaceHub(repo_id = "HuggingFaceH4/zephyr-7b-alpha", model_kwargs = {"temperature":0.5, "max_new_tokens":512, "max_length":64})

  warn_deprecated(
  from .autonotebook import tqdm as notebook_tqdm


Create a application around the LLM model

In [14]:
chain = LLMChain(llm=model,prompt=prompt)

Shoot your question

In [19]:
query = "What is React.js"
docs_page_content = retrieve_content(query)
docs_page_content

'Trending NowDSAWeb TechFoundational CoursesData SciencePractice ProblemPythonMachine LearningJavaScriptSystem DesignDjangoDevOps TutorialJavaC C++ReactJSNodeJSWeb DesignWeb BrowserCP LiveAptitudePuzzlesProjects \n\n\n\n\n▲ QuestionsJavaScript Cheat SheetDSA using JavaScriptFree JavaScript CourseJavaScript A to Z Complete GuideReactJSReactJS TutorialFree ReactJS CourseReactJS FrameworksNextJSReact Material UIReact BootstrapReact SuiteAnt DesignReactJS UIReact BootstrapReact SuiteAnt DesignReactJS ReactstrapBlueprintJSNode.jsNode.js TutorialExpress.jsPHPPHP TutorialPHP Programming ExamplesAngularJSAngularJS TutorialAngularJS Cheat SheetAngularJS FrameworksAngular PrimeNGAngular ngx BootstrapjQueryjQuery'

In [20]:
result=chain.run(question = query, docs = docs_page_content)
result

"\n        You are an AI Assistant that follows instructions extremely well.\n        Please be truthful and give direct answers. Please tell 'I don't know' if user query is not in content\n        \n        Answer the following question: \nWhat is React.js\n\n\n        By searching the following transcript:\nTrending NowDSAWeb TechFoundational CoursesData SciencePractice ProblemPythonMachine LearningJavaScriptSystem DesignDjangoDevOps TutorialJavaC C++ReactJSNodeJSWeb DesignWeb BrowserCP LiveAptitudePuzzlesProjects \n\n\n\n\n▲ QuestionsJavaScript Cheat SheetDSA using JavaScriptFree JavaScript CourseJavaScript A to Z Complete GuideReactJSReactJS TutorialFree ReactJS CourseReactJS FrameworksNextJSReact Material UIReact BootstrapReact SuiteAnt DesignReactJS UIReact BootstrapReact SuiteAnt DesignReactJS ReactstrapBlueprintJSNode.jsNode.js TutorialExpress.jsPHPPHP TutorialPHP Programming ExamplesAngularJSAngularJS TutorialAngularJS Cheat SheetAngularJS FrameworksAngular PrimeNGAngular ngx 

Let's try to ask a question which is out of context

In [21]:
query = "Where does vishali live?"
docs_page_content = retrieve_content(query)
docs_page_content

'▲\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\r\n                     A-143, 9th Floor, Sovereign Corporate Tower, Sector-136, Noida, Uttar Pradesh - 201305 CompanyAbout UsLegalCareersIn MediaContact UsAdvertise with usGFG Corporate SolutionPlacement Training ProgramExploreJob-A-Thon Hiring ChallengeHack-A-ThonGfG Weekly ContestOffline Classes (Delhi/NCR)DSA in JAVA/C++Master System DesignMaster AwarenessUPSC Study MaterialGeography NotesHistory NotesModern Indian History NotesMedieval Indian History NotesAncient Indian History NotesComplete History NotesScience & Tech. NotesEthics NotesPolity NotesEconomics NotesGovernment Schemes (Updated)UPSC'

In [22]:
result=chain.run(question = query, docs = docs_page_content)
result

"\n        You are an AI Assistant that follows instructions extremely well.\n        Please be truthful and give direct answers. Please tell 'I don't know' if user query is not in content\n        \n        Answer the following question: \nWhere does vishali live?\n\n\n        By searching the following transcript:\n▲\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\r\n                     A-143, 9th Floor, Sovereign Corporate Tower, Sector-136, Noida, Uttar Pradesh - 201305 CompanyAbout UsLegalCareersIn MediaContact UsAdvertise with usGFG Corporate SolutionPlacement Training ProgramExploreJob-A-Thon Hiring ChallengeHack-A-ThonGfG Weekly ContestOffline Classes (Delhi/NCR)DSA in JAVA/C++Master System DesignMaster AwarenessUPSC Study MaterialGeography NotesHistory NotesModern Indian History NotesMedieval Indian History NotesAncient Indian History NotesComplete History NotesScience & Tech. NotesEthics NotesPolity NotesEconomics Notes