In [1]:
import os
import langchain
import streamlit as st
import pickle
import time
from langchain_huggingface import HuggingFaceEmbeddings
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from sentence_transformers import SentenceTransformer
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.chains.qa_with_sources.loading import load_qa_with_sources_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredURLLoader
from langchain.vectorstores import FAISS




In [2]:
model_name = "deepset/roberta-base-squad2"  # You can choose other models depending on your task
qa_pipeline = pipeline("question-answering", model=model_name)

In [3]:
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

In [4]:
# URL Loader
loaders = UnstructuredURLLoader(urls=[
    "https://www.moneycontrol.com/news/business/markets/wall-street-rises-as-tesla-soars-on-ai-optimism-11351111.html",
    "https://www.moneycontrol.com/news/business/tata-motors-launches-punch-icng-price-starts-at-rs-7-1-lakh-11098751.html"
])

In [5]:
# Load data from URLs
data = loaders.load()
len(data)

2

In [6]:
# Text Splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)

In [7]:
# Split documents into smaller chunks
docs = text_splitter.split_documents(data)
len(docs)

16

In [8]:
pip install langchain transformers sentence-transformers faiss-cpu


Note: you may need to restart the kernel to use updated packages.


In [9]:
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings

# Extract document texts from the docs
doc_texts = [doc.page_content for doc in docs]

# Create FAISS vector index using document texts and embeddings model
# Pass the model name as a string instead of the model object
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Now create the FAISS index using the texts and embeddings
vectorindex_faiss = FAISS.from_texts(doc_texts, embeddings)


  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


In [10]:
# Storing vector index create in local
file_path="vector_index.pkl"
with open(file_path, "wb") as f:
    pickle.dump(vectorindex_faiss, f)

In [11]:
if os.path.exists(file_path):
    with open(file_path, "rb") as f:
        vectorIndex = pickle.load(f)

In [12]:
class HuggingFaceQAWithSourcesChain:
    def __init__(self, retriever, qa_pipeline):
        self.retriever = retriever
        self.qa_pipeline = qa_pipeline
    
    def __call__(self, inputs, return_only_outputs=False):
        # Retrieve relevant documents from the FAISS vector store
        docs = self.retriever.get_relevant_documents(inputs['question'])
        
        # Combine all retrieved document texts into one
        context = " ".join([doc.page_content for doc in docs])
        
        # Use the Hugging Face QA pipeline to answer the question based on the context
        result = self.qa_pipeline(question=inputs['question'], context=context)
        
        if return_only_outputs:
            return {"answer": result['answer']}
        return {"answer": result['answer'], "context": context}

# Assuming `vectorIndex` is your FAISS retriever
retriever = vectorIndex.as_retriever()

# Initialize the custom Hugging Face QA chain
hf_chain = HuggingFaceQAWithSourcesChain(retriever=retriever, qa_pipeline=qa_pipeline)



In [13]:
# Example query
query = "what is the price of Tiago iCNG?"
#query = "when did tata Motors launched the CNG variant of its micro SUV Punch priced between Rs 7.1 lakh and Rs 9.68 lakh?"
#query = "what are the main features of punch iCNG?"

# Enable debugging
langchain.debug = True

# Execute the chain to get the result
result = hf_chain({"question": query}, return_only_outputs=True)

# Display the result
print(result)

  docs = self.retriever.get_relevant_documents(inputs['question'])


{'answer': 'between Rs 6.55 lakh and Rs 8.1 lakh'}
