In [37]:
import os
import pickle
import google.generativeai as genai
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.llms.base import LLM
from langchain.chains import RetrievalQAWithSourcesChain
import logging

In [38]:
genai.configure(api_key="AIzaSyCLgVGupysmVb9GrHoWUMmGYliNoefVbVg")
model = genai.GenerativeModel("gemini-1.5-flash")

In [39]:
from bs4 import BeautifulSoup
import requests
from langchain.schema import Document

# Function to fetch and parse the webpage
def fetch_and_parse(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    return soup.get_text()

# URLs to scrape
urls = [
    "https://www.moneycontrol.com/news/business/markets/wall-street-rises-as-tesla-soars-on-ai-optimism-11351111.html",
    "https://www.moneycontrol.com/news/business/tata-motors-launches-punch-icng-price-starts-at-rs-7-1-lakh-11098751.html"
]

# Fetch and parse the content
data = [fetch_and_parse(url) for url in urls]

# Convert the data into a list of Document objects
documents = [Document(page_content=text) for text in data]

In [40]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,  # Adjust chunk size as needed
    chunk_overlap=20,  # Optional overlap to maintain context
    length_function=len,
)

In [41]:
docs = text_splitter.split_documents(documents)

In [42]:
print(f"Number of chunks: {len(docs)}")

Number of chunks: 280


In [43]:
docs

[Document(page_content='Wall Street rises as Tesla soars on AI optimism', metadata={}),
 Document(page_content='EnglishHindiGujaratiSpecialsSearch Quotes, News, Mutual Fund NAVsTrending StocksOla Electric', metadata={}),
 Document(page_content='StocksOla Electric INE0LXG01040, OLAELEC, 544225Kalyan Jeweller INE303R01014, KALYANKJIL, 543278ITC', metadata={}),
 Document(page_content='543278ITC Hotels INE379A01028, ITCHOTELS, 544325Suzlon Energy INE040H01021, SUZLON, 532667Reliance', metadata={}),
 Document(page_content='532667Reliance INE002A01018, RELIANCE, 500325QuotesMutual FundsCommoditiesFutures &', metadata={}),
 Document(page_content='& OptionsCurrencyNewsCryptocurrencyForumNoticesVideosGlossaryAll  Hello, Login Hello, LoginLog-inor', metadata={}),
 Document(page_content='LoginLog-inor Sign-UpMy AccountMy Profile My PortfolioMy WatchlistMy AlertsMy MessagesPrice', metadata={}),
 Document(page_content='MessagesPrice AlertsMy Profile My PROMy PortfolioMy WatchlistMy AlertsMy Message

In [44]:
docs[1]

Document(page_content='EnglishHindiGujaratiSpecialsSearch Quotes, News, Mutual Fund NAVsTrending StocksOla Electric', metadata={})

In [47]:
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

In [48]:
vectorindex_openai = FAISS.from_documents(docs, embeddings)

In [52]:
# Storing vector index create in local
file_path= "vector_index.pkl"
with open(file_path, "wb") as f:
    pickle.dump(vectorindex_openai, f)

In [53]:
if os.path.exists(file_path):
    with open(file_path, "rb") as f:
        vectorIndex = pickle.load(f)

In [54]:
from langchain.schema import Document
# Securely configure the API key using an environment variable
os.environ["GOOGLE_API_KEY"] = "AIzaSyCLgVGupysmVb9GrHoWUMmGYliNoefVbVg"
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))

# Custom GeminiLLM integrated with LangChain
class GeminiLLM(LLM):
    """Custom Gemini model wrapper for LangChain"""

    def _call(self, prompt: str, stop=None, run_manager=None, **kwargs) -> str:
        try:
            model = genai.GenerativeModel("gemini-1.5-flash")
            response = model.generate_content(prompt)
            logging.debug(f"Response: {response.text}")
            return response.text  # Ensure we return a plain string
        except Exception as e:
            logging.error(f"Error generating content: {e}")
            return "Error generating response"

    @property
    def _llm_type(self) -> str:
        return "Gemini"

# Instantiate the Gemini model
llm = GeminiLLM()

data = [
    "The Tiago iCNG is a fuel-efficient car priced around 7.5 lakh INR.",
    "The Tiago iCNG comes with features like dual airbags and ABS.",
    "It is available in multiple variants with different prices."
]

# Convert data into Document objects with metadata
documents = [
    Document(page_content=text, metadata={"source": f"doc_{i}"}) for i, text in enumerate(data)
]

# Initialize HuggingFace Embeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Create FAISS vector store
vectorstore = FAISS.from_documents(documents=documents, embedding=embeddings)

# Create and test retriever
retriever = vectorstore.as_retriever()

In [55]:
chain = RetrievalQAWithSourcesChain.from_llm(
    llm=llm,
    retriever=vectorstore.as_retriever()  # Fixed variable name
)

In [56]:
chain



In [58]:
query = "What is the price of Tiago iCNG?"
response = chain({"question": query}, return_only_outputs=True)
print(response)

{'answer': 'The Tiago iCNG is priced around 7.5 lakh INR.  However, note that the provided text also mentions that it is available in multiple variants with different prices.\n', 'sources': 'doc_0, doc_2'}


In [59]:
import gc
gc.collect()

4413

In [60]:
import gc
gc.collect()

0

In [61]:
import gc
gc.collect()

0

In [62]:
import torch
torch.mps.empty_cache()

In [63]:
import gc
gc.collect()

0