In [1]:
import os
import streamlit as st
import pickle
import time

from langchain_openai import OpenAI
from langchain_openai import OpenAI, OpenAIEmbeddings
from langchain_community.document_loaders import UnstructuredURLLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQAWithSourcesChain

In [2]:
#load openAI api key
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

# Now safely get the key
api_key = os.getenv("OPENAI_API_KEY")
if api_key is None:
    raise ValueError("OPENAI_API_KEY not found in .env file")

os.environ['OPENAI_API_KEY'] = api_key

In [3]:
# Initialise LLM with required params
llm = OpenAI(temperature=0.9, max_tokens=500) 

### (1) Load data

In [4]:
loaders = UnstructuredURLLoader(urls=[
    "https://www.moneycontrol.com/news/business/markets/wall-street-rises-as-tesla-soars-on-ai-optimism-11351111.html",
    "https://www.moneycontrol.com/news/business/tata-motors-launches-punch-icng-price-starts-at-rs-7-1-lakh-11098751.html"
])
data = loaders.load() 
len(data)

2

### (2) Split data to create chunks

In [5]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)

# As data is of type documents we can directly use split_documents over split_text in order to get the chunks.
docs = text_splitter.split_documents(data)

In [6]:
len(docs)

17

In [7]:
docs[0]

Document(metadata={'source': 'https://www.moneycontrol.com/news/business/markets/wall-street-rises-as-tesla-soars-on-ai-optimism-11351111.html'}, page_content='English\n\nHindi\n\nGujarati\n\nSpecials\n\nHello, Login\n\nHello, Login\n\nLog-inor Sign-Up\n\nMy Account\n\nMy Profile\n\nMy Portfolio\n\nMy Watchlist\n\nMy Alerts\n\nMy Messages\n\nPrice Alerts\n\nMy Profile\n\nMy PRO\n\nMy Portfolio\n\nMy Watchlist\n\nMy Alerts\n\nMy Messages\n\nPrice Alerts\n\nLogout\n\nLoans up to ₹50 LAKHS\n\nFixed Deposits\n\nCredit CardsLifetime Free\n\nCredit Score\n\nChat with Us\n\nDownload App\n\nFollow us on:\n\nNetwork 18\n\nGo Ad-Free\n\nMy Alerts\n\n>->MC_ENG_DESKTOP/MC_ENG_NEWS/MC_ENG_MARKETS_AS/MC_ENG_ROS_NWS_MKTS_AS_ATF_728\n\nMoneycontrol\n\nGo PRO@₹1/dayPRO\n\nMoneycontrol PRO\n\nAdvertisement\n\nRemove Ad\n\nBusiness\n\nMarkets\n\nStocks\n\nEconomy\n\nCompanies\n\nTrends\n\nIPO\n\nOpinion\n\nEV Special\n\nHomeNewsBusinessMarketsWall Street rises as Tesla soars on AI optimism\n\nTrending To

### (3) Create embeddings for these chunks and save them to FAISS index

In [8]:
# Create the embeddings of the chunks using openAIEmbeddings
embeddings = OpenAIEmbeddings()

# Pass the documents and embeddings inorder to create FAISS vector index
vectorindex_openai = FAISS.from_documents(docs, embeddings)

In [13]:
vectorindex_openai.save_local(index_path)

In [14]:
# Load the index with security acknowledgement
if os.path.exists(index_path):
    try:
        loaded_index = FAISS.load_local(
            folder_path=index_path,
            embeddings=embeddings,
            allow_dangerous_deserialization=True  # Required for loading
        )
        print("FAISS index loaded successfully!")
    except Exception as e:
        print(f"Error loading index: {str(e)}")
else:
    print("No saved index found - create a new one")

FAISS index loaded successfully!


### (4) Retrieve similar embeddings for a given question and call LLM to retrieve final answer

In [17]:
from langchain.chains import RetrievalQAWithSourcesChain
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(
    model="gpt-3.5-turbo",  # or "gpt-4-turbo"
    temperature=0.5,
    max_tokens=1000
)

# Create the QA chain with source tracking
qa_chain = RetrievalQAWithSourcesChain.from_chain_type(
    llm=llm,
    chain_type="stuff",  # or "map_reduce", "refine" for larger docs
    retriever=loaded_index.as_retriever(
        search_type="similarity",  # or "mmr" for diversity
        search_kwargs={"k": 4}  # Number of docs to retrieve
    ),
    return_source_documents=True  # Important for source tracking
)

In [19]:
# Example usage:
query = "what is the price of Tiago iCNG?"
result = qa_chain.invoke({"question": query})
print(f"Answer: {result['answer']}")
print(f"Sources: {result['sources']}")

Answer: The price of Tiago iCNG is between Rs 6.55 lakh and Rs 8.1 lakh.

Sources: https://www.moneycontrol.com/news/business/tata-motors-launches-punch-icng-price-starts-at-rs-7-1-lakh-11098751.html
