In [1]:
from langchain_community.document_loaders import UnstructuredURLLoader
urls = ['https://www.livemint.com/economy/budget-2024-key-highlights-live-updates-nirmala-sitharaman-infrastructure-defence-income-tax-modi-budget-23-july-11721654502862.html',
        'https://cleartax.in/s/budget-2024-highlights',
        'https://www.hindustantimes.com/budget',
        'https://economictimes.indiatimes.com/news/economy/policy/budget-2024-highlights-india-nirmala-sitharaman-capex-fiscal-deficit-tax-slab-key-announcement-in-union-budget-2024-25/articleshow/111942707.cms?from=mdr']
loader = UnstructuredURLLoader(urls=urls)
data = loader.load()  

In [2]:
len(data)

4

In [4]:
#data

In [6]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# split data
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000)
docs = text_splitter.split_documents(data)


print("Total number of documents: ",len(docs))

Total number of documents:  164


In [7]:
docs[7]

Document(metadata={'source': 'https://www.livemint.com/economy/budget-2024-key-highlights-live-updates-nirmala-sitharaman-infrastructure-defence-income-tax-modi-budget-23-july-11721654502862.html'}, page_content='“It was an intriguing trading session. Key events like the Budget often set the tone for future trends, but today, the closing did not show significant changes. The impact of such events can sometimes be seen in the following sessions, making the next few days crucial. Observing the daily chart, the market shows a defined range. The upper end, marked by a bearish engulfing pattern formed on Friday at 24850, serves as a key resistance, followed by the psychological level of 25000. On the downside, the panic low around the 20EMA at 24100 - 24000 acts as a strong support zone. In our previous outlook, we highlighted the fresh sell signal by the RSI Smoothened in the overbought zone, which remains a key observation for our firm cautious stance. However, recent observations indicat

In [8]:
# Embedding models: https://python.langchain.com/v0.1/docs/integrations/text_embedding/
# Let's load the Hugging Face Embedding class.  sentence_transformers
from langchain_community.embeddings import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings()

vector = embeddings.embed_query("hello, world!")
vector[:5]
#vector

  warn_deprecated(
  from tqdm.autonotebook import tqdm, trange


[0.03492265194654465,
 0.01883004419505596,
 -0.017854733392596245,
 0.00013885056250728667,
 0.07407364994287491]

In [17]:
from langchain_chroma import Chroma
vectorstore = Chroma.from_documents(documents=docs, embedding=HuggingFaceEmbeddings())

In [18]:
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3})

retrieved_docs = retriever.invoke("Budget highlights")

In [19]:
len(retrieved_docs)

3

In [20]:
print(retrieved_docs[2].page_content)

25 Jul 2024, 08:43:33 AM IST

Action-Packed Budget Session Ends with Minimal Change, says Rajesh Bhosale, Equity Technical Analyst, Angel One

Rajesh Bhosale, Equity Technical Analyst, Angel One, said, “On the key Budget day, the Benchmark Index began on a positive note. However, as the day progressed, prices traded cautiously within a range, influenced by the Budget announcement. In the second half, volatility spiked, causing prices to drop sharply below 24100. Despite this, prices quickly rebounded, reclaiming lost ground and ending just below 24500 with a marginal loss of 0.14%."


In [21]:
from langchain_huggingface import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from transformers import pipeline
from langchain_core.output_parsers import StrOutputParser
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer


#model_id = "meta-llama/Meta-Llama-3-8B"
model_id = "tiiuae/falcon-7b"

text_generation_pipeline = pipeline(
    "text-generation", model=model_id, model_kwargs={"torch_dtype": torch.bfloat16}, max_new_tokens=400, device=0)

llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

prompt_template = """
<|system|>
Answer the question based on your knowledge. Use the following context to help:

{context}

</s>
<|user|>
{question}
</s>
<|assistant|>

 """

prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template,
)

llm_chain = prompt | llm | StrOutputParser()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

  return self.fget.__get__(instance, owner)()


In [22]:
from langchain_core.runnables import RunnablePassthrough

rag_chain = {"context": retriever, "question": RunnablePassthrough()} | llm_chain

In [23]:
question = "2024 Budget Highlights"

In [24]:
rag_chain.invoke(question)

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


'\n<|system|>\nAnswer the question based on your knowledge. Use the following context to help:\n\n[Document(metadata={\'source\': \'https://www.livemint.com/economy/budget-2024-key-highlights-live-updates-nirmala-sitharaman-infrastructure-defence-income-tax-modi-budget-23-july-11721654502862.html\'}, page_content=\'24 Jul 2024, 04:21:26 PM IST\\n\\nBudget 2024 Key Highlights Live Updates: Rajeev Tiwari, Co founder of STEMROBO Technologies, said\\n\\nBudget 2024 Key Highlights Live Updates: "The Union Budget 2024 focuses on the provisions for the education and startup sectors, reflecting an approach to boost economic growth and development. The budget allocation for education stands at ₹1.48 lakh crore, marking an increase aimed at enhancing educational infrastructure, skilling, and employment opportunities.\'), Document(metadata={\'source\': \'https://www.livemint.com/economy/budget-2024-key-highlights-live-updates-nirmala-sitharaman-infrastructure-defence-income-tax-modi-budget-23-jul

In [26]:
question = "2024 Budget Highlights"
response = rag_chain.invoke(question)

# Making the response readable
response = response.replace("</s>", "").strip()
print("Response:", response)



Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


Response: <|system|>
Answer the question based on your knowledge. Use the following context to help:

[Document(metadata={'source': 'https://www.livemint.com/economy/budget-2024-key-highlights-live-updates-nirmala-sitharaman-infrastructure-defence-income-tax-modi-budget-23-july-11721654502862.html'}, page_content='24 Jul 2024, 04:21:26 PM IST\n\nBudget 2024 Key Highlights Live Updates: Rajeev Tiwari, Co founder of STEMROBO Technologies, said\n\nBudget 2024 Key Highlights Live Updates: "The Union Budget 2024 focuses on the provisions for the education and startup sectors, reflecting an approach to boost economic growth and development. The budget allocation for education stands at ₹1.48 lakh crore, marking an increase aimed at enhancing educational infrastructure, skilling, and employment opportunities.'), Document(metadata={'source': 'https://www.livemint.com/economy/budget-2024-key-highlights-live-updates-nirmala-sitharaman-infrastructure-defence-income-tax-modi-budget-23-july-1172165

In [33]:
question = "What is the Union Budget?"
response = rag_chain.invoke(question)

# Making the response readable
response = response.replace("</s>", "").strip()
print("Response:", response)

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


Response: <|system|>
Answer the question based on your knowledge. Use the following context to help:

[Document(metadata={'source': 'https://www.livemint.com/economy/budget-2024-key-highlights-live-updates-nirmala-sitharaman-infrastructure-defence-income-tax-modi-budget-23-july-11721654502862.html'}, page_content='48 min read . Updated: 25 Jul 2024, 08:43 AM IST\n\nLivemint, Written By Ankit Gohel, Anubhav Mukherjee, Shivangini\n\nBudget 2024 Key Highlights: Finance Minister Nirmala Sitharaman presented the Union Budget 2024-25 in the Lok Sabha today. FM listed out roadmap for nine priorities for India opportunity.\n\nPremium\n\nRead Full Story\n\nBudget 2024 Key Highlights: Finance Minister Nirmala Sitharaman announced the Union Budget 2024-25 in the parliament on July 23. The Budget focused on provisioning for the agricultural sector, the introduction of schemes related to employment, loan schemes, announcements for financial support to the MSME sector, infrastructural development, a

In [27]:
from langchain_huggingface import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from transformers import pipeline
from langchain_core.output_parsers import StrOutputParser
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer


model_name = "EleutherAI/gpt-neo-1.3B"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

text_generation_pipeline = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    temperature=0.1,
    do_sample=True,
    repetition_penalty=1.1,
    return_full_text=True,
    max_new_tokens=400,
)
llm = HuggingFacePipeline(pipeline=text_generation_pipeline)


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [28]:


prompt_template = """
<|system|>
Answer the question based on your knowledge. Use the following context to help:

{context}

</s>
<|user|>
{question}
</s>
<|assistant|>

 """

prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template,
)



In [29]:
llm_chain = prompt | llm | StrOutputParser()

In [30]:
from langchain_core.runnables import RunnablePassthrough
rag_chain = {"context": retriever, "question": RunnablePassthrough()} | llm_chain

In [32]:
question = "What is the Union Budget?"

response = rag_chain.invoke(question)

# Making the response readable
response = response.replace("</s>", "").strip()
print("Response:", response)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
