In [5]:
from bs4 import BeautifulSoup
import pandas as pd
import os

data = {'title': [], 'text': []}

data_folder = "/Users/amankothari/Downloads/toc_notifications_2023_1991/notification_1991_2023"

for file_name in os.listdir(data_folder):
    if file_name.endswith(".html"):
        file_path = os.path.join(data_folder, file_name)
        with open(file_path, 'r', encoding='utf-8') as file:
            soup = BeautifulSoup(file, 'html.parser')
            
            # Extract title 
            headers = soup.find_all('td', class_='tableheader')
            if len(headers) > 1:
                title = headers[1].get_text(strip=True)
            else:
                title = "Notification Details" 
            
            # Extract text s
            text = ' '.join([p.text for p in soup.find_all('p')])

            data['title'].append(title)
            data['text'].append(text)

df = pd.DataFrame(data)

print(df)


                                                   title  \
0                       GOI Notification - 5.48 per cent   
1                             Marginal Standing Facility   
2      Tender for "7.49 percent Government Stock, 201...   
3      Foreign Exchange Management (Transfer or Issue...   
4        Auction of Government of India Dated Securities   
...                                                  ...   
12245                                 RBI - Notification   
12246  RRBs - Master Circular on Priority Sector Lending   
12247                              PM Vishwakarma Scheme   
12248  List of Terrorist individuals / organizations ...   
12249  Auction for Sale (Re-issue ) of ‘7.35 per cent...   

                                                    text  
0      GOVERNMENT OF INDIA MINISTRY OF FINANCE (Depar...  
1      RBI/2018-2019/161    FMOD.MAOG. No.131/01.18.0...  
2       Annexure I   For office use only  Regn no.   ...  
3      RESERVE BANK OF INDIA\n          FOR

In [7]:
df.head()

Unnamed: 0,title,text
0,GOI Notification - 5.48 per cent,GOVERNMENT OF INDIA MINISTRY OF FINANCE (Depar...
1,Marginal Standing Facility,RBI/2018-2019/161 FMOD.MAOG. No.131/01.18.0...
2,"Tender for ""7.49 percent Government Stock, 201...",Annexure I For office use only Regn no. ...
3,Foreign Exchange Management (Transfer or Issue...,RESERVE BANK OF INDIA\n FOREIGN EXCHA...
4,Auction of Government of India Dated Securities,Ref.No.IDMC. 4144 /08.02.30.01/2002-03 April 1...


In [8]:
df.rename(columns={'title': 'RBI Notification Title', 'text': 'RBI Notification Text'}, inplace=True)


In [18]:
df_filtered = df[df['RBI Notification Title'] != 'Notification Details']

In [33]:
df_small = df_filtered.iloc[:500].copy()

In [34]:
df_small.to_csv('data/notifications_data_small.csv', index=False)

In [35]:
# Testing with data; Indexing and initializing query engine with complete data and service context
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, download_loader
from llama_index.core import Settings

from llama_index.llms.openai import OpenAI
documents = SimpleDirectoryReader('data').load_data()
Settings.llm = OpenAI(model="gpt-3.5-turbo", temperature=0.1)
Settings.chunk_size = 1024
Settings.chunk_overlap = 20
index = VectorStoreIndex.from_documents(documents, show_progress=True)
query_engine_service_context = index.as_query_engine(llm=Settings.llm) 

Parsing nodes: 100%|██████████| 1/1 [00:03<00:00,  3.54s/it]
Generating embeddings: 100%|██████████| 1275/1275 [00:45<00:00, 27.94it/s]


In [36]:
response = query_engine_service_context.query('What is this data about?')
print(response)

The data provided discusses various aspects related to the banking sector, including the disclosure of information by banks, operational procedures for old and incapacitated persons, surveys of business expectations, lead indicators of the services sector activity, and credit extended by scheduled commercial banks.


In [37]:
response = query_engine_service_context.query('Tell me about the Marginal Standing Facility')
print(response)

The Marginal Standing Facility (MSF) rate has been adjusted to 6.25 per cent with immediate effect. All other terms and conditions of the extant MSF scheme will remain unchanged.


In [41]:
index.storage_context.persist()
# creates storage folder, containing .json files
# vector_store : embeddings computed for each chunk
# docstore : different chunks from documents
# index_store: hash address for different chunks, determines which embeddings belong to which chunk

# uncomment following lines to use specific storage context:
# from llama_index import StorageContext, load_index_from_storage

# storage_context = StorageContext.from_defaults(persist_dir = './storage')
# index = load_index_from_storage(storage_context=storage_context)

In [43]:
#Providing prompt template and instruction to the model. Improving performance
from llama_index.core import Prompt

template = (
    "You are a RBI representative. We have provided context information below. \n"
    "---------------------\n"
    "{context_str}"
    "\n---------------------\n"
    "Given this information, please answer the question and provide detailed responses: {query_str}\n"
)
qa_template = Prompt(template)
query_engine_service_context_prompt = index.as_query_engine(text_qa_template=qa_template)

In [44]:
response = query_engine_service_context_prompt.query("Tell me about the detection and impounding of counterfeit notes")
print(response)

The detection and impounding of counterfeit notes is a crucial process to prevent fake currency from circulating in the economy. The Reserve Bank of India (RBI) has issued guidelines and instructions to banks and financial institutions on how to detect and impound counterfeit notes effectively.

According to the Master Circular on Detection and Impounding of Counterfeit Notes, counterfeit notes can be impounded by all branches of Public Sector Banks, Private Sector Banks, Foreign Banks, Co-operative Banks, Regional Rural Banks, Treasuries, Sub-Treasuries, and Reserve Bank Issue Offices.

When a counterfeit note is detected, it should be stamped with a "COUNTERFEIT BANKNOTE" stamp and recorded in a register under authentication. An acknowledgement receipt should be issued to the tenderer, and the impounded note should be forwarded to the local police authorities for investigation by filing a First Information Report (FIR).

Banks are also advised to ensure that banknotes in denomination

In [None]:
#Evaluting responses using GPT-4, working on error 
from llama_index.core.evaluation import FaithfulnessEvaluator

# create llm
llm_gpt4 = OpenAI(model="gpt-4", temperature=0.0)

# define evaluator
evaluator = FaithfulnessEvaluator(llm=llm_gpt4)

# query index
query_engine_for_eval = index.as_query_engine()
response = query_engine_for_eval.query(
    "Tell me about the detection and impounding of counterfeit notes"
)
eval_result = evaluator.evaluate_response(response=response)
print(str(eval_result.passing))