In [255]:
import os
import json
import nltk
import logging
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.llms.base import LLM
from langchain.prompts import PromptTemplate
from langchain.embeddings import HuggingFaceEmbeddings
import google.generativeai as genai
from google.oauth2 import service_account
from pydantic import BaseModel, Field
from rouge_score import rouge_scorer
import sacrebleu
from docx import Document as DocxDocument
from tqdm import tqdm
from langchain.docstore.document import Document as LangchainDocument
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [256]:
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [257]:
# Download required NLTK resources
nltk.download("punkt")
nltk.download("wordnet")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\farha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\farha\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\farha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [258]:
# Path to the service account's JSON file
service_account_path = "adv-nlp-uts-faa7595a22eb.json"

# Create credentials using the service account JSON file
try:
    credentials = service_account.Credentials.from_service_account_file(service_account_path, scopes=["https://www.googleapis.com/auth/generative-language"])
except FileNotFoundError:
    logger.error(f"Service account file not found at {service_account_path}.")
    raise
except Exception as e:
    logger.error(f"Error creating credentials from the service account file: {e}")
    raise

# Configure the Gemini API client with the credentials
genai.configure(credentials=credentials)

In [259]:
# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

In [260]:
# Define text preprocessing function with lemmatization
def preprocess_text(text):
    # 1. Strip whitespace
    text = text.strip()

    # 2. Tokenize the text
    tokens = nltk.word_tokenize(text)

    # 3. Remove stopwords and apply lemmatization
    lemmatized_tokens = [lemmatizer.lemmatize(token.lower()) for token in tqdm(tokens, desc="Processing tokens") if token.lower() not in stop_words]

    # 4. Join the tokens back into a string
    preprocessed_text = " ".join(lemmatized_tokens)

    return preprocessed_text

In [261]:
# Read .docx files from 'dataset/word_standards' folder
def read_docx_files(folder_path):
    documents = []
    for filename in tqdm(os.listdir(folder_path), desc="Reading .docx files"):
        if filename.endswith(".docx"):
            file_path = os.path.join(folder_path, filename)
            try:
                docx_doc = DocxDocument(file_path)
                full_text = []
                for para in docx_doc.paragraphs:
                    full_text.append(para.text)
                text = "\n".join(full_text)
                # Create a LangChain Document object with text and metadata
                langchain_doc = LangchainDocument(page_content=text, metadata={"source": filename})
                documents.append(langchain_doc)
            except Exception as e:
                logger.error(f"Error reading {file_path}: {e}")
    return documents

In [262]:
# Load and preprocess documents from the folder
folder_path = "../data/word_standards"
documents = read_docx_files(folder_path)

Reading .docx files:   0%|          | 0/91 [00:00<?, ?it/s]

Reading .docx files: 100%|██████████| 91/91 [00:05<00:00, 15.21it/s]


In [263]:
# Preprocess the text in each document
for doc in documents:
    doc.page_content = preprocess_text(doc.page_content)

Processing tokens: 100%|██████████| 6363/6363 [00:00<00:00, 348990.58it/s]
Processing tokens: 100%|██████████| 1783/1783 [00:00<00:00, 395559.30it/s]
Processing tokens: 100%|██████████| 2738/2738 [00:00<00:00, 271175.34it/s]
Processing tokens: 100%|██████████| 1775/1775 [00:00<00:00, 253589.81it/s]
Processing tokens: 100%|██████████| 3804/3804 [00:00<00:00, 341834.65it/s]
Processing tokens: 100%|██████████| 2582/2582 [00:00<00:00, 363156.60it/s]
Processing tokens: 100%|██████████| 2464/2464 [00:00<00:00, 404856.23it/s]
Processing tokens: 100%|██████████| 5387/5387 [00:00<00:00, 359142.24it/s]
Processing tokens: 100%|██████████| 6725/6725 [00:00<00:00, 417864.57it/s]
Processing tokens: 100%|██████████| 6145/6145 [00:00<00:00, 407590.70it/s]
Processing tokens: 100%|██████████| 2835/2835 [00:00<00:00, 350876.44it/s]
Processing tokens: 100%|██████████| 2313/2313 [00:00<00:00, 385788.57it/s]
Processing tokens: 100%|██████████| 2000/2000 [00:00<00:00, 329106.99it/s]
Processing tokens: 100%|█

In [264]:
# Initialize the text splitter with overlap
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2500,
    chunk_overlap=500,
    separators=["\n\n", "\n", " ", "", "\t", "\r\n", "\r", "\v", "\f", "\u0085", "\u2028", "\u2029"]
)

In [265]:
# Split the documents into chunks
split_docs = text_splitter.split_documents(documents)

logger.info(f"Total number of documents after splitting: {len(split_docs)}")

INFO:__main__:Total number of documents after splitting: 1022


In [266]:
# Initialize the embeddings model
embedding_model_name = "all-MiniLM-L6-v2"
embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)

# Initialize FAISS vector store
vector_store = FAISS.from_documents(split_docs, embeddings)

# Save the vector store locally
vector_store.save_local("faiss_index")

# To load the vector store from disk
# vector_store = FAISS.load_local("faiss_index", embeddings)

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cuda
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2


In [267]:
# Implement the Gemini LLM class
class GeminiLLM(LLM, BaseModel):
    model_name: str = Field(default="gemini-1.5-flash")
    temperature: float = Field(default=0.7)

    @property
    def _llm_type(self):
        return "gemini"

    def _call(self, prompt: str, stop: list[str] = None) -> str:
        try:
            # Initialize the model
            model = genai.GenerativeModel(model_name=self.model_name)

            # Generate content using the Gemini API
            response = model.generate_content(
                prompt,
                # temperature=self.temperature,
                # max_output_tokens=512  # Adjust token limit as needed
            )

            # Extract generated text from the response
            generated_text = response.text

            # Handle stop tokens if provided
            if stop:
                for token in stop:
                    generated_text = generated_text.split(token)[0]

            return generated_text.strip()

        except Exception as e:
            logger.error(f"Gemini API error: {e}")
            return "I'm sorry, but I couldn't process your request at this time."

In [268]:
# Initialize the Gemini LLM client
llm = GeminiLLM(model_name="gemini-1.5-flash", temperature=0.7)

# Define a prompt template
prompt_template = PromptTemplate(
    input_variables=["context", "question"],
    template="""
You are an AI assistant with professional expertise in financial regulations and banking statistics, particularly knowledgeable about Australian APRA guidelines.

Based on the provided context, please answer the following question in a clear, well detailed, and informative manner. Ensure your response directly addresses the query.

Context:
{context}

Question:
{question}

Answer:
""",
)

In [269]:
# Create a RetrievalQA chain with the custom prompt
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",  # You can experiment with 'refine' or 'map_reduce'
    retriever=vector_store.as_retriever(search_kwargs={"k": 100}),
    chain_type_kwargs={"prompt": prompt_template},
    return_source_documents=True,
)

In [270]:
# Function to handle user queries
def answer_query(query):
    try:
        response = qa_chain({"query": query})
        answer = response["result"]
        source_docs = response["source_documents"]
        # print("Response:")
        print(answer)
        # print("\nRelevant Source Documents:")
        # for doc in source_docs:
        #     print(f"Source: {doc.metadata.get('source', 'Unknown Source')}")
        #     print(doc.page_content)
        #     print("-" * 80)
    except Exception as e:
        logger.error(f"Error during query processing: {e}")

In [271]:
query = "What is RWA?"
answer_query(query)

RWA stands for **Risk-Weighted Asset**. 

It is a key concept in banking regulations, particularly within the context of capital adequacy requirements. RWA is calculated by multiplying an institution's assets by a risk weight that reflects the level of risk associated with those assets. This process is used to determine the amount of capital that an institution needs to hold to cover potential losses.

**In the provided context, the RWA is calculated based on several factors:**

* **Total IRRBB Capital Requirement:** This refers to the capital required to cover the interest rate risk in the banking book (IRRBB). 
* **Diversification Benefit Amount:** This represents the reduction in capital requirements due to diversification of assets. 
* **Factor of 12.5:** This is a constant factor applied to the total capital charge to arrive at the risk-weighted equivalent amount.

**Here is a breakdown of how RWA is derived in the provided text:**

1. The total IRRBB capital requirement is derive

In [272]:
# Test the RAG system with a query
query = "What is the quality control on International Banking Statistics Balance Sheet Items?"
answer_query(query)

The quality control for International Banking Statistics (IBS) Balance Sheet Items (ARF 731.4) involves a combination of internal review and external audit. 

**Internal Review:** 

* **Process Control:** Australian-owned banks, as the reporting entities, are required to have developed and implemented a process control system that ensures the completeness and reliability of the information provided in their ARF 731.4 submissions. 
* **Authorisation:** An authorized officer from the bank is responsible for submitting the information using the "Direct APRA" application method and digitally signing the relevant information using a digital certificate accepted by APRA.

**External Audit:**

* **Annual Review and Testing:** The information provided in ARF 731.4 must be subject to review and testing by the bank's external auditor on an annual basis, or more frequently as required. 
* **Scope and Nature of Review:** The external auditor must conduct a review to form an opinion on the accuracy

In [273]:
query = "What are the key elements in the definition of Effective Maturity?"
answer_query(query)

The context you provided does not define "Effective Maturity." The context focuses on the Australian Prudential Regulation Authority (APRA) guidelines for various aspects of financial institutions, including capital adequacy, liquidity risk, and remuneration. It extensively discusses the calculation of risk weights, available stable funding (ASF), required stable funding (RSF), and other related metrics. 

However, the definition of "Effective Maturity" is not explicitly mentioned in the text. To answer your question, I need more information about the context in which you are referring to "Effective Maturity." 

Please provide:

* **The specific document or section** where you encountered the term "Effective Maturity."
* **The context in which the term is used.** 

With this additional information, I can help you understand the key elements of the "Effective Maturity" definition and provide a clear and detailed answer.


In [274]:
# # Evaluation functions (remain unchanged)
# def evaluate_rouge(predicted, reference):
#     scorer = rouge_scorer.RougeScorer(["rouge1", "rougeL"], use_stemmer=True)
#     return scorer.score(reference, predicted)


# def evaluate_bleu(predicted, reference):
#     bleu = sacrebleu.corpus_bleu([predicted], [[reference]])
#     return bleu.score


# def evaluate_f1(predicted, reference):
#     predicted_tokens = nltk.word_tokenize(preprocess_text(predicted))
#     reference_tokens = nltk.word_tokenize(preprocess_text(reference))
#     common_tokens = set(predicted_tokens) & set(reference_tokens)

#     precision = len(common_tokens) / len(predicted_tokens) if predicted_tokens else 0
#     recall = len(common_tokens) / len(reference_tokens) if reference_tokens else 0

#     if precision + recall == 0:
#         return 0.0
#     return 2 * (precision * recall) / (precision + recall)


# def run_evaluation(file_path):
#     with open(file_path, "r") as f:
#         data = json.load(f)

#     rouge_scores, bleu_scores, f1_scores = [], [], []
#     chat_history = []

#     for entry in data:
#         question = entry["question"]
#         reference_answer = entry["answer"]
#         predicted_answer = answer_query(question, chat_history)

#         rouge = evaluate_rouge(predicted_answer, reference_answer)
#         bleu = evaluate_bleu(predicted_answer, reference_answer)
#         f1 = evaluate_f1(predicted_answer, reference_answer)

#         rouge_scores.append(rouge["rougeL"].fmeasure)
#         bleu_scores.append(bleu)
#         f1_scores.append(f1)

#         print(f"Q: {question}")
#         print(f"Predicted: {predicted_answer}")
#         print(f"Reference: {reference_answer}")
#         print(f"ROUGE-L: {rouge}")
#         print(f"BLEU: {bleu}")
#         print(f"F1: {f1}")
#         print("-" * 80)

#     print("\n=== Evaluation Summary ===")
#     print(f"Average ROUGE-L: {sum(rouge_scores) / len(rouge_scores):.4f}")
#     print(f"Average BLEU: {sum(bleu_scores) / len(bleu_scores):.2f}")
#     print(f"Average F1: {sum(f1_scores) / len(f1_scores):.4f}")

In [275]:
# run_evaluation("questions_answers.json")