In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pypdf import PdfReader
import re
from sentence_transformers import SentenceTransformer
import faiss
import pickle
import json
import re
from google import genai



In [5]:
def extract_text_from_pdf(pdf_path):
    """Extracts all text from a given PDF file."""
    reader = PdfReader(pdf_path)
    full_text = ""
    for page in reader.pages:
        full_text += page.extract_text() or "" 
    return full_text


file_path1 = 'documents/corep-own-funds-instructions.pdf'
file_path2 = 'documents/Reporting (CRR)_06-02-2026.pdf'
own_funds_instructions = extract_text_from_pdf(file_path1)
reporting_crr = extract_text_from_pdf(file_path2)


In [6]:
print(len(own_funds_instructions), len(reporting_crr))

586822 83341


In [7]:
data_list = list()
i =0
while (i < len(own_funds_instructions)):
    row = {'chunk_id': 'corep_'+str(int(i/1000)),'text':own_funds_instructions[i:i+1200]	,'source':'COREP_Annex_II'}
    i=i+1000
    data_list.append(row)

data1=pd.DataFrame(data_list)
def is_junk_chunk(text, min_len=200):
    if not isinstance(text, str):
        return True

    t = text.strip()

    if len(t) < min_len:
        return True

    no_space = re.sub(r"\s+", "", t)

    # mostly punctuation/dots
    if len(re.sub(r"[A-Za-z0-9]", "", no_space)) / len(no_space) > 0.85:
        return True

    # long dotted separator
    if re.search(r"\.{15,}", t):
        return True

    # very low alphabetic content
    alpha_chars = sum(c.isalpha() for c in t)
    if alpha_chars / len(t) < 0.15:
        return True

    return False


data1_clean = data1.copy()
data1_clean["is_junk"] = data1_clean["text"].apply(is_junk_chunk)

print("Before:", len(data1_clean))

data1_clean = data1_clean[data1_clean["is_junk"] == False] \
    .drop(columns=["is_junk"]) \
    .reset_index(drop=True)

print("After:", len(data1_clean))
# Separate COREP and PRA
corep_df = data1_clean[data1_clean["source"] == "COREP_Annex_II"].copy().reset_index(drop=True)


# Reassign sequential chunk_ids
corep_df["chunk_id"] = [f"corep_{i:04d}" for i in range(len(corep_df))]


# Merge back
data1 = pd.concat([corep_df], ignore_index=True)

# Check
data1



Before: 587
After: 562


Unnamed: 0,chunk_id,text,source
0,corep_0000,"erage; \n(b) group solvency, an overview of t...",COREP_Annex_II
1,corep_0001,rows and cells of the templates. Those numeri...,COREP_Annex_II
2,corep_0002,Abbreviations \n10. For the purposes of thi...,COREP_Annex_II
3,corep_0003,"s of certain types of undertakings, amending D...",COREP_Annex_II
4,corep_0004,\n11. The CA templates contain information abo...,COREP_Annex_II
...,...,...,...
557,corep_0557,"(row 0060), the part of NPEs secured by immov...",COREP_Annex_II
558,corep_0558,er Article 47c(6) CRR secured by immovable pro...,COREP_Annex_II
559,corep_0559,tion of the exposure as non-performing. \nEff...,COREP_Annex_II
560,corep_0560,"ints (a), (b), (c), (e) and (g) of Article 47c...",COREP_Annex_II


In [8]:
data_list_new = list()
i =0
while (i < len(reporting_crr)):
    row = {'chunk_id': 'pra_'+str(int(i/1000)),'text':reporting_crr[i:i+1200]	,'source':'PRA_RULEBOOK'}
    i=i+1000
    data_list_new.append(row)

data2=pd.DataFrame(data_list_new)
def is_junk_chunk(text, min_len=200):
    if not isinstance(text, str):
        return True

    t = text.strip()

    if len(t) < min_len:
        return True

    no_space = re.sub(r"\s+", "", t)

    # mostly punctuation/dots
    if len(re.sub(r"[A-Za-z0-9]", "", no_space)) / len(no_space) > 0.85:
        return True

    # long dotted separator
    if re.search(r"\.{15,}", t):
        return True

    # very low alphabetic content
    alpha_chars = sum(c.isalpha() for c in t)
    if alpha_chars / len(t) < 0.15:
        return True

    return False


data2_clean = data2.copy()
data2_clean["is_junk"] = data2_clean["text"].apply(is_junk_chunk)

print("Before:", len(data2_clean))

data2_clean = data2_clean[data2_clean["is_junk"] == False] \
    .drop(columns=["is_junk"]) \
    .reset_index(drop=True)

print("After:", len(data2_clean))
pra_df   = data2_clean[data2_clean["source"] == "PRA_RULEBOOK"].copy().reset_index(drop=True)

pra_df["chunk_id"]   = [f"pra_{i:04d}" for i in range(len(pra_df))]

# Merge back
data2 = pd.concat([ pra_df], ignore_index=True)

# Check
data2



Before: 84
After: 84


Unnamed: 0,chunk_id,text,source
0,pra_0000,Prudential Regulation Authority Rulebook\nPart...,PRA_RULEBOOK
1,pra_0001,s\n1.1 This Part applies to:\n(a) a firm that ...,PRA_RULEBOOK
2,pra_0002,5 and annexes X and XI of\nChapter 6.\n31/12/...,PRA_RULEBOOK
3,pra_0003,s on a consolidated basis\n2.4 A CRR consolida...,PRA_RULEBOOK
4,pra_0004,'consolidation situation' is defined in Artic...,PRA_RULEBOOK
...,...,...,...
79,pra_0079,.246 [Deleted.]\n01/09/2022\n2.247 [Deleted.]0...,PRA_RULEBOOK
80,pra_0080,22\n6.257 [Note: Provision left blank]\n01/09/...,PRA_RULEBOOK
81,pra_0081,/09/2022\n6.269 Annex XVI Template F 32.04 can...,PRA_RULEBOOK
82,pra_0082,found here\nO .\n01/09/2022\n6.279 Annex XVII...,PRA_RULEBOOK


In [9]:
data_final = pd.concat([data1,data2],ignore_index=True)
data_final

Unnamed: 0,chunk_id,text,source
0,corep_0000,"erage; \n(b) group solvency, an overview of t...",COREP_Annex_II
1,corep_0001,rows and cells of the templates. Those numeri...,COREP_Annex_II
2,corep_0002,Abbreviations \n10. For the purposes of thi...,COREP_Annex_II
3,corep_0003,"s of certain types of undertakings, amending D...",COREP_Annex_II
4,corep_0004,\n11. The CA templates contain information abo...,COREP_Annex_II
...,...,...,...
641,pra_0079,.246 [Deleted.]\n01/09/2022\n2.247 [Deleted.]0...,PRA_RULEBOOK
642,pra_0080,22\n6.257 [Note: Provision left blank]\n01/09/...,PRA_RULEBOOK
643,pra_0081,/09/2022\n6.269 Annex XVI Template F 32.04 can...,PRA_RULEBOOK
644,pra_0082,found here\nO .\n01/09/2022\n6.279 Annex XVII...,PRA_RULEBOOK


In [None]:

bad_patterns = r"\[Deleted\]|\[ Deleted \]|Provision left blank|can be found here|\[Deleted\.\]"

data_final = data_final[
    ~data_final["text"].str.contains(bad_patterns, regex=True, flags=re.IGNORECASE)
].copy()

data_final = data_final.reset_index(drop=True)
keep_keywords = r"COREP|own funds|CET1|Tier 1|Tier 2|capital requirements|CRR"

pra_useful = data_final[
    (data_final["source"] == "PRA_RULEBOOK") &
    (data_final["text"].str.contains(keep_keywords, regex=True, flags=re.IGNORECASE))
]

corep_all = data_final[data_final["source"] == "COREP_Annex_II"]

data_final = pd.concat([corep_all, pra_useful], ignore_index=True)
data_final = data_final.reset_index(drop=True)



In [11]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

embeddings = model.encode(
    data_final["text"].tolist(),
    show_progress_bar=True,normalize_embeddings=True
)

embeddings = np.array(embeddings).astype("float32")
data_final["embedding_text"] = list(embeddings)

# for i in data_final['text']:
#     data_final['text'] = model.encode(i)

Batches: 100%|██████████| 19/19 [00:01<00:00, 10.07it/s]


In [12]:
data_final

Unnamed: 0,chunk_id,text,source,embedding_text
0,corep_0000,"erage; \n(b) group solvency, an overview of t...",COREP_Annex_II,"[-0.011413075, 0.050157834, -0.06911405, -0.02..."
1,corep_0001,rows and cells of the templates. Those numeri...,COREP_Annex_II,"[-0.05913836, 0.056985375, -0.046240084, -0.06..."
2,corep_0002,Abbreviations \n10. For the purposes of thi...,COREP_Annex_II,"[-0.05379025, 0.00073226163, 0.024320433, -0.0..."
3,corep_0003,"s of certain types of undertakings, amending D...",COREP_Annex_II,"[-0.048888933, 0.013849427, 0.017130297, -0.01..."
4,corep_0004,\n11. The CA templates contain information abo...,COREP_Annex_II,"[-0.013768392, 0.037880532, 0.020612482, -0.03..."
...,...,...,...,...
589,pra_0041,tional liquidity monitoring metrics specified ...,PRA_RULEBOOK,"[0.017277189, -0.051744733, -0.06506463, -0.01..."
590,pra_0042,to report information on asset encumbrance in ...,PRA_RULEBOOK,"[-0.02410535, 0.074167915, -0.041559182, -0.04..."
591,pra_0044,ial holding\ncompanies and UK parent mixed fin...,PRA_RULEBOOK,"[-0.004714869, -0.03567785, 0.015623904, -0.01..."
592,pra_0045,"book, the following shall apply with regard to...",PRA_RULEBOOK,"[-0.0073001846, 0.019241735, -0.05826596, 0.00..."


In [13]:
dimension = embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)
index.add(embeddings)

faiss.write_index(index, "corep_faiss.index")


In [14]:
metadata = data_final[["chunk_id", "source", "text"]].to_dict(orient="records")

with open("corep_metadata.pkl", "wb") as f:
    pickle.dump(metadata, f)


In [15]:
index = faiss.read_index("corep_faiss.index")
with open("corep_metadata.pkl", "rb") as f:
    metadata = pickle.load(f)
def embed_query(query):
    q_embedding = model.encode([query])
    return np.array(q_embedding).astype("float32")
def retrieve_chunks(query, top_k=5):
    q_vec = model.encode([query], normalize_embeddings=True)
    q_vec = np.array(q_vec).astype("float32")

    distances, indices = index.search(q_vec, top_k)

    results = []
    for i, idx in enumerate(indices[0]):
        chunk_data = metadata[idx]

        results.append({
            "chunk_id": chunk_data["chunk_id"],
            "source": chunk_data["source"],
            "text": chunk_data["text"],
            "score": float(distances[0][i])
        })

    return results

scenario = """
CET1 capital = 540 million GBP
AT1 capital = 100 million GBP
Tier 2 capital = 80 million GBP
Intangible assets deduction = 40 million GBP
Deferred tax assets deduction = 20 million GBP
"""

question = "How should Tier 1 capital be calculated and reported in COREP Own Funds template C 01.00?"

query = f"""
Scenario:
{scenario}

Question:
{question}
"""


retrieved = retrieve_chunks(query, top_k=5)

for r in retrieved:
    print(r["chunk_id"], r["source"])
    print(r["text"][:300])
    print("-----")


corep_0103 COREP_Annex_II
 5.2 provides further details on the calculation of those grandfathered 
instruments which do not constitute state aid.   
18. Institutions shall report in the first four columns the adjustments to Common 
Equity Tier 1 capital, Additional Tier 1 capital a nd Tier 2 capital as well as the 
amount to
-----
corep_0073 COREP_Annex_II
the base of the 
threshold and multiplying the sum thus obtained by 10%.  
Effective from 1 January 20220210  10  17.65% CET1 threshold   
Article 48(1) CRR  
This item contains the 17.65% threshold for holdings in financial sector 
entities where an institution has a significant investment, and for
-----
corep_0139 COREP_Annex_II
NDS INCLUDED IN CONSOLIDATED OWN FUNDS  
Article 87 CRR  
0310  QUALIFYING TIER 1 INSTRUMENTS INCLUDED IN CONSOLIDATED TIER 1 
CAPITAL  
Article 85 CRR  
Effective from 1 January 20220320  MINORITY INTERESTS INCLUDED IN CONSOLIDATED COMMON  
EQUITY TIER 1 CAPITAL  
Article 84 CRR  
The amount to be 
-----
c

In [36]:
client = genai.Client(api_key="AIzaSyD3gYLc-cigRL2ytsdxHIvniFDnWqnHIDA")


schema_template = """
{
  "template": "C 01.00",
  "currency": "GBP",
  "scenario_summary": "",
  "populated_cells": [
    {
      "row": "",
      "column": "",
      "item": "",
      "value": null,
      "unit": "GBP",
      "confidence": "",
      "source_chunk_ids": []
    }
  ],
  "validation_flags": [
    {
      "type": "missing_data|inconsistency|warning",
      "message": ""
    }
  ],
  "audit_log": [
    {
      "field": "",
      "value": null,
      "justification": "",
      "source_chunk_ids": []
    }
  ]
}
"""


def build_context(retrieved_chunks):
    context = ""
    for chunk in retrieved_chunks:
        context += f"[{chunk['chunk_id']} | {chunk['source']}]\n"
        context += chunk["text"] + "\n\n"
    return context


def build_prompt(question, scenario, retrieved_chunks):
    context = build_context(retrieved_chunks)

    prompt = f"""
You are a PRA COREP regulatory reporting assistant.

TASK:
Generate a structured COREP Own Funds reporting output for Template C 01.00.

RULES:
- Use ONLY the scenario and the retrieved regulatory context.
- Output MUST be valid JSON only (no markdown, no explanation).
- Every populated cell MUST include source_chunk_ids.
- If a value cannot be derived, set it to null and add a validation flag.
- Populate Tier 1 = CET1 + AT1 if values exist.

Scenario:
{scenario}

Question:
{question}

Retrieved regulatory context:
{context}

Return JSON strictly following this schema:
{schema_template}
"""
    return prompt


def call_gemini(prompt):
    response = client.models.generate_content(
        model="models/gemini-3-flash-preview",
        contents=prompt
    )
    return response.text



def parse_json_output(llm_output):
    llm_output = re.sub(r"```json|```", "", llm_output).strip()
    try:
        return json.loads(llm_output)
    except:
        print("❌ JSON Parsing Failed. Raw output below:\n")
        print(llm_output)
        return None


def generate_corep_json(question, scenario, retrieved_chunks):
    prompt = build_prompt(question, scenario, retrieved_chunks)
    llm_output = call_gemini(prompt)
    return parse_json_output(llm_output)


In [37]:
scenario = """
CET1 = 540 million GBP
AT1 = 100 million GBP
Tier 2 = 80 million GBP
Intangible assets deduction = 40 million GBP
Deferred tax assets deduction = 20 million GBP
"""

question = "How should Tier 1 capital and Total Own Funds be reported in COREP Own Funds template C 01.00?"

query = f"Scenario: {scenario}\nQuestion: {question}"
retrieved = retrieve_chunks(query, top_k=8)

result_json = generate_corep_json(question, scenario, retrieved)

print(result_json)


{'template': 'C 01.00', 'currency': 'GBP', 'scenario_summary': 'Reporting of Own Funds for Template C 01.00. CET1 base of 540M is adjusted for Intangible assets (40M) and Deferred tax assets (20M). Tier 1 is calculated by adding Additional Tier 1 (100M) to the net CET1. Total Own Funds includes Tier 2 (80M).', 'populated_cells': [{'row': '0010', 'column': '0010', 'item': 'COMMON EQUITY TIER 1 CAPITAL', 'value': 540000000, 'unit': 'GBP', 'confidence': 'High', 'source_chunk_ids': ['corep_0004', 'corep_0139']}, {'row': '0300', 'column': '0010', 'item': '(-) Intangible assets', 'value': 40000000, 'unit': 'GBP', 'confidence': 'High', 'source_chunk_ids': ['corep_0004', 'corep_0140']}, {'row': '0340', 'column': '0010', 'item': '(-) Deferred tax assets that rely on future profitability', 'value': 20000000, 'unit': 'GBP', 'confidence': 'High', 'source_chunk_ids': ['corep_0004', 'corep_0073']}, {'row': '0530', 'column': '0010', 'item': 'Common Equity Tier 1 capital', 'value': 480000000, 'unit': 

In [38]:
df_report = pd.DataFrame(result_json["populated_cells"])
df_report

Unnamed: 0,row,column,item,value,unit,confidence,source_chunk_ids
0,10,10,COMMON EQUITY TIER 1 CAPITAL,540000000,GBP,High,"[corep_0004, corep_0139]"
1,300,10,(-) Intangible assets,40000000,GBP,High,"[corep_0004, corep_0140]"
2,340,10,(-) Deferred tax assets that rely on future pr...,20000000,GBP,High,"[corep_0004, corep_0073]"
3,530,10,Common Equity Tier 1 capital,480000000,GBP,High,[corep_0004]
4,540,10,ADDITIONAL TIER 1 CAPITAL,100000000,GBP,High,"[corep_0004, corep_0139]"
5,740,10,TIER 1 CAPITAL,580000000,GBP,High,"[corep_0004, corep_0103]"
6,750,10,TIER 2 CAPITAL,80000000,GBP,High,"[corep_0004, corep_0139]"
7,980,10,OWN FUNDS,660000000,GBP,High,[corep_0004]


In [39]:
def validate_corep_output(result_json):
    flags = []

    # Convert populated_cells into dict by row for quick lookup
    row_map = {cell["row"]: cell["value"] for cell in result_json["populated_cells"]}

    cet1 = row_map.get("0530")   # net CET1
    at1  = row_map.get("0540")
    tier1 = row_map.get("0740")
    tier2 = row_map.get("0750")
    own_funds = row_map.get("0980")

    # Rule 1: Tier 1 = CET1 + AT1
    if cet1 is not None and at1 is not None and tier1 is not None:
        expected_tier1 = cet1 + at1
        if tier1 != expected_tier1:
            flags.append({
                "type": "inconsistency",
                "message": f"Tier 1 mismatch: expected {expected_tier1}, got {tier1}"
            })

    # Rule 2: Own Funds = Tier 1 + Tier 2
    if tier1 is not None and tier2 is not None and own_funds is not None:
        expected_own_funds = tier1 + tier2
        if own_funds != expected_own_funds:
            flags.append({
                "type": "inconsistency",
                "message": f"Own Funds mismatch: expected {expected_own_funds}, got {own_funds}"
            })

    # Rule 3: Missing required rows
    required_rows = ["0530", "0540", "0740", "0750", "0980"]
    for r in required_rows:
        if r not in row_map:
            flags.append({
                "type": "missing_data",
                "message": f"Missing required row {r}"
            })

    return flags


In [44]:
validation_flags = validate_corep_output(result_json)

result_json["validation_flags"] = validation_flags

print(result_json["validation_flags"])


[]


In [45]:
for entry in result_json["audit_log"]:
    print("FIELD:", entry["field"])
    print("VALUE:", entry["value"])
    print("JUSTIFICATION:", entry["justification"])
    print("SOURCE CHUNKS:", entry["source_chunk_ids"])
    
    print("-----")


FIELD: Row 0530 (CET1)
VALUE: 480000000
JUSTIFICATION: Calculated as Gross CET1 (540M) minus Intangible assets (40M) and Deferred tax assets (20M).
SOURCE CHUNKS: ['corep_0004']
-----
FIELD: Row 0740 (Tier 1)
VALUE: 580000000
JUSTIFICATION: Sum of net Common Equity Tier 1 (480M) and Additional Tier 1 (100M) as per core rules.
SOURCE CHUNKS: ['corep_0103']
-----
FIELD: Row 0980 (Own Funds)
VALUE: 660000000
JUSTIFICATION: Sum of Tier 1 Capital (580M) and Tier 2 Capital (80M).
SOURCE CHUNKS: ['corep_0004']
-----


In [46]:
def attach_evidence(audit_log, retrieved_chunks):
    chunk_map = {c["chunk_id"]: c["text"] for c in retrieved_chunks}

    for entry in audit_log:
        evidence = []
        for cid in entry["source_chunk_ids"]:
            if cid in chunk_map:
                evidence.append(chunk_map[cid][:300])
        entry["evidence_snippets"] = evidence

    return audit_log
result_json["audit_log"] = attach_evidence(result_json["audit_log"], retrieved)


In [47]:
print(json.dumps(result_json["audit_log"], indent=4))

[
    {
        "field": "Row 0530 (CET1)",
        "value": 480000000,
        "justification": "Calculated as Gross CET1 (540M) minus Intangible assets (40M) and Deferred tax assets (20M).",
        "source_chunk_ids": [
            "corep_0004"
        ],
        "evidence_snippets": [
            "\n11. The CA templates contain information about Pillar 1 numerators (own funds, Tier \n1, Common Equity Tier 1), denominator (own funds requirements), and the \napplication of CRR and CRD transitional provisions and is structured in five \ntemplates:   \n(a) Template CA1 c ontains the amount of own fund"
        ]
    },
    {
        "field": "Row 0740 (Tier 1)",
        "value": 580000000,
        "justification": "Sum of net Common Equity Tier 1 (480M) and Additional Tier 1 (100M) as per core rules.",
        "source_chunk_ids": [
            "corep_0103"
        ],
        "evidence_snippets": [
            " 5.2 provides further details on the calculation of those grandfathered \nin