In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pypdf import PdfReader
import re
from sentence_transformers import SentenceTransformer
import faiss
import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [2]:


def extract_text_from_pdf(pdf_path):
    """Extracts all text from a given PDF file."""
    reader = PdfReader(pdf_path)
    full_text = ""
    for page in reader.pages:
        full_text += page.extract_text() or "" 
    return full_text


file_path1 = 'documents/corep-own-funds-instructions.pdf'
file_path2 = 'documents/Reporting (CRR)_06-02-2026.pdf'
own_funds_instructions = extract_text_from_pdf(file_path1)
reporting_crr = extract_text_from_pdf(file_path2)



In [3]:
print(len(own_funds_instructions), len(reporting_crr))

586822 83341


In [4]:
data_list = list()
i =0
while (i < len(own_funds_instructions)):
    row = {'chunk_id': 'corep_'+str(int(i/1000)),'text':own_funds_instructions[i:i+1200]	,'source':'COREP_Annex_II'}
    i=i+1000
    data_list.append(row)

data1=pd.DataFrame(data_list)
def is_junk_chunk(text, min_len=200):
    if not isinstance(text, str):
        return True

    t = text.strip()

    if len(t) < min_len:
        return True

    no_space = re.sub(r"\s+", "", t)

    # mostly punctuation/dots
    if len(re.sub(r"[A-Za-z0-9]", "", no_space)) / len(no_space) > 0.85:
        return True

    # long dotted separator
    if re.search(r"\.{15,}", t):
        return True

    # very low alphabetic content
    alpha_chars = sum(c.isalpha() for c in t)
    if alpha_chars / len(t) < 0.15:
        return True

    return False


data1_clean = data1.copy()
data1_clean["is_junk"] = data1_clean["text"].apply(is_junk_chunk)

print("Before:", len(data1_clean))

data1_clean = data1_clean[data1_clean["is_junk"] == False] \
    .drop(columns=["is_junk"]) \
    .reset_index(drop=True)

print("After:", len(data1_clean))
# Separate COREP and PRA
corep_df = data1_clean[data1_clean["source"] == "COREP_Annex_II"].copy().reset_index(drop=True)


# Reassign sequential chunk_ids
corep_df["chunk_id"] = [f"corep_{i:04d}" for i in range(len(corep_df))]


# Merge back
data1 = pd.concat([corep_df], ignore_index=True)

# Check
data1



Before: 587
After: 562


Unnamed: 0,chunk_id,text,source
0,corep_0000,"erage; \n(b) group solvency, an overview of t...",COREP_Annex_II
1,corep_0001,rows and cells of the templates. Those numeri...,COREP_Annex_II
2,corep_0002,Abbreviations \n10. For the purposes of thi...,COREP_Annex_II
3,corep_0003,"s of certain types of undertakings, amending D...",COREP_Annex_II
4,corep_0004,\n11. The CA templates contain information abo...,COREP_Annex_II
...,...,...,...
557,corep_0557,"(row 0060), the part of NPEs secured by immov...",COREP_Annex_II
558,corep_0558,er Article 47c(6) CRR secured by immovable pro...,COREP_Annex_II
559,corep_0559,tion of the exposure as non-performing. \nEff...,COREP_Annex_II
560,corep_0560,"ints (a), (b), (c), (e) and (g) of Article 47c...",COREP_Annex_II


In [5]:
data_list_new = list()
i =0
while (i < len(reporting_crr)):
    row = {'chunk_id': 'pra_'+str(int(i/1000)),'text':reporting_crr[i:i+1200]	,'source':'PRA_RULEBOOK'}
    i=i+1000
    data_list_new.append(row)

data2=pd.DataFrame(data_list_new)
def is_junk_chunk(text, min_len=200):
    if not isinstance(text, str):
        return True

    t = text.strip()

    if len(t) < min_len:
        return True

    no_space = re.sub(r"\s+", "", t)

    # mostly punctuation/dots
    if len(re.sub(r"[A-Za-z0-9]", "", no_space)) / len(no_space) > 0.85:
        return True

    # long dotted separator
    if re.search(r"\.{15,}", t):
        return True

    # very low alphabetic content
    alpha_chars = sum(c.isalpha() for c in t)
    if alpha_chars / len(t) < 0.15:
        return True

    return False


data2_clean = data2.copy()
data2_clean["is_junk"] = data2_clean["text"].apply(is_junk_chunk)

print("Before:", len(data2_clean))

data2_clean = data2_clean[data2_clean["is_junk"] == False] \
    .drop(columns=["is_junk"]) \
    .reset_index(drop=True)

print("After:", len(data2_clean))
pra_df   = data2_clean[data2_clean["source"] == "PRA_RULEBOOK"].copy().reset_index(drop=True)

pra_df["chunk_id"]   = [f"pra_{i:04d}" for i in range(len(pra_df))]

# Merge back
data2 = pd.concat([ pra_df], ignore_index=True)

# Check
data2



Before: 84
After: 84


Unnamed: 0,chunk_id,text,source
0,pra_0000,Prudential Regulation Authority Rulebook\nPart...,PRA_RULEBOOK
1,pra_0001,s\n1.1 This Part applies to:\n(a) a firm that ...,PRA_RULEBOOK
2,pra_0002,5 and annexes X and XI of\nChapter 6.\n31/12/...,PRA_RULEBOOK
3,pra_0003,s on a consolidated basis\n2.4 A CRR consolida...,PRA_RULEBOOK
4,pra_0004,'consolidation situation' is defined in Artic...,PRA_RULEBOOK
...,...,...,...
79,pra_0079,.246 [Deleted.]\n01/09/2022\n2.247 [Deleted.]0...,PRA_RULEBOOK
80,pra_0080,22\n6.257 [Note: Provision left blank]\n01/09/...,PRA_RULEBOOK
81,pra_0081,/09/2022\n6.269 Annex XVI Template F 32.04 can...,PRA_RULEBOOK
82,pra_0082,found here\nO .\n01/09/2022\n6.279 Annex XVII...,PRA_RULEBOOK


In [6]:
data_final = pd.concat([data1,data2],ignore_index=True)
data_final

Unnamed: 0,chunk_id,text,source
0,corep_0000,"erage; \n(b) group solvency, an overview of t...",COREP_Annex_II
1,corep_0001,rows and cells of the templates. Those numeri...,COREP_Annex_II
2,corep_0002,Abbreviations \n10. For the purposes of thi...,COREP_Annex_II
3,corep_0003,"s of certain types of undertakings, amending D...",COREP_Annex_II
4,corep_0004,\n11. The CA templates contain information abo...,COREP_Annex_II
...,...,...,...
641,pra_0079,.246 [Deleted.]\n01/09/2022\n2.247 [Deleted.]0...,PRA_RULEBOOK
642,pra_0080,22\n6.257 [Note: Provision left blank]\n01/09/...,PRA_RULEBOOK
643,pra_0081,/09/2022\n6.269 Annex XVI Template F 32.04 can...,PRA_RULEBOOK
644,pra_0082,found here\nO .\n01/09/2022\n6.279 Annex XVII...,PRA_RULEBOOK


In [7]:
import re

bad_patterns = r"\[Deleted\]|\[ Deleted \]|Provision left blank|can be found here|\[Deleted\.\]"

data_final = data_final[
    ~data_final["text"].str.contains(bad_patterns, regex=True, flags=re.IGNORECASE)
].copy()

data_final = data_final.reset_index(drop=True)
keep_keywords = r"COREP|own funds|CET1|Tier 1|Tier 2|capital requirements|CRR"

pra_useful = data_final[
    (data_final["source"] == "PRA_RULEBOOK") &
    (data_final["text"].str.contains(keep_keywords, regex=True, flags=re.IGNORECASE))
]

corep_all = data_final[data_final["source"] == "COREP_Annex_II"]

data_final = pd.concat([corep_all, pra_useful], ignore_index=True)
data_final = data_final.reset_index(drop=True)



In [8]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

embeddings = model.encode(
    data_final["text"].tolist(),
    show_progress_bar=True
)

embeddings = np.array(embeddings).astype("float32")
data_final["embedding_text"] = list(embeddings)

# for i in data_final['text']:
#     data_final['text'] = model.encode(i)

Batches: 100%|██████████| 19/19 [00:02<00:00,  9.12it/s]


In [9]:
data_final

Unnamed: 0,chunk_id,text,source,embedding_text
0,corep_0000,"erage; \n(b) group solvency, an overview of t...",COREP_Annex_II,"[-0.011413075, 0.050157834, -0.06911405, -0.02..."
1,corep_0001,rows and cells of the templates. Those numeri...,COREP_Annex_II,"[-0.05913836, 0.056985375, -0.046240084, -0.06..."
2,corep_0002,Abbreviations \n10. For the purposes of thi...,COREP_Annex_II,"[-0.053790245, 0.0007322616, 0.024320431, -0.0..."
3,corep_0003,"s of certain types of undertakings, amending D...",COREP_Annex_II,"[-0.048888933, 0.013849427, 0.017130297, -0.01..."
4,corep_0004,\n11. The CA templates contain information abo...,COREP_Annex_II,"[-0.013768392, 0.037880532, 0.020612482, -0.03..."
...,...,...,...,...
589,pra_0041,tional liquidity monitoring metrics specified ...,PRA_RULEBOOK,"[0.017277189, -0.051744733, -0.06506463, -0.01..."
590,pra_0042,to report information on asset encumbrance in ...,PRA_RULEBOOK,"[-0.02410535, 0.074167915, -0.041559182, -0.04..."
591,pra_0044,ial holding\ncompanies and UK parent mixed fin...,PRA_RULEBOOK,"[-0.0047148685, -0.035677847, 0.015623903, -0...."
592,pra_0045,"book, the following shall apply with regard to...",PRA_RULEBOOK,"[-0.0073001846, 0.019241735, -0.05826596, 0.00..."


In [None]:
dimension = embeddings.shape[1]  
index = faiss.IndexFlatL2(dimension)
faiss.write_index(index, "corep_faiss.index")

In [None]:
metadata = data_final[["chunk_id", "source", "text"]].to_dict(orient="records")

with open("corep_metadata.pkl", "wb") as f:
    pickle.dump(metadata, f)


In [None]:
index = faiss.read_index("corep_faiss.index")
with open("corep_metadata.pkl", "rb") as f:
    metadata = pickle.load(f)
def embed_query(query):
    q_embedding = model.encode([query])
    return np.array(q_embedding).astype("float32")
def retrieve_chunks(query, top_k=5):
    q_vec = embed_query(query)

    distances, indices = index.search(q_vec, top_k)

    results = []
    for idx, dist in zip(indices[0], distances[0]):
        chunk_data = metadata[idx]
        results.append({
            "chunk_id": chunk_data["chunk_id"],
            "source": chunk_data["source"],
            "text": chunk_data["text"],
            "distance": float(dist)
        })

    return results
query = "How is Tier 1 capital calculated in COREP own funds?"
retrieved = retrieve_chunks(query, top_k=5)

for r in retrieved:
    print(r["chunk_id"], r["source"])
    print(r["text"][:300])
    print("-----")


pra_0046 PRA_RULEBOOK
tutions shall submit the information referred to in this Chapter 5 of this Reporting
(CRR) Part of the PRA Rulebook in the data exchange formats and representations
specified by the competent authorities and respecting the data point definition of the
data point model referred to in Annex XIV and th
-----
pra_0046 PRA_RULEBOOK
tutions shall submit the information referred to in this Chapter 5 of this Reporting
(CRR) Part of the PRA Rulebook in the data exchange formats and representations
specified by the competent authorities and respecting the data point definition of the
data point model referred to in Annex XIV and th
-----
pra_0046 PRA_RULEBOOK
tutions shall submit the information referred to in this Chapter 5 of this Reporting
(CRR) Part of the PRA Rulebook in the data exchange formats and representations
specified by the competent authorities and respecting the data point definition of the
data point model referred to in Annex XIV and th
-----
pra_0046 PRA_

In [13]:
data_final

Unnamed: 0,chunk_id,text,source,embedding_text
0,corep_0000,"erage; \n(b) group solvency, an overview of t...",COREP_Annex_II,"[-0.011413075, 0.050157834, -0.06911405, -0.02..."
1,corep_0001,rows and cells of the templates. Those numeri...,COREP_Annex_II,"[-0.05913836, 0.056985375, -0.046240084, -0.06..."
2,corep_0002,Abbreviations \n10. For the purposes of thi...,COREP_Annex_II,"[-0.053790245, 0.0007322616, 0.024320431, -0.0..."
3,corep_0003,"s of certain types of undertakings, amending D...",COREP_Annex_II,"[-0.048888933, 0.013849427, 0.017130297, -0.01..."
4,corep_0004,\n11. The CA templates contain information abo...,COREP_Annex_II,"[-0.013768392, 0.037880532, 0.020612482, -0.03..."
...,...,...,...,...
589,pra_0041,tional liquidity monitoring metrics specified ...,PRA_RULEBOOK,"[0.017277189, -0.051744733, -0.06506463, -0.01..."
590,pra_0042,to report information on asset encumbrance in ...,PRA_RULEBOOK,"[-0.02410535, 0.074167915, -0.041559182, -0.04..."
591,pra_0044,ial holding\ncompanies and UK parent mixed fin...,PRA_RULEBOOK,"[-0.0047148685, -0.035677847, 0.015623903, -0...."
592,pra_0045,"book, the following shall apply with regard to...",PRA_RULEBOOK,"[-0.0073001846, 0.019241735, -0.05826596, 0.00..."
