## KPI's

In [1]:
import pandas as pd
customers = pd.read_csv("data/raw/customers.csv")
loans = pd.read_csv("data/raw/loan_applications.csv")
repayments = pd.read_csv("data/raw/repayments.csv")

df = loans.merge(customers, on="customer_id", how="left")
df = df.merge(repayments, left_on="application_id", right_on="loan_id", how="left")

print("Total applications:", len(df))
print("Approval rate:", (df["application_stage"] == "Approved").mean())



Total applications: 8
Approval rate: 0.625


In [2]:
def calculate_kpis(df):
    kpis = {
        "approval_rate": (df["application_stage"] == "Approved").mean(),
        "rejection_rate": (df["application_stage"] == "Rejected").mean(),
        "avg_dpd": df["days_past_due"].fillna(0).mean()
    }
    return kpis


In [3]:
def risk_kpis(df):
    total_loans = df["application_id"].nunique()

    dpd_30 = (df["days_past_due"] > 30).sum()
    dpd_90 = (df["days_past_due"] > 90).sum()

    return {
        "avg_dpd": df["days_past_due"].fillna(0).mean(),
        "pct_dpd_30": dpd_30 / total_loans,
        "pct_dpd_90": dpd_90 / total_loans
    }


In [4]:
risk_kpis(df)

{'avg_dpd': np.float64(24.25),
 'pct_dpd_30': np.float64(0.375),
 'pct_dpd_90': np.float64(0.0)}

In [5]:
def segment_risk(df, segment_col):
    return (
        df.groupby(segment_col)
        .agg(
            approval_rate=("application_stage", lambda x: (x=="Approved").mean()),
            avg_dpd=("days_past_due", "mean"),
            loan_count=("application_id", "count")
        )
        .reset_index()
    )


In [6]:
def early_warning(df):
    stressed = df[df["days_past_due"] > 30]
    segments = stressed.groupby("employment_type").size()

    return segments


In [7]:
early_warning(df)

employment_type
Salaried         1
Self-Employed    2
dtype: int64

## RAG PART

In [8]:
from langchain_community.document_loaders import DirectoryLoader,PyPDFLoader,TextLoader

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
loader=DirectoryLoader("documents/",glob="*.pdf",loader_cls=PyPDFLoader) # ignore

In [10]:
text_loader=DirectoryLoader("documents/",glob="*.txt",loader_cls=TextLoader)

In [11]:
pdf_docs=loader.load()
text_docs=text_loader.load()

In [12]:
all_docs = pdf_docs + text_docs


In [13]:
print(f"PDFs loaded: {len(pdf_docs)}")
print(f"TXTs loaded: {len(text_docs)}")
print(f"Total docs: {len(all_docs)}")

PDFs loaded: 24
TXTs loaded: 1
Total docs: 25


In [14]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=100
)

chunks = splitter.split_documents(all_docs)


In [15]:
from dotenv import load_dotenv
load_dotenv()  # take environment variables from .env file
import os 
api_key = os.getenv("GOOGLE_API_KEY")

In [16]:
from langchain_google_genai.embeddings import GoogleGenerativeAIEmbeddings

In [17]:
embedding=GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")

In [18]:
# if we are creating a vector store then we have provide length 768 for text-embedding-004

vectors = embedding.embed_query("What is machine learning?")
print(len(vectors))

768


In [19]:
import os
from pinecone import Pinecone
load_dotenv()

# Create Pinecone client instance
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

index_name = "vectordb"

# Check if index exists
if not pc.has_index(index_name):
    pc.create_index_for_model(
        name=index_name,
        cloud="aws",
        region="us-east-1",
        embed={"model": "llama-text-embed-v2",
            "field_map": {"text": "chunk_text"}
        } # type: ignore
    )

# Connect to the index
index = pc.Index(index_name)


In [20]:
vectors = []
for i, chunk in enumerate(chunks):
    vector = embedding.embed_documents([chunk.page_content])[0]  # extract the flat list
    vectors.append({
        "id": f"chunk-{i}",
        "values": vector,  # now a flat list
        "metadata": {
            "source": chunk.metadata.get("source", ""),
            "chunk_text": chunk.page_content
        }
    })

# Upsert into Pinecone
vector_db = index.upsert(vectors=vectors)
print(vector_db)


{'upserted_count': 110}


In [21]:
# query=str(input("Enter your query: "))
# query_vector = embedding.embed_query(query)
# search_results = index.query(
#     vector=query_vector,
#     top_k=3,
#     include_metadata=True
# )


In [None]:
from langchain_groq.chat_models import ChatGroq
from langchain_core.prompts import ChatPromptTemplate, HumanMessagePromptTemplate, SystemMessagePromptTemplate
from langchain.tools import tool
from groq import Groq
from langchain_pinecone import PineconeVectorStore



load_dotenv()
client=Groq(api_key=os.getenv("GROQ_API_KEY"))
vectorstore =PineconeVectorStore(
    index=index,
    embedding=embedding,
    text_key="chunk_text"
)
retriever = vectorstore.as_retriever(search_kwargs={"k": 2})


def retrieve_context(query: str):
    """Retrieve information to help answer a query."""
    retrieved_docs = vectorstore.similarity_search(query, k=3)
    serialized = "\n\n".join(
        (f"Source: {doc.metadata}\nContent: {doc.page_content}")
        for doc in retrieved_docs
    )
    return serialized, retrieved_docs


In [52]:
query = "What must lenders communicate to small borrowers if a loan application up to Rs. 2 lakhs is rejected?"
context, docs = retrieve_context(query)

In [54]:
from langchain_core.prompts import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
    SystemMessagePromptTemplate
)
from langchain_groq import ChatGroq

# LLM
chat_model = ChatGroq(
    model_name="llama-3.3-70b-versatile",
    temperature=2
)

# Prompt
prompt = ChatPromptTemplate(
    messages=[
        SystemMessagePromptTemplate.from_template(
            """You are a Financial Research Assistant specialized in providing clear, accurate, and context-grounded answers.

Strict Rules:
- Answer ONLY using the information explicitly present in the provided context.
- Do NOT use external knowledge, assumptions, or interpretations.
- Do NOT infer reasons, motivations, or implications not stated in the context.
- Do NOT add examples, explanations, or background beyond the context.

Answering Guidelines:
- Present the answer in simple, clear, and professional language.
- Use short sentences or bullet points if it improves readability.
- Keep the response concise and directly relevant to the question.
- Maintain a formal and factual tone suitable for financial and regulatory use.

If the answer cannot be found explicitly in the context, respond exactly with:
"The provided context does not contain the information required to answer this question."""
        ),
        HumanMessagePromptTemplate.from_template(
            "Context: {context}\n\nQuestion: {question}"
        )
    ],
    input_variables=["context", "question"]
)

# Create messages
messages = prompt.invoke({
    "context": context,
    "question": query
})

# Invoke model
answer = chat_model.invoke(messages)

print("Answer:", answer.content)


Answer: Lenders must convey in writing the main reason/reasons for rejection of the loan application of small borrowers seeking loans up to Rs. 2 lakhs.
