In [1]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import os
import google.generativeai as genai 


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
os.environ["GOOGLE_API_KEY"] = "AIzaSyBZPPMP6bmzI98klH-wwp4hfdSwv4dmILs"
genai.configure(api_key=os.environ["GOOGLE_API_KEY"])

In [3]:
data = pd.read_csv('project_root/data/Training Dataset.csv')
data = data.dropna(subset='LoanAmount')
data


Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
5,LP001011,Male,Yes,2,Graduate,Yes,5417,4196.0,267.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


In [4]:
data['Loan_Status'].value_counts()

Loan_Status
Y    411
N    181
Name: count, dtype: int64

In [5]:
approved_data = data[data['Loan_Status'] == 'Y'].reset_index(drop=True)
denied_data = data[data['Loan_Status'] == 'N'].reset_index(drop=True)
all_data = data.reset_index(drop=True)

In [6]:
def row_to_text(row):
    return (
        f"Loan ID: {row['Loan_ID']}. "
        f"{row['Gender']} {row['Education']} applicant, "
        f"{'married' if row['Married'] == 'Yes' else 'single'}, "
        f"{'self-employed' if row['Self_Employed'] == 'Yes' else 'not self-employed'}, "
        f"income: {row['ApplicantIncome']}, "
        f"loan amount: {row['LoanAmount']} for {row['Loan_Amount_Term']} months, "
        f"credit history: {row['Credit_History']}, "
        f"property area: {row['Property_Area']}. "
        f"Loan status: {'approved' if row['Loan_Status'] == 'Y' else 'denied'}."
    )


In [7]:
# Convert to documents
approved_docs = [row_to_text(row) for _, row in approved_data.iterrows()]
denied_docs = [row_to_text(row) for _, row in denied_data.iterrows()]
all_docs = [row_to_text(row) for _, row in all_data.iterrows()]


In [8]:
# Initialize the SentenceTransformer model
model = SentenceTransformer("all-MiniLM-L6-v2")

approved_embeddings = model.encode(approved_docs)
denied_embeddings = model.encode(denied_docs)
all_embeddings = model.encode(all_docs)

In [9]:
def create_index(embeddings):
    dim = embeddings.shape[1]
    idx = faiss.IndexFlatL2(dim)
    idx.add(embeddings)
    return idx

approved_index = create_index(approved_embeddings)
denied_index = create_index(denied_embeddings)
all_index = create_index(all_embeddings)


In [10]:
def classify_query_intent(query):
    q = query.lower()
    if "approved" in q or "approved applicants" in q or "got loan" in q:
        return "approved"
    elif "denied" in q or "rejected" in q:
        return "denied"
    else:
        return "all"


In [11]:
def retrieve_top_k(query, k=5):
    intent = classify_query_intent(query)
    
    query_vec = model.encode([query])
    
    if intent == "approved":
        D, I = approved_index.search(query_vec, k)
        return [approved_docs[i] for i in I[0]]
    elif intent == "denied":
        D, I = denied_index.search(query_vec, k)
        return [denied_docs[i] for i in I[0]]
    else:
        D, I = all_index.search(query_vec, k)
        return [all_docs[i] for i in I[0]]


In [12]:
def generate_answer_gemini(query, context_docs, model_name="models/gemini-1.5-flash-latest", temperature=0.4):
    import google.generativeai as genai

    context = "\n".join(context_docs)

    prompt = f"""
You are an intelligent assistant helping with loan data analysis.

Use the following context from past loan applications to answer the user's question clearly and concisely.

Context:
{context}

Question: {query}
"""

    model = genai.GenerativeModel(model_name)
    response = model.generate_content(prompt)
    return response.text.strip()


In [13]:
"""for m in genai.list_models():
    print(m.name)
"""

'for m in genai.list_models():\n    print(m.name)\n'

In [15]:
query = "who are the people that got their loan denied and were married and graduated?"
top_docs = retrieve_top_k(query)
generate_answer_gemini(query, top_docs)


'Four married graduate applicants (Loan IDs: LP001146, LP001197, LP001497, LP002912) had their loan applications denied.'