In [13]:
!pip install -q pandas faiss-cpu sentence-transformers transformers openai


In [14]:

import pandas as pd
import faiss
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch


df = pd.read_csv('/content/Training Dataset.csv')
df.fillna("", inplace=True)

def row_to_text(row):
    return (
        f"The applicant is a {row['Gender']} who is a {row['Education']} and "
        f"{'is self-employed' if row['Self_Employed'] == 'Yes' else 'is not self-employed'}. "
        f"They have an income of {row['ApplicantIncome']}, a coapplicant income of {row['CoapplicantIncome']}, "
        f"a loan amount of {row['LoanAmount']} and a loan term of {row['Loan_Amount_Term']}. "
        f"The credit history is {row['Credit_History']} and the property area is {row['Property_Area']}. "
        f"The loan status is {row['Loan_Status']}."
    )

docs = df.apply(row_to_text, axis=1).tolist()
print(f"Processed {len(docs)} documents.")


Processed 614 documents.


In [15]:

embed_model = SentenceTransformer('all-MiniLM-L6-v2')


print("Embedding documents")
doc_embeddings = embed_model.encode(docs, show_progress_bar=True)


dimension = doc_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(doc_embeddings)


Embedding documents


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

In [16]:
print("Loading language model")
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
generator_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")


Loading language model


In [17]:

def retrieve_docs(query, k=5):
    query_vec = embed_model.encode([query])
    distances, indices = index.search(query_vec, k)
    return [docs[i] for i in indices[0]]


def generate_answer(context, question):
    prompt = (
        f"You are an assistant for analyzing loan applications.\n\n"
        f"Context:\n{context}\n\n"
        f"Question: {question}\n\n"
        f"Answer the question based only on the context above."
    )
    inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)

    outputs = generator_model.generate(
        **inputs,
        max_new_tokens=100,
        repetition_penalty=2.0,
        no_repeat_ngram_size=3
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


def answer_question(question, top_k=5):
    context_docs = retrieve_docs(question, k=top_k)
    context = "\n".join(context_docs)
    return generate_answer(context, question)


In [None]:
while True:
    question = input("\nAsk a question (or type 'exit' to quit): ")
    if question.lower() == 'exit':
        break
    answer = answer_question(question)
    print(f"\nAnswer: {answer}")



Ask a question (or type 'exit' to quit): Does income affect the loan amount?

Answer: yes

Ask a question (or type 'exit' to quit): Is credit history important for loan status?

Answer: Yes

Ask a question (or type 'exit' to quit): Does being self-employed impact loan approval?

Answer: no

Ask a question (or type 'exit' to quit): What factors affect loan approval?

Answer: income
