In [25]:
import pandas as pd

df = pd.read_csv("Training Dataset.csv")
df.dropna(inplace=True)  # Optional: clean NaN values

# Combine all rows into "documents" to give to the retriever
documents = []
for index, row in df.iterrows():
    text = " | ".join([f"{col}: {str(row[col])}" for col in df.columns])
    documents.append(text)


In [26]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# Use a light model for embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')

# Convert to embeddings
doc_embeddings = model.encode(documents)

# Create FAISS index
dimension = doc_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(doc_embeddings)


  return forward_call(*args, **kwargs)


In [27]:
def retrieve_similar_docs(query, top_k=5):
    query_embedding = model.encode([query])
    distances, indices = index.search(np.array(query_embedding), top_k)
    return [documents[i] for i in indices[0]]


In [28]:
from transformers import pipeline

generator = pipeline("text-generation", model="sshleifer/tiny-gpt2", trust_remote_code=True)

def generate_answer(context, query):
    prompt = f"Answer the question based on the following loan data:\n{context}\n\nQuestion: {query}"
    output = generator(prompt, max_new_tokens=200, do_sample=True)[0]['generated_text']
    return output


Device set to use cpu


In [29]:
def rag_qa_bot(query):
    context_docs = retrieve_similar_docs(query)
    context = "\n".join(context_docs)
    return generate_answer(context, query)

# Example Query
query = "What are the common reasons for loan rejection?"
print(rag_qa_bot(query))


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Answer the question based on the following loan data:
Loan_ID: LP001013 | Gender: Male | Married: Yes | Dependents: 0 | Education: Not Graduate | Self_Employed: No | ApplicantIncome: 2333 | CoapplicantIncome: 1516.0 | LoanAmount: 95.0 | Loan_Amount_Term: 360.0 | Credit_History: 1.0 | Property_Area: Urban | Loan_Status: Y
Loan_ID: LP001238 | Gender: Male | Married: Yes | Dependents: 3+ | Education: Not Graduate | Self_Employed: Yes | ApplicantIncome: 7100 | CoapplicantIncome: 0.0 | LoanAmount: 125.0 | Loan_Amount_Term: 60.0 | Credit_History: 1.0 | Property_Area: Urban | Loan_Status: Y
Loan_ID: LP002370 | Gender: Male | Married: No | Dependents: 0 | Education: Not Graduate | Self_Employed: No | ApplicantIncome: 2717 | CoapplicantIncome: 0.0 | LoanAmount: 60.0 | Loan_Amount_Term: 180.0 | Credit_History: 1.0 | Property_Area: Urban | Loan_Status: Y
Loan_ID: LP001603 | Gender: Male | Married: Yes | Dependents: 0 | Education: Not Graduate | Self_Employed: Yes | ApplicantIncome: 4344 | Coappli

In [None]:
import pickle
import faiss

# Save FAISS index
faiss.write_index(index, "faiss_index.bin")

# Save documents and embedding model
with open("documents.pkl", "wb") as f:
    pickle.dump(documents, f)

with open("faiss_index.pkl", "wb") as f:
    pickle.dump(index, f)


