**Load the Datasets**

In [None]:
#import necessary library
import pandas as pd

# Load the Training Dataset
train_df = pd.read_csv("Training Dataset.csv")
train_preview = train_df.head()

# Load the Test Dataset
test_df = pd.read_csv("Test Dataset.csv")
test_preview = test_df.head()

# Load the Sample Submission
sample_df = pd.read_csv("Sample_Submission.csv")
sample_preview = sample_df.head()

train_preview, test_preview, sample_preview


(    Loan_ID Gender Married Dependents     Education Self_Employed  \
 0  LP001002   Male      No          0      Graduate            No   
 1  LP001003   Male     Yes          1      Graduate            No   
 2  LP001005   Male     Yes          0      Graduate           Yes   
 3  LP001006   Male     Yes          0  Not Graduate            No   
 4  LP001008   Male      No          0      Graduate            No   
 
    ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
 0             5849                0.0         NaN             360.0   
 1             4583             1508.0       128.0             360.0   
 2             3000                0.0        66.0             360.0   
 3             2583             2358.0       120.0             360.0   
 4             6000                0.0       141.0             360.0   
 
    Credit_History Property_Area Loan_Status  
 0             1.0         Urban           Y  
 1             1.0         Rural           N  
 2 

**Preproccesing the datasets**

In [5]:
import pandas as pd

train_df = pd.read_csv("Training Dataset.csv")

# Drop rows with missing target
train_df = train_df.dropna(subset=["Loan_Status"])

# Fill missing values with median/mode
train_df.fillna({
    "LoanAmount": train_df["LoanAmount"].median(),
    "Loan_Amount_Term": train_df["Loan_Amount_Term"].median(),
    "Credit_History": train_df["Credit_History"].mode()[0]
}, inplace=True)

def row_to_text(row):
    return (
        f"Loan ID: {row['Loan_ID']}. "
        f"Applicant is a {row['Gender']} who is {'married' if row['Married'] == 'Yes' else 'not married'} "
        f"with {row['Dependents']} dependents. "
        f"Education: {row['Education']}, Self Employed: {row['Self_Employed']}. "
        f"Income: {row['ApplicantIncome']}, Coapplicant Income: {row['CoapplicantIncome']}. "
        f"Requested Loan Amount: {row['LoanAmount']} for {row['Loan_Amount_Term']} months. "
        f"Credit History: {'Good' if row['Credit_History'] == 1 else 'Poor'}. "
        f"Property Area: {row['Property_Area']}. "
        f"Loan Status: {'Approved' if row['Loan_Status'] == 'Y' else 'Rejected'}."
    )

text_chunks = train_df.apply(row_to_text, axis=1).tolist()


In [6]:
!pip install faiss-cpu


Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl (31.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m39.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.11.0


**Embed Text Chunks (using sentence-transformers or Hugging Face models)**

In [7]:
from sentence_transformers import SentenceTransformer
import numpy as np

# Load embedder
embedder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Embed the text chunks
embeddings = embedder.encode(text_chunks, convert_to_tensor=False, show_progress_bar=True)
embeddings = np.array(embeddings).astype("float32")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/20 [00:00<?, ?it/s]

In [8]:
import faiss

dimension = embeddings.shape[1]

index = faiss.IndexFlatL2(dimension)

index.add(embeddings)

faiss.write_index(index, "loan_qa_index.faiss")


In [9]:
query = "Who usually gets approved for a loan?"

query_embedding = embedder.encode([query])[0].astype("float32").reshape(1, -1)

D, I = index.search(query_embedding, k=3)

for idx in I[0]:
    print(text_chunks[idx])


Loan ID: LP002983. Applicant is a Male who is married with 1 dependents. Education: Graduate, Self Employed: No. Income: 8072, Coapplicant Income: 240.0. Requested Loan Amount: 253.0 for 360.0 months. Credit History: Good. Property Area: Urban. Loan Status: Approved.
Loan ID: LP002938. Applicant is a Male who is married with 0 dependents. Education: Graduate, Self Employed: Yes. Income: 16120, Coapplicant Income: 0.0. Requested Loan Amount: 260.0 for 360.0 months. Credit History: Good. Property Area: Urban. Loan Status: Approved.
Loan ID: LP002266. Applicant is a Male who is married with 2 dependents. Education: Graduate, Self Employed: No. Income: 3100, Coapplicant Income: 1400.0. Requested Loan Amount: 113.0 for 360.0 months. Credit History: Good. Property Area: Urban. Loan Status: Approved.


In [10]:
!pip install sentence-transformers


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence-transformers)
 

In [12]:
from sentence_transformers import SentenceTransformer
import numpy as np

# Load the model
embedder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Create embeddings
embeddings = embedder.encode(text_chunks, convert_to_tensor=False, show_progress_bar=True)

# Convert to float32 for FAISS compatibility
embeddings = np.array(embeddings).astype("float32")


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

In [17]:
!pip install transformers




In [33]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)


In [36]:
def generate_answer(query, top_k=3):
    retrieved = semantic_search(query, top_k)
    context = "\n".join(retrieved)
    prompt = f"Context:\n{context}\n\nQuestion: {query}\nAnswer:"
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
    outputs = model.generate(**inputs, max_new_tokens=128)
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer, context


In [34]:
import faiss
import numpy as np

# Ensure float32 for FAISS
embeddings = np.array(embeddings).astype("float32")

index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)


In [20]:
question = "Do graduates have higher loan approval chances?"
answer, retrieved = generate_answer(question)

print("Answer:\n", answer)
print("\nRetrieved Context:\n", retrieved)


Answer:
 yes

Retrieved Context:
 Loan ID: LP001673. Applicant is a Male who is not married with 0 dependents. Education: Graduate, Self Employed: Yes. Income: 11000, Coapplicant Income: 0.0. Requested Loan Amount: 83.0 for 360.0 months. Credit History: Good. Property Area: Urban. Loan Status: Rejected.
Loan ID: LP001259. Applicant is a Male who is married with 1 dependents. Education: Graduate, Self Employed: Yes. Income: 1000, Coapplicant Income: 3022.0. Requested Loan Amount: 110.0 for 360.0 months. Credit History: Good. Property Area: Urban. Loan Status: Rejected.
Loan ID: LP002949. Applicant is a Female who is not married with 3+ dependents. Education: Graduate, Self Employed: nan. Income: 416, Coapplicant Income: 41667.0. Requested Loan Amount: 350.0 for 180.0 months. Credit History: Good. Property Area: Urban. Loan Status: Rejected.


In [24]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)


In [25]:
index = faiss.IndexFlatL2(384)
index.add(embeddings)  # embeddings must be float32


In [26]:
len(text_chunks) == len(embeddings)


True

**Implement Retrieval (semantic search)**

> Semantic Retrieval with FAISS



In [35]:
def semantic_search(query, top_k=3):
    query_embedding = embedder.encode([query])[0].astype("float32").reshape(1, -1)
    distances, indices = index.search(query_embedding, top_k)
    return [text_chunks[i] for i in indices[0]]


In [28]:
retrieved_chunks = semantic_search(query, top_k=3)
context = "\n".join(retrieved_chunks)

prompt = f"Context:\n{context}\n\nQuestion: {query}\nAnswer:"

inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)

In [30]:
def rag_chatbot(query):
    answer, _ = generate_answer(query)
    return answer


**Using Gradio interface to deploy the projects**

In [21]:
!pip install gradio




In [29]:
outputs = model.generate(**inputs, max_new_tokens=128)
answer = tokenizer.decode(outputs[0], skip_special_tokens=True)

print("Answer:\n", answer)
print("\nRetrieved Context:\n", retrieved_chunks)

Answer:
 Male who is married with 3+ dependents

Retrieved Context:
 ['Loan ID: LP002979. Applicant is a Male who is married with 3+ dependents. Education: Graduate, Self Employed: No. Income: 4106, Coapplicant Income: 0.0. Requested Loan Amount: 40.0 for 180.0 months. Credit History: Good. Property Area: Rural. Loan Status: Approved.', 'Loan ID: LP002983. Applicant is a Male who is married with 1 dependents. Education: Graduate, Self Employed: No. Income: 8072, Coapplicant Income: 240.0. Requested Loan Amount: 253.0 for 360.0 months. Credit History: Good. Property Area: Urban. Loan Status: Approved.', 'Loan ID: LP002266. Applicant is a Male who is married with 2 dependents. Education: Graduate, Self Employed: No. Income: 3100, Coapplicant Income: 1400.0. Requested Loan Amount: 113.0 for 360.0 months. Credit History: Good. Property Area: Urban. Loan Status: Approved.']


In [38]:
# Sanity test without Gradio
test_query = "Do graduates get more approvals?"
ans, ctx = generate_answer(test_query)
print("Answer:", ans)
print("Context:", ctx)


Answer: yes
Context: Loan ID: LP002082. Applicant is a Male who is married with 0 dependents. Education: Graduate, Self Employed: Yes. Income: 5818, Coapplicant Income: 2160.0. Requested Loan Amount: 184.0 for 360.0 months. Credit History: Good. Property Area: Semiurban. Loan Status: Approved.
Loan ID: LP001788. Applicant is a Female who is not married with 0 dependents. Education: Graduate, Self Employed: Yes. Income: 3463, Coapplicant Income: 0.0. Requested Loan Amount: 122.0 for 360.0 months. Credit History: Good. Property Area: Urban. Loan Status: Approved.
Loan ID: LP001504. Applicant is a Male who is not married with 0 dependents. Education: Graduate, Self Employed: Yes. Income: 6950, Coapplicant Income: 0.0. Requested Loan Amount: 175.0 for 180.0 months. Credit History: Good. Property Area: Semiurban. Loan Status: Approved.


**Integrate with LLMs (OpenAI, HuggingFace) and Build Chat Interface**

*   RAG with Hugging Face LLM




In [None]:
# RAG Q&A Chatbot using Hugging Face (No API keys)

import pandas as pd
import os
import faiss
import numpy as np
import gradio as gr
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load training data
data_path = "Training Dataset.csv"
df = pd.read_csv(data_path)
df.fillna({
    "LoanAmount": df["LoanAmount"].median(),
    "Loan_Amount_Term": df["Loan_Amount_Term"].median(),
    "Credit_History": df["Credit_History"].mode()[0]
}, inplace=True)
df.dropna(subset=["Loan_Status"], inplace=True)


def row_to_text(row):
    return (
        f"Loan ID: {row['Loan_ID']}. "
        f"Applicant is a {row['Gender']} who is {'married' if row['Married'] == 'Yes' else 'not married'} "
        f"with {row['Dependents']} dependents. "
        f"Education: {row['Education']}, Self Employed: {row['Self_Employed']}. "
        f"Income: {row['ApplicantIncome']}, Coapplicant Income: {row['CoapplicantIncome']}. "
        f"Requested Loan Amount: {row['LoanAmount']} for {row['Loan_Amount_Term']} months. "
        f"Credit History: {'Good' if row['Credit_History'] == 1 else 'Poor'}. "
        f"Property Area: {row['Property_Area']}. "
        f"Loan Status: {'Approved' if row['Loan_Status'] == 'Y' else 'Rejected'}."
    )

texts = df.apply(row_to_text, axis=1).tolist()


embedder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
corpus_embeddings = embedder.encode(texts, convert_to_tensor=False, show_progress_bar=True)
corpus_embeddings = np.array(corpus_embeddings).astype("float32")

index = faiss.IndexFlatL2(corpus_embeddings.shape[1])
index.add(corpus_embeddings)

model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

def generate_answer(question, top_k=3):
    
    question_embedding = embedder.encode([question])[0].astype("float32")

    _, top_indices = index.search(np.array([question_embedding]), top_k)
    context = "\n".join([texts[idx] for idx in top_indices[0]])

    prompt = f"Context: {context}\n\nQuestion: {question}\nAnswer:"

    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
    outputs = model.generate(**inputs, max_new_tokens=128)
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return answer, context

# Gradio UI
def chat_interface(question):
    answer, context = generate_answer(question)
    return answer, context

demo = gr.Interface(
    fn=chat_interface,
    inputs=gr.Textbox(label="Ask a Question"),
    outputs=[
        gr.Textbox(label="Generated Answer"),
        gr.Textbox(label="Retrieved Context")
    ],
    title="RAG Q&A Chatbot",
    description="Ask a question based on the training dataset and get a context-aware answer using Hugging Face models."
)

demo.launch()

Batches:   0%|          | 0/20 [00:00<?, ?it/s]

It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://883a09450f0d67a6ed.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


