In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

df = pd.read_csv(r"C:\Users\kumar\OneDrive\Desktop\Celebal Technology\loan_chatbot_rag\data\cleaned_loan_dataset.csv")

df['combined_text'] = df.fillna('').apply(lambda row: ' '.join(str(x) for x in row), axis=1)

vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df['combined_text'])

def retrieve_context(query, top_k=3):
    query_vec = vectorizer.transform([query])
    similarity = cosine_similarity(query_vec, tfidf_matrix).flatten()
    top_indices = similarity.argsort()[-top_k:][::-1]
    return "\n".join(df['combined_text'].iloc[top_indices])

model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

def generate_answer(query):
    context = retrieve_context(query)
    prompt = f"Context: {context}\nQuestion: {query}\nAnswer:"
    inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)
    outputs = model.generate(**inputs, max_new_tokens=100)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

query = "Why was the loan rejected for some applicants?"
answer = generate_answer(query)
print("Query:", query)
print("Answer:", answer)


Query: Why was the loan rejected for some applicants?
Answer: The loan was not a graduate.
