<a href="https://colab.research.google.com/github/ByronFC3/2/blob/main/MC_EI2_Prediction_RAG_Model_18June_28April.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Building a RAG System for Analyzing NSF Grants

The point of the code below is to see if I can plug any PDF into it and use RAG to quary the content of that PDF to answer questions about the document. This is the simulate a companies newly onboarded employee needed to know how to do something on their job and not having access to a manager or co-worker.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
import gradio as gr
import joblib
from sklearn.linear_model import LogisticRegression
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from getpass import getpass

# === Configuration ===
# Securely input your API key if it's not already set
if not os.getenv("OPENAI_API_KEY"):
    os.environ["OPENAI_API_KEY"] = getpass('Enter your OpenAI API key: ')

embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# --- Define your specific PDF for RAG ---
MC_GENAI_STUDY_PDF = "/content/the-economic-potential-of-generative-ai-the-next-productivity-frontier (1).pdf"
print(f"Default RAG PDF set to: {MC_GENAI_STUDY_PDF}")

# Paths for proposal directories (update as needed) - these are primarily used for training the classifier
# Ensure these directories exist and contain some PDFs for the classifier to train on
funded_dir = "/content/drive/MyDrive/3.Google_Colab_repo_11Apr/approved"
rejected_dir = "/content/drive/MyDrive/3.Google_Colab_repo_11Apr/Rejection"
MODEL_PATH = "proposal_classifier.pkl"

# === Build Global Retriever (from funded/rejected proposals) ===
# This part assumes you've run the initial code to create and save "grant_faiss_index_with_labels"
global_vectorstore = None
global_retriever = None
try:
    if os.path.exists("grant_faiss_index_with_labels"):
        global_vectorstore = FAISS.load_local(
            "grant_faiss_index_with_labels", embedding_model,
            allow_dangerous_deserialization=True
        )
        global_retriever = global_vectorstore.as_retriever()
        print("Global FAISS vectorstore (funded/rejected proposals) loaded successfully.")
    else:
        print("Global FAISS vectorstore 'grant_faiss_index_with_labels' not found. Training classifier might be affected.")
except Exception as e:
    print(f"Could not load global FAISS vectorstore: {e}.")


# === Initialize LLM ===
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

# === Utility for PDF-based retriever ===
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)

def create_pdf_retriever(pdf_path, is_default_pdf=False):
    """Creates a FAISS retriever from a single PDF file."""
    if not os.path.exists(pdf_path):
        print(f"Error: PDF file not found at {pdf_path}")
        return None
    try:
        loader = PyPDFLoader(pdf_path)
        pages = loader.load()
        docs = text_splitter.split_documents(pages)
        # Add metadata including source and page number
        tagged = [Document(page_content=d.page_content, metadata={"source": os.path.basename(pdf_path), "page": d.metadata.get("page", "N/A")}) for d in docs]
        vs = FAISS.from_documents(tagged, embedding_model)
        print(f"Retriever created for {pdf_path} with {len(docs)} chunks.")
        return vs.as_retriever()
    except Exception as e:
        print(f"Error creating retriever for {pdf_path}: {e}")
        return None

# === Train or Load Funding Classifier ===
def load_labeled_docs_for_classifier():
    """Loads and tags documents from funded and rejected directories for classifier training."""
    tagged_docs = []
    def load_split_tag(dir_path, label):
        temp_tagged = []
        if not os.path.exists(dir_path):
            print(f"Directory not found for classifier training: {dir_path}")
            return []
        for fname in os.listdir(dir_path):
            if fname.endswith('.pdf'):
                full_path = os.path.join(dir_path, fname)
                try:
                    loader = PyPDFLoader(full_path)
                    pages = loader.load()
                    chunks = text_splitter.split_documents(pages)
                    for c in chunks:
                        temp_tagged.append(Document(page_content=c.page_content, metadata={"label":label, "source": fname}))
                except Exception as e:
                    print(f"Error loading {full_path} for classifier: {e}")
        return temp_tagged

    funded = load_split_tag(funded_dir, "funded")
    rejected = load_split_tag(rejected_dir, "rejected")
    tagged_docs.extend(funded)
    tagged_docs.extend(rejected)
    return tagged_docs

classifier = None
if os.path.exists(MODEL_PATH):
    try:
        classifier = joblib.load(MODEL_PATH)
        print("Classifier loaded successfully.")
    except Exception as e:
        print(f"Error loading classifier: {e}. Retraining...")
        classifier = None # Force retraining if load fails
else:
    print("Classifier model not found. Training new classifier...")

if classifier is None: # Train if not loaded or failed to load
    docs_for_clf = load_labeled_docs_for_classifier()
    if docs_for_clf:
        texts = [d.page_content for d in docs_for_clf]
        labels = [1 if d.metadata["label"]=="funded" else 0 for d in docs_for_clf]
        try:
            embeddings = embedding_model.embed_documents(texts)
            clf = LogisticRegression(max_iter=1000)
            clf.fit(embeddings, labels)
            joblib.dump(clf, MODEL_PATH)
            classifier = clf
            print("Classifier trained and saved.")
        except Exception as e:
            print(f"Error training classifier: {e}. Prediction functionality may not work.")
    else:
        print("No documents found for classifier training. Prediction functionality disabled.")


# === Functions ===
def answer_question(question, uploaded_pdf_file):
    """
    Answers a question using RAG. Prioritizes sources in this order:
    1. User-uploaded PDF.
    2. Hardcoded MC_GENAI_STUDY_PDF.
    3. Global retriever (from funded/rejected proposals).
    """
    if not question.strip():
        return "Please enter a question.", ""

    current_retriever = None
    source_description = ""

    # Priority 1: Use the explicitly uploaded PDF from Gradio
    if uploaded_pdf_file and uploaded_pdf_file.name:
        current_retriever = create_pdf_retriever(uploaded_pdf_file.name)
        source_description = f"using uploaded PDF: {os.path.basename(uploaded_pdf_file.name)}"
        print(f"Using retriever for uploaded PDF: {uploaded_pdf_file.name}.")
    # Priority 2: Use the hardcoded MC_GenAI_Study_1 PDF if no other PDF is uploaded
    elif os.path.exists(MC_GENAI_STUDY_PDF):
        current_retriever = create_pdf_retriever(MC_GENAI_STUDY_PDF)
        source_description = f"using default PDF: {os.path.basename(MC_GENAI_STUDY_PDF)}"
        print(f"Using retriever for default PDF: {MC_GENAI_STUDY_PDF}.")
    # Priority 3: Fallback to global retriever if no specific PDF is provided
    elif global_retriever is not None:
        current_retriever = global_retriever
        source_description = "using pre-loaded funded/rejected proposals"
        print("Using global retriever (funded/rejected proposals).")
    else:
        return "Error: No document source available to answer the question. Please upload a PDF or ensure the default PDF path is correct and accessible.", ""

    if current_retriever is None:
        return f"Error: Could not create a retriever from the specified document source ({source_description}).", ""

    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        retriever=current_retriever,
        return_source_documents=True
    )

    try:
        result = qa_chain({"query": question})
        answer = result["result"]
        docs = result["source_documents"]
        formatted_sources = []
        for doc in docs:
            src = doc.metadata.get("source", "N/A")
            page = doc.metadata.get("page", "N/A")
            snippet = doc.page_content[:200].replace("\n", " ") + "..."
            formatted_sources.append(f"Source: {src}, Page: {page}: {snippet}")
        return answer, "\n\n".join(formatted_sources)
    except Exception as e:
        return f"An error occurred during answering from {source_description}: {e}", ""


def predict_likelihood(pdf_file):
    """Predicts the funding likelihood of an uploaded PDF using the trained classifier."""
    if not classifier:
        return "Classifier not available. Please ensure training data directories are correctly configured and contain PDFs."

    if not pdf_file:
        return "Upload a PDF file to predict likelihood."

    try:
        loader = PyPDFLoader(pdf_file.name)
        pages = loader.load()
        chunks = text_splitter.split_documents(pages)
        if not chunks:
            return "No text could be extracted from the uploaded PDF."

        texts = [c.page_content for c in chunks]
        embeddings = embedding_model.embed_documents(texts)
        probs = classifier.predict_proba(embeddings)[:,1]
        avg_prob = float(probs.mean())
        return f"Estimated funding probability: {avg_prob*100:.2f}%"
    except Exception as e:
        return f"An error occurred during prediction: {e}"

# Combined chat+predict function for UI
def chat_and_predict(question, pdf_file):
    answer, sources = answer_question(question, pdf_file)
    # Only predict likelihood if a PDF was specifically uploaded for chat
    likelihood = predict_likelihood(pdf_file) if pdf_file else "Upload a PDF for prediction in this tab."
    return answer, sources, likelihood

# === Gradio Interface ===
with gr.Blocks() as demo:
    gr.Markdown("## Grant Proposal Reviewer & Success Predictor Chatbot")
    gr.Markdown(
        f"Ask questions about grant proposals. "
        f"By default, the chatbot will answer questions based on **'{os.path.basename(MC_GENAI_STUDY_PDF)}'** if no other PDF is uploaded. "
        "You can also upload a different PDF to chat with it, or use the 'Predict Funding' tab to assess funding likelihood of a proposal."
    )
    with gr.Tab("Chat & Search"):
        inp = gr.Textbox(label="Your Question", placeholder="Ask about the economic potential of GenAI or upload a proposal...")
        pdf_q = gr.File(label="(Optional) Upload a PDF to chat with it (overrides default)", file_types=[".pdf"])
        btn_q = gr.Button("Submit Query")
        out_answer = gr.Textbox(label="AI Answer", interactive=False, lines=5)
        out_sources = gr.Textbox(label="Source Snippets", interactive=False, lines=8)
        out_prob_chat = gr.Textbox(label="Funding Likelihood (for uploaded PDF)", interactive=False)
        btn_q.click(fn=chat_and_predict, inputs=[inp, pdf_q], outputs=[out_answer, out_sources, out_prob_chat])
    with gr.Tab("Predict Funding"):
        pdf_p = gr.File(label="Upload PDF to Predict Funding", file_types=[".pdf"])
        btn_p = gr.Button("Predict Likelihood")
        out_prob = gr.Textbox(label="Funding Probability", interactive=False)
        btn_p.click(fn=predict_likelihood, inputs=pdf_p, outputs=out_prob)

if __name__ == "__main__":
    demo.launch(share=True)