<a href="https://colab.research.google.com/github/Alina-89/Academic_RAG/blob/main/Academic_RAG_clean.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install gradio pymupdf sentence-transformers
!pip install chromadb
!pip install torch transformers



In [None]:
!pip install nbstripout

In [None]:
import os
import gradio as gr
import fitz
from sentence_transformers import SentenceTransformer
import re
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch





#Extract the text from the PDFs

In [None]:
#Function that extracts text from the pdf
def extract_text(pdf_file):
  doc=fitz.open(stream=pdf_file, filetype="pdf")
  text=""
  for page in doc:
    text+=page.get_text()
  return text

#Split the text into sentences

In [None]:
#Split text into sentences
def naive_sent_tokenize(text):
    # Splits on ., ?, or ! followed by a space or end of string
    sentences = re.split(r'(?<=[.!?]) +', text)
    return sentences


# Combine sentences into chunks of max. 100 tokens


In [None]:

#Chunk the sentences

def chunk_text_by_sentences(text, max_tokens=100):
    sentences = naive_sent_tokenize(text)
    chunks = []
    current_chunk = ""
    current_length = 0
    for sentence in sentences:
        sentence_length = len(sentence.split())
        if current_length + sentence_length > max_tokens:
            chunks.append(current_chunk.strip())
            current_chunk = sentence
            current_length = sentence_length
        else:
            current_chunk += " " + sentence
            current_length += sentence_length
    if current_chunk:
        chunks.append(current_chunk.strip())
    return chunks





#Import Chroma and create a collection

In [None]:
#Import Chroma and create a collection

import chromadb
from chromadb.config import Settings

client = chromadb.Client(Settings(
    persist_directory="chroma_db",  # where to store your DB files
    anonymized_telemetry=False
))

# Create or get your collection
collection = client.get_or_create_collection(name="pdf_chunks")

#Load the embedding model and embed the chunks

In [None]:
#Load the embedding model

embedding_model=SentenceTransformer('all-MiniLM-L6-v2')

#Embed the text chunks
def embed_text(text, file_name="document.pdf"):
    chunks = chunk_text_by_sentences(text)
    embeddings = embedding_model.encode(chunks)
    for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
        collection.add(
            documents=[chunk],
            embeddings=[embedding],
            ids=[f"{file_name}_{i}"],
            metadatas=[{"chunk_index": i, "source": file_name}]
        )

    return f"Stored {len(chunks)} chunks in Chroma for {file_name}"

#Load the LLM locally


In [None]:
# Load Phi-2 model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2")
model = AutoModelForCausalLM.from_pretrained("microsoft/phi-2", torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32)
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

#User enters a query, the query is embedded, the prompt (style+user query+retrieved chunks) is sent to the model, and the model will generate an answer.

In [None]:
#Lets user enter a query, embedds the query and retrieves relevant chunks then the model generates based on query and chunks and also on style
def chat_with_pdf(user_query, style, top_k=5):
    if not user_query.strip():
        return "Please enter a question."

    # Embed the query
    query_embedding = embedding_model.encode([user_query])[0]

    # Search in ChromaDB
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=top_k
    )

    if not results["documents"]:
        return "No relevant chunks found."

    # Prepare prompt
    chunks = results["documents"][0]
    prompt_style = {
        "Academic": "Answer the question in an academic tone.",
        "Friendly": "Answer the question like you are someone's best friend.",
        "Explain like I am 5": "Answer the question in a simple way, like you're explaining to a 5 year old child."
    }

    prompt = f"{prompt_style.get(style, '')}\n\nContext:\n" + "\n\n".join(chunks) + f"\n\nQuestion: {user_query}\nAnswer:"

    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048).to(device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=200,
            temperature=0.7,
            do_sample=True,
            top_p=0.9
        )

    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer.split("Answer:")[-1].strip()


In [None]:
#Interface with gradio
with gr.Blocks(css="""
/* 🌙 Dark Mode Background */
.gradio-container {
    background-color: #18181B;
    color: #FFFFFF;
    font-family: 'Poppins', sans-serif;
    padding: 20px;
    display: flex;
    justify-content: center;
}

/* Stack content vertically and center */
#main-column {
    display: flex;
    flex-direction: column;
    align-items: center;
    max-width: 800px;
    width: 100%;
    margin: auto;
}

/* 🖼️ Image Styling */
.gr-image {
    border-radius: 12px;
    box-shadow: 4px 4px 10px rgba(255, 255, 255, 0.2);
}

/* ✏️ Textbox Enhancements */
.gr-textbox {
    width: 90%;
    font-size: 18px;
    padding: 10px;
    border: 2px solid #4A4A4D;
}

/* 🎨 Button Customization */
.gr-button {
    background-color: #5A67D8;
    color: pink;
    font-size: 16px;
    padding: 12px 18px;
    border-radius: 8px;
    transition: 0.2s ease-in-out;
}

/* ✨ Refine Labels */
label {
    font-weight: bold;
    color: #D1D5DB;
}
""") as demo:
    with gr.Column(elem_id="main-column"):
        gr.Markdown("<h2 style='color: #EAB308;'>📄 Upload your PDF</h2><p style='color: #9CA3AF;'>Then ask questions about it</p>")

        file_input = gr.File(label="Upload a file", type="binary")
        style_input = gr.Radio(["Academic", "Friendly", "Explain like I am 5"], label="Choose a style")
        #upload_btn = gr.Button("Upload PDF")

        gr.Markdown("<h2 style='color: #34D399;'>💬 Ask a question</h2>")
        query_input = gr.Textbox(label="Your question")
        query_btn = gr.Button("Ask")
        query_output = gr.Textbox(label="Answer")

        query_btn.click(fn=chat_with_pdf,
                        inputs=[query_input, style_input],
                        outputs=[query_output])

    demo.launch(debug=True)

