<a href="https://colab.research.google.com/github/Alina-89/Academic_RAG/blob/main/Academic_RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [27]:
!pip install gradio pymupdf sentence-transformers nltk
!pip install chromadb



In [36]:
import os
import gradio as gr
import fitz
from sentence_transformers import SentenceTransformer
import re





In [37]:
def naive_sent_tokenize(text):
    # Splits on ., ?, or ! followed by a space or end of string
    sentences = re.split(r'(?<=[.!?]) +', text)
    return sentences


*Extract the text from the PDF*


In [38]:
#Function that extracts text from the pdf
def extract_text(pdf_file):
  doc=fitz.open(stream=pdf_file, filetype="pdf")
  text=""
  for page in doc:
    text+=page.get_text()
  return text

# Chunk the text into sentences

In [39]:

#Chunk the text into sentences

def chunk_text_by_sentences(text, max_tokens=100):
    sentences = naive_sent_tokenize(text)
    chunks = []
    current_chunk = ""
    current_length = 0
    for sentence in sentences:
        sentence_length = len(sentence.split())
        if current_length + sentence_length > max_tokens:
            chunks.append(current_chunk.strip())
            current_chunk = sentence
            current_length = sentence_length
        else:
            current_chunk += " " + sentence
            current_length += sentence_length
    if current_chunk:
        chunks.append(current_chunk.strip())
    return chunks

#Load the embedding model

embedding_model=SentenceTransformer('all-MiniLM-L6-v2')

#Embed the text chunks
def embed_text(text, file_name="document.pdf"):
    chunks = chunk_text_by_sentences(text)
    embeddings = embedding_model.encode(chunks)
    for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
        collection.add(
            documents=[chunk],
            embeddings=[embedding],
            ids=[f"{file_name}_{i}"],
            metadatas=[{"chunk_index": i, "source": file_name}]
        )

    return f"Stored {len(chunks)} chunks in Chroma for {file_name}"



#Upload embeddings in database

In [40]:
def process_pdf_and_embed(pdf_file, style):
    if pdf_file is None:
        return "Please upload a PDF file"
    try:
        text = extract_text(pdf_file)
        result = embed_text(text, file_name=pdf_file.name if hasattr(pdf_file, 'name') else "uploaded_file")
        return result
    except Exception as e:
        return f"Error processing PDF: {str(e)}"



In [41]:
#This will help with generating summary; in progress
def generate_summary(pdf_file, style):
  if pdf_file is None:
    return "Please upload a PDF file"
  try:
    text=extract_text(pdf_file)
  except Exception as e:
    return f"Error extracting text: {str(e)}"
  return f"Extracted {len (text)} characters. \n\n{text[:2000]}"

In [42]:
#Import Chroma and create a collection

import chromadb
from chromadb.config import Settings

client = chromadb.Client(Settings(
    persist_directory="chroma_db",  # where to store your DB files
    anonymized_telemetry=False
))

# Create or get your collection
collection = client.get_or_create_collection(name="pdf_chunks")



In [43]:
#Interface with gradio
with gr.Blocks(css="""
/* 🌙 Dark Mode Background */
.gradio-container {
    background-color: #18181B;
    color: #FFFFFF;
    font-family: 'Poppins', sans-serif;
    padding: 20px;
    display: flex;
    justify-content: center;
}

/* Stack content vertically and center */
#main-column {
    display: flex;
    flex-direction: column;
    align-items: center;
    max-width: 800px;
    width: 100%;
    margin: auto;
}

/* 🖼️ Image Styling */
.gr-image {
    border-radius: 12px;
    box-shadow: 4px 4px 10px rgba(255, 255, 255, 0.2);
}

/* ✏️ Textbox Enhancements */
.gr-textbox {
    width: 90%;
    font-size: 18px;
    padding: 10px;
    border: 2px solid #4A4A4D;
}

/* 🎨 Button Customization */
.gr-button {
    background-color: #5A67D8;
    color: pink;
    font-size: 16px;
    padding: 12px 18px;
    border-radius: 8px;
    transition: 0.2s ease-in-out;
}

/* ✨ Refine Labels */
label {
    font-weight: bold;
    color: #D1D5DB;
}
""") as demo:

    with gr.Column(elem_id="main-column"):
        gr.Markdown("<h2 style='color: #EAB308;'>✏️ Create your summary</h2><p style='color: #9CA3AF;'>Upload your document</p>")

        file_input = gr.File(label="Upload a file", type="binary")
        style_input = gr.Radio(["Academic", "Robotic", "Explain like I am 5"], label="Choose a style")
        generate_btn = gr.Button("Generate")

        summary_output = gr.Textbox(label="Generated Summary")


        generate_btn.click(fn=process_pdf_and_embed,
                           inputs=[file_input, style_input],
                           outputs=[summary_output])

demo.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://5616d575673b75ab6b.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


