##RAG

#🔧 Setup

In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
!pip install chromadb

In [None]:

!pip install rouge_score

In [None]:
!pip install -U langchain-community

In [None]:
!pip install --upgrade chromadb

In [None]:
!pip install PyMuPDF

In [None]:
import fitz  # PyMuPDF
import chromadb
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
from transformers import pipeline
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
import time

drive.mount('/content/drive')

import nltk
nltk.download('punkt_tab')





#📄 Text Processing

In [None]:
def extract_text_from_pdf(pdf_path, chars=1000):
    """Extract the first `chars` characters from a PDF."""
    doc = fitz.open(pdf_path)
    text = "\n".join([page.get_text("text") for page in doc])
    return text[:chars]


#RAG Pipeline

1)Document Chunking

In [None]:
def split_text_into_chunks(text, chunk_size=512, overlap=50):
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
    return splitter.split_text(text)


2) Initialize ChromaDB with Embeddings

In [None]:
def initialize_chromadb(persist_dir, collection_name):
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return Chroma(persist_directory=persist_dir, embedding_function=embeddings), embeddings


3)Add Chunks to Chroma


In [None]:
def store_chunks_in_chroma(chroma_db, chunks):
    for i, chunk in enumerate(chunks):
        chroma_db.add_texts(texts=[chunk], metadatas=[{"chunk_id": i}])


4) retrieve Relevant Chunks

In [None]:
def retrieve_chunks(chroma_db, query, top_k=5):
    return chroma_db.similarity_search(query, k=top_k)


#Summarization

In [None]:

# List of models
models = {
    "facebook/bart-large-cnn": "BART",
    "google/pegasus-large": "PEGASUS",
    "allenai/led-base-16384": "LED",
    "facebook/bart-large-xsum": "BART + Longformer hybrid",
    "Salesforce/ctrl": "CTRL",
    "microsoft/prophetnet-large-uncased": "ProphetNet"
}

# Function to select model
def get_summarizer(model_name):
    if model_name in models:
        return pipeline("summarization", model=model_name)
    else:
        raise ValueError(f"Model {model_name} not found in available models.")

# Summarization with dynamic model selection
def summarize_with_model(query, context_text, model_name="facebook/bart-large-cnn"):
    summarizer = get_summarizer(model_name)
    input_text = f"Query: {query}\nDocument: {context_text}"
    summary = summarizer(input_text, max_length=150, min_length=50, do_sample=False)
    return summary[0]["summary_text"]

#Evaluation

In [None]:
def compute_bleu(reference, candidate):
    return sentence_bleu([reference.split()], candidate.split())

def compute_rouge(reference, candidate):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    return scorer.score(reference, candidate)


#Full Evaluation Pipeline

In [None]:
def run_rag_pipeline(pdf_path, query, chroma_path, collection_name, model_name="facebook/bart-large-cnn"):
    reference_summary = extract_text_from_pdf(pdf_path)

    chroma_db, _ = initialize_chromadb(chroma_path, collection_name)
    rag_results = chroma_db.similarity_search(query, k=3)
    context = " ".join([doc.page_content for doc in rag_results])

    if not context.strip():
        return "No relevant content retrieved."

    summary = summarize_with_model(query, context, model_name)

    bleu = compute_bleu(reference_summary, summary)
    rouge = compute_rouge(reference_summary, summary)

    print("\n🔷 RAG Model Performance:")
    print(f"   🟢 BLEU: {bleu:.4f}")
    print(f"   🟢 ROUGE-1: {rouge['rouge1'].fmeasure:.4f}")
    print(f"   🟢 ROUGE-2: {rouge['rouge2'].fmeasure:.4f}")
    print(f"   🟢 ROUGE-L: {rouge['rougeL'].fmeasure:.4f}")

    return summary

#Testing

In [None]:
# ==== Parameters ====
pdf_path = "/content/drive/MyDrive/Data/38_1612851.pdf"  # Example path
query = "Summarize this paper in 200 words."
chroma_path = "/content/drive/MyDrive/VectorDB_Embeddings"  # Where to persist Chroma
collection_name = "ties_collection_emb"

# ==== Step 1: Extract + Chunk Text ====
raw_text = extract_text_from_pdf(pdf_path)
chunks = split_text_into_chunks(raw_text)

# ==== Step 2: Create Vector Store & Store Chunks ====
chroma_db, _ = initialize_chromadb(chroma_path, collection_name)
store_chunks_in_chroma(chroma_db, chunks)



In [None]:
# ==== Step 3: RAG Retrieval + Summarization + Evaluation ====

summary = run_rag_pipeline(pdf_path, query, chroma_path, collection_name, model_name="google/pegasus-large")

In [None]:
summary = run_rag_pipeline(pdf_path, query, chroma_path, collection_name, model_name="allenai/led-base-16384")

In [None]:
# ==== Step 3: RAG Retrieval + Summarization + Evaluation ====

summary = run_rag_pipeline(pdf_path, query, chroma_path, collection_name, model_name="facebook/bart-large-cnn")


#Processing Multiple PDFs

In [None]:
import fitz
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from transformers import pipeline
from transformers import pipeline
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
import torch


# ==== Parameters ====
pdf_files = [
    "/content/drive/MyDrive/Data/38_1612851.pdf",
    "/content/drive/MyDrive/Data/39_3323796.pdf",
    "/content/drive/MyDrive/Data/40_568165.pdf"
]

query = "Summarize this paper in 200 words."
chroma_path = "/content/drive/MyDrive/VectorDB_Embeddings"
collection_name = "ties_collection_emb"
model_name = "google/pegasus-large"

# ==== Processing Multiple PDFs ====
for pdf_path in pdf_files:
    print(f"\nProcessing: {pdf_path}")

    # Step 1: Extract + Chunk Text
    raw_text = extract_text_from_pdf(pdf_path)
    chunks = split_text_into_chunks(raw_text)

    # Step 2: Create Vector Store & Store Chunks
    chroma_db, _ = initialize_chromadb(chroma_path, collection_name)
    store_chunks_in_chroma(chroma_db, chunks)






In [None]:
# Step 3: RAG Retrieval + Summarization
summary = run_rag_pipeline(pdf_path, query, chroma_path, collection_name, model_name="google/pegasus-large")

In [None]:
summary = run_rag_pipeline(pdf_path, query, chroma_path, collection_name, model_name="allenai/led-base-16384")

In [None]:
summary = run_rag_pipeline(pdf_path, query, chroma_path, collection_name, model_name="facebook/bart-large-cnn")

#baseline model

In [None]:
from transformers import pipeline
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
import torch


# ✅ Metric functions
def compute_bleu(reference, candidate):
    return sentence_bleu([reference.split()], candidate.split())

def compute_rouge(reference, candidate):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    return scorer.score(reference, candidate)

# 📌 Summarization function
def summarize_with_model(query, document_text, model_name="t5-small"):
    print(f"\n🔍 Running summarization with model: {model_name}")

    # ✅ Auto-detect device (GPU/CPU)
    device = 0 if torch.cuda.is_available() else -1
    print(f"Device set to use {'cuda' if device == 0 else 'cpu'}")

    summarizer = pipeline("summarization", model=model_name, device=device)

    def chunked_summarization(text, chunk_size=512, max_chunks=10, max_length=128, min_length=30):
        chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)][:max_chunks]
        summaries = []

        for i, chunk in enumerate(chunks):
            print(f"   📎 Summarizing chunk {i+1}/{len(chunks)}")
            summary = summarizer(chunk, max_length=max_length, min_length=min_length, do_sample=False)
            summaries.append(summary[0]['summary_text'])

        return " ".join(summaries)

    return chunked_summarization(document_text)

# 📝 Sample input
query = "Summarize the main idea of this document."
document_text = """
Machine learning is a method of data analysis that automates analytical model building. It is a branch of artificial intelligence
based on the idea that systems can learn from data, identify patterns and make decisions with minimal human intervention.
The iterative aspect of machine learning is important because as models are exposed to new data, they are able to independently adapt.
They learn from previous computations to produce reliable, repeatable decisions and results. It’s a science that’s not new – but one
that has gained fresh momentum.
"""

# ✅ Reference summary for evaluation
reference_summary = "Machine learning is a type of AI that enables systems to learn and improve from experience without being explicitly programmed."

# 🧪 Baseline & Large Models to Try
models = [
    "allenai/led-base-16384",
    "facebook/bart-large-cnn",
    "google/pegasus-large",
    #other models
    "google/flan-t5-base",
    "google/pegasus-cnn_dailymail",
    "google/flan-t5-large"

]

# 🚀 Execute & evaluate
for model in models:
    try:
        summary = summarize_with_model(query, document_text, model_name=model)

        # 🔍 Evaluation
        bleu = compute_bleu(reference_summary, summary)
        rouge = compute_rouge(reference_summary, summary)

        print("\n🔷 Model Performance:")
        print(f"   🟢 BLEU: {bleu:.4f}")
        print(f"   🟢 ROUGE-1: {rouge['rouge1'].fmeasure:.4f}")
        print(f"   🟢 ROUGE-2: {rouge['rouge2'].fmeasure:.4f}")
        print(f"   🟢 ROUGE-L: {rouge['rougeL'].fmeasure:.4f}")

    except Exception as e:
        print(f"\n❌ Error with model {model}: {e}")
