##RAG

#🔧 Setup

In [27]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [28]:
!pip install chromadb



In [29]:

!pip install rouge_score



In [30]:
!pip install -U langchain-community



In [31]:
!pip install --upgrade chromadb



In [32]:
!pip install PyMuPDF



In [33]:
import fitz  # PyMuPDF
import chromadb
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
from transformers import pipeline
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
import time

drive.mount('/content/drive')

import nltk
nltk.download('punkt_tab')





Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

#📄 Text Processing

In [34]:
def extract_text_from_pdf(pdf_path, chars=1000):
    """Extract the first `chars` characters from a PDF."""
    doc = fitz.open(pdf_path)
    text = "\n".join([page.get_text("text") for page in doc])
    return text[:chars]


#RAG Pipeline

1)Document Chunking

In [35]:
def split_text_into_chunks(text, chunk_size=512, overlap=50):
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
    return splitter.split_text(text)


2) Initialize ChromaDB with Embeddings

In [36]:
def initialize_chromadb(persist_dir, collection_name):
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return Chroma(persist_directory=persist_dir, embedding_function=embeddings), embeddings


3)Add Chunks to Chroma


In [37]:
def store_chunks_in_chroma(chroma_db, chunks):
    for i, chunk in enumerate(chunks):
        chroma_db.add_texts(texts=[chunk], metadatas=[{"chunk_id": i}])


4) retrieve Relevant Chunks

In [38]:
def retrieve_chunks(chroma_db, query, top_k=5):
    return chroma_db.similarity_search(query, k=top_k)


#Summarization

In [39]:

# List of models
models = {
    "facebook/bart-large-cnn": "BART",
    "google/pegasus-large": "PEGASUS",
    "allenai/led-base-16384": "LED",
    "facebook/bart-large-xsum": "BART + Longformer hybrid",
    "Salesforce/ctrl": "CTRL",
    "microsoft/prophetnet-large-uncased": "ProphetNet"
}

# Function to select model
def get_summarizer(model_name):
    if model_name in models:
        return pipeline("summarization", model=model_name)
    else:
        raise ValueError(f"Model {model_name} not found in available models.")

# Summarization with dynamic model selection
def summarize_with_model(query, context_text, model_name="facebook/bart-large-cnn"):
    summarizer = get_summarizer(model_name)
    input_text = f"Query: {query}\nDocument: {context_text}"
    summary = summarizer(input_text, max_length=150, min_length=50, do_sample=False)
    return summary[0]["summary_text"]

#Evaluation

In [40]:
def compute_bleu(reference, candidate):
    return sentence_bleu([reference.split()], candidate.split())

def compute_rouge(reference, candidate):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    return scorer.score(reference, candidate)


#Full Evaluation Pipeline

In [41]:
def run_rag_pipeline(pdf_path, query, chroma_path, collection_name, model_name="facebook/bart-large-cnn"):
    reference_summary = extract_text_from_pdf(pdf_path)

    chroma_db, _ = initialize_chromadb(chroma_path, collection_name)
    rag_results = chroma_db.similarity_search(query, k=3)
    context = " ".join([doc.page_content for doc in rag_results])

    if not context.strip():
        return "No relevant content retrieved."

    summary = summarize_with_model(query, context, model_name)

    bleu = compute_bleu(reference_summary, summary)
    rouge = compute_rouge(reference_summary, summary)

    print("\n🔷 RAG Model Performance:")
    print(f"   🟢 BLEU: {bleu:.4f}")
    print(f"   🟢 ROUGE-1: {rouge['rouge1'].fmeasure:.4f}")
    print(f"   🟢 ROUGE-2: {rouge['rouge2'].fmeasure:.4f}")
    print(f"   🟢 ROUGE-L: {rouge['rougeL'].fmeasure:.4f}")

    return summary

#Testing

In [42]:
# ==== Parameters ====
pdf_path = "/content/drive/MyDrive/Data/38_1612851.pdf"  # Example path
query = "Summarize this paper in 200 words."
chroma_path = "/content/drive/MyDrive/VectorDB_Embeddings"  # Where to persist Chroma
collection_name = "ties_collection_emb"

# ==== Step 1: Extract + Chunk Text ====
raw_text = extract_text_from_pdf(pdf_path)
chunks = split_text_into_chunks(raw_text)

# ==== Step 2: Create Vector Store & Store Chunks ====
chroma_db, _ = initialize_chromadb(chroma_path, collection_name)
store_chunks_in_chroma(chroma_db, chunks)



In [43]:
# ==== Step 3: RAG Retrieval + Summarization + Evaluation ====

summary = run_rag_pipeline(pdf_path, query, chroma_path, collection_name, model_name="google/pegasus-large")

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu



🔷 RAG Model Performance:
   🟢 BLEU: 0.2140
   🟢 ROUGE-1: 0.5650
   🟢 ROUGE-2: 0.5158
   🟢 ROUGE-L: 0.4574


In [44]:
summary = run_rag_pipeline(pdf_path, query, chroma_path, collection_name, model_name="allenai/led-base-16384")

Device set to use cpu



🔷 RAG Model Performance:
   🟢 BLEU: 0.1000
   🟢 ROUGE-1: 0.4211
   🟢 ROUGE-2: 0.3617
   🟢 ROUGE-L: 0.4105


In [45]:
# ==== Step 3: RAG Retrieval + Summarization + Evaluation ====

summary = run_rag_pipeline(pdf_path, query, chroma_path, collection_name, model_name="facebook/bart-large-cnn")


Device set to use cpu



🔷 RAG Model Performance:
   🟢 BLEU: 0.0640
   🟢 ROUGE-1: 0.3400
   🟢 ROUGE-2: 0.2121
   🟢 ROUGE-L: 0.2600


#Processing Multiple PDFs

In [46]:
import fitz
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from transformers import pipeline
from transformers import pipeline
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
import torch


# ==== Parameters ====
pdf_files = [
    "/content/drive/MyDrive/Data/38_1612851.pdf",
    "/content/drive/MyDrive/Data/39_3323796.pdf",
    "/content/drive/MyDrive/Data/40_568165.pdf"
]

query = "Summarize this paper in 200 words."
chroma_path = "/content/drive/MyDrive/VectorDB_Embeddings"
collection_name = "ties_collection_emb"
model_name = "google/pegasus-large"

# ==== Processing Multiple PDFs ====
for pdf_path in pdf_files:
    print(f"\nProcessing: {pdf_path}")

    # Step 1: Extract + Chunk Text
    raw_text = extract_text_from_pdf(pdf_path)
    chunks = split_text_into_chunks(raw_text)

    # Step 2: Create Vector Store & Store Chunks
    chroma_db, _ = initialize_chromadb(chroma_path, collection_name)
    store_chunks_in_chroma(chroma_db, chunks)







Processing: /content/drive/MyDrive/Data/38_1612851.pdf

Processing: /content/drive/MyDrive/Data/39_3323796.pdf

Processing: /content/drive/MyDrive/Data/40_568165.pdf


In [47]:
# Step 3: RAG Retrieval + Summarization
summary = run_rag_pipeline(pdf_path, query, chroma_path, collection_name, model_name="google/pegasus-large")

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu



🔷 RAG Model Performance:
   🟢 BLEU: 0.0000
   🟢 ROUGE-1: 0.1111
   🟢 ROUGE-2: 0.0345
   🟢 ROUGE-L: 0.0855


In [48]:
summary = run_rag_pipeline(pdf_path, query, chroma_path, collection_name, model_name="allenai/led-base-16384")

Device set to use cpu



🔷 RAG Model Performance:
   🟢 BLEU: 0.0000
   🟢 ROUGE-1: 0.1294
   🟢 ROUGE-2: 0.0402
   🟢 ROUGE-L: 0.0597


In [49]:
summary = run_rag_pipeline(pdf_path, query, chroma_path, collection_name, model_name="facebook/bart-large-cnn")

Device set to use cpu



🔷 RAG Model Performance:
   🟢 BLEU: 0.0000
   🟢 ROUGE-1: 0.1517
   🟢 ROUGE-2: 0.0670
   🟢 ROUGE-L: 0.1137


#baseline model

In [26]:
from transformers import pipeline
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
import torch


# ✅ Metric functions
def compute_bleu(reference, candidate):
    return sentence_bleu([reference.split()], candidate.split())

def compute_rouge(reference, candidate):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    return scorer.score(reference, candidate)

# 📌 Summarization function
def summarize_with_model(query, document_text, model_name="t5-small"):
    print(f"\n🔍 Running summarization with model: {model_name}")

    # ✅ Auto-detect device (GPU/CPU)
    device = 0 if torch.cuda.is_available() else -1
    print(f"Device set to use {'cuda' if device == 0 else 'cpu'}")

    summarizer = pipeline("summarization", model=model_name, device=device)

    def chunked_summarization(text, chunk_size=512, max_chunks=10, max_length=128, min_length=30):
        chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)][:max_chunks]
        summaries = []

        for i, chunk in enumerate(chunks):
            print(f"   📎 Summarizing chunk {i+1}/{len(chunks)}")
            summary = summarizer(chunk, max_length=max_length, min_length=min_length, do_sample=False)
            summaries.append(summary[0]['summary_text'])

        return " ".join(summaries)

    return chunked_summarization(document_text)

# 📝 Sample input
query = "Summarize the main idea of this document."
document_text = """
Machine learning is a method of data analysis that automates analytical model building. It is a branch of artificial intelligence
based on the idea that systems can learn from data, identify patterns and make decisions with minimal human intervention.
The iterative aspect of machine learning is important because as models are exposed to new data, they are able to independently adapt.
They learn from previous computations to produce reliable, repeatable decisions and results. It’s a science that’s not new – but one
that has gained fresh momentum.
"""

# ✅ Reference summary for evaluation
reference_summary = "Machine learning is a type of AI that enables systems to learn and improve from experience without being explicitly programmed."

# 🧪 Baseline & Large Models to Try
models = [
    "allenai/led-base-16384",
    "facebook/bart-large-cnn",
    "google/pegasus-large",
    #other models
    "google/flan-t5-base",
    "google/pegasus-cnn_dailymail",
    "google/flan-t5-large"

]

# 🚀 Execute & evaluate
for model in models:
    try:
        summary = summarize_with_model(query, document_text, model_name=model)

        # 🔍 Evaluation
        bleu = compute_bleu(reference_summary, summary)
        rouge = compute_rouge(reference_summary, summary)

        print("\n🔷 Model Performance:")
        print(f"   🟢 BLEU: {bleu:.4f}")
        print(f"   🟢 ROUGE-1: {rouge['rouge1'].fmeasure:.4f}")
        print(f"   🟢 ROUGE-2: {rouge['rouge2'].fmeasure:.4f}")
        print(f"   🟢 ROUGE-L: {rouge['rougeL'].fmeasure:.4f}")

    except Exception as e:
        print(f"\n❌ Error with model {model}: {e}")



🔍 Running summarization with model: allenai/led-base-16384
Device set to use cpu


Device set to use cpu
Your max_length is set to 128, but your input_length is only 103. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=51)
Input ids are automatically padded from 103 to 1024 to be a multiple of `config.attention_window`: 1024


   📎 Summarizing chunk 1/2


Your max_length is set to 128, but your input_length is only 12. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=6)
Input ids are automatically padded from 12 to 1024 to be a multiple of `config.attention_window`: 1024


   📎 Summarizing chunk 2/2

🔷 Model Performance:
   🟢 BLEU: 0.0292
   🟢 ROUGE-1: 0.1967
   🟢 ROUGE-2: 0.0667
   🟢 ROUGE-L: 0.1803

🔍 Running summarization with model: facebook/bart-large-cnn
Device set to use cpu


Device set to use cpu
Your max_length is set to 128, but your input_length is only 104. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=52)


   📎 Summarizing chunk 1/2


Your max_length is set to 128, but your input_length is only 13. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=6)


   📎 Summarizing chunk 2/2

🔷 Model Performance:
   🟢 BLEU: 0.0476
   🟢 ROUGE-1: 0.2410
   🟢 ROUGE-2: 0.0741
   🟢 ROUGE-L: 0.2410

🔍 Running summarization with model: google/pegasus-large
Device set to use cpu


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu
Your max_length is set to 128, but your input_length is only 92. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=46)


   📎 Summarizing chunk 1/2


Your max_length is set to 128, but your input_length is only 9. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=4)


   📎 Summarizing chunk 2/2

🔷 Model Performance:
   🟢 BLEU: 0.0000
   🟢 ROUGE-1: 0.1818
   🟢 ROUGE-2: 0.0504
   🟢 ROUGE-L: 0.1157

🔍 Running summarization with model: google/flan-t5-base
Device set to use cpu


config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Device set to use cpu
Your max_length is set to 128, but your input_length is only 106. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=53)


   📎 Summarizing chunk 1/2


Your max_length is set to 128, but your input_length is only 12. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=6)


   📎 Summarizing chunk 2/2

🔷 Model Performance:
   🟢 BLEU: 0.0487
   🟢 ROUGE-1: 0.2651
   🟢 ROUGE-2: 0.0741
   🟢 ROUGE-L: 0.2169

🔍 Running summarization with model: google/pegasus-cnn_dailymail
Device set to use cpu


config.json:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/88.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Device set to use cpu
Your max_length is set to 128, but your input_length is only 92. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=46)


   📎 Summarizing chunk 1/2


Your max_length is set to 128, but your input_length is only 9. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=4)


   📎 Summarizing chunk 2/2

🔷 Model Performance:
   🟢 BLEU: 0.0330
   🟢 ROUGE-1: 0.1849
   🟢 ROUGE-2: 0.0513
   🟢 ROUGE-L: 0.1513

🔍 Running summarization with model: google/flan-t5-large
Device set to use cpu


config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Device set to use cpu
Your max_length is set to 128, but your input_length is only 104. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=52)


   📎 Summarizing chunk 1/2


Your max_length is set to 128, but your input_length is only 10. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=5)


   📎 Summarizing chunk 2/2

🔷 Model Performance:
   🟢 BLEU: 0.0000
   🟢 ROUGE-1: 0.1622
   🟢 ROUGE-2: 0.0556
   🟢 ROUGE-L: 0.1351

🔍 Running summarization with model: google/flan-t5-xl
Device set to use cpu


config.json:   0%|          | 0.00/1.44k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/53.0k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.45G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Device set to use cpu
Your max_length is set to 128, but your input_length is only 106. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=53)


   📎 Summarizing chunk 1/2


KeyboardInterrupt: 