In [1]:
import os
import nltk
import chromadb
import PyPDF2
import torch
from chromadb.utils.embedding_functions import HuggingFaceEmbeddingFunction
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer
from nltk.translate.bleu_score import sentence_bleu
from rouge import Rouge
from google.colab import drive
import time
from httpx import ReadTimeout
from huggingface_hub import login
login(token='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')

nltk.download("punkt")
nltk.download('punkt_tab')
nltk.download("averaged_perceptron_tagger")

os.environ["HF_TOKEN"] = "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"

def extract_text_from_pdf(pdf_path):
    with open(pdf_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        return "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])

def store_text_in_chromadb(text,client, collection_name="ties_collection"):
    embed_func = HuggingFaceEmbeddingFunction(
        model_name="sentence-transformers/all-MiniLM-L6-v2",
        api_key=os.environ.get("HF_TOKEN")
    )
    collection = client.get_or_create_collection(
        name=collection_name,
        embedding_function=embed_func
    )
    docs = nltk.sent_tokenize(text)

    batch_size = 10
    for i in range(0, len(docs), batch_size):
        batch_docs = docs[i:i + batch_size]
        ids = [str(i) for i in range(len(batch_docs))]
        try:
            collection.add(documents=batch_docs, ids=ids)
            # print(f"Batch {i // batch_size + 1} added successfully.")
        except ReadTimeout as e:
            # print(f"Timeout while adding batch {i // batch_size + 1}. Retrying...")
            time.sleep(2)  # Add retry logic


            # print(f"Successfully added {len(docs)} documents to collection.")
        except Exception as e:
            # print(f"Error adding documents: {e}")
            raise

def retrieve_relevant_docs(query,client,collection_name="ties_collection", top_k=5):
    collection = client.get_collection(name=collection_name)
    return collection.query(query_texts=[query], n_results=top_k)["documents"]

def summarize_text(text, model_name="google/flan-t5-small"):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
    summary_ids = model.generate(inputs.input_ids, max_length=150, min_length=50)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

def agentic_rag_summary(query, retrieval_docs):
    model = pipeline("text-generation", model="google/gemma-3-1b-pt" ,token = os.environ.get("HF_TOKEN"))
    prompt = f"Based on the retrieved documents, summarize concisely: {retrieval_docs}"
    return model(prompt, max_new_tokens=50, do_sample=True)[0]['generated_text']

# Example workflow
drive.mount('/content/drive', force_remount=True)
chroma_path = "/content/drive/MyDrive/VectorDB_Embeddings"
client = chromadb.PersistentClient(path=chroma_path)
pdf_text = extract_text_from_pdf("my_pdf.pdf")
store_text_in_chromadb(pdf_text,client)
query = "Summarize key findings"
retrieved_docs = " ".join([doc for sublist in retrieve_relevant_docs(query, client) for doc in sublist])
summary = summarize_text(retrieved_docs)
agentic_summary = agentic_rag_summary(query, retrieved_docs)
print("Standard Summary:", summary)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Mounted at /content/drive


/root/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx.tar.gz: 100%|██████████| 79.3M/79.3M [00:01<00:00, 72.7MiB/s]
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/880 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.00G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

Device set to use cpu


Standard Summary: 66.7%, published by authors from North America (27.3%), conducted in teams (89.4%) in mostly domestic-only col- laborations (71.2%). Given the already rapidly grow- ing AIEd literature base in higher education, now is the time to ensure that the field has a solid research and conceptual grounding.


In [2]:
print("Agentic summary:", agentic_summary)

Agentic summary: Based on the retrieved documents, summarize concisely: Reviews were included if they synthesised applications of AI solely 
in formal higher or continuing education, were published in English between 2018 
and July 2023, were journal articles or full conference papers, and if they had a method 
section 66 publications were included for data extraction and synthesis in EPPI 
Reviewer, which were predominantly systematic reviews (66.7%), published by authors 
from North America (27.3%), conducted in teams (89.4%) in mostly domestic-only col-
laborations (71.2%). Given the already rapidly grow-
ing AIEd literature base in higher education, now is the time to ensure that the field 
has a solid research and conceptual grounding. Findings show that these reviews mostly focused on AIHEd gener -
ally (47.0%) or Profiling and Prediction (28.8%) as thematic foci, however key findings 
indicated a predominance of the use of Adaptive Systems and Personalisation in higher 
educatio

In [9]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer

# Compute BLEU score
def compute_bleu(reference, candidate):
    reference_tokens = [reference.split()]
    candidate_tokens = candidate.split()
    smoothing = SmoothingFunction().method1
    return sentence_bleu(reference_tokens, candidate_tokens, smoothing_function=smoothing)

# Compute ROUGE scores
def compute_rouge(reference, candidate):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference, candidate)
    return {key: scores[key].fmeasure for key in scores}

# Initialize score dictionaries
bleu_scores = {"flan-t5-small": [], "gemma-3-1b-pt": []}
rouge_scores = {"flan-t5-small": {"rouge1": [], "rouge2": [], "rougeL": []},
                "gemma-3-1b-pt": {"rouge1": [], "rouge2": [], "rougeL": []}}

# Example reference and candidate summaries
reference_summary = "This document discusses key insights on X, Y, and Z."
summary = "Key findings about X, Y, and Z are included in this document."
agentic_summary = "The document explores X, Y, and Z in great depth."

# Compute BLEU and ROUGE
bleu_rag = compute_bleu(reference_summary, summary)
rouge_rag = compute_rouge(reference_summary, summary)

bleu_gemma = compute_bleu(reference_summary, agentic_summary)
rouge_gemma = compute_rouge(reference_summary, agentic_summary)

# Store results
bleu_scores["flan-t5-small"].append(bleu_rag)
bleu_scores["gemma-3-1b-pt"].append(bleu_gemma)

for key in ["rouge1", "rouge2", "rougeL"]:
    rouge_scores["flan-t5-small"][key].append(rouge_rag[key])
    rouge_scores["gemma-3-1b-pt"][key].append(rouge_gemma[key])

# Compute average scores
avg_bleu_rag = sum(bleu_scores["flan-t5-small"]) / len(bleu_scores["flan-t5-small"])
avg_bleu_gemma = sum(bleu_scores["gemma-3-1b-pt"]) / len(bleu_scores["gemma-3-1b-pt"])

avg_rouge_rag = {key: sum(rouge_scores["flan-t5-small"][key]) / len(rouge_scores["flan-t5-small"][key]) for key in ["rouge1", "rouge2", "rougeL"]}
avg_rouge_gemma = {key: sum(rouge_scores["gemma-3-1b-pt"][key]) / len(rouge_scores["gemma-3-1b-pt"][key]) for key in ["rouge1", "rouge2", "rougeL"]}

# Print results
print("\n🔷 flan-t5-small Model Performance:")
print(f"   🟢 Avg BLEU: {avg_bleu_rag:.4f}")
print(f"   🟢 Avg ROUGE-1: {avg_rouge_rag['rouge1']:.4f}")
print(f"   🟢 Avg ROUGE-2: {avg_rouge_rag['rouge2']:.4f}")
print(f"   🟢 Avg ROUGE-L: {avg_rouge_rag['rougeL']:.4f}")

print("\n🔷 gemma-3-1b-pt Model Performance:")
print(f"   🟢 Avg BLEU: {avg_bleu_gemma:.4f}")
print(f"   🟢 Avg ROUGE-1: {avg_rouge_gemma['rouge1']:.4f}")
print(f"   🟢 Avg ROUGE-2: {avg_rouge_gemma['rouge2']:.4f}")
print(f"   🟢 Avg ROUGE-L: {avg_rouge_gemma['rougeL']:.4f}")



🔷 flan-t5-small Model Performance:
   🟢 Avg BLEU: 0.0843
   🟢 Avg ROUGE-1: 0.6364
   🟢 Avg ROUGE-2: 0.4000
   🟢 Avg ROUGE-L: 0.4545

🔷 gemma-3-1b-pt Model Performance:
   🟢 Avg BLEU: 0.1122
   🟢 Avg ROUGE-1: 0.5000
   🟢 Avg ROUGE-2: 0.3333
   🟢 Avg ROUGE-L: 0.5000
