<a href="https://colab.research.google.com/github/BillyMuthiani/Ultra_fast-Book_summarizer/blob/main/COMMAND_R_SUMMARY.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
!pip install cohere PyMuPDF fpdf2 nltk tqdm PyPDF2



In [6]:
# Step 2: Import libraries
import os
import fitz  # PyMuPDF
import nltk
import unicodedata
import cohere
from fpdf import FPDF
from tqdm import tqdm
from nltk.tokenize import sent_tokenize
from time import time
from getpass import getpass

nltk.download("punkt", quiet=True)

# Step 3: Enter Cohere API Key (never share this!)
COHERE_API_KEY = getpass(" Enter your Cohere API Key: ")
co = cohere.Client(COHERE_API_KEY)

# Step 4: Upload your file
from google.colab import files
uploaded = files.upload()
input_file = next(iter(uploaded))

# Step 5: Extract text from PDF or TXT
def extract_text(file_path):
    try:
        if file_path.lower().endswith(".txt"):
            with open(file_path, "r", encoding="utf-8") as f:
                return unicodedata.normalize("NFKC", f.read().replace("\ufffd", ""))
        elif file_path.lower().endswith(".pdf"):
            doc = fitz.open(file_path)
            text = "".join(page.get_text() for page in doc)
            doc.close()
            return unicodedata.normalize("NFKC", text.replace("\ufffd", ""))
    except Exception as e:
        print(f" Error reading file: {e}")
        return ""

text = extract_text(input_file)
sentences = sent_tokenize(text)

# Step 6: Chunking
MAX_TOKENS = 1500  # adjust for context window

def chunk_sentences(sentences, max_words=MAX_TOKENS):
    chunks = []
    current = []
    total_words = 0
    for sentence in sentences:
        word_count = len(sentence.split())
        if total_words + word_count > max_words:
            chunks.append(" ".join(current))
            current = [sentence]
            total_words = word_count
        else:
            current.append(sentence)
            total_words += word_count
    if current:
        chunks.append(" ".join(current))
    return chunks

chunks = chunk_sentences(sentences)
print(f" Total Chunks: {len(chunks)}")

# Step 7: Summarization with Cohere
summaries = []
start_time = time()

for i, chunk in enumerate(tqdm(chunks, desc="Summarizing")):
    try:
        response = co.chat(
            model="command-r-plus",
            message=f"Summarize this text in simple, concise language:\n\n{chunk[:3000]}",
            temperature=0.3,
        )
        summaries.append(response.text.strip())
    except Exception as e:
        print(f" Error at chunk {i}: {e}")
        summaries.append("[Summary unavailable]")

final_summary = "\n\n".join(summaries)
elapsed = time() - start_time
print(f" Done summarizing in {elapsed:.2f} seconds")

# Step 8: Truncate to 2 pages (approx. 7000 chars)
if len(final_summary) > 7000:
    final_summary = final_summary[:7000].rsplit(" ", 1)[0] + "..."
    final_summary += "\n\n[Note: Summary truncated to fit 2 pages.]"

# Step 9: Generate PDF
pdf = FPDF()
pdf.add_page()
pdf.set_auto_page_break(auto=True, margin=15)
pdf.set_font("Arial", "B", 14)
pdf.cell(0, 10, input_file, ln=True, align="C")
pdf.set_font("Arial", size=10)

# Clean text for encoding
final_summary = unicodedata.normalize("NFKC", final_summary)
final_summary = final_summary.replace("\u2014", "-").replace("\u2019", "'")
pdf.ln(10)
pdf.multi_cell(0, 6, final_summary)

output_path = input_file.replace(".pdf", "_summary.pdf").replace(".txt", "_summary.pdf")
pdf.output(output_path)
print(f" Saved summary to {output_path}")
files.download(output_path)

 Enter your Cohere API Key: ··········


Saving The Alchemist (Paulo Coelho) (Z-Library).pdf to The Alchemist (Paulo Coelho) (Z-Library) (2).pdf
 Total Chunks: 28


Summarizing: 100%|██████████| 28/28 [01:25<00:00,  3.06s/it]

 Done summarizing in 85.75 seconds
 Saved summary to The Alchemist (Paulo Coelho) (Z-Library) (2)_summary.pdf





<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>