<a href="https://colab.research.google.com/github/BillyMuthiani/Ultra_fast-Book_summarizer/blob/main/COMMAND_R_SUMMARY.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:

!pip install together PyMuPDF fpdf2 nltk tqdm PyPDF2



In [22]:
# Step 2: Import libraries
import os
import fitz  # PyMuPDF
import nltk
import unicodedata
import cohere
from fpdf import FPDF,XPos ,YPos
from tqdm import tqdm
from nltk.tokenize import sent_tokenize
import time
from getpass import getpass

nltk.download("punkt", quiet=True)

# Step 3: Enter Cohere API Key
COHERE_API_KEY = "Nw8spSpgyZAhhAezn9gPSKkg2Adc1ixXsXGzyb64"
co = cohere.Client(COHERE_API_KEY, client_name="summarizer-notebook")

# Step 4: Upload your file
from google.colab import files
uploaded = files.upload()
input_file = next(iter(uploaded))

# Step 5: Extract text from PDF or TXT
def extract_text(file_path):
    try:
        if file_path.lower().endswith(".txt"):
            with open(file_path, "r", encoding="utf-8") as f:
                return unicodedata.normalize("NFKC", f.read().replace("\ufffd", ""))
        elif file_path.lower().endswith(".pdf"):
            doc = fitz.open(file_path)
            text = "".join(page.get_text() for page in doc)
            doc.close()
            return unicodedata.normalize("NFKC", text.replace("\ufffd", ""))
    except Exception as e:
        print(f" Error reading file: {e}")
        return ""

text = extract_text(input_file)
sentences = sent_tokenize(text)

# Step 6: Chunking
MAX_TOKENS = 1500  # adjust for context window

def chunk_sentences(sentences, max_words=MAX_TOKENS):
    chunks = []
    current = []
    total_words = 0
    for sentence in sentences:
        word_count = len(sentence.split())
        if total_words + word_count > max_words:
            chunks.append(" ".join(current))
            current = [sentence]
            total_words = word_count
        else:
            current.append(sentence)
            total_words += word_count
    if current:
        chunks.append(" ".join(current))
    return chunks

chunks = chunk_sentences(sentences)
print(f" Total Chunks: {len(chunks)}")

# Step 7: Summarization with Cohere (with batching and delays)
summaries = []
start_time = time.time()
batch_size = 10
pause_seconds = 6

for i in tqdm(range(0, len(chunks), batch_size), desc="Summarizing"):
    batch = chunks[i:i + batch_size]
    for j, chunk in enumerate(batch):
        try:
            response = co.chat(
                model="command-r-plus",
                message=f"Summarize this text in simple, concise language:\n\n{chunk[:3000]}",
                temperature=0.3,
            )
            summaries.append(response.text.strip())
        except cohere.errors.TooManyRequestsError:
            print(f"Rate limit exceeded, waiting {pause_seconds} seconds...")
            time.sleep(pause_seconds)
            response = co.chat(
                model="command-r-plus",
                message=f"Summarize this text in simple, concise language:\n\n{chunk[:3000]}",
                temperature=0.3,
            )
            summaries.append(response.text.strip())
        except Exception as e:
            print(f" Error at chunk {i + j}: {e}")
            summaries.append("[Summary unavailable]")
        time.sleep(2)  # Add a 2-second pause between each API call
    if i + batch_size < len(chunks):
        time.sleep(pause_seconds)

final_summary = "\n\n".join(summaries)
elapsed = time.time() - start_time
print(f" Done summarizing in {elapsed:.2f} seconds")

# Step 8: Truncate to 2 pages
if len(final_summary) > 7000:
    final_summary = final_summary[:7000].rsplit(" ", 1)[0] + "..."
    final_summary += "\n\n[Note: Summary truncated to fit 2 pages.]"

def generate_pdf(text, output_path, title="Summary"):
    # Clean up text and title to handle special characters
    text = unicodedata.normalize("NFKC", text.replace("\ufffd", "")).replace("’", "'")
    text = text.encode('latin-1', errors='replace').decode('latin-1')
    title = title.encode('latin-1', errors='replace').decode('latin-1')

    # Initialize PDF
    pdf = FPDF()
    pdf.set_margins(10, 10, 10)
    pdf.add_page()
    pdf.set_font("Helvetica", "B", 14)
    pdf.cell(0, 10, title, new_x=XPos.LMARGIN, new_y=YPos.NEXT, align="C")
    pdf.ln(5)
    pdf.set_font("Helvetica", size=9)
    pdf.multi_cell(0, 5, text)
    # Ensure exactly two pages
    if pdf.page_no() < 2:
        pdf.add_page()
    pdf.output(output_path)

#  Save PDF
output_path = os.path.splitext(input_file)[0] + "_summary.pdf"
print(" Generating PDF...")
generate_pdf(final_summary, output_path, title=os.path.basename(input_file))
print(f" PDF saved as: {output_path}")
print(f" Summarization done in {elapsed:.2f} seconds")

Saving The Alchemist (Paulo Coelho) (Z-Library).pdf to The Alchemist (Paulo Coelho) (Z-Library) (5).pdf
 Total Chunks: 28


Summarizing:  33%|███▎      | 1/3 [00:55<01:51, 55.87s/it]

Rate limit exceeded, waiting 6 seconds...
Rate limit exceeded, waiting 6 seconds...


Summarizing:  67%|██████▋   | 2/3 [01:57<00:59, 59.39s/it]

Rate limit exceeded, waiting 6 seconds...
Rate limit exceeded, waiting 6 seconds...


Summarizing: 100%|██████████| 3/3 [02:48<00:00, 56.02s/it]

 Done summarizing in 168.08 seconds
 Generating PDF...
 PDF saved as: The Alchemist (Paulo Coelho) (Z-Library) (5)_summary.pdf
 Summarization done in 168.08 seconds



