In [1]:

%pip install --upgrade pip
%pip install transformers torch sentencepiece datasets newspaper3k textstat flask accelerate

import os
import json
import datetime
from typing import List

import torch
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
from newspaper import Article
import textstat

device = 0 if torch.cuda.is_available() else -1
print('Using device:', 'cuda' if device==0 else 'cpu')

def chunk_text_by_tokens(text: str, tokenizer: AutoTokenizer, max_tokens: int = 1024, overlap: int = 128) -> List[str]:
    """Split `text` into chunks whose tokenized length <= max_tokens (with overlap)."""
    token_ids = tokenizer.encode(text, truncation=False)
    chunks = []
    i = 0
    n = len(token_ids)
    while i < n:
        chunk_ids = token_ids[i:i + max_tokens]
        chunk_text = tokenizer.decode(chunk_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
        chunks.append(chunk_text)
        i += max_tokens - overlap
    return chunks


def read_article_from_url(url: str) -> str:
    art = Article(url)
    art.download()
    art.parse()
    return art.text


def readability_stats(text: str) -> dict:
    return {
        'word_count': len(text.split()),
        'flesch_reading_ease': textstat.flesch_reading_ease(text),
        'flesch_kincaid_grade': textstat.flesch_kincaid_grade(text)
    }

MODEL_NAME = 'facebook/bart-large-cnn'

print('Loading tokenizer and model (this may take a minute)')

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

summarizer = pipeline('summarization', model=model, tokenizer=tokenizer, device=device)

try:
    max_model_input_tokens = model.config.max_position_embeddings
except Exception:
    max_model_input_tokens = 1024
print('Model max input tokens (approx):', max_model_input_tokens)


def summarize_long_text(text: str,
                        tokenizer: AutoTokenizer,
                        summarizer_pipeline,
                        chunk_max_tokens: int = 1024,
                        chunk_overlap: int = 128,
                        chunk_min_length: int = 30,
                        chunk_max_length: int = 200,
                        final_min_length: int = 30,
                        final_max_length: int = 200) -> str:

    chunks = chunk_text_by_tokens(text, tokenizer, max_tokens=chunk_max_tokens, overlap=chunk_overlap)
    
    summaries = []
    for i, ch in enumerate(chunks):
        # pipeline returns list of dicts
        out = summarizer_pipeline(ch, min_length=chunk_min_length, max_length=chunk_max_length, truncation=True)
        summary_text = out[0]['summary_text']
        summaries.append(summary_text)
    
    combined = ' \n'.join(summaries)
    
    tokenized_combined = tokenizer.encode(combined)
    if len(tokenized_combined) > chunk_max_tokens:
        out = summarizer_pipeline(combined, min_length=final_min_length, max_length=final_max_length, truncation=True)
        return out[0]['summary_text']
    return combined

example_path = 'example_article.txt'
if os.path.exists(example_path):
    with open(example_path, 'r', encoding='utf-8') as f:
        article_text = f.read()
else:
    
    article_text = """
    Replace this placeholder with your article text or save a real article to example_article.txt
    """

summary = summarize_long_text(article_text, tokenizer, summarizer)
print('--- SUMMARY ---\n')
print(summary)

print('\n--- STATS ---')
print('Original:', readability_stats(article_text))
print('Summary :', readability_stats(summary))

HISTORY_PATH = 'summaries_history.json'

def save_summary(original: str, summary: str, source: str = None, path: str = HISTORY_PATH):
    entry = {
        'timestamp_utc': datetime.datetime.utcnow().isoformat(),
        'source': source,
        'original_word_count': len(original.split()),
        'summary_word_count': len(summary.split()),
        'original_text': original[:2000],  # store a truncated preview; remove truncation if you want full
        'summary_text': summary
    }
    data = []
    if os.path.exists(path):
        try:
            with open(path, 'r', encoding='utf-8') as f:
                data = json.load(f)
        except Exception:
            data = []
    data.append(entry)
    with open(path, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

from flask import Flask, request, render_template_string, jsonify

app = Flask(__name__)

INDEX_HTML = '''
<!doctype html>
<title>AI Writer - Summarizer</title>
<h2>Paste article text or provide a URL</h2>
<form method=post>
  <textarea name=text rows=20 cols=90 placeholder="Paste text here"></textarea><br>
  <input type=text name=url placeholder="Optional: article URL"><br>
  <input type=submit value="Summarize">
</form>
<pre id=out>{{result}}</pre>
'''

@app.route('/', methods=['GET','POST'])
def index():
    result = ''
    if request.method == 'POST':
        text = request.form.get('text') or ''
        url = request.form.get('url') or None
        if url and not text:
            try:
                text = read_article_from_url(url)
            except Exception as e:
                result = 'Failed to fetch URL: ' + str(e)
                return render_template_string(INDEX_HTML, result=result)
        if len(text.strip()) == 0:
            result = 'No text provided.'
        else:
            summary = summarize_long_text(text, tokenizer, summarizer)
            stats = readability_stats(summary)
            result = f"SUMMARY:\n{summary}\n\nSTATS:\n{stats}"
    return render_template_string(INDEX_HTML, result=result)


if __name__ == '__main__':
    app.run(host='0.0.0.0', port=5000, debug=True)


reqs = '''
transformers
torch
sentencepiece
newspaper3k
textstat
flask
accelerate
'''
with open('requirements.txt', 'w') as f:
    f.write(reqs.strip())
print('requirements.txt written')





Note: you may need to restart the kernel to use updated packages.
Collecting transformers
  Using cached transformers-4.55.4-py3-none-any.whl.metadata (41 kB)
Collecting torch
  Using cached torch-2.8.0-cp313-cp313-win_amd64.whl.metadata (30 kB)
Collecting sentencepiece
  Using cached sentencepiece-0.2.1-cp313-cp313-win_amd64.whl.metadata (10 kB)
Collecting datasets
  Using cached datasets-4.0.0-py3-none-any.whl.metadata (19 kB)
Collecting newspaper3k
  Using cached newspaper3k-0.2.8-py3-none-any.whl.metadata (11 kB)
Collecting textstat
  Using cached textstat-0.7.8-py3-none-any.whl.metadata (15 kB)
Collecting flask
  Using cached flask-3.1.2-py3-none-any.whl.metadata (3.2 kB)
Collecting accelerate
  Using cached accelerate-1.10.0-py3-none-any.whl.metadata (19 kB)
Collecting filelock (from transformers)
  Using cached filelock-3.19.1-py3-none-any.whl.metadata (2.1 kB)
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers)
  Using cached huggingface_hub-0.34.4-py3-none-any.whl.meta

ImportError: lxml.html.clean module is now a separate project lxml_html_clean.
Install lxml[html_clean] or lxml_html_clean directly.