In [7]:
# summarization_assignment.py
import numpy as np
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)
from transformers import pipeline
import evaluate

# ==============================
# 1. Load Dataset
# ==============================
print("üì• Loading CNN/DailyMail dataset...")
dataset = load_dataset("cnn_dailymail", "3.0.0")

# Use smaller subsets for faster training (adjust as needed)
train_dataset = dataset["train"].select(range(2000))      # 2K samples for demo
eval_dataset = dataset["validation"].select(range(500))   # 500 for eval

# ==============================
# 2. Model & Tokenizer Setup
# ==============================
model_name = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Preprocessing function
def preprocess_function(examples):
    inputs = ["summarize: " + doc for doc in examples["article"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["highlights"], max_length=128, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

print("üßπ Preprocessing datasets...")
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_eval = eval_dataset.map(preprocess_function, batched=True)

# ==============================
# 3. ROUGE Metric
# ==============================
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    return {k: round(v * 100, 2) for k, v in result.items()}

# ==============================
# 4. Training Setup
# ==============================
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

training_args = Seq2SeqTrainingArguments(
    output_dir="./t5-cnn-finetuned",
    eval_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_strategy="epoch",
    num_train_epochs=2,
    predict_with_generate=True,
    logging_dir="./logs",
    fp16=torch.cuda.is_available(),
    push_to_hub=False,
    report_to="none",  # disable W&B etc.
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# ==============================
# 5. Fine-tune the Model
# ==============================
print("üöÄ Starting fine-tuning...")
trainer.train()

# Save final model
trainer.save_model("./t5-cnn-finetuned")
tokenizer.save_pretrained("./t5-cnn-finetuned")

print("‚úÖ Model fine-tuned and saved to './t5-cnn-finetuned'")

# ==============================
# 6. Load Fine-tuned Model for Inference
# ==============================
print("üîÅ Loading fine-tuned model for inference...")
summarizer = pipeline(
    "summarization",
    model="./t5-cnn-finetuned",
    tokenizer="./t5-cnn-finetuned",
    device=0 if torch.cuda.is_available() else -1,
    framework="pt"
)

# ==============================
# 7. Infinite Interactive Loop
# ==============================
print("\n" + "="*60)
print("üí¨ INTERACTIVE SUMMARIZATION MODE")
print("Paste any article below. Type 'quit' to exit.")
print("="*60)

while True:
    print("\nüìù Enter your article (or 'quit' to exit):")
    user_input = input("> ").strip()

    if user_input.lower() in ["quit", "exit", "q"]:
        print("üëã Goodbye!")
        break

    if not user_input:
        print("‚ö†Ô∏è Please enter non-empty text.")
        continue

    try:
        # Generate summary
        result = summarizer(
            user_input,
            max_length=120,
            min_length=30,
            do_sample=False,
            num_beams=4,
            early_stopping=True
        )
        summary = result[0]["summary_text"]

        print("\nüîç ORIGINAL TEXT (first 500 chars):")
        print(user_input[:500] + "..." if len(user_input) > 500 else user_input)
        print("\n‚ú® GENERATED SUMMARY:")
        print(summary)

    except Exception as e:
        print(f"‚ùå Error during summarization: {e}")

üì• Loading CNN/DailyMail dataset...
üßπ Preprocessing datasets...


Map:   0%|          | 0/500 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(


üöÄ Starting fine-tuning...


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,No log,0.881554,1.56,0.78,1.35,1.34
2,1.715500,0.857412,7.21,3.08,6.11,6.06


Device set to use cuda:0


‚úÖ Model fine-tuned and saved to './t5-cnn-finetuned'
üîÅ Loading fine-tuned model for inference...

üí¨ INTERACTIVE SUMMARIZATION MODE
Paste any article below. Type 'quit' to exit.

üìù Enter your article (or 'quit' to exit):
> It‚Äôs not possible to make a single Python function like that 100% accurate at converting C++ code into Python, because:  C++ and Python are fundamentally different languages (syntax, typing, memory model, OOP semantics, templates, etc.).  There‚Äôs no direct one-to-one mapping for many constructs (like pointers, references, templates, manual memory management, STL containers, etc.).  A ‚Äúperfect‚Äù translation would require a full compiler front-end for C++ that parses its AST (Abstract Syntax Tree) and converts it to an equivalent Python AST ‚Äî not simple string replacements.


Both `max_new_tokens` (=256) and `max_length`(=120) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



üîç ORIGINAL TEXT (first 500 chars):
It‚Äôs not possible to make a single Python function like that 100% accurate at converting C++ code into Python, because:  C++ and Python are fundamentally different languages (syntax, typing, memory model, OOP semantics, templates, etc.).  There‚Äôs no direct one-to-one mapping for many constructs (like pointers, references, templates, manual memory management, STL containers, etc.).  A ‚Äúperfect‚Äù translation would require a full compiler front-end for C++ that parses its AST (Abstract Syntax Tree) ...

‚ú® GENERATED SUMMARY:
C++ and Python are fundamentally different languages . There‚Äôs no direct one-to-one mapping for many constructs . A ‚Äúperfect‚Äù translation would require a full compiler front-end for C++ .

üìù Enter your article (or 'quit' to exit):
> quit
üëã Goodbye!
