<a href="https://colab.research.google.com/github/Allraindrop/gt-nlp-summarization/blob/main/week0_set_up.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ===========================
# Step 0: Install required packages
# ===========================
!pip install transformers datasets tqdm pandas

# ===========================
# Step 1: Load dataset and clean text
# ===========================
from datasets import load_dataset
import pandas as pd
from tqdm.notebook import tqdm

# Load CNN/DailyMail dataset
dataset = load_dataset("cnn_dailymail", '3.0.0')

def clean_text(text):
    """
    Clean the input text by removing extra whitespaces or unwanted characters.
    """
    return " ".join(text.split())

print(f"Training set size: {len(dataset['train'])}")
print(f"Test set size: {len(dataset['test'])}")

# ===========================
# Step 2: Initialize summarization pipeline
# ===========================
from transformers import pipeline

# Initialize summarization pipeline using BART large
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

MAX_INPUT_LENGTH = 1024  # BART max tokens

# ===========================
# Step 3: Generate summaries for batch
# ===========================
NUM_EXAMPLES = 50  # how many examples to generate (adjustable)
results = []

for i in tqdm(range(NUM_EXAMPLES), desc="Generating summaries"):
    article = clean_text(dataset['train'][i]['article'])
    reference_summary = clean_text(dataset['train'][i]['highlights'])

    # Generate summary safely
    summary = summarizer(
        article,
        max_length=80,
        min_length=30,
        do_sample=False,
        truncation=True
    )
    generated_summary = summary[0]['summary_text']

    # Append results
    results.append({
        "article": article,
        "generated_summary": generated_summary,
        "reference_summary": reference_summary
    })

# Convert to DataFrame for easier handling
df = pd.DataFrame(results)

# ===========================
# Step 4: Save results
# ===========================
# Save to CSV for easy GitHub upload
df.to_csv("generated_summaries.csv", index=False, encoding="utf-8")
print("Summaries saved to generated_summaries.csv")

# ===========================
# Step 5: Visualize some examples
# ===========================
for idx in range(3):  # show first 3 examples
    print(f"--- Example {idx+1} ---")
    print("Original Article (first 200 chars):", df['article'][idx][:200], "...")
    print("Generated Summary:", df['generated_summary'][idx])
    print("Reference Summary:", df['reference_summary'][idx])
    print("---\n")

# ===========================
# Step 6 (Optional): Push to GitHub
# ===========================
# Make sure you've connected Colab to your GitHub account
# and set up personal access token if needed
# Uncomment below if you want to push automatically

# !git config --global user.email "your_email@example.com"
# !git config --global user.name "YourName"
# !git init
# !git add generated_summaries.csv
# !git commit -m "Add generated summaries"
# !git branch -M main
# !git remote add origin https://github.com/yourusername/yourrepo.git
# !git push -u origin main
