In [1]:
!pip install spacy
!python -m spacy download en_core_web_sm

!pip install transformers
!pip install sentencepiece


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m101.7 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [2]:
text = """
Former U.S. President Barack Obama addressed climate change at the UN summit this week, emphasizing the urgent need for coordinated international action. In his speech, he acknowledged the progress made under the Paris Agreement but warned that much more needed to be done. He also highlighted the impact of wildfires, rising sea levels, and extreme weather events as clear indicators of a warming planet. World leaders responded with a renewed commitment to lowering emissions and investing in green technology.
"""


In [3]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from heapq import nlargest

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Process the text
doc = nlp(text)

# Score sentences
word_frequencies = {}
for word in doc:
    if word.text.lower() not in STOP_WORDS and word.text.lower() not in punctuation:
        if word.text.lower() not in word_frequencies:
            word_frequencies[word.text.lower()] = 1
        else:
            word_frequencies[word.text.lower()] += 1

# Normalize
max_freq = max(word_frequencies.values())
for word in word_frequencies.keys():
    word_frequencies[word] = word_frequencies[word] / max_freq

# Score sentences
sentence_scores = {}
for sent in doc.sents:
    for word in sent:
        if word.text.lower() in word_frequencies:
            if sent not in sentence_scores:
                sentence_scores[sent] = word_frequencies[word.text.lower()]
            else:
                sentence_scores[sent] += word_frequencies[word.text.lower()]

# Get top 2 sentences
summary_sentences = nlargest(2, sentence_scores, key=sentence_scores.get)

extractive_summary = ' '.join([sent.text for sent in summary_sentences])
print("🔹 Extractive Summary:\n", extractive_summary)


🔹 Extractive Summary:
 
Former U.S. President Barack Obama addressed climate change at the UN summit this week, emphasizing the urgent need for coordinated international action. He also highlighted the impact of wildfires, rising sea levels, and extreme weather events as clear indicators of a warming planet.


In [4]:
from transformers import pipeline

# Load pre-trained summarization pipeline (BART model)
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Generate summary
abstractive_summary = summarizer(text, max_length=60, min_length=20, do_sample=False)

# Show result
print("🔹 Abstractive Summary:\n", abstractive_summary[0]['summary_text'])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cpu


🔹 Abstractive Summary:
 Former U.S. President Barack Obama addressed climate change at the UN summit. He acknowledged the progress made under the Paris Agreement but warned that much more needed to be done.


In [5]:
!pip install transformers datasets rouge_score


Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=5996f42a12232eaedd57825a3e027b4be64f2492ad06f415611d52b3bb8b5aac
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [6]:
from datasets import Dataset

# Define a few long texts and their ideal summaries
data = {
    "article": [
        "The moon is Earth's only natural satellite and is the fifth largest moon in the solar system. It affects Earth's tides and has inspired countless myths and stories.",
        "Barack Obama served as the 44th President of the United States. He was the first African-American to hold the office and introduced key reforms like Obamacare.",
        "Climate change refers to long-term shifts in temperatures and weather patterns. It is mainly caused by human activities like burning fossil fuels and deforestation."
    ],
    "summary": [
        "The moon affects tides and is Earth's only natural satellite.",
        "Obama was the 44th U.S. President and introduced reforms like Obamacare.",
        "Climate change is caused by human activity and affects weather patterns."
    ]
}

dataset = Dataset.from_dict(data)


In [7]:
from transformers import AutoTokenizer

model_name = "facebook/bart-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess_function(examples):
    inputs = tokenizer(examples["article"], padding="max_length", truncation=True, max_length=512)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], padding="max_length", truncation=True, max_length=64)
    inputs["labels"] = labels["input_ids"]
    return inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)


config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]



In [8]:
from transformers import AutoModelForSeq2SeqLM, TrainingArguments, Trainer

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="no",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=1,
    save_strategy="no"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset
)

trainer.train()


model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'

In [9]:
!pip install --upgrade transformers




In [1]:
!pip install --upgrade transformers
!pip install datasets rouge_score




In [2]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=2,
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir="./logs"
)


In [3]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset
)

trainer.train()


NameError: name 'model' is not defined

In [4]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = "facebook/bart-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [5]:
from datasets import Dataset

data = {
    "article": [
        "The moon is Earth's only natural satellite and is the fifth largest moon in the solar system. It affects Earth's tides and has inspired countless myths and stories.",
        "Barack Obama served as the 44th President of the United States. He was the first African-American to hold the office and introduced key reforms like Obamacare.",
        "Climate change refers to long-term shifts in temperatures and weather patterns. It is mainly caused by human activities like burning fossil fuels and deforestation."
    ],
    "summary": [
        "The moon affects tides and is Earth's only natural satellite.",
        "Obama was the 44th U.S. President and introduced reforms like Obamacare.",
        "Climate change is caused by human activity and affects weather patterns."
    ]
}

dataset = Dataset.from_dict(data)


In [6]:
def preprocess_function(examples):
    inputs = tokenizer(examples["article"], padding="max_length", truncation=True, max_length=512)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], padding="max_length", truncation=True, max_length=64)
    inputs["labels"] = labels["input_ids"]
    return inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)


Map:   0%|          | 0/3 [00:00<?, ? examples/s]



In [8]:
from transformers import TrainingArguments, Trainer
import os

# 🔇 Disable wandb (so it stops asking for login)
os.environ["WANDB_DISABLED"] = "true"

training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=2,
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir="./logs"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset
)

# 🚀 Now train the model without wandb interference
trainer.train()


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss




TrainOutput(global_step=6, training_loss=11.870185852050781, metrics={'train_runtime': 66.3256, 'train_samples_per_second': 0.136, 'train_steps_per_second': 0.09, 'total_flos': 2743814062080.0, 'train_loss': 11.870185852050781, 'epoch': 3.0})

In [9]:
# Sample article to summarize
text = """
Artificial intelligence (AI) is rapidly changing the way we live and work. From self-driving cars to smart assistants, AI is becoming part of everyday life. While it offers convenience and efficiency, experts warn about ethical concerns and job displacement.
"""

# Tokenize the input
inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)

# Generate summary
summary_ids = model.generate(inputs["input_ids"], max_length=64, min_length=20, length_penalty=2.0, num_beams=4, early_stopping=True)

# Decode and print summary
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print("🔹 Summary:\n", summary)


🔹 Summary:
 Artificial intelligence (AI) is rapidly changing the way we live and work. From self-driving cars to smart assistants, AI is becoming part of everyday life. While it offers convenience and efficiency, experts warn about ethical concerns and job displacement.Started by:
