In [1]:
import polars as pl
import warnings

warnings.filterwarnings('ignore')

df_polars = pl.read_csv("/kaggle/input/news-summarization/data.csv", ignore_errors=True).lazy()
df_polars = df_polars.select(["Content", "Summary"]).limit(50000).collect()
df = df_polars.to_pandas()

In [2]:
del df_polars

In [3]:
df.head()

Unnamed: 0,Content,Summary
0,New York police are concerned drones could bec...,Police have investigated criminals who have ri...
1,By . Ryan Lipman . Perhaps Australian porn sta...,Porn star Angela White secretly filmed sex act...
2,"This was, Sergio Garcia conceded, much like be...",American draws inspiration from fellow country...
3,An Ebola outbreak that began in Guinea four mo...,World Health Organisation: 635 infections and ...
4,By . Associated Press and Daily Mail Reporter ...,A sinkhole opened up at 5:15am this morning in...


In [4]:
df.tail()

Unnamed: 0,Content,Summary
49995,"Reid, 22, scored four goals during a season-lo...",Torquay United have re-signed Exeter forward J...
49996,By . Valerie Elliott . A surge in obesity amon...,Experts believe growing restriction on dogs ex...
49997,They were alleged to have called it quits when...,Duchess of Cambridge's sister attended The Bri...
49998,These crawls are part of an effort to archive ...,– Police say a man who delivered a bag of mari...
49999,A toddler drowned in the swimming pool of his ...,Family were throwing a summer party at the pro...


In [5]:
df.isnull().sum()

Content    3
Summary    0
dtype: int64

In [6]:
df.dropna(subset=['Content', 'Summary'], inplace=True)
df.reset_index(drop=True, inplace=True)

In [7]:
df['input_text'] = 'summarize: ' + df['Content']

In [8]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained('t5-base')
model = AutoModelForSeq2SeqLM.from_pretrained('t5-base').to(device)

dataset = Dataset.from_pandas(df)

def preprocess(examples):
    model_inputs = tokenizer(
        examples["input_text"],
        max_length=412,
        truncation=True,
        padding="max_length"
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["Summary"],
            max_length=100,
            truncation=True,
            padding="max_length"
        )["input_ids"]

    labels = [
        [(token if token != tokenizer.pad_token_id else -100) for token in label]
        for label in labels
    ]

    model_inputs["labels"] = labels
    return model_inputs

processed_dataset = dataset.map(
    preprocess,
    batched=True,
    batch_size=8,
    remove_columns=dataset.column_names,
    load_from_cache_file=False
)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

2025-06-27 15:41:53.828644: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751038914.281436      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751038914.404167      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Map:   0%|          | 0/49997 [00:00<?, ? examples/s]

In [10]:
del df

In [11]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./t5-finetuned-feedflash",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    save_steps=500,
    logging_steps=250,
    save_strategy="no",
    learning_rate=3e-4,         
    weight_decay=0.01,
    warmup_steps=500,
    lr_scheduler_type="linear",
    max_grad_norm=1.0,          
    fp16=True,
    report_to="none"
)

In [12]:
from transformers import Trainer

trainer = Trainer(model = model, args = training_args, train_dataset = processed_dataset)

In [13]:
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
250,2.0049
500,1.9267
750,1.9435
1000,1.9509
1250,1.9406
1500,1.9197
1750,1.9219
2000,1.9216
2250,1.9341
2500,1.9236


TrainOutput(global_step=9375, training_loss=1.74511283203125, metrics={'train_runtime': 11811.6362, 'train_samples_per_second': 12.699, 'train_steps_per_second': 0.794, 'total_flos': 7.349871019677696e+16, 'train_loss': 1.74511283203125, 'epoch': 3.0})

In [14]:
model.save_pretrained("feedflash-t5")
tokenizer.save_pretrained("feedflash-t5")

('feedflash-t5/tokenizer_config.json',
 'feedflash-t5/special_tokens_map.json',
 'feedflash-t5/spiece.model',
 'feedflash-t5/added_tokens.json',
 'feedflash-t5/tokenizer.json')

In [17]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [18]:
model.push_to_hub("Arihant-Bhandari/feedflash-t5")
tokenizer.push_to_hub("Arihant-Bhandari/feedflash-t5")

Uploading...:   0%|          | 0.00/892M [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

Uploading...:   0%|          | 0.00/792k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Arihant-Bhandari/feedflash-t5/commit/e8e0c6bff98e88939b61d60b64d2e2f3f1ea920b', commit_message='Upload tokenizer', commit_description='', oid='e8e0c6bff98e88939b61d60b64d2e2f3f1ea920b', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Arihant-Bhandari/feedflash-t5', endpoint='https://huggingface.co', repo_type='model', repo_id='Arihant-Bhandari/feedflash-t5'), pr_revision=None, pr_num=None)