#**Setup**

In [None]:
!pip install datasets evaluate transformers[sentencepiece]
!pip install -U datasets huggingface_hub fsspec

Collecting fsspec
  Using cached fsspec-2025.7.0-py3-none-any.whl.metadata (12 kB)


In [None]:
from datasets import load_dataset

#**Load Data**

In [None]:
ds = load_dataset("azrai99/the-star-news-articles")
ds

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


DatasetDict({
    train: Dataset({
        features: ['content_id', 'title', 'text', 'section', 'category', 'content_tier', 'content_length', 'authors', 'published_date', 'keywords', 'summary', 'url', 'top_image'],
        num_rows: 21709
    })
})

#**Data Preparation**

In [None]:
ds = ds["train"].train_test_split(test_size=0.2)
ds

DatasetDict({
    train: Dataset({
        features: ['content_id', 'title', 'text', 'section', 'category', 'content_tier', 'content_length', 'authors', 'published_date', 'keywords', 'summary', 'url', 'top_image'],
        num_rows: 17367
    })
    test: Dataset({
        features: ['content_id', 'title', 'text', 'section', 'category', 'content_tier', 'content_length', 'authors', 'published_date', 'keywords', 'summary', 'url', 'top_image'],
        num_rows: 4342
    })
})

In [None]:
ds_tst = ds["test"].train_test_split(test_size=0.5)
ds_tst

DatasetDict({
    train: Dataset({
        features: ['content_id', 'title', 'text', 'section', 'category', 'content_tier', 'content_length', 'authors', 'published_date', 'keywords', 'summary', 'url', 'top_image'],
        num_rows: 2171
    })
    test: Dataset({
        features: ['content_id', 'title', 'text', 'section', 'category', 'content_tier', 'content_length', 'authors', 'published_date', 'keywords', 'summary', 'url', 'top_image'],
        num_rows: 2171
    })
})

In [None]:
ds["validation"] = ds_tst["train"]
ds["test"] = ds_tst["test"]
ds

DatasetDict({
    train: Dataset({
        features: ['content_id', 'title', 'text', 'section', 'category', 'content_tier', 'content_length', 'authors', 'published_date', 'keywords', 'summary', 'url', 'top_image'],
        num_rows: 17367
    })
    test: Dataset({
        features: ['content_id', 'title', 'text', 'section', 'category', 'content_tier', 'content_length', 'authors', 'published_date', 'keywords', 'summary', 'url', 'top_image'],
        num_rows: 2171
    })
    validation: Dataset({
        features: ['content_id', 'title', 'text', 'section', 'category', 'content_tier', 'content_length', 'authors', 'published_date', 'keywords', 'summary', 'url', 'top_image'],
        num_rows: 2171
    })
})

In [None]:
# We only need the Article title and body, so other columns are dropped
rem_cols = ['content_id', 'section', 'category', 'content_tier', 'content_length', 'authors', 'published_date', 'keywords', 'summary', 'url', 'top_image']

In [None]:
for d in ds:
  ds[d] = ds[d].remove_columns(rem_cols)
ds

DatasetDict({
    train: Dataset({
        features: ['title', 'text'],
        num_rows: 17367
    })
    test: Dataset({
        features: ['title', 'text'],
        num_rows: 2171
    })
    validation: Dataset({
        features: ['title', 'text'],
        num_rows: 2171
    })
})

In [None]:
def show_samples(dataset, num_samples=3, seed=42):
  """Prints random samples from the dataset.
  Args:
      dataset (Dataset): The dataset to show the samples from.
      num_samples (int): The number of samples to show. Defaults to 3.
      seed (int): The starting number to initialize the random number generator.
  """
  sample = dataset["train"].shuffle(seed=seed).select(range(num_samples))
  for example in sample:
    print(f"\n'>> Target Title: {example['title']}'")
    print(f"'>> Article: {example['text']}'")


show_samples(ds)


'>> Target Title: Businessman loses RM200,000 in cruise to nowhere'
'>> Article: JOHOR BARU: A businessman who was planning a cruise trip with his family lost about RM200,000 when he downloaded a malicious application onto his phone.

The victim, who only wanted to be known as Lee, said he came across the app on March 8 while browsing for a holiday to Singapore with his wife and three children.

“I wanted to take my family on a three-day, two-night cruise to Singapore during the Hari Raya holiday,” he said.

Lee, 47, said he was looking for information about cruises when he came across a Facebook advertisement.

“I was then called by a Singapore-registered number asking for my personal details and informing me that no payment was needed until the day I board the ship.

“The person then gave me a link to an app to verify my booking.

“However, after downloading the app, my phone suddenly shut down,” the businessman said during a press conference organised by Tebrau MP Jimmy Puah yester

In [None]:
#Filter only titles of length <2
ds = ds.filter(lambda x: len(x["title"].split()) > 2)
ds

Filter:   0%|          | 0/17367 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2171 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2171 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['title', 'text'],
        num_rows: 17336
    })
    test: Dataset({
        features: ['title', 'text'],
        num_rows: 2168
    })
    validation: Dataset({
        features: ['title', 'text'],
        num_rows: 2169
    })
})

#**Save Data**

In [None]:
ds.save_to_disk("Data")

Saving the dataset (0/1 shards):   0%|          | 0/17336 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2168 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2169 [00:00<?, ? examples/s]

In [None]:
# If using colab, Uncomment the following lines to save the data to your google drive.
# from google.colab import drive
# drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#!cp -r /content/Data "/content/drive/My Drive/Projects/Title Generator/"