Loading Datasets

In [1]:
train_n = 2000
valid_n = 500
test_n = 500

In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("pariza/bbc-news-summary")

print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: C:\Users\Fady\.cache\kagglehub\datasets\pariza\bbc-news-summary\versions\2


BBC Dataset

In [3]:
import os
import pandas as pd

In [4]:
articles_path = os.path.join(path, "BBC News Summary/News Articles")
summaries_path = os.path.join(path, "BBC News Summary/Summaries")

In [5]:
data = []

for category in os.listdir(articles_path):
    article_dir = os.path.join(articles_path, category)
    summary_dir = os.path.join(summaries_path, category)

    if os.path.isdir(article_dir) and os.path.isdir(summary_dir):
        for filename in os.listdir(article_dir):
            article_file = os.path.join(article_dir, filename)
            summary_file = os.path.join(summary_dir, filename)

            if os.path.exists(summary_file):
                with open(article_file, "r", encoding="ISO-8859-1") as f:
                    article_text = f.read().strip()
                with open(summary_file, "r", encoding="ISO-8859-1") as f:
                    summary_text = f.read().strip()

                data.append(
                    {
                        # 'Category': category,
                        "Article": article_text,
                        "Summary": summary_text,
                    }
                )

In [6]:
bbc_df = pd.DataFrame(data)

bbc_df.to_csv("../data/bbc/bbc_dataset.csv", index=False)

In [7]:
bbc_df = pd.read_csv("../data/bbc/bbc_dataset.csv")

In [8]:
bbc_df.head()

Unnamed: 0,Article,Summary
0,Ad sales boost Time Warner profit\n\nQuarterly...,TimeWarner said fourth quarter sales rose 2% t...
1,Dollar gains on Greenspan speech\n\nThe dollar...,The dollar has hit its highest level against t...
2,Yukos unit buyer faces loan claim\n\nThe owner...,Yukos' owner Menatep Group says it will ask Ro...
3,High fuel prices hit BA's profits\n\nBritish A...,"Rod Eddington, BA's chief executive, said the ..."
4,Pernod takeover talk lifts Domecq\n\nShares in...,Pernod has reduced the debt it took on to fund...


In [9]:
bbc_df.isnull().sum()

Article    0
Summary    0
dtype: int64

In [10]:
bbc_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2225 entries, 0 to 2224
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Article  2225 non-null   object
 1   Summary  2225 non-null   object
dtypes: object(2)
memory usage: 34.9+ KB


CNN Dataset

In [11]:
import kagglehub
from kagglehub import KaggleDatasetAdapter

# Train Dataset
train_path = "cnn_dailymail/train.csv"
train_cnn_df = kagglehub.load_dataset(
    KaggleDatasetAdapter.PANDAS,
    "gowrishankarp/newspaper-text-summarization-cnn-dailymail",
    train_path,
)
train_cnn_df = train_cnn_df.sample(n = train_n, random_state=42).reset_index(drop=True)

# Validation Dataset
valid_path = "cnn_dailymail/validation.csv"
valid_cnn_df = kagglehub.load_dataset(
    KaggleDatasetAdapter.PANDAS,
    "gowrishankarp/newspaper-text-summarization-cnn-dailymail",
    valid_path,
)
valid_cnn_df = valid_cnn_df.sample(n = valid_n, random_state=42).reset_index(drop=True)


# Test Dataset
test_path = "cnn_dailymail/test.csv"
test_cnn_df = kagglehub.load_dataset(
    KaggleDatasetAdapter.PANDAS,
    "gowrishankarp/newspaper-text-summarization-cnn-dailymail",
    test_path,
)
test_cnn_df = test_cnn_df.sample(n = test_n, random_state=42).reset_index(drop=True)


print("First 5 records:", train_cnn_df.head())

  train_cnn_df = kagglehub.load_dataset(
  valid_cnn_df = kagglehub.load_dataset(
  test_cnn_df = kagglehub.load_dataset(


First 5 records:                                          id  \
0  ed0fed726929c1eeabe6c390e47128dbb7d7a055   
1  023cd84001b33aed4ff0f3f5ecb0fdd2151cf543   
2  6a70a0d8d3ed365fe1df6d35f1587a8b9b298618   
3  b37204c13ea38b511265e41ac69fb12acfb63f85   
4  c24e5805afd5145bc48410e876db91d44a06be5e   

                                             article  \
0  By . Mia De Graaf . Britons flocked to beaches...   
1  A couple who weighed a combined 32st were sham...   
2  Video footage shows the heart stopping moment ...   
3  Istanbul, Turkey (CNN) -- About 250 people rac...   
4  By . Daily Mail Reporter . PUBLISHED: . 12:53 ...   

                                          highlights  
0  People enjoyed temperatures of 17C at Brighton...  
1  Couple started piling on pounds after the birt...  
2  A 17-year-old boy suffering lacerations to his...  
3  Syrians citizens hightail it to Turkey .\nMost...  
4  The Xue Long had provided the helicopter that ...  


In [12]:
for name, df in zip(
    ["train", "valid", "test"], [train_cnn_df, valid_cnn_df, test_cnn_df]
):
    df.drop(columns=["id"], inplace=True)
    df["Article"] = df.pop("article")
    df["Summary"] = df.pop("highlights")
    df.to_csv(f"../data/cnn/cnn_dailymail_{name}.csv", index=False)