In [1]:
import os
import pandas as pd
from utils import prepare_labeled_sentences, prepare_labeled_sentences_spacy

Read Datasets

In [2]:
# BBC Dataset
bbc_df = pd.read_csv("data/bbc/bbc_dataset.csv")

# CNN Datasets
cnn_train_df = pd.read_csv("data/cnn/cnn_dailymail_train.csv")
cnn_valid_df = pd.read_csv("data/cnn/cnn_dailymail_valid.csv")
cnn_test_df = pd.read_csv("data/cnn/cnn_dailymail_test.csv")

imdb_df = pd.read_csv("data/imdb/imdb.csv")

In [3]:
# Preview to confirm structure
print("BBC Sample:")
display(bbc_df.head())

BBC Sample:


Unnamed: 0,Article,Summary
0,Ad sales boost Time Warner profit\n\nQuarterly...,TimeWarner said fourth quarter sales rose 2% t...
1,Dollar gains on Greenspan speech\n\nThe dollar...,The dollar has hit its highest level against t...
2,Yukos unit buyer faces loan claim\n\nThe owner...,Yukos' owner Menatep Group says it will ask Ro...
3,High fuel prices hit BA's profits\n\nBritish A...,"Rod Eddington, BA's chief executive, said the ..."
4,Pernod takeover talk lifts Domecq\n\nShares in...,Pernod has reduced the debt it took on to fund...


In [4]:
print("CNN Sample:")
display(cnn_train_df.head())

CNN Sample:


Unnamed: 0,Article,Summary
0,By . Mia De Graaf . Britons flocked to beaches...,People enjoyed temperatures of 17C at Brighton...
1,A couple who weighed a combined 32st were sham...,Couple started piling on pounds after the birt...
2,Video footage shows the heart stopping moment ...,A 17-year-old boy suffering lacerations to his...
3,"Istanbul, Turkey (CNN) -- About 250 people rac...",Syrians citizens hightail it to Turkey .\nMost...
4,By . Daily Mail Reporter . PUBLISHED: . 12:53 ...,The Xue Long had provided the helicopter that ...


In [12]:
print("IMDB Sample:")
display(imdb_df.head())

IMDB Sample:


Unnamed: 0,Article,Summary
0,One of the other reviewers has mentioned that ...,One of the other reviewers has mentioned that ...
1,A wonderful little production The filming tech...,A wonderful little production The filming tech...
2,I thought this was wonderful way to spend time...,I thought it was proof that Woody Allen is sti...
3,Basically there a family where little boy Jake...,Basically there a family where little boy Jake...
4,Petter Mattei Love in the Time of Money is vis...,Petter Mattei Love in the Time of Money is vis...


Preprocess BBC Datasets

In [None]:
# Process the BBC dataset
bbc_labeled_data = prepare_labeled_sentences_spacy(bbc_df)

# Convert to DataFrame for modeling
bbc_processed_df = pd.DataFrame(
    [
        {
            "article_id": item["article_id"],
            "article_sentences": item["raw_sentence"],
            "preprocessed_sentence": item["preprocessed_sentence"],
            "label": item["label"],
        }
        for item in bbc_labeled_data
    ]
)

Preprocessing articles: 100%|██████████| 2225/2225 [07:45<00:00,  4.78it/s]


In [7]:
bbc_processed_df.shape

(41677, 4)

In [8]:
# Count how many sentences are labeled as summary sentences
summary_count = bbc_processed_df['label'].sum()
total_count = len(bbc_processed_df)
print(f"Summary sentences: {summary_count} out of {total_count} ({summary_count/total_count:.2%})")

# Show some examples of sentences included in summaries
print("\nExample summary sentences:")
display(bbc_processed_df[bbc_processed_df['label'] == 1].head(3))

Summary sentences: 16543 out of 41677 (39.69%)

Example summary sentences:


Unnamed: 0,article_id,raw_sentence,sentence,label
0,0,Ad sales boost Time Warner profit Quarterly p...,ad sale boost time warner profit quarterly pro...,1
2,0,TimeWarner said fourth quarter sales rose 2% t...,timewarner say fourth quarter sale rise 11.1bn...,1
6,0,"It lost 464,000 subscribers in the fourth quar...",lose subscriber fourth quarter profit low prec...,1


In [9]:
bbc_processed_df.head(60)

Unnamed: 0,article_id,raw_sentence,sentence,label
0,0,Ad sales boost Time Warner profit Quarterly p...,ad sale boost time warner profit quarterly pro...,1
1,0,"The firm, which is now one of the biggest inve...",firm one big investor google benefit sale inte...,0
2,0,TimeWarner said fourth quarter sales rose 2% t...,timewarner say fourth quarter sale rise 11.1bn...,1
3,0,Its profits were buoyed by one-off gains which...,profit buoy gain offset profit dip warner bros...,0
4,0,Time Warner said on Friday that it now owns 8%...,time warner say friday google,0
5,0,"But its own internet business, AOL, had has mi...",internet business aol mix fortune,0
6,0,"It lost 464,000 subscribers in the fourth quar...",lose subscriber fourth quarter profit low prec...,1
7,0,"However, the company said AOL's underlying pro...",however company say aol underlying profit exce...,1
8,0,It hopes to increase subscribers by offering t...,hop increase subscriber offer online service f...,0
9,0,TimeWarner also has to restate 2000 and 2003 r...,timewarner also restate result follow probe u ...,0


Preprocessed IMDB Dataset

In [None]:
# Process the BBC dataset
imdb_labeled_df = prepare_labeled_sentences_spacy(imdb_df[:4000])

# Convert to DataFrame for modeling
imdb_processed_df = pd.DataFrame(
    [
        {
            "article_id": item["article_id"],
            "article_sentences": item["raw_sentence"],
            "preprocessed_sentence": item["preprocessed_sentence"],
            "label": item["label"],
        }
        for item in imdb_labeled_df
    ]
)

Preprocessing articles: 100%|██████████| 4000/4000 [04:26<00:00, 14.99it/s]


In [13]:
imdb_processed_df.shape

(13024, 4)

In [15]:
# Count how many sentences are labeled as summary sentences
summary_count = imdb_processed_df['label'].sum()
total_count = len(imdb_processed_df)
print(f"Summary sentences: {summary_count} out of {total_count} ({summary_count/total_count:.2%})")

# Show some examples of sentences included in summaries
print("\nExample summary sentences:")
display(imdb_processed_df[imdb_processed_df['label'] == 1].head(3))

Summary sentences: 2934 out of 13024 (22.53%)

Example summary sentences:


Unnamed: 0,article_id,raw_sentence,sentence,label
2,1,A wonderful little production The filming tech...,wonderful little production filming technique ...,1
9,3,Basically there a family where little boy Jake...,basically family little boy jake think zombie ...,1
11,4,Petter Mattei Love in the Time of Money is vis...,petter mattei love time money visually stunnin...,1


In [19]:
print(imdb_processed_df["raw_sentence"][2])

A wonderful little production The filming technique is very unassuming very old time BBC fashion and gives comforting and sometimes discomforting sense of realism to the entire piece The actors are extremely well chosen Michael Sheen not only has got all the polari


In [16]:
imdb_processed_df.head(60)

Unnamed: 0,article_id,raw_sentence,sentence,label
0,0,One of the other reviewers has mentioned that ...,one reviewer mention watch oz episode hook rig...,0
1,0,This show pulls no punches with regards to dru...,show pull punch regard drug sex violence hardc...,0
2,1,A wonderful little production The filming tech...,wonderful little production filming technique ...,1
3,1,but he has all the voices down pat too You can...,voice pat truly see seamless edit guide refere...,0
4,1,but it is terrificly written and performed pie...,terrificly write perform piece masterful produ...,0
5,1,The realism really comes home with the little ...,realism really come home little thing fantasy ...,0
6,2,I thought this was wonderful way to spend time...,think wonderful way spend time hot summer week...,0
7,2,The plot is simplistic but the dialogue is wit...,plot simplistic dialogue witty character likab...,0
8,2,While some may be disappointed when they reali...,may disappoint realize match point risk addict...,0
9,3,Basically there a family where little boy Jake...,basically family little boy jake think zombie ...,1


Preprocessed CNN Dataset

In [None]:
# Process the CNN Train dataset
cnn_train_labeled_data = prepare_labeled_sentences(cnn_train_df)

# Convert to DataFrame for modeling
cnn_train_processed_df = pd.DataFrame(
    [
        {
            "article_id": item["article_id"],
            "raw_sentence": item["raw_sentence"],
            "sentence": item["preprocessed_sentence"],
            "label": item["label"],
        }
        for item in cnn_train_labeled_data
    ]
)

# Process the CNN Validation dataset
cnn_valid_labeled_data = prepare_labeled_sentences(cnn_valid_df)

# Convert to DataFrame for modeling
cnn_valid_processed_df = pd.DataFrame(
    [
        {
            "article_id": item["article_id"],
            "raw_sentence": item["raw_sentence"],
            "sentence": item["preprocessed_sentence"],
            "label": item["label"],
        }
        for item in cnn_valid_labeled_data
    ]
)

# Process the CNN Test dataset
cnn_test_labeled_data = prepare_labeled_sentences(cnn_test_df)

# Convert to DataFrame for modeling
cnn_test_processed_df = pd.DataFrame(
    [
        {
            "article_id": item["article_id"],
            "raw_sentence": item["raw_sentence"],
            "sentence": item["preprocessed_sentence"],
            "label": item["label"],
        }
        for item in cnn_test_labeled_data
    ]
)

In [None]:
print(cnn_train_processed_df.shape)
print(cnn_valid_processed_df.shape)
print(cnn_test_processed_df.shape)

In [None]:
cnn_train_processed_df.head(60)

kNN