In [1]:
import os
import pandas as pd
from utils import prepare_labeled_sentences, prepare_labeled_sentences_spacy

Read Datasets

In [2]:
# BBC Dataset
bbc_df = pd.read_csv("data/bbc/bbc_dataset.csv")

#IMDB Dataset
imdb_df = pd.read_csv("data/imdb/imdb.csv")

In [3]:
# Preview to confirm structure
print("BBC Sample:")
display(bbc_df.head())

BBC Sample:


Unnamed: 0,Article,Summary
0,Ad sales boost Time Warner profit\n\nQuarterly...,TimeWarner said fourth quarter sales rose 2% t...
1,Dollar gains on Greenspan speech\n\nThe dollar...,The dollar has hit its highest level against t...
2,Yukos unit buyer faces loan claim\n\nThe owner...,Yukos' owner Menatep Group says it will ask Ro...
3,High fuel prices hit BA's profits\n\nBritish A...,"Rod Eddington, BA's chief executive, said the ..."
4,Pernod takeover talk lifts Domecq\n\nShares in...,Pernod has reduced the debt it took on to fund...


In [4]:
print("IMDB Sample:")
display(imdb_df.head())

IMDB Sample:


Unnamed: 0,Article,Summary
0,One of the other reviewers has mentioned that ...,One of the other reviewers has mentioned that ...
1,A wonderful little production The filming tech...,A wonderful little production The filming tech...
2,I thought this was wonderful way to spend time...,I thought it was proof that Woody Allen is sti...
3,Basically there a family where little boy Jake...,Basically there a family where little boy Jake...
4,Petter Mattei Love in the Time of Money is vis...,Petter Mattei Love in the Time of Money is vis...


Preprocess BBC Datasets

In [5]:
# Process the BBC dataset
bbc_labeled_data = prepare_labeled_sentences_spacy(bbc_df)

# Convert to DataFrame for modeling
bbc_processed_df = pd.DataFrame(
    [
        {
            "article_id": item["article_id"],
            "article_sentences": item["raw_sentence"],
            "preprocessed_sentence": item["preprocessed_sentence"],
            "label": item["label"],
        }
        for item in bbc_labeled_data
    ]
)

Preprocessing articles: 100%|██████████| 2225/2225 [05:58<00:00,  6.21it/s]


In [6]:
bbc_processed_df.shape

(41677, 4)

In [7]:
# Count how many sentences are labeled as summary sentences
summary_count = bbc_processed_df['label'].sum()
total_count = len(bbc_processed_df)
print(f"Summary sentences: {summary_count} out of {total_count} ({summary_count/total_count:.2%})")

# Show some examples of sentences included in summaries
print("\nExample summary sentences:")
display(bbc_processed_df[bbc_processed_df['label'] == 1].head(3))

Summary sentences: 16543 out of 41677 (39.69%)

Example summary sentences:


Unnamed: 0,article_id,article_sentences,preprocessed_sentence,label
0,0,Ad sales boost Time Warner profit Quarterly p...,ad sale boost time warner profit quarterly pro...,1
2,0,TimeWarner said fourth quarter sales rose 2% t...,timewarner say fourth quarter sale rise 11.1bn...,1
6,0,"It lost 464,000 subscribers in the fourth quar...",lose subscriber fourth quarter profit low prec...,1


In [8]:
bbc_processed_df.head(60)

Unnamed: 0,article_id,article_sentences,preprocessed_sentence,label
0,0,Ad sales boost Time Warner profit Quarterly p...,ad sale boost time warner profit quarterly pro...,1
1,0,"The firm, which is now one of the biggest inve...",firm one big investor google benefit sale inte...,0
2,0,TimeWarner said fourth quarter sales rose 2% t...,timewarner say fourth quarter sale rise 11.1bn...,1
3,0,Its profits were buoyed by one-off gains which...,profit buoy gain offset profit dip warner bros...,0
4,0,Time Warner said on Friday that it now owns 8%...,time warner say friday google,0
5,0,"But its own internet business, AOL, had has mi...",internet business aol mix fortune,0
6,0,"It lost 464,000 subscribers in the fourth quar...",lose subscriber fourth quarter profit low prec...,1
7,0,"However, the company said AOL's underlying pro...",however company say aol underlying profit exce...,1
8,0,It hopes to increase subscribers by offering t...,hop increase subscriber offer online service f...,0
9,0,TimeWarner also has to restate 2000 and 2003 r...,timewarner also restate result follow probe u ...,0


Preprocessed IMDB Dataset

In [9]:
# Process the BBC dataset
imdb_labeled_df = prepare_labeled_sentences_spacy(imdb_df[:4000])

# Convert to DataFrame for modeling
imdb_processed_df = pd.DataFrame(
    [
        {
            "article_id": item["article_id"],
            "article_sentences": item["raw_sentence"],
            "preprocessed_sentence": item["preprocessed_sentence"],
            "label": item["label"],
        }
        for item in imdb_labeled_df
    ]
)

Preprocessing articles: 100%|██████████| 4000/4000 [04:44<00:00, 14.08it/s]


In [10]:
imdb_processed_df.shape

(13024, 4)

In [11]:
# Count how many sentences are labeled as summary sentences
summary_count = imdb_processed_df['label'].sum()
total_count = len(imdb_processed_df)
print(f"Summary sentences: {summary_count} out of {total_count} ({summary_count/total_count:.2%})")

# Show some examples of sentences included in summaries
print("\nExample summary sentences:")
display(imdb_processed_df[imdb_processed_df['label'] == 1].head(3))

Summary sentences: 2934 out of 13024 (22.53%)

Example summary sentences:


Unnamed: 0,article_id,article_sentences,preprocessed_sentence,label
2,1,A wonderful little production The filming tech...,wonderful little production filming technique ...,1
9,3,Basically there a family where little boy Jake...,basically family little boy jake think zombie ...,1
11,4,Petter Mattei Love in the Time of Money is vis...,petter mattei love time money visually stunnin...,1


In [12]:
print(imdb_processed_df["article_sentences"][2])

A wonderful little production The filming technique is very unassuming very old time BBC fashion and gives comforting and sometimes discomforting sense of realism to the entire piece The actors are extremely well chosen Michael Sheen not only has got all the polari


In [13]:
imdb_processed_df.head(60)

Unnamed: 0,article_id,article_sentences,preprocessed_sentence,label
0,0,One of the other reviewers has mentioned that ...,one reviewer mention watch oz episode hook rig...,0
1,0,This show pulls no punches with regards to dru...,show pull punch regard drug sex violence hardc...,0
2,1,A wonderful little production The filming tech...,wonderful little production filming technique ...,1
3,1,but he has all the voices down pat too You can...,voice pat truly see seamless edit guide refere...,0
4,1,but it is terrificly written and performed pie...,terrificly write perform piece masterful produ...,0
5,1,The realism really comes home with the little ...,realism really come home little thing fantasy ...,0
6,2,I thought this was wonderful way to spend time...,think wonderful way spend time hot summer week...,0
7,2,The plot is simplistic but the dialogue is wit...,plot simplistic dialogue witty character likab...,0
8,2,While some may be disappointed when they reali...,may disappoint realize match point risk addict...,0
9,3,Basically there a family where little boy Jake...,basically family little boy jake think zombie ...,1


kNN

In [14]:
from ML_models.knn import KNNExtractiveSummarizer
from sklearn.model_selection import train_test_split

# Prepare data
X = bbc_processed_df["preprocessed_sentence"]
y = bbc_processed_df["label"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Initialize and tune model
knn_bbc = KNNExtractiveSummarizer()

# Fine-tune the model on the training set
knn_bbc.tune(X_train, y_train, n_iter=10, scoring="f1")  # You can increase n_iter

# Evaluate
print("BBC Dataset Evaluation (KNN):")
knn_bbc.evaluate(X_test, y_test)

# Generate summaries for a few articles
sample_article_ids = bbc_processed_df["article_id"].unique()[:5]

for article_id in sample_article_ids:
    article_df = bbc_processed_df[bbc_processed_df["article_id"] == article_id]
    reference_summary = " ".join(
        article_df[article_df["label"] == 1]["article_sentences"]
    )
    generated_summary = knn_bbc.summarize(
        article_df["article_sentences"].tolist(),
        article_df["preprocessed_sentence"].tolist(),
    )

    print(f"\nArticle ID: {article_id}")
    print("Reference Summary:", reference_summary[:200] + "...")
    print("Generated Summary:", generated_summary[:200] + "...")

    rouge_scores = knn_bbc.compute_rouge(generated_summary, reference_summary)
    if rouge_scores is not None:
        print("ROUGE Scores:", rouge_scores[0])

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] END clf__metric=cosine, clf__n_neighbors=3, clf__weights=distance, tfidf__max_features=3000, tfidf__min_df=1, tfidf__ngram_range=(1, 2); total time=  11.5s
[CV] END clf__metric=cosine, clf__n_neighbors=3, clf__weights=distance, tfidf__max_features=3000, tfidf__min_df=1, tfidf__ngram_range=(1, 2); total time=   6.7s
[CV] END clf__metric=cosine, clf__n_neighbors=3, clf__weights=distance, tfidf__max_features=3000, tfidf__min_df=1, tfidf__ngram_range=(1, 2); total time=   6.3s
[CV] END clf__metric=euclidean, clf__n_neighbors=7, clf__weights=distance, tfidf__max_features=5000, tfidf__min_df=1, tfidf__ngram_range=(1, 1); total time=   6.1s
[CV] END clf__metric=euclidean, clf__n_neighbors=7, clf__weights=distance, tfidf__max_features=5000, tfidf__min_df=1, tfidf__ngram_range=(1, 1); total time=   6.0s
[CV] END clf__metric=euclidean, clf__n_neighbors=7, clf__weights=distance, tfidf__max_features=5000, tfidf__min_df=1, tfidf__ngr

In [15]:
from ML_models.knn import KNNExtractiveSummarizer
from sklearn.model_selection import train_test_split

# Prepare data
X = imdb_processed_df["preprocessed_sentence"]
y = imdb_processed_df["label"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Initialize and tune model
knn_bbc = KNNExtractiveSummarizer()

# Fine-tune the model on the training set
knn_bbc.tune(X_train, y_train, n_iter=10, scoring="f1")  # You can increase n_iter

# Evaluate
print("BBC Dataset Evaluation (KNN):")
knn_bbc.evaluate(X_test, y_test)

# Generate summaries for a few articles
sample_article_ids = imdb_processed_df["article_id"].unique()[:5]

for article_id in sample_article_ids:
    article_df = imdb_processed_df[imdb_processed_df["article_id"] == article_id]
    reference_summary = " ".join(
        article_df[article_df["label"] == 1]["article_sentences"]
    )
    generated_summary = knn_bbc.summarize(
        article_df["article_sentences"].tolist(),
        article_df["preprocessed_sentence"].tolist(),
    )

    print(f"\nArticle ID: {article_id}")
    print("Reference Summary:", reference_summary[:200] + "...")
    print("Generated Summary:", generated_summary[:200] + "...")

    rouge_scores = knn_bbc.compute_rouge(generated_summary, reference_summary)
    if rouge_scores is not None:
        print("ROUGE Scores:", rouge_scores[0])

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] END clf__metric=cosine, clf__n_neighbors=3, clf__weights=distance, tfidf__max_features=3000, tfidf__min_df=1, tfidf__ngram_range=(1, 2); total time=   2.2s
[CV] END clf__metric=cosine, clf__n_neighbors=3, clf__weights=distance, tfidf__max_features=3000, tfidf__min_df=1, tfidf__ngram_range=(1, 2); total time=   2.2s
[CV] END clf__metric=cosine, clf__n_neighbors=3, clf__weights=distance, tfidf__max_features=3000, tfidf__min_df=1, tfidf__ngram_range=(1, 2); total time=   2.1s
[CV] END clf__metric=euclidean, clf__n_neighbors=7, clf__weights=distance, tfidf__max_features=5000, tfidf__min_df=1, tfidf__ngram_range=(1, 1); total time=   1.2s
[CV] END clf__metric=euclidean, clf__n_neighbors=7, clf__weights=distance, tfidf__max_features=5000, tfidf__min_df=1, tfidf__ngram_range=(1, 1); total time=   1.2s
[CV] END clf__metric=euclidean, clf__n_neighbors=7, clf__weights=distance, tfidf__max_features=5000, tfidf__min_df=1, tfidf__ngr

Logistic Regression

In [16]:
from ML_models.logistic_reg import LogisticRegressionSummarizer
from sklearn.model_selection import train_test_split

X = bbc_processed_df["preprocessed_sentence"]
y = bbc_processed_df["label"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

lr_bbc = LogisticRegressionSummarizer()

# Fine-tune the model on the training set
lr_bbc.tune(
    X_train, y_train, n_iter=10, scoring="f1"
)  # You can increase n_iter if desired

print("BBC Dataset Evaluation:")
lr_bbc.evaluate(X_test, y_test)

sample_article_ids = bbc_processed_df["article_id"].unique()[:5]

for article_id in sample_article_ids:
    article_df = bbc_processed_df[bbc_processed_df["article_id"] == article_id]
    reference_summary = " ".join(
        article_df[article_df["label"] == 1]["article_sentences"]
    )
    generated_summary = lr_bbc.summarize(
        article_df["article_sentences"].tolist(),
        article_df["preprocessed_sentence"].tolist(),
    )

    print(f"\nArticle ID: {article_id}")
    print("Reference Summary:", reference_summary[:200] + "...")
    print("Generated Summary:", generated_summary[:200] + "...")

    rouge_scores = lr_bbc.compute_rouge(generated_summary, reference_summary)
    if rouge_scores is not None:
        print("ROUGE Scores:", rouge_scores[0])

Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best params: {'tfidf__ngram_range': (1, 2), 'tfidf__min_df': 3, 'tfidf__max_features': 5000, 'clf__max_iter': 1500, 'clf__C': 1}
Best score: 0.5626295285604364
BBC Dataset Evaluation:
              precision    recall  f1-score   support

           0       0.73      0.66      0.69      5062
           1       0.54      0.62      0.58      3274

    accuracy                           0.65      8336
   macro avg       0.64      0.64      0.64      8336
weighted avg       0.66      0.65      0.65      8336


Article ID: 0
Reference Summary: Ad sales boost Time Warner profit  Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (Â£600m) for the three months to December, from $639m year-earlier. TimeWarner said fourth quart...
Generated Summary: The firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and higher advert sales. TimeWarner said fourth quar

In [17]:
from ML_models.logistic_reg import LogisticRegressionSummarizer
from sklearn.model_selection import train_test_split

X = imdb_processed_df["preprocessed_sentence"]
y = imdb_processed_df["label"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

lr_imdb = LogisticRegressionSummarizer()

# Fine-tune the model on the training set
lr_imdb.tune(X_train, y_train, n_iter=10, scoring="f1")

print("IMDB Dataset Evaluation:")
lr_imdb.evaluate(X_test, y_test)

sample_article_ids = imdb_processed_df["article_id"].unique()[:5]

for article_id in sample_article_ids:
    article_df = imdb_processed_df[imdb_processed_df["article_id"] == article_id]
    article_sents = article_df["article_sentences"].tolist()
    preprocessed_sents = article_df["preprocessed_sentence"].tolist()

    if not preprocessed_sents or not article_sents:
        print(f"\nArticle ID: {article_id}")
        print("Empty input. Skipping...")
        continue

    reference_summary = " ".join(
        article_df[article_df["label"] == 1]["article_sentences"]
    )
    generated_summary = lr_imdb.summarize(article_sents, preprocessed_sents)

    print(f"\nArticle ID: {article_id}")
    print("Reference Summary:", reference_summary[:200] + "...")
    print("Generated Summary:", generated_summary[:200] + "...")

    rouge_scores = lr_imdb.compute_rouge(generated_summary, reference_summary)
    if rouge_scores is not None:
        print("ROUGE Scores:", rouge_scores[0])

Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best params: {'tfidf__ngram_range': (1, 2), 'tfidf__min_df': 2, 'tfidf__max_features': 3000, 'clf__max_iter': 500, 'clf__C': 0.1}
Best score: 0.40721609299234146
IMDB Dataset Evaluation:
              precision    recall  f1-score   support

           0       0.85      0.69      0.76      2021
           1       0.35      0.57      0.43       584

    accuracy                           0.67      2605
   macro avg       0.60      0.63      0.60      2605
weighted avg       0.74      0.67      0.69      2605


Article ID: 0
Reference Summary: ...
Generated Summary: One of the other reviewers has mentioned that after watching just Oz episode you ll be hooked They are right as this is exactly what happened with me The first thing that struck me about Oz was its br...
Error computing ROUGE: Reference is empty.

Article ID: 1
Reference Summary: A wonderful little production The filming technique is very unassuming very old time BB

Decision Trees

In [18]:
from ML_models.decision_tree import DecisionTreeSummarizer

bbc_summarizer = DecisionTreeSummarizer("BBC", bbc_df)
bbc_summarizer.run()
bbc_summarizer.show_samples()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Fady\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!



=== Running on BBC Dataset ===
Train Accuracy: 0.8961
Test Accuracy: 0.8686

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.88      0.93     12226
           1       0.05      0.43      0.10       200

    accuracy                           0.87     12426
   macro avg       0.52      0.65      0.51     12426
weighted avg       0.97      0.87      0.92     12426


ROUGE Scores:
rouge-1: 0.3711
rouge-2: 0.2659
rouge-l: 0.3639

--- Article 1665 ---
Predicted: Despite being without flanker Keith Gleeson, coach Eddie O'Sullivan has not had to contend with the sort of casualty lists that have hit England and Scotland in particular prior to the tournament. "For Ireland to win it we need to stay relatively injury free, and fortunately we are one of the few teams that have done that so far," Wood added. But despite their traditional hospitality when the Irish are visiting, Wood believes Wales might end their four-match losing run ag

In [19]:
bbc_summarizer = DecisionTreeSummarizer("IMDB", imdb_df)
bbc_summarizer.run()
bbc_summarizer.show_samples()


=== Running on IMDB Dataset ===
Train Accuracy: 0.9771
Test Accuracy: 0.9684

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.97      0.98     11650
           1       0.05      0.34      0.08        50

    accuracy                           0.97     11700
   macro avg       0.52      0.66      0.53     11700
weighted avg       0.99      0.97      0.98     11700


ROUGE Scores:
rouge-1: 0.5617
rouge-2: 0.4881
rouge-l: 0.5612

--- Article 13209 ---
Predicted: Two old men sitting on park bench don really have problem with this scene Only problem is that it not scene it the entire movieYup movies don get anymore low concept than this They also don get anymore boring than this either but there worse to come because these two old men are chalk and cheese One is Nat Moyer who is Yiddish communist while the other is Midge Carter former golden gloves champion who also black Let me see now Jew and black man sitting on park bench get

In [20]:
from ML_models.random_forest import RandomForestSummarizer

bbc_summarizer = RandomForestSummarizer("BBC", bbc_df)
bbc_summarizer.run()
bbc_summarizer.show_samples()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Fady\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!



=== Running on BBC Dataset ===
Train Accuracy: 0.9407
Test Accuracy: 0.9315

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.94      0.96     12226
           1       0.07      0.27      0.11       200

    accuracy                           0.93     12426
   macro avg       0.53      0.60      0.54     12426
weighted avg       0.97      0.93      0.95     12426


ROUGE Scores:
rouge-1: 0.3028
rouge-2: 0.2067
rouge-l: 0.2960

--- Article 1665 ---
Predicted: "Things have been building up over the past few years and I think this is the year for Ireland," he told BBC Sport. A lot of things are in our favour with England and France at home." "For Ireland to win it we need to stay relatively injury free, and fortunately we are one of the few teams that have done that so far," Wood added. "It is going to be tough and we need to take all the luck and opportunities that come our way."
Reference: "So many of the major England players

In [21]:
from ML_models.random_forest import RandomForestSummarizer

bbc_summarizer = RandomForestSummarizer("IMDB", imdb_df)
bbc_summarizer.run()
bbc_summarizer.show_samples()


=== Running on IMDB Dataset ===
Train Accuracy: 0.9968
Test Accuracy: 0.9924

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     11650
           1       0.15      0.16      0.15        50

    accuracy                           0.99     11700
   macro avg       0.57      0.58      0.57     11700
weighted avg       0.99      0.99      0.99     11700


ROUGE Scores:
rouge-1: 0.5617
rouge-2: 0.4881
rouge-l: 0.5612

--- Article 13209 ---
Predicted: Two old men sitting on park bench don really have problem with this scene Only problem is that it not scene it the entire movieYup movies don get anymore low concept than this They also don get anymore boring than this either but there worse to come because these two old men are chalk and cheese One is Nat Moyer who is Yiddish communist while the other is Midge Carter former golden gloves champion who also black Let me see now Jew and black man sitting on park bench get

CNN

In [22]:
from DL_models.cnn import CNNExtractiveSummarizer
from sklearn.model_selection import train_test_split

# Prepare data
X = bbc_processed_df["preprocessed_sentence"]
y = bbc_processed_df["label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize model
cnn_bbc = CNNExtractiveSummarizer()

# Train/tune the model
cnn_bbc.tune(X_train, y_train, X_val_raw=X_test, y_val=y_test, epochs=30)

# Evaluate
print("BBC Dataset Evaluation (CNN):")
cnn_bbc.evaluate(X_test, y_test)

# Generate summaries for a few articles
sample_article_ids = bbc_processed_df["article_id"].unique()[:5]

for article_id in sample_article_ids:
    article_df = bbc_processed_df[bbc_processed_df["article_id"] == article_id]
    reference_summary = " ".join(
        article_df[article_df["label"] == 1]["article_sentences"]
    )
    generated_summary = cnn_bbc.summarize(
        article_df["article_sentences"].tolist(),
        article_df["preprocessed_sentence"].tolist()
    )

    print(f"\nArticle ID: {article_id}")
    print("Reference Summary:", reference_summary[:200] + "...")
    print("Generated Summary:", generated_summary[:200] + "...")

    rouge_scores = cnn_bbc.compute_rouge(generated_summary, reference_summary)
    if rouge_scores is not None:
        print("ROUGE Scores:", rouge_scores[0])




Epoch 1/30
[1m1042/1042[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 9ms/step - accuracy: 0.6324 - loss: 0.6397 - val_accuracy: 0.6657 - val_loss: 0.6049
Epoch 2/30
[1m1042/1042[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 9ms/step - accuracy: 0.7508 - loss: 0.5109 - val_accuracy: 0.6738 - val_loss: 0.6176
Epoch 3/30
[1m1042/1042[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 9ms/step - accuracy: 0.8895 - loss: 0.2890 - val_accuracy: 0.6610 - val_loss: 0.7697
Epoch 4/30
[1m1042/1042[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 10ms/step - accuracy: 0.9689 - loss: 0.1050 - val_accuracy: 0.6642 - val_loss: 1.1222
Epoch 5/30
[1m1042/1042[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 10ms/step - accuracy: 0.9870 - loss: 0.0557 - val_accuracy: 0.6623 - val_loss: 1.3481
Epoch 6/30
[1m1042/1042[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 8ms/step - accuracy: 0.9928 - loss: 0.0350 - val_accuracy: 0.6720 - val_loss: 1.5404
Epoch 7/30


In [23]:
from DL_models.cnn import CNNExtractiveSummarizer
from sklearn.model_selection import train_test_split

# Prepare data
X = imdb_processed_df["preprocessed_sentence"]
y = imdb_processed_df["label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize model
cnn_bbc = CNNExtractiveSummarizer()

# Train/tune the model
cnn_bbc.tune(X_train, y_train, X_val_raw=X_test, y_val=y_test, epochs=30)

# Evaluate
print("IMDB Dataset Evaluation (CNN):")
cnn_bbc.evaluate(X_test, y_test)

# Generate summaries for a few articles
sample_article_ids = imdb_processed_df["article_id"].unique()[:5]

for article_id in sample_article_ids:
    article_df = imdb_processed_df[imdb_processed_df["article_id"] == article_id]
    reference_summary = " ".join(
        article_df[article_df["label"] == 1]["article_sentences"]
    )
    generated_summary = cnn_bbc.summarize(
        article_df["article_sentences"].tolist(),
        article_df["preprocessed_sentence"].tolist()
    )

    print(f"\nArticle ID: {article_id}")
    print("Reference Summary:", reference_summary[:200] + "...")
    print("Generated Summary:", generated_summary[:200] + "...")

    rouge_scores = cnn_bbc.compute_rouge(generated_summary, reference_summary)
    if rouge_scores is not None:
        print("ROUGE Scores:", rouge_scores[0])


Epoch 1/30




[1m326/326[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 10ms/step - accuracy: 0.7621 - loss: 0.5409 - val_accuracy: 0.7758 - val_loss: 0.4894
Epoch 2/30
[1m326/326[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - accuracy: 0.7902 - loss: 0.4302 - val_accuracy: 0.7731 - val_loss: 0.5166
Epoch 3/30
[1m326/326[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - accuracy: 0.9105 - loss: 0.2288 - val_accuracy: 0.7052 - val_loss: 0.6530
Epoch 4/30
[1m326/326[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - accuracy: 0.9782 - loss: 0.0710 - val_accuracy: 0.7459 - val_loss: 0.8751
Epoch 5/30
[1m326/326[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - accuracy: 0.9894 - loss: 0.0333 - val_accuracy: 0.7582 - val_loss: 1.1882
Epoch 6/30
[1m326/326[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - accuracy: 0.9945 - loss: 0.0169 - val_accuracy: 0.7486 - val_loss: 1.2912
Epoch 7/30
[1m326/326[0m [32m━━━━━━

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Article ID: 1
Reference Summary: A wonderful little production The filming technique is very unassuming very old time BBC fashion and gives comforting and sometimes discomforting sense of realism to the entire piece The actors are ex...
Generated Summary: A wonderful little production The filming technique is very unassuming very old time BBC fashion and gives comforting and sometimes discomforting sense of realism to the entire piece The actors are ex...
ROUGE Scores: {'rouge-1': {'r': 1.0, 'p': 0.5492957746478874, 'f': 0.7090909045140497}, 'rouge-2': {'r': 1.0, 'p': 0.45161290322580644, 'f': 0.6222222179358025}, 'rouge-l': {'r': 1.0, 'p': 0.5492957746478874, 'f': 0.7090909045140497}}
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step

Article ID: 2
Reference Summary: ...
Generated Summary: I thought this was wonderful way to spend time on too hot summer weekend sitting in the air conditioned theater and watching light hearted comedy The plot is simplistic but th

In [24]:
from DL_models.bilstm_attention import BiLSTMSummarizer

bilstm = BiLSTMSummarizer("IMDB", imdb_processed_df)
bilstm.train()
bilstm.evaluate()
bilstm.show_samples(n=5)


=== Training on IMDB ===
Epoch 1 Loss: 0.9270
Epoch 2 Loss: 0.8734
Epoch 3 Loss: 0.8536
Epoch 4 Loss: 0.8417
Epoch 5 Loss: 0.7946
Training completed in 109.01s
Best Threshold: 0.40, F1: 0.4372

ROUGE Scores:
rouge-1: 0.4298
rouge-2: 0.3811
rouge-l: 0.4177

--- Article ID: 3460 ---
Predicted Summary:
 freddy annoyance see many time one nothing different lot time want take awful one liner get tv screen
Reference Summary:
 hit rock bottom right begin bad act jumbled sequence event mean sure freddy movie suppose dreamlike creepy one like train wreck poor sequence event awful plot setup feel like come terrible headache like get scar directing totally fail none suspense well craft horror previous sequel find even death scene mostly crass moronic death food especially except one cool scene craft like comic book battle movie get point storyline lame lame lame lame
--------------------------------------------------------------------------------

--- Article ID: 1213 ---
Predicted Summary:
 hig