In [1]:
import os
import pandas as pd
from utils import prepare_labeled_sentences, prepare_labeled_sentences_spacy

Read Datasets

In [2]:
# BBC Dataset
bbc_df = pd.read_csv("data/bbc/bbc_dataset.csv")

#IMDB Dataset
imdb_df = pd.read_csv("data/imdb/imdb.csv")

In [3]:
# Preview to confirm structure
print("BBC Sample:")
display(bbc_df.head())

BBC Sample:


Unnamed: 0,Article,Summary
0,Musicians to tackle US red tape\n\nMusicians' ...,Nigel McCune from the Musicians' Union said Br...
1,"U2's desire to be number one\n\nU2, who have w...",But they still want more.They have to want to ...
2,Rocker Doherty in on-stage fight\n\nRock singe...,"Babyshambles, which he formed after his acrimo..."
3,Snicket tops US box office chart\n\nThe film a...,A Series of Unfortunate Events also stars Scot...
4,Ocean's Twelve raids box office\n\nOcean's Twe...,"Ocean's Twelve, the crime caper sequel starrin..."


In [4]:
print("IMDB Sample:")
display(imdb_df.head())

IMDB Sample:


Unnamed: 0,Article,Summary
0,One of the other reviewers has mentioned that ...,One of the other reviewers has mentioned that ...
1,A wonderful little production The filming tech...,A wonderful little production The filming tech...
2,I thought this was wonderful way to spend time...,I thought it was proof that Woody Allen is sti...
3,Basically there a family where little boy Jake...,Basically there a family where little boy Jake...
4,Petter Mattei Love in the Time of Money is vis...,Petter Mattei Love in the Time of Money is vis...


Preprocess BBC Datasets

In [5]:
# Process the BBC dataset
bbc_labeled_data = prepare_labeled_sentences_spacy(bbc_df)

# Convert to DataFrame for modeling
bbc_processed_df = pd.DataFrame(
    [
        {
            "article_id": item["article_id"],
            "article_sentences": item["raw_sentence"],
            "preprocessed_sentence": item["preprocessed_sentence"],
            "label": item["label"],
        }
        for item in bbc_labeled_data
    ]
)

Preprocessing articles: 100%|██████████| 2225/2225 [05:07<00:00,  7.24it/s]


In [28]:
bbc_processed_df.shape

(41677, 4)

In [7]:
# Count how many sentences are labeled as summary sentences
summary_count = bbc_processed_df['label'].sum()
total_count = len(bbc_processed_df)
print(f"Summary sentences: {summary_count} out of {total_count} ({summary_count/total_count:.2%})")

# Show some examples of sentences included in summaries
print("\nExample summary sentences:")
display(bbc_processed_df[bbc_processed_df['label'] == 1].head(3))

Summary sentences: 16543 out of 41677 (39.69%)

Example summary sentences:


Unnamed: 0,article_id,article_sentences,preprocessed_sentence,label
0,0,Musicians to tackle US red tape Musicians' gr...,musician tackle u red tape musician group tack...,1
1,0,A singer hoping to perform in the US can expec...,singer hop perform u expect pay simply obtain ...,1
4,0,Nigel McCune from the Musicians' Union said Br...,nigel mccune musician union say british musici...,1


In [8]:
bbc_processed_df.head(60)

Unnamed: 0,article_id,article_sentences,preprocessed_sentence,label
0,0,Musicians to tackle US red tape Musicians' gr...,musician tackle u red tape musician group tack...,1
1,0,A singer hoping to perform in the US can expec...,singer hop perform u expect pay simply obtain ...,1
2,0,Groups including the Musicians' Union are call...,group include musician union call end raw deal...,0
3,0,US acts are not faced with comparable expense ...,u act face comparable expense bureaucracy visi...,0
4,0,Nigel McCune from the Musicians' Union said Br...,nigel mccune musician union say british musici...,1
5,0,A sponsor has to make a petition on their beha...,sponsor make petition behalf form amount nearl...,0
6,0,"""If you make a mistake on your form, you risk ...",make mistake form risk ban thus ability career...,0
7,0,"""The US is the world's biggest music market, w...",u world big music market mean something creaky...,1
8,0,"""The current situation is preventing British a...",current situation prevent british act maintain...,1
9,0,The Musicians' Union stance is being endorsed ...,musician union stance endorse music manager fo...,1


Preprocessed IMDB Dataset

In [6]:
# Process the BBC dataset
imdb_labeled_df = prepare_labeled_sentences_spacy(imdb_df[:4000])

# Convert to DataFrame for modeling
imdb_processed_df = pd.DataFrame(
    [
        {
            "article_id": item["article_id"],
            "article_sentences": item["raw_sentence"],
            "preprocessed_sentence": item["preprocessed_sentence"],
            "label": item["label"],
        }
        for item in imdb_labeled_df
    ]
)

Preprocessing articles: 100%|██████████| 4000/4000 [03:45<00:00, 17.71it/s]


In [10]:
imdb_processed_df.shape

(13024, 4)

In [11]:
# Count how many sentences are labeled as summary sentences
summary_count = imdb_processed_df['label'].sum()
total_count = len(imdb_processed_df)
print(f"Summary sentences: {summary_count} out of {total_count} ({summary_count/total_count:.2%})")

# Show some examples of sentences included in summaries
print("\nExample summary sentences:")
display(imdb_processed_df[imdb_processed_df['label'] == 1].head(3))

Summary sentences: 2934 out of 13024 (22.53%)

Example summary sentences:


Unnamed: 0,article_id,article_sentences,preprocessed_sentence,label
2,1,A wonderful little production The filming tech...,wonderful little production filming technique ...,1
9,3,Basically there a family where little boy Jake...,basically family little boy jake think zombie ...,1
11,4,Petter Mattei Love in the Time of Money is vis...,petter mattei love time money visually stunnin...,1


In [12]:
print(imdb_processed_df["article_sentences"][2])

A wonderful little production The filming technique is very unassuming very old time BBC fashion and gives comforting and sometimes discomforting sense of realism to the entire piece The actors are extremely well chosen Michael Sheen not only has got all the polari


In [13]:
imdb_processed_df.head(60)

Unnamed: 0,article_id,article_sentences,preprocessed_sentence,label
0,0,One of the other reviewers has mentioned that ...,one reviewer mention watch oz episode hook rig...,0
1,0,This show pulls no punches with regards to dru...,show pull punch regard drug sex violence hardc...,0
2,1,A wonderful little production The filming tech...,wonderful little production filming technique ...,1
3,1,but he has all the voices down pat too You can...,voice pat truly see seamless edit guide refere...,0
4,1,but it is terrificly written and performed pie...,terrificly write perform piece masterful produ...,0
5,1,The realism really comes home with the little ...,realism really come home little thing fantasy ...,0
6,2,I thought this was wonderful way to spend time...,think wonderful way spend time hot summer week...,0
7,2,The plot is simplistic but the dialogue is wit...,plot simplistic dialogue witty character likab...,0
8,2,While some may be disappointed when they reali...,may disappoint realize match point risk addict...,0
9,3,Basically there a family where little boy Jake...,basically family little boy jake think zombie ...,1


In [14]:
from sklearn.utils import resample

def balance_dataset(df):
    df_majority = df[df.label == 0]
    df_minority = df[df.label == 1]

    df_minority_upsampled = resample(
        df_minority, replace=True, n_samples=len(df_majority), random_state=42
    )

    return pd.concat([df_majority, df_minority_upsampled])


# Balance both datasets
bbc_balanced = balance_dataset(bbc_processed_df)
imdb_balanced = balance_dataset(imdb_processed_df)


#### Machine Learning Models

kNN

In [None]:
from ML_models.knn import KNNExtractiveSummarizer
from sklearn.model_selection import train_test_split

# Prepare data
X = bbc_balanced["preprocessed_sentence"]
y = bbc_balanced["label"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Initialize and tune model
knn_bbc = KNNExtractiveSummarizer()

# Fine-tune the model on the training set
knn_bbc.tune(X_train, y_train, n_iter=10, scoring="f1")  # You can increase n_iter

# Evaluate
print("BBC Dataset Evaluation (KNN):")
knn_bbc.evaluate(X_test, y_test)

# Generate summaries for a few articles
sample_article_ids = bbc_balanced["article_id"].unique()[:5]

for article_id in sample_article_ids:
    article_df = bbc_balanced[bbc_balanced["article_id"] == article_id]
    reference_summary = " ".join(
        article_df[article_df["label"] == 1]["article_sentences"]
    )
    generated_summary = knn_bbc.summarize(
        article_df["article_sentences"].tolist(),
        article_df["preprocessed_sentence"].tolist(),
    )

    print(f"\nArticle ID: {article_id}")
    print("Reference Summary:", reference_summary[:200] + "...")
    print("Generated Summary:", generated_summary[:200] + "...")

    rouge_scores = knn_bbc.compute_rouge(generated_summary, reference_summary)
    if rouge_scores is not None:
        print("ROUGE Scores:", rouge_scores[0])

Best params: {'tfidf__ngram_range': (1, 1), 'tfidf__min_df': 3, 'tfidf__max_features': 3000, 'clf__weights': 'distance', 'clf__n_neighbors': 7, 'clf__metric': 'cosine'}
Best score: 0.7530351372268077
BBC Dataset Evaluation (KNN):
              precision    recall  f1-score   support

           0       0.84      0.74      0.79      5043
           1       0.77      0.86      0.81      5011

    accuracy                           0.80     10054
   macro avg       0.80      0.80      0.80     10054
weighted avg       0.80      0.80      0.80     10054


Article ID: 0
Reference Summary: TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn. It lost 464,000 subscribers in the fourth quarter profits were lower than in the preceding three quarters. For the full-year, Time...
Generated Summary: TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn. For the full-year, TimeWarner posted a profit of $3.36bn, up 27% from its 2003 performance, while revenues grew 6.4%

In [None]:
from ML_models.knn import KNNExtractiveSummarizer
from sklearn.model_selection import train_test_split

# Prepare data
X = imdb_balanced["preprocessed_sentence"]
y = imdb_balanced["label"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Initialize and tune model
knn_bbc = KNNExtractiveSummarizer()

# Fine-tune the model on the training set
knn_bbc.tune(X_train, y_train, n_iter=10, scoring="f1")  # You can increase n_iter

# Evaluate
print("IMDB Dataset Evaluation (KNN):")
knn_bbc.evaluate(X_test, y_test)

# Generate summaries for a few articles
sample_article_ids = imdb_balanced["article_id"].unique()[:5]

for article_id in sample_article_ids:
    article_df = imdb_balanced[imdb_balanced["article_id"] == article_id]
    reference_summary = " ".join(
        article_df[article_df["label"] == 1]["article_sentences"]
    )
    generated_summary = knn_bbc.summarize(
        article_df["article_sentences"].tolist(),
        article_df["preprocessed_sentence"].tolist(),
    )

    print(f"\nArticle ID: {article_id}")
    print("Reference Summary:", reference_summary[:200] + "...")
    print("Generated Summary:", generated_summary[:200] + "...")

    rouge_scores = knn_bbc.compute_rouge(generated_summary, reference_summary)
    if rouge_scores is not None:
        print("ROUGE Scores:", rouge_scores[0])

Best params: {'tfidf__ngram_range': (1, 1), 'tfidf__min_df': 1, 'tfidf__max_features': 5000, 'clf__weights': 'distance', 'clf__n_neighbors': 7, 'clf__metric': 'euclidean'}
Best score: 0.8969881150466228
IMDB Dataset Evaluation (KNN):
              precision    recall  f1-score   support

           0       0.92      0.94      0.93      2013
           1       0.94      0.92      0.93      2023

    accuracy                           0.93      4036
   macro avg       0.93      0.93      0.93      4036
weighted avg       0.93      0.93      0.93      4036


Article ID: 0
Reference Summary: ...
Generated Summary: One of the other reviewers has mentioned that after watching just Oz episode you ll be hooked They are right as this is exactly what happened with me The first thing that struck me about Oz was its br...
Error computing ROUGE: Reference is empty.

Article ID: 1
Reference Summary: A wonderful little production The filming technique is very unassuming very old time BBC fashion and 

Logistic Regression

In [16]:
from ML_models.logistic_reg import LogisticRegressionSummarizer
from sklearn.model_selection import train_test_split

X = bbc_balanced["preprocessed_sentence"]
y = bbc_balanced["label"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

lr_bbc = LogisticRegressionSummarizer()

# Fine-tune the model on the training set
lr_bbc.tune(
    X_train, y_train, n_iter=10, scoring="f1"
)  # You can increase n_iter if desired

print("BBC Dataset Evaluation:")
lr_bbc.evaluate(X_test, y_test)

sample_article_ids = bbc_balanced["article_id"].unique()[:3]

for article_id in sample_article_ids:
    article_df = bbc_balanced[bbc_balanced["article_id"] == article_id]
    reference_summary = " ".join(
        article_df[article_df["label"] == 1]["article_sentences"]
    )
    generated_summary = lr_bbc.summarize(
        article_df["article_sentences"].tolist(),
        article_df["preprocessed_sentence"].tolist(),
    )

    print(f"\nArticle ID: {article_id}")
    print("Reference Summary:", reference_summary[:200] + "...")
    print("Generated Summary:", generated_summary[:200] + "...")

    rouge_scores = lr_bbc.compute_rouge(generated_summary, reference_summary)
    if rouge_scores is not None:
        print("ROUGE Scores:", rouge_scores[0])

<<<<<<< local


Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best params: {'tfidf__ngram_range': (1, 2), 'tfidf__min_df': 3, 'tfidf__max_features': 5000, 'clf__max_iter': 1500, 'clf__C': 1}
Best score: 0.5626295285604364
BBC Dataset Evaluation:
              precision    recall  f1-score   support

           0       0.73      0.66      0.69      5062
           1       0.54      0.62      0.58      3274

    accuracy                           0.65      8336
   macro avg       0.64      0.64      0.64      8336
weighted avg       0.66      0.65      0.65      8336


Article ID: 0
Reference Summary: TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn. It lost 464,000 subscribers in the fourth quarter profits were lower than in the preceding three quarters. For the full-year, Time...
Generated Summary: The firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and higher advert sales. TimeWarner said fourth quar

In [17]:
from ML_models.logistic_reg import LogisticRegressionSummarizer
from sklearn.model_selection import train_test_split

X = imdb_balanced["preprocessed_sentence"]
y = imdb_balanced["label"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

lr_imdb = LogisticRegressionSummarizer()

# Fine-tune the model on the training set
lr_imdb.tune(X_train, y_train, n_iter=10, scoring="f1")

print("IMDB Dataset Evaluation:")
lr_imdb.evaluate(X_test, y_test)

sample_article_ids = imdb_balanced["article_id"].unique()[:3]

for article_id in sample_article_ids:
    article_df = imdb_balanced[imdb_balanced["article_id"] == article_id]
    article_sents = article_df["article_sentences"].tolist()
    preprocessed_sents = article_df["preprocessed_sentence"].tolist()

    if not preprocessed_sents or not article_sents:
        print(f"\nArticle ID: {article_id}")
        print("Empty input. Skipping...")
        continue

    reference_summary = " ".join(
        article_df[article_df["label"] == 1]["article_sentences"]
    )
    generated_summary = lr_imdb.summarize(article_sents, preprocessed_sents)

    print(f"\nArticle ID: {article_id}")
    print("Reference Summary:", reference_summary[:200] + "...")
    print("Generated Summary:", generated_summary[:200] + "...")

    rouge_scores = lr_imdb.compute_rouge(generated_summary, reference_summary)
    if rouge_scores is not None:
        print("ROUGE Scores:", rouge_scores[0])

<<<<<<< local


Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best params: {'tfidf__ngram_range': (1, 2), 'tfidf__min_df': 2, 'tfidf__max_features': 3000, 'clf__max_iter': 500, 'clf__C': 0.1}
Best score: 0.40721609299234146
IMDB Dataset Evaluation:
              precision    recall  f1-score   support

           0       0.85      0.69      0.76      2021
           1       0.35      0.57      0.43       584

    accuracy                           0.80      4036
   macro avg       0.80      0.80      0.80      4036
weighted avg       0.80      0.80      0.80      4036


Article ID: 0
Reference Summary: ...
Generated Summary: One of the other reviewers has mentioned that after watching just Oz episode you ll be hooked They are right as this is exactly what happened with me The first thing that struck me about Oz was its br...
Error computing ROUGE: Reference is empty.

Article ID: 1
Reference Summary: A wonderful little production The filming technique is very unassuming very old time BB

>>>>>>> remote


Decision Trees

In [None]:
from ML_models.decisionTreeClassifierModel import DecisionTreeClassifierModel
bbc_model = DecisionTreeClassifierModel("BBC", bbc_processed_df)
bbc_model.run()
bbc_model.show_predictions(n=10)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Fady\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!



=== Running on BBC Dataset ===
Train Accuracy: 0.8802
Test Accuracy: 0.8588

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.87      0.92     12218
           1       0.06      0.47      0.10       208

    accuracy                           0.86     12426
   macro avg       0.52      0.67      0.51     12426
weighted avg       0.97      0.86      0.91     12426


ROUGE Scores:
rouge-1: 0.3920
rouge-2: 0.2834
rouge-l: 0.3817

--- Article 1665 ---
Predicted: Campaign groups including Friends of the Earth, the World Development Movement, and War on Want said UK government policy on free trade was a major barrier to fighting poverty.
Reference: Mr Brown welcomed news that the Bill Gates Foundation and Norway are joining up to put an extra Â£0.53bn ($1bn ) into the Global Alliance for Vaccines and Immunisation (Gavi).UK Chancellor Gordon Brown has offered Â£960m ($1.8bn) over 15 years to an international scheme aiming to boost v

In [None]:
from ML_models.decisionTreeClassifierModel import DecisionTreeClassifierModel
bbc_model = DecisionTreeClassifierModel("imdb", imdb_processed_df)
bbc_model.run()
bbc_model.show_predictions(n=10)


=== Running on IMDB Dataset ===
Train Accuracy: 0.9771
Test Accuracy: 0.9684

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.97      0.98     11650
           1       0.05      0.34      0.08        50

    accuracy                           0.97     11700
   macro avg       0.52      0.66      0.53     11700
weighted avg       0.99      0.97      0.98     11700


ROUGE Scores:
rouge-1: 0.5617
rouge-2: 0.4881
rouge-l: 0.5612

--- Article 13209 ---
Predicted: Two old men sitting on park bench don really have problem with this scene Only problem is that it not scene it the entire movieYup movies don get anymore low concept than this They also don get anymore boring than this either but there worse to come because these two old men are chalk and cheese One is Nat Moyer who is Yiddish communist while the other is Midge Carter former golden gloves champion who also black Let me see now Jew and black man sitting on park bench get

In [None]:
from ML_models.decision_tree import DecisionTreeSummarizer

bbc_summarizer = DecisionTreeSummarizer("BBC", bbc_df)
bbc_summarizer.run()
bbc_summarizer.show_samples()

In [None]:
bbc_summarizer = DecisionTreeSummarizer("IMDB", imdb_df)
bbc_summarizer.run()
bbc_summarizer.show_samples()

Random Forest

In [58]:
from ML_models.randomforest import RandomForestClassifierModel
model_bbc_rf = RandomForestClassifierModel("BBC", bbc_processed_df)
model_bbc_rf.run()
model_bbc_rf.show_predictions(n=3)


=== Running classification on BBC Dataset ===

Train Accuracy: 0.6780
Test Accuracy: 0.6516

Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.68      0.66      7557
           1       0.66      0.63      0.64      7524

    accuracy                           0.65     15081
   macro avg       0.65      0.65      0.65     15081
weighted avg       0.65      0.65      0.65     15081


=== Article-wise Summary Evaluation ===

Article ID: 0
Reference Summary: Musicians to tackle US red tape  Musicians' groups are to tackle US visa regulations which are blamed for hindering British acts' chances of succeeding across the Atlantic. A singer hoping to perform in the US can expect to pay $1,300 (Â£680) simply for obtaining a visa. Nigel McCune from the Musicians' Union said British musicians are "disadvantaged" compared to their US counterparts. "The US is the world's biggest music market, which means something has to be done about the 

In [59]:
from ML_models.randomforest import RandomForestClassifierModel
model_bbc_rf = RandomForestClassifierModel("imdb", imdb_processed_df)
model_bbc_rf.run()
model_bbc_rf.show_predictions(n=3)


=== Running classification on imdb Dataset ===

Train Accuracy: 0.7743
Test Accuracy: 0.7241

Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.67      0.71      3037
           1       0.70      0.78      0.74      3017

    accuracy                           0.72      6054
   macro avg       0.73      0.72      0.72      6054
weighted avg       0.73      0.72      0.72      6054


=== Article-wise Summary Evaluation ===

Article ID: 0
Reference Summary: ...
Generated Summary: One of the other reviewers has mentioned that after watching just Oz episode you ll be hooked They are right as this is exactly what happened with me The first thing that struck me about Oz was its brutality and unflinching scenes of violence which set in right from the word GO Trust me this is not show for the faint hearted or timid
Error computing ROUGE: Reference or generated summary is empty.

Article ID: 1
Reference Summary: A wonderful little prod

In [20]:
from ML_models.random_forest import RandomForestSummarizer

bbc_summarizer = RandomForestSummarizer("BBC", bbc_df)
bbc_summarizer.run()
bbc_summarizer.show_samples()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Fady\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!



=== Running on BBC Dataset ===
Train Accuracy: 0.9407
Test Accuracy: 0.9315

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.94      0.96     12226
           1       0.07      0.27      0.11       200

    accuracy                           0.93     12426
   macro avg       0.53      0.60      0.54     12426
weighted avg       0.97      0.93      0.95     12426


ROUGE Scores:
rouge-1: 0.3028
rouge-2: 0.2067
rouge-l: 0.2960

--- Article 1665 ---
Predicted: "Things have been building up over the past few years and I think this is the year for Ireland," he told BBC Sport. A lot of things are in our favour with England and France at home." "For Ireland to win it we need to stay relatively injury free, and fortunately we are one of the few teams that have done that so far," Wood added. "It is going to be tough and we need to take all the luck and opportunities that come our way."
Reference: "So many of the major England players

In [None]:
from ML_models.random_forest import RandomForestSummarizer

bbc_summarizer = RandomForestSummarizer("IMDB", imdb_df)
bbc_summarizer.run()
bbc_summarizer.show_samples()


=== Running on IMDB Dataset ===
Train Accuracy: 0.9968
Test Accuracy: 0.9924

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     11650
           1       0.15      0.16      0.15        50

    accuracy                           0.99     11700
   macro avg       0.57      0.58      0.57     11700
weighted avg       0.99      0.99      0.99     11700


ROUGE Scores:
rouge-1: 0.5617
rouge-2: 0.4881
rouge-l: 0.5612

--- Article 13209 ---
Predicted: Two old men sitting on park bench don really have problem with this scene Only problem is that it not scene it the entire movieYup movies don get anymore low concept than this They also don get anymore boring than this either but there worse to come because these two old men are chalk and cheese One is Nat Moyer who is Yiddish communist while the other is Midge Carter former golden gloves champion who also black Let me see now Jew and black man sitting on park bench get

XGBoost

In [28]:
import pandas as pd
from ML_models.xgboost import XGBoostClassifierModel

# ✅ Use the actual DataFrame variable, not a string
# Assuming bbc_processed_df was defined earlier
df = bbc_processed_df

# Run the model
model = XGBoostClassifierModel(dataset_name="BBC", df=df)
model.run()
model.show_predictions(n=3)



=== Running classification on BBC Dataset ===

Train Accuracy: 0.7293
Test Accuracy: 0.6651

Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.87      0.76      5027
           1       0.64      0.36      0.46      3309

    accuracy                           0.67      8336
   macro avg       0.66      0.61      0.61      8336
weighted avg       0.66      0.67      0.64      8336


=== Article-wise Summary Evaluation ===

Article ID: 1385
Reference Summary: Martinez sees off Vinci challenge  Veteran Spaniard Conchita Martinez came from a set down to beat Italian Roberta Vinci at the Qatar Open in Doha. Slovakian Daniela Hantuchova beat Bulgarian Magdaleena Maleeva 4-6 6-4 6-3 to set up a second round clash with Russian Elena Bovina. The veteran Martinez found herself in trouble early on against Vinci with the Italian clinching the set thanks to breaks in the third and 11th games.
Generated Summary: The veteran Martinez found h

In [32]:
from ML_models.xgboost import XGBoostClassifierModel

df = imdb_processed_df  # Your preprocessed DataFrame with 'text' and 'label' columns

model = XGBoostClassifierModel(dataset_name="imdb", df=df)
model.run()



=== Running classification on imdb Dataset ===

Train Accuracy: 0.8545
Test Accuracy: 0.775

Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.97      0.87      2018
           1       0.50      0.11      0.18       587

    accuracy                           0.78      2605
   macro avg       0.65      0.54      0.53      2605
weighted avg       0.72      0.78      0.71      2605



Naive Bayes

In [None]:
from sklearn.utils import resample
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report


def train_and_evaluate(df, dataset_name=""):
    X = df["preprocessed_sentence"]
    y = df["label"]

    vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words="english")
    X_vec = vectorizer.fit_transform(X)

    X_train, X_test, y_train, y_test = train_test_split(
        X_vec, y, test_size=0.2, random_state=42
    )

    model = MultinomialNB()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print(f"\n===== {dataset_name} Dataset Results =====")
    print(classification_report(y_test, y_pred))


# Train on BBC
train_and_evaluate(bbc_balanced, dataset_name="BBC")

# Train on IMDB
train_and_evaluate(imdb_balanced, dataset_name="IMDB")


===== BBC Dataset Results =====
              precision    recall  f1-score   support

           0       0.86      0.55      0.67      5043
           1       0.67      0.91      0.77      5011

    accuracy                           0.73     10054
   macro avg       0.77      0.73      0.72     10054
weighted avg       0.77      0.73      0.72     10054


===== IMDB Dataset Results =====
              precision    recall  f1-score   support

           0       0.93      0.75      0.83      2013
           1       0.79      0.95      0.86      2023

    accuracy                           0.85      4036
   macro avg       0.86      0.85      0.85      4036
weighted avg       0.86      0.85      0.85      4036



#### Deep Learning Models

In [15]:
from DL_models.vanilla_transformer import run_pipeline

model = run_pipeline(bbc_df)

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Fady\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Preparing data...
Training model...
Epoch 1 - Val Accuracy: 0.9923
Epoch 2 - Val Accuracy: 0.9923
Epoch 3 - Val Accuracy: 0.9923
Epoch 4 - Val Accuracy: 0.9923
Epoch 5 - Val Accuracy: 0.9923

Sample Evaluation:

📄 Article:
Ad sales boost Time Warner profit

Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (Â£600m) for the three months to December, from $639m year-earlier.

The firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and high...

✂️ Predicted Summary:
Ad sales boost Time Warner profit  Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (Â£600m) for the three months to December, from $639m year-earlier. It lost 464,000 subscribers in the fourth quarter profits were lower than in the preceding three quarters. "Our financial performance was strong, meeting or exceeding all of our full-year objectives and greatly enhancing our flexibility," chairman and chief executive Richard Pa

CNN

In [None]:
# from DL_models.cnn import CNNExtractiveSummarizer
# from sklearn.model_selection import train_test_split

# # Prepare data
# X = bbc_processed_df["preprocessed_sentence"]
# y = bbc_processed_df["label"]
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Initialize model
# cnn_bbc = CNNExtractiveSummarizer()

# # Train/tune the model
# cnn_bbc.tune(X_train, y_train, X_val_raw=X_test, y_val=y_test, epochs=30)

# # Evaluate
# print("BBC Dataset Evaluation (CNN):")
# cnn_bbc.evaluate(X_test, y_test)

# # Generate summaries for a few articles
# sample_article_ids = bbc_processed_df["article_id"].unique()[:5]

# for article_id in sample_article_ids:
#     article_df = bbc_processed_df[bbc_processed_df["article_id"] == article_id]
#     reference_summary = " ".join(
#         article_df[article_df["label"] == 1]["article_sentences"]
#     )
#     generated_summary = cnn_bbc.summarize(
#         article_df["article_sentences"].tolist(),
#         article_df["preprocessed_sentence"].tolist()
#     )

#     print(f"\nArticle ID: {article_id}")
#     print("Reference Summary:", reference_summary[:200] + "...")
#     print("Generated Summary:", generated_summary[:200] + "...")

#     rouge_scores = cnn_bbc.compute_rouge(generated_summary, reference_summary)
#     if rouge_scores is not None:
#         print("ROUGE Scores:", rouge_scores[0])


In [None]:
# from DL_models.cnn import CNNExtractiveSummarizer
# from sklearn.model_selection import train_test_split

# # Prepare data
# X = imdb_processed_df["preprocessed_sentence"]
# y = imdb_processed_df["label"]
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Initialize model
# cnn_bbc = CNNExtractiveSummarizer()

# # Train/tune the model
# cnn_bbc.tune(X_train, y_train, X_val_raw=X_test, y_val=y_test, epochs=30)

# # Evaluate
# print("IMDB Dataset Evaluation (CNN):")
# cnn_bbc.evaluate(X_test, y_test)

# # Generate summaries for a few articles
# sample_article_ids = imdb_processed_df["article_id"].unique()[:5]

# for article_id in sample_article_ids:
#     article_df = imdb_processed_df[imdb_processed_df["article_id"] == article_id]
#     reference_summary = " ".join(
#         article_df[article_df["label"] == 1]["article_sentences"]
#     )
#     generated_summary = cnn_bbc.summarize(
#         article_df["article_sentences"].tolist(),
#         article_df["preprocessed_sentence"].tolist()
#     )

#     print(f"\nArticle ID: {article_id}")
#     print("Reference Summary:", reference_summary[:200] + "...")
#     print("Generated Summary:", generated_summary[:200] + "...")

#     rouge_scores = cnn_bbc.compute_rouge(generated_summary, reference_summary)
#     if rouge_scores is not None:
#         print("ROUGE Scores:", rouge_scores[0])


In [None]:
from DL_models.bilstm_attention import BiLSTMSummarizer

bilstmn = BiLSTMSummarizer("BBC", bbc_processed_df)
bilstmn.train()
bilstmn.evaluate()
bilstmn.show_samples(n=5)


=== Training on BBC ===
Epoch 1 Loss: 1.0872
Epoch 2 Loss: 1.0021
Epoch 3 Loss: 0.8830
Epoch 4 Loss: 0.6982
Epoch 5 Loss: 0.4818
Training completed in 281.33s
Best Threshold: 0.25, F1: 0.6022

ROUGE Scores:
rouge-1: 0.3566
rouge-2: 0.2505
rouge-l: 0.3027

--- Article ID: 1217 ---
Predicted Summary:
 lord drayson whose company powderject win pound contract provide smallpox vaccine government september terror attack give party day christmas
Reference Summary:
 party build poll war chests labour party receive donation final quarter new figure show significant donation come retire millionaire businessman philanthropist sir christopher ondaatje give party sum refrigerator magnate william haughey obe give also donation top conservative scottish business group focus scotland institute international research world large independent conference company also among gift tory donation total bearwood corporate service liberal democrat large donor joseph rowntree reform trust ltd company promote pol

In [None]:
bilstm = BiLSTMSummarizer("IMDB", imdb_processed_df)
bilstm.train()
bilstm.evaluate()
bilstm.show_samples(n=5)


=== Training on IMDB ===
Epoch 1 Loss: 0.9286
Epoch 2 Loss: 0.8791
Epoch 3 Loss: 0.8417
Epoch 4 Loss: 0.8230
Epoch 5 Loss: 0.7406
Training completed in 88.23s
Best Threshold: 0.40, F1: 0.4321

ROUGE Scores:
rouge-1: 0.4443
rouge-2: 0.3931
rouge-l: 0.4315

--- Article ID: 3460 ---
Predicted Summary:
 hit rock bottom right begin bad act jumbled sequence event mean sure freddy movie suppose dreamlike creepy one like train wreck poor sequence event awful plot setup feel like come terrible headache like get scar freddy annoyance see many time one nothing different lot time want take awful one liner get tv screen
Reference Summary:
 hit rock bottom right begin bad act jumbled sequence event mean sure freddy movie suppose dreamlike creepy one like train wreck poor sequence event awful plot setup feel like come terrible headache like get scar directing totally fail none suspense well craft horror previous sequel find even death scene mostly crass moronic death food especially except one cool 

FeedForward Neural Network


In [None]:
# # ===== Cell 1: Imports =====
# from DL_models.FNN import (FeedForwardNet, extract_features, prepare_dataloaders,
#                            compute_class_weight, train_model)
# from sklearn.model_selection import train_test_split
# import torch

In [None]:
# # ===== Cell 2: Data Preparation =====
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# X_train_df, X_val_df, y_train, y_val = train_test_split(
#     bbc_processed_df[['preprocessed_sentence']],
#     bbc_processed_df['label'].values,
#     test_size=0.2,
#     random_state=42
# )

# X_train, X_val, vectorizer = extract_features(X_train_df, X_val_df)

# train_loader, val_loader = prepare_dataloaders(X_train, y_train, X_val, y_val, device=device)

In [None]:
# # ===== Cell 3: Model Initialization =====
# input_size = X_train.shape[1]
# model = FeedForwardNet(input_size)
# pos_weight = compute_class_weight(y_train).to(device)
# criterion = torch.nn.BCELoss(pos_weight)
# optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [None]:
# # ===== Cell 4: Training =====
# train_model(model, train_loader, val_loader, criterion, optimizer, device=device, epochs=15)

In [None]:
# from rouge_score import rouge_scorer
# import numpy as np

# # ===== Cell 6: ROUGE Evaluation =====
# def evaluate_rouge(df, model, vectorizer, top_k=3):
#     scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
#     scores = []

#     model.eval()
#     with torch.no_grad():
#         for article_id in df['article_id'].unique():
#             article_df = df[df['article_id'] == article_id]
#             X = vectorizer.transform(article_df['preprocessed_sentence']).toarray()
#             preds = model(torch.tensor(X, dtype=torch.float32)).numpy()

#             top_indices = preds.argsort()[-top_k:][::-1]
#             predicted_summary = " ".join(article_df.iloc[top_indices]["article_sentences"])
#             reference_summary = imdb_df.loc[article_id]["Summary"]

#             score = scorer.score(reference_summary, predicted_summary)
#             scores.append(score)

#     return scores

# rouge_scores = evaluate_rouge(bbc_processed_df, model, vectorizer)

# avg_rouge1 = np.mean([s["rouge1"].fmeasure for s in rouge_scores])
# avg_rouge2 = np.mean([s["rouge2"].fmeasure for s in rouge_scores])
# avg_rougeL = np.mean([s["rougeL"].fmeasure for s in rouge_scores])

# print(f"Average ROUGE-1: {avg_rouge1:.4f}")
# print(f"Average ROUGE-2: {avg_rouge2:.4f}")
# print(f"Average ROUGE-L: {avg_rougeL:.4f}")

LSTM

In [None]:
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from rouge_score import rouge_scorer

def compute_rouge_scores(pred_summary, reference_summary):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference_summary, pred_summary)
    return scores


import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Bidirectional, TimeDistributed, Dense
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint

nltk.download('punkt')

# === Load your data ===

texts = bbc_df["Article"]
summaries = ['starttoken ' + s + ' endtoken' for s in bbc_df["Summary"]]

# === Tokenizer setup ===
text_tokenizer = Tokenizer(num_words=5000, oov_token='UNK')
text_tokenizer.fit_on_texts(texts)

# === Parameters ===
max_sent_len = 30
max_sents = 5

# === Preprocess articles ===
def preprocess_articles(texts):
    all_articles = []
    for article in texts:
        sents = sent_tokenize(article)[:max_sents]
        tokenized = text_tokenizer.texts_to_sequences(sents)
        padded = pad_sequences(tokenized, maxlen=max_sent_len, padding='post')
        padded = np.pad(padded, ((0, max_sents - len(padded)), (0, 0)), mode='constant')
        all_articles.append(padded)
    return np.array(all_articles)

X = preprocess_articles(texts)

# # === Generate labels for extractive summary ===
def label_sentences(texts, summaries, top_n=3):
    labels = []
    for article, summary in zip(texts, summaries):
        sents = sent_tokenize(article)[:max_sents]
        summary_text = summary.replace("starttoken ", "").replace(" endtoken", "")
        
        if not sents:
            labels.append(np.zeros(max_sents))
            continue

        # Compute TF-IDF similarity
        tfidf = TfidfVectorizer().fit(sents + [summary_text])
        sent_vecs = tfidf.transform(sents)
        summary_vec = tfidf.transform([summary_text])
        sims = cosine_similarity(summary_vec, sent_vecs).flatten()

        # Get top-N most similar sentence indices
        top_indices = sims.argsort()[-top_n:]
        label = np.zeros(len(sents))
        label[top_indices] = 1

        # Pad to max_sents
        padded_label = np.pad(label, (0, max_sents - len(label)), 'constant')
        labels.append(padded_label)
        
    return np.array(labels)


y = label_sentences(texts, summaries, top_n=3)

# === Train-test split ===
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

# === Model Definition ===
input_layer = Input(shape=(max_sents, max_sent_len))
embedding_layer = TimeDistributed(Embedding(input_dim=len(text_tokenizer.word_index)+1, output_dim=128))(input_layer)
lstm_layer = TimeDistributed(LSTM(64))(embedding_layer)
output_layer = Dense(1, activation='sigmoid')(lstm_layer)

model = Model(inputs=input_layer, outputs=output_layer)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

# === Callbacks ===
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True, verbose=1)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, min_lr=1e-5, verbose=1)
checkpoint = ModelCheckpoint('best_extractive_model.h5', monitor='val_loss', save_best_only=True, verbose=1)

# === Train ===
model.fit(
    X_train,
    np.expand_dims(y_train, -1),
    validation_split=0.1,
    epochs=10,
    batch_size=16,
    callbacks=[early_stopping, reduce_lr, checkpoint],
    verbose=1
)

# === Inference ===

def extract_summary(article):
    sents = sent_tokenize(article)[:max_sents]
    tokenized = text_tokenizer.texts_to_sequences(sents)
    padded = pad_sequences(tokenized, maxlen=max_sent_len, padding='post')
    padded = np.pad(padded, ((0, max_sents - len(padded)), (0, 0)), mode='constant')
    
    prediction = model.predict(np.expand_dims(padded, 0))[0].flatten()
    
    # === Dynamic threshold based on mean score
    threshold = prediction.mean()
    
    summary = [s for i, s in enumerate(sents) if prediction[i] > threshold]
    # Fallback if no sentence is selected
    if not summary:
        top_idx = prediction.argmax()
        summary = [sents[top_idx]]
    
    return ' '.join(summary)

# === Example Prediction + Evaluation ===
example_idx = 0
article = texts.iloc[example_idx]
reference_summary = bbc_df["Summary"].iloc[example_idx].replace("starttoken ", "").replace(" endtoken", "")

print("\n--- Example Article ---")
print(article[:500], "...")

# Predict summary
pred_summary = extract_summary(article)
print("\n--- Extracted Summary ---")
print(pred_summary)

# Compute ROUGE
rouge_scores = compute_rouge_scores(pred_summary, reference_summary)
print("\n--- ROUGE Scores BBC ---")
for key, value in rouge_scores.items():
    print(f"{key}: Precision: {value.precision:.4f}, Recall: {value.recall:.4f}, F1: {value.fmeasure:.4f}")





[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Fady\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Epoch 1/10
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step - accuracy: 0.6207 - loss: 0.6479
Epoch 1: val_loss improved from inf to 0.61103, saving model to best_extractive_model.h5




[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 82ms/step - accuracy: 0.6209 - loss: 0.6477 - val_accuracy: 0.6716 - val_loss: 0.6110 - learning_rate: 0.0010
Epoch 2/10
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step - accuracy: 0.7358 - loss: 0.5395
Epoch 2: val_loss did not improve from 0.61103
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 75ms/step - accuracy: 0.7358 - loss: 0.5395 - val_accuracy: 0.6587 - val_loss: 0.6332 - learning_rate: 0.0010
Epoch 3/10
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step - accuracy: 0.8141 - loss: 0.4359
Epoch 3: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.

Epoch 3: val_loss did not improve from 0.61103
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 79ms/step - accuracy: 0.8139 - loss: 0.4361 - val_accuracy: 0.6517 - val_loss: 0.6659 - learning_rate: 0.0010
Epoch 4/10
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[

In [None]:
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
!pip install rouge
!pip install rouge_score
from rouge_score import rouge_scorer

def compute_rouge_scores(pred_summary, reference_summary):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference_summary, pred_summary)
    return scores


import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Bidirectional, TimeDistributed, Dense
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint

nltk.download('punkt')
# === Tokenizer setup ===
text_tokenizer = Tokenizer(num_words=5000, oov_token='UNK')
text_tokenizer.fit_on_texts(texts)

# === Parameters ===
max_sent_len = 30
max_sents = 5

# === Preprocess articles ===
def preprocess_articles(texts):
    all_articles = []
    for article in texts:
        sents = sent_tokenize(article)[:max_sents]
        tokenized = text_tokenizer.texts_to_sequences(sents)
        padded = pad_sequences(tokenized, maxlen=max_sent_len, padding='post')
        padded = np.pad(padded, ((0, max_sents - len(padded)), (0, 0)), mode='constant')
        all_articles.append(padded)
    return np.array(all_articles)



# # === Generate labels for extractive summary ===
def label_sentences(texts, summaries, top_n=3):
    labels = []
    for article, summary in zip(texts, summaries):
        sents = sent_tokenize(article)[:max_sents]
        summary_text = summary.replace("starttoken ", "").replace(" endtoken", "")
        
        if not sents:
            labels.append(np.zeros(max_sents))
            continue

        # Compute TF-IDF similarity
        tfidf = TfidfVectorizer().fit(sents + [summary_text])
        sent_vecs = tfidf.transform(sents)
        summary_vec = tfidf.transform([summary_text])
        sims = cosine_similarity(summary_vec, sent_vecs).flatten()

        # Get top-N most similar sentence indices
        top_indices = sims.argsort()[-top_n:]
        label = np.zeros(len(sents))
        label[top_indices] = 1

        # Pad to max_sents
        padded_label = np.pad(label, (0, max_sents - len(label)), 'constant')
        labels.append(padded_label)
        
    return np.array(labels)



# === Model Definition ===
input_layer = Input(shape=(max_sents, max_sent_len))
embedding_layer = TimeDistributed(Embedding(input_dim=len(text_tokenizer.word_index)+1, output_dim=100,trainable=False))(input_layer)
lstm_layer = TimeDistributed(LSTM(64, dropout=0.3, recurrent_dropout=0.3))(embedding_layer)
output_layer = Dense(1, activation='sigmoid')(lstm_layer)

model = Model(inputs=input_layer, outputs=output_layer)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

# === Callbacks ===
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True, verbose=1)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, min_lr=1e-5, verbose=1)
checkpoint = ModelCheckpoint('best_extractive_model.h5', monitor='val_loss', save_best_only=True, verbose=1)


# === Inference ===

def extract_summary(article):
    sents = sent_tokenize(article)[:max_sents]
    tokenized = text_tokenizer.texts_to_sequences(sents)
    padded = pad_sequences(tokenized, maxlen=max_sent_len, padding='post')
    padded = np.pad(padded, ((0, max_sents - len(padded)), (0, 0)), mode='constant')
    
    prediction = model.predict(np.expand_dims(padded, 0))[0].flatten()
    
    # === Dynamic threshold based on mean score
    threshold = prediction.mean()
    
    summary = [s for i, s in enumerate(sents) if prediction[i] > threshold]
    # Fallback if no sentence is selected
    if not summary:
        top_idx = prediction.argmax()
        summary = [sents[top_idx]]
    
    return ' '.join(summary)


# === Example Prediction + Evaluation ===


# === Load IMDB data ===
texts = imdb_df["Article"]
summaries = ['starttoken ' + s + ' endtoken' for s in imdb_df["Summary"]]

# === Preprocess ===
X = preprocess_articles(texts)
y = label_sentences(texts, summaries, top_n=3)

# === Train-test split ===
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

# === Train the model on IMDb dataset ===
model.fit(
    X_train,
    np.expand_dims(y_train, -1),
    validation_split=0.1,
    epochs=10,
    batch_size=16,
    callbacks=[early_stopping, reduce_lr, checkpoint],
    verbose=1
)

# === Evaluate on an IMDb example ===
example_idx = 0
article = texts.iloc[example_idx]
reference_summary = imdb_df["Summary"].iloc[example_idx].replace("starttoken ", "").replace(" endtoken", "")

print("\n--- IMDB Sample Article ---")
print(article[:500], "...")

# Predict summary
pred_summary = extract_summary(article)
print("\n--- Extracted Summary ---")
print(pred_summary)

# Compute ROUGE
rouge_scores = compute_rouge_scores(pred_summary, reference_summary)
print("\n--- ROUGE Scores ---")
for key, value in rouge_scores.items():
    print(f"{key}: Precision: {value.precision:.4f}, Recall: {value.recall:.4f}, F1: {value.fmeasure:.4f}")



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Fady\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Epoch 1/10
[1m1975/1975[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step - accuracy: 0.9857 - loss: 0.0331
Epoch 1: val_loss improved from inf to 0.00001, saving model to best_extractive_model.h5




[1m1975/1975[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m127s[0m 59ms/step - accuracy: 0.9857 - loss: 0.0331 - val_accuracy: 1.0000 - val_loss: 1.0450e-05 - learning_rate: 0.0010
Epoch 2/10
[1m1975/1975[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step - accuracy: 1.0000 - loss: 1.0317e-05
Epoch 2: val_loss improved from 0.00001 to 0.00000, saving model to best_extractive_model.h5




[1m1975/1975[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m113s[0m 57ms/step - accuracy: 1.0000 - loss: 1.0315e-05 - val_accuracy: 1.0000 - val_loss: 2.3533e-06 - learning_rate: 0.0010
Epoch 3/10
[1m1974/1975[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 56ms/step - accuracy: 1.0000 - loss: 2.6681e-06
Epoch 3: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.

Epoch 3: val_loss improved from 0.00000 to 0.00000, saving model to best_extractive_model.h5




[1m1975/1975[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m114s[0m 58ms/step - accuracy: 1.0000 - loss: 2.6675e-06 - val_accuracy: 1.0000 - val_loss: 7.4317e-07 - learning_rate: 0.0010
Epoch 4/10
[1m1975/1975[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step - accuracy: 1.0000 - loss: 1.0023e-06
Epoch 4: val_loss improved from 0.00000 to 0.00000, saving model to best_extractive_model.h5




[1m1975/1975[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m124s[0m 63ms/step - accuracy: 1.0000 - loss: 1.0023e-06 - val_accuracy: 1.0000 - val_loss: 3.8727e-07 - learning_rate: 5.0000e-04
Epoch 5/10
[1m1975/1975[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step - accuracy: 1.0000 - loss: 5.1363e-07
Epoch 5: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.

Epoch 5: val_loss improved from 0.00000 to 0.00000, saving model to best_extractive_model.h5




[1m1975/1975[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m127s[0m 64ms/step - accuracy: 1.0000 - loss: 5.1358e-07 - val_accuracy: 1.0000 - val_loss: 1.7012e-07 - learning_rate: 5.0000e-04
Epoch 6/10
[1m1975/1975[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step - accuracy: 1.0000 - loss: 2.4595e-07
Epoch 6: val_loss improved from 0.00000 to 0.00000, saving model to best_extractive_model.h5




[1m1975/1975[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m126s[0m 64ms/step - accuracy: 1.0000 - loss: 2.4593e-07 - val_accuracy: 1.0000 - val_loss: 9.8545e-08 - learning_rate: 2.5000e-04
Epoch 7/10
[1m1975/1975[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step - accuracy: 1.0000 - loss: 1.3991e-07
Epoch 7: ReduceLROnPlateau reducing learning rate to 0.0001250000059371814.

Epoch 7: val_loss improved from 0.00000 to 0.00000, saving model to best_extractive_model.h5




[1m1975/1975[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m126s[0m 64ms/step - accuracy: 1.0000 - loss: 1.3990e-07 - val_accuracy: 1.0000 - val_loss: 4.7582e-08 - learning_rate: 2.5000e-04
Epoch 8/10
[1m1975/1975[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step - accuracy: 1.0000 - loss: 7.3384e-08
Epoch 8: val_loss improved from 0.00000 to 0.00000, saving model to best_extractive_model.h5




[1m1975/1975[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m125s[0m 64ms/step - accuracy: 1.0000 - loss: 7.3380e-08 - val_accuracy: 1.0000 - val_loss: 2.9140e-08 - learning_rate: 1.2500e-04
Epoch 9/10
[1m1975/1975[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step - accuracy: 1.0000 - loss: 4.4152e-08
Epoch 9: ReduceLROnPlateau reducing learning rate to 6.25000029685907e-05.

Epoch 9: val_loss improved from 0.00000 to 0.00000, saving model to best_extractive_model.h5




[1m1975/1975[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m130s[0m 66ms/step - accuracy: 1.0000 - loss: 4.4149e-08 - val_accuracy: 1.0000 - val_loss: 1.5255e-08 - learning_rate: 1.2500e-04
Epoch 10/10
[1m1975/1975[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step - accuracy: 1.0000 - loss: 2.5486e-08
Epoch 10: val_loss improved from 0.00000 to 0.00000, saving model to best_extractive_model.h5




[1m1975/1975[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m126s[0m 64ms/step - accuracy: 1.0000 - loss: 2.5485e-08 - val_accuracy: 1.0000 - val_loss: 1.0603e-08 - learning_rate: 6.2500e-05
Restoring model weights from the end of the best epoch: 10.

--- IMDB Sample Article ---
One of the other reviewers has mentioned that after watching just Oz episode you ll be hooked They are right as this is exactly what happened with me The first thing that struck me about Oz was its brutality and unflinching scenes of violence which set in right from the word GO Trust me this is not show for the faint hearted or timid This show pulls no punches with regards to drugs sex or violence Its is hardcore in the classic use of the word It is called OZ as that is the nickname given to the ...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step

--- Extracted Summary ---
One of the other reviewers has mentioned that after watching just Oz episode you ll be hooked They are right as this 