In [25]:
# # Mount Google Drive
# from google.colab import drive
# drive.mount('/content/drive')

# # Install required packages
# !pip install datasets
# !pip install transformers[torch]
# !pip install evaluate
# !pip install -U accelerate
# !pip install rouge_score

import torch
from datasets import load_dataset
import pandas as pd
import nltk
from transformers import AutoTokenizer, pipeline
from tqdm import tqdm

# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')


Using device: cuda


In [26]:
# Load the dataset
ds = load_dataset('multi_news', trust_remote_code=True)

# Convert to DataFrames
ds_train = pd.DataFrame(ds['train'])
ds_test = pd.DataFrame(ds['test'])


In [27]:
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HAL-9000\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HAL-9000\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [28]:
nltk.download('punkt')
nltk.download('stopwords')

stop_words = set(nltk.corpus.stopwords.words('english'))

def preprocess_text(text):
    # Keep alphanumeric characters and specific punctuation
    text = re.sub(r'[^a-zA-Z0-9\s.,!?]', '', text)  
    tokens = nltk.word_tokenize(text.lower())
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    return ' '.join(tokens)

def preprocess_and_save(dataset, save_path, batch_size=100):
    total_rows = len(dataset)
    for start in tqdm(range(0, total_rows, batch_size), desc="Processing batches"):
        end = min(start + batch_size, total_rows)
        batch = dataset.iloc[start:end].copy()
        batch['document'] = batch['document'].map(preprocess_text)

        if start == 0:
            batch.to_csv(save_path, index=False)
        else:
            batch.to_csv(save_path, mode='a', header=False, index=False)

# Preprocess training and test data
preprocess_and_save(ds_train, 'preprocessed_train.csv')
preprocess_and_save(ds_test, 'preprocessed_test.csv')

print("Preprocessing completed.")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HAL-9000\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HAL-9000\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Processing batches:  80%|███████▉  | 359/450 [05:03<01:22,  1.10it/s]

In [None]:
# Count documents and vocabulary size
num_documents = len(ds_train) + len(ds_test)
preprocessed_texts = ' '.join(ds_train['document'].tolist() + ds_test['document'].tolist())
vocabulary = set(preprocessed_texts.split())
vocabulary_size = len(vocabulary)

print(f"Total documents: {num_documents}")
print(f"Vocabulary size: {vocabulary_size}")


In [None]:
# Load the Tokenizer
tokenizer = AutoTokenizer.from_pretrained('facebook/bart-base')

def tokenize_function(text):
    return tokenizer(
        text,
        padding='max_length',
        truncation=True,
        max_length=512,
        return_tensors='pt'
    )


In [None]:
# Tokenize the test documents 
tokenized_documents = [tokenize_function(doc) for doc in ds_test['document']]


In [6]:
# Load the Summarization Pipeline
summarizer = pipeline("summarization", model="facebook/bart-base")


In [7]:
# Prepare documents for summarization
documents_to_summarize = ds_test['document'].tolist()
print(f"Number of documents to summarize: {len(documents_to_summarize)}")

# Truncate documents 
max_input_length = 1024
documents_to_summarize = [doc[:max_input_length] for doc in documents_to_summarize]


In [10]:
# Make predictions in batches
batch_size = 8
predicted_summaries = []

for i in tqdm(range(0, len(documents_to_summarize), batch_size)):
    batch_docs = documents_to_summarize[i:i + batch_size]
    summaries = summarizer(batch_docs, max_length=40, min_length=30, do_sample=False)
    predicted_summaries.extend(summaries)

# Extract the summary text
predicted_summaries_text = [summary['summary_text'] for summary in predicted_summaries]


In [None]:
# Output Predicted Summaries
for i, summary in enumerate(predicted_summaries_text):
    print(f"Document {i+1} Summary: {summary}")

# Create a DataFrame to hold the original documents and their summaries
results_df = pd.DataFrame({
    'document': documents_to_summarize,
    'summary': predicted_summaries_text
})

# Save the DataFrame to a CSV file
results_df.to_csv('summarization_results.csv', index=False)

print("Summarization results saved to 'summarization_results.csv'.")


In [17]:
# Evaluation using ROUGE
from evaluate import load
rouge = load("rouge")
results = rouge.compute(predictions=predicted_summaries_text, references=ds_test['summary'].tolist())
print("ROUGE scores:", results)

# Summarization Performance
print("Reflection on Summarization Performance:")
print(f"ROUGE scores: {results}")

