## Pre-Processing and Manual Tokenization

This notebook loads data, processes the text converting to lowercase, removing non-alphanumeric tokens, and filters out stop words using 'multi_news' dataset.

In [1]:
# Install required packages
# !pip install datasets
# !pip install transformers[torch]
# !pip install evaluate
# !pip install -U accelerate
# !pip install rouge_score


In [2]:
# Import necessary libraries
import torch
from datasets import load_dataset, Dataset, DatasetDict
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from transformers import AutoTokenizer
from tqdm import tqdm  # Import tqdm for progress tracking
import os


In [3]:
# Check if GPU is available for computation
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')


Using device: cuda


In [4]:
# Load the dataset from Hugging Face
ds = load_dataset('multi_news', trust_remote_code=True)
print(ds)
print(ds['train'][0])


DatasetDict({
    train: Dataset({
        features: ['document', 'summary'],
        num_rows: 44972
    })
    validation: Dataset({
        features: ['document', 'summary'],
        num_rows: 5622
    })
    test: Dataset({
        features: ['document', 'summary'],
        num_rows: 5622
    })
})
{'document': 'National Archives \n \n Yes, it’s that time again, folks. It’s the first Friday of the month, when for one ever-so-brief moment the interests of Wall Street, Washington and Main Street are all aligned on one thing: Jobs. \n \n A fresh update on the U.S. employment situation for January hits the wires at 8:30 a.m. New York time offering one of the most important snapshots on how the economy fared during the previous month. Expectations are for 203,000 new jobs to be created, according to economists polled by Dow Jones Newswires, compared to 227,000 jobs added in February. The unemployment rate is expected to hold steady at 8.3%. \n \n Here at MarketBeat HQ, we’ll be offering

In [5]:
# Convert to DataFrames
ds_train = pd.DataFrame(ds['train'])
ds_test = pd.DataFrame(ds['test'])

# Display the first few rows
print(ds_train.head())
print(ds_test.head())


                                            document  \
0  National Archives \n \n Yes, it’s that time ag...   
1  LOS ANGELES (AP) — In her first interview sinc...   
2  GAITHERSBURG, Md. (AP) — A small, private jet ...   
3  Tucker Carlson Exposes His Own Sexism on Twitt...   
4  A man accused of removing another man's testic...   

                                             summary  
0  – The unemployment rate dropped to 8.2% last m...  
1  – Shelly Sterling plans "eventually" to divorc...  
2  – A twin-engine Embraer jet that the FAA descr...  
3  – Tucker Carlson is in deep doodoo with conser...  
4  – What are the three most horrifying words in ...  
                                            document  \
0  GOP Eyes Gains As Voters In 11 States Pick Gov...   
1  \n \n \n \n UPDATE: 4/19/2001 Read Richard Met...   
2  It's the Golden State's latest version of the ...   
3  The seed for this crawl was a list of every ho...   
4  After a year in which liberals scored impressi... 

In [6]:
# Reconstruct the datasets into a DatasetDict
new_ds = DatasetDict({
    'train': Dataset.from_pandas(ds_train),
    'test': Dataset.from_pandas(ds_test)
})

# View the resulting dataset dict object
print(new_ds)


DatasetDict({
    train: Dataset({
        features: ['document', 'summary'],
        num_rows: 44972
    })
    test: Dataset({
        features: ['document', 'summary'],
        num_rows: 5622
    })
})


In [None]:
# Ensure that NLTK stopwords are downloaded
nltk.download('punkt')
nltk.download('stopwords')

# Load stopwords for preprocessing
stop_words = set(stopwords.words('english')) 


# Define a text preprocessing function
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalnum()]
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Function to preprocess in batches and save periodically
def preprocess_and_save(dataset, save_path, batch_size=100):
    total_rows = len(dataset)
    for start in tqdm(range(0, total_rows, batch_size), desc="Processing batches"):
        end = min(start + batch_size, total_rows)
        batch = dataset.iloc[start:end].copy()  # Extract the current batch

        # Preprocess the current batch
        batch['document'] = batch['document'].map(preprocess_text)

        # Save the processed batch
        if start == 0:
            # Save the first batch as a new file
            batch.to_csv(save_path, index=False)
        else:
            # Append subsequent batches
            batch.to_csv(save_path, mode='a', header=False, index=False)

        # Clear cache if using GPU
        torch.cuda.empty_cache()

        # Log every 10 batches
        if (start // batch_size) % 10 == 0:
            print(f"Processed and saved rows {start} to {end}")

# Check for existing preprocessed file
save_path_train = 'preprocessed_train.csv'
save_path_test = 'preprocessed_test.csv'

# Preprocess training data
if not os.path.exists(save_path_train):
    preprocess_and_save(ds['train'].to_pandas(), save_path_train)

# Preprocess test data
if not os.path.exists(save_path_test):
    preprocess_and_save(ds['test'].to_pandas(), save_path_test)

print("Preprocessing completed.")


In [None]:
# Save preprocessed training dataset to CSV
new_ds['train'].to_csv('preprocessed_train.csv', index=False)

# Save preprocessed test dataset to CSV
new_ds['test'].to_csv('preprocessed_test.csv', index=False)

In [None]:
# Load a tokenizer
tokenizer = AutoTokenizer.from_pretrained('facebook/bart-base')


In [None]:
# Set maximum token length for tokenization
max_length = 512

# Tokenization function
def tokenize_function(examples):
    return tokenizer(
        examples['document'],
        padding='max_length',
        truncation=True,
        max_length=max_length
    )

# Apply tokenization to the dataset
tokenized_ds_train = new_ds['train'].map(tokenize_function, batched=True)
tokenized_ds_test = new_ds['test'].map(tokenize_function, batched=True)






In [11]:
# Convert to pandas DataFrame
train_df = pd.DataFrame(tokenized_ds_train)
test_df = pd.DataFrame(tokenized_ds_test)

# Save to CSV
train_df.to_csv('tokenized_ds_train.csv', index=False)
test_df.to_csv('tokenized_ds_test.csv', index=False)