## LLM Project Initial Steps - Summarization Model using 'multi_news'

In [9]:
# Imports
from datasets import load_dataset, Dataset, DatasetDict
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from transformers import pipeline
import torch
import time
import logging
import os

# Download NLTK Resources
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)


True

I have always had an interest in the news and events that go on around us, so I logically chose to create a model that would summarize news articles.

In [None]:
# Load the Dataset for Document Summarization
ds = load_dataset('multi_news')

In [None]:
# View Available Splits
print(ds)


In [None]:
# View an Example Row from the Training Set
print(ds['train'][0])

In [None]:
# Understanding Features of the Dataset
print(ds['train'].features)


In [None]:
# Convert to DataFrames
ds_train = pd.DataFrame(ds['train'])
ds_test = pd.DataFrame(ds['test'])

In [None]:
# Explore the DataFrames
print(ds_train.head())
print(ds_test.head())


In [None]:
# Convert DataFrames to Dataset Objects
train = Dataset.from_pandas(ds_train)
test = Dataset.from_pandas(ds_test)


In [None]:
# Create a DatasetDict
new_ds = DatasetDict({
    'train': train,
    'test': test
})

# View the Resulting DatasetDict
print(new_ds)

## Testing Preprocessing on 'new_ds'

In [12]:
# Preprocessing Function
stop_words = set(stopwords.words('english'))  # Load stopwords 

def preprocess_text(texts):
    processed_texts = []
    
    for text in texts:
        if text is None:  # Handle None values
            continue
        
        # Check if the input is a list and join if necessary
        if isinstance(text, list):
            text = ' '.join(text)
        
        tokens = word_tokenize(text.lower())
        tokens = [word for word in tokens if word.isalnum()]
        tokens = [word for word in tokens if word not in stop_words]
        processed_texts.append(' '.join(tokens))
        
    return processed_texts

In [None]:
# Apply Preprocessing to the Dataset
new_ds = new_ds.map(lambda x: {'preprocessed': preprocess_text(x['document'])}, batched=True)

In [15]:
# Check for null values in the preprocessed data
train_df = new_ds['train'].to_pandas()
test_df = new_ds['test'].to_pandas()

print(train_df.isnull().sum())
print(test_df.isnull().sum())

# View the preprocessed data
print(new_ds['train'].select(range(5)))  # Show first 5 processed samples
print(new_ds['test'].select(range(5)))   # Show first 5 processed samples

document        0
summary         0
preprocessed    0
dtype: int64
document        0
summary         0
preprocessed    0
dtype: int64
Dataset({
    features: ['document', 'summary', 'preprocessed'],
    num_rows: 5
})
Dataset({
    features: ['document', 'summary', 'preprocessed'],
    num_rows: 5
})
