Importing required libraries

In [5]:
import pandas as pd
import re
import torch
from transformers import BartTokenizer, BartForConditionalGeneration, Trainer, TrainingArguments
from datasets import Dataset
import pandas as pd
import re
import spacy
from tqdm import tqdm
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import Dataset, load_from_disk
import accelerate

Loading the dataset & data exploration

In [None]:
train_data = pd.read_csv('train.csv')
val_data = pd.read_csv('validation.csv')
test_data = pd.read_csv('test.csv')

In [7]:
print(train_data.head())
print(val_data.head())
print(test_data.head())

                                         id  \
0  0001d1afc246a7964130f43ae940af6bc6c57f01   
1  0002095e55fcbd3a2f366d9bf92a95433dc305ef   
2  00027e965c8264c35cc1bc55556db388da82b07f   
3  0002c17436637c4fe1837c935c04de47adb18e9a   
4  0003ad6ef0c37534f80b55b4235108024b407f0b   

                                             article  \
0  By . Associated Press . PUBLISHED: . 14:11 EST...   
1  (CNN) -- Ralph Mata was an internal affairs li...   
2  A drunk driver who killed a young woman in a h...   
3  (CNN) -- With a breezy sweep of his pen Presid...   
4  Fleetwood are the only team still to have a 10...   

                                          highlights  
0  Bishop John Folda, of North Dakota, is taking ...  
1  Criminal complaint: Cop used his role to help ...  
2  Craig Eccleston-Todd, 27, had drunk at least t...  
3  Nina dos Santos says Europe must be ready to a...  
4  Fleetwood top of League One after 2-0 win at S...  
                                         id  \
0  

Preprocessing

In [None]:
# Changing all letters to lowercase

def preprocess(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text

train_data['article'] = train_data['article'].apply(preprocess)
train_data['highlights'] = train_data['highlights'].apply(preprocess)

val_data['article'] = val_data['article'].apply(preprocess)
val_data['highlights'] = val_data['highlights'].apply(preprocess)

test_data['article'] = test_data['article'].apply(preprocess)
test_data['highlights'] = test_data['highlights'].apply(preprocess)


In [None]:
# Convert pandas DataFrames to Hugging Face Dataset objects
train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)
test_dataset = Dataset.from_pandas(test_data)


In [None]:
# Save the cleaned datasets as CSV files without including the index
train_data.to_csv('cleaned_train_data.csv', index=False)
val_data.to_csv('cleaned_val_data.csv', index=False)
test_data.to_csv('cleaned_test_data.csv', index=False)

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from collections import Counter

# Download necessary NLTK resources: stopwords list and punkt tokenizer
nltk.download('stopwords')
nltk.download('punkt')

# Create a set of English stopwords
stop_words = set(stopwords.words('english'))

# Define a function to remove stopwords from a given text
def remove_stopwords(text):
    word_tokens = word_tokenize(text)  # Tokenize the text into words
    filtered_text = [word for word in word_tokens if word not in stop_words]  # Filter out stopwords
    return ' '.join(filtered_text)  # Join the words back into a single string

# Apply the remove_stopwords function to the 'article' and 'highlights' columns of the datasets
train_data['article'] = train_data['article'].apply(remove_stopwords)
train_data['highlights'] = train_data['highlights'].apply(remove_stopwords)

val_data['article'] = val_data['article'].apply(remove_stopwords)
val_data['highlights'] = val_data['highlights'].apply(remove_stopwords)

test_data['article'] = test_data['article'].apply(remove_stopwords)
test_data['highlights'] = test_data['highlights'].apply(remove_stopwords)

# Save the datasets after removing stopwords
train_data.to_csv('stopwords_removed_train_data.csv', index=False)
val_data.to_csv('stopwords_removed_val_data.csv', index=False)
test_data.to_csv('stopwords_removed_test_data.csv', index=False)

In [None]:
##Use SpaCy for lemmatization to convert words to their base forms.

In [2]:
# !pip install spacy
# !python -m spacy download en_core_web_sm

In [3]:
# !pip install spacy tqdm
# !python -m spacy download en_core_web_sm

In [15]:
import pandas as pd
import re
import spacy
from tqdm import tqdm

# Load datasets
train_data = pd.read_csv('cleaned_train_data.csv')
val_data = pd.read_csv('cleaned_val_data.csv')
test_data = pd.read_csv('cleaned_test_data.csv')

# Function to clean text
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text

# Apply cleaning to all datasets
train_data['article'] = train_data['article'].apply(clean_text)
train_data['highlights'] = train_data['highlights'].apply(clean_text)

val_data['article'] = val_data['article'].apply(clean_text)
val_data['highlights'] = val_data['highlights'].apply(clean_text)

test_data['article'] = test_data['article'].apply(clean_text)
test_data['highlights'] = test_data['highlights'].apply(clean_text)

# Load the pre-trained SpaCy model globally
nlp = spacy.load('en_core_web_sm')

# Function for lemmatization using nlp.pipe for batch processing
def lemmatize_texts_with_progress(texts):
    lemmatized_texts = []
    for doc in tqdm(nlp.pipe(texts, batch_size=50, disable=['parser', 'ner']), total=len(texts)):
        lemmatized_texts.append(' '.join([token.lemma_ for token in doc]))
    return lemmatized_texts

# Apply lemmatization with progress bar
train_data['article'] = lemmatize_texts_with_progress(train_data['article'].tolist())
train_data['highlights'] = lemmatize_texts_with_progress(train_data['highlights'].tolist())

val_data['article'] = lemmatize_texts_with_progress(val_data['article'].tolist())
val_data['highlights'] = lemmatize_texts_with_progress(val_data['highlights'].tolist())

test_data['article'] = lemmatize_texts_with_progress(test_data['article'].tolist())
test_data['highlights'] = lemmatize_texts_with_progress(test_data['highlights'].tolist())

# Save lemmatized data
train_data.to_csv('lemmatized_train_data.csv', index=False)
val_data.to_csv('lemmatized_val_data.csv', index=False)
test_data.to_csv('lemmatized_test_data.csv', index=False)


100%|████████████████████████████████████████████████████████████████████████| 287113/287113 [3:35:38<00:00, 22.19it/s]
100%|█████████████████████████████████████████████████████████████████████████| 287113/287113 [14:04<00:00, 339.91it/s]
100%|████████████████████████████████████████████████████████████████████████████| 13368/13368 [08:01<00:00, 27.76it/s]
100%|███████████████████████████████████████████████████████████████████████████| 13368/13368 [00:41<00:00, 318.43it/s]
100%|████████████████████████████████████████████████████████████████████████████| 11490/11490 [07:00<00:00, 27.33it/s]
100%|███████████████████████████████████████████████████████████████████████████| 11490/11490 [00:35<00:00, 326.70it/s]


In [17]:
from collections import Counter

# Function to replace all numbers in the text with a placeholder <NUM>
def replace_numbers(text):
    return re.sub(r'\d+', '<NUM>', text)

# Function to replace rare words in the text with a placeholder <RARE>
# A word is considered rare if its frequency is below the specified threshold (default is 5)
def remove_rare_words(text, freq_threshold=5):
    words = text.split()  # Split the text into words
    word_freq = Counter(words)  # Count the frequency of each word
    # Identify words that occur less than the frequency threshold
    rare_words = {word for word, freq in word_freq.items() if freq < freq_threshold}
    # Replace rare words with <RARE>, keep other words unchanged
    filtered_text = [word if word not in rare_words else '<RARE>' for word in words]
    return ' '.join(filtered_text)  # Join the words back into a single string

# Apply number replacement and rare word removal to the 'article' and 'highlights' columns of all datasets
train_data['article'] = train_data['article'].apply(replace_numbers).apply(remove_rare_words)
train_data['highlights'] = train_data['highlights'].apply(replace_numbers).apply(remove_rare_words)

val_data['article'] = val_data['article'].apply(replace_numbers).apply(remove_rare_words)
val_data['highlights'] = val_data['highlights'].apply(replace_numbers).apply(remove_rare_words)

test_data['article'] = test_data['article'].apply(replace_numbers).apply(remove_rare_words)
test_data['highlights'] = test_data['highlights'].apply(replace_numbers).apply(remove_rare_words)

# Save the processed datasets with handled numbers and rare words
train_data.to_csv('handled_train_data.csv', index=False)  # Save the training data
val_data.to_csv('handled_val_data.csv', index=False)      # Save the validation data
test_data.to_csv('handled_test_data.csv', index=False)    # Save the test data

In [21]:
# Apply sentence tokenization to the 'article' column
# This splits the text of each article into a list of sentences
train_data['sentences'] = train_data['article'].apply(sent_tokenize)
val_data['sentences'] = val_data['article'].apply(sent_tokenize)
test_data['sentences'] = test_data['article'].apply(sent_tokenize)

# Save the datasets with the new 'sentences' column to CSV files
train_data.to_csv('tokenized_train_data.csv', index=False)  # Save the training data
val_data.to_csv('tokenized_val_data.csv', index=False)      # Save the validation data
test_data.to_csv('tokenized_test_data.csv', index=False)    # Save the test data

In [17]:
# Convert pandas DataFrames to Hugging Face Dataset objects
train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)
test_dataset = Dataset.from_pandas(test_data)