In [None]:
import pandas as pd
import re
import spacy
from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from collections import Counter

# Load datasets
train_data = pd.read_csv('cleaned_train_data.csv')
val_data = pd.read_csv('cleaned_val_data.csv')
test_data = pd.read_csv('cleaned_test_data.csv')

# Function to clean text
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text

# Apply cleaning to all datasets
train_data['article'] = train_data['article'].apply(clean_text)
train_data['highlights'] = train_data['highlights'].apply(clean_text)

val_data['article'] = val_data['article'].apply(clean_text)
val_data['highlights'] = val_data['highlights'].apply(clean_text)

test_data['article'] = test_data['article'].apply(clean_text)
test_data['highlights'] = test_data['highlights'].apply(clean_text)

# Load the pre-trained SpaCy model globally
nlp = spacy.load('en_core_web_sm')

# Function for lemmatization using nlp.pipe for batch processing
def lemmatize_texts_with_progress(texts):
    lemmatized_texts = []
    for doc in tqdm(nlp.pipe(texts, batch_size=50, disable=['parser', 'ner']), total=len(texts)):
        lemmatized_texts.append(' '.join([token.lemma_ for token in doc]))
    return lemmatized_texts

# Apply lemmatization with progress bar
train_data['article'] = lemmatize_texts_with_progress(train_data['article'].tolist())
train_data['highlights'] = lemmatize_texts_with_progress(train_data['highlights'].tolist())

val_data['article'] = lemmatize_texts_with_progress(val_data['article'].tolist())
val_data['highlights'] = lemmatize_texts_with_progress(val_data['highlights'].tolist())

test_data['article'] = lemmatize_texts_with_progress(test_data['article'].tolist())
test_data['highlights'] = lemmatize_texts_with_progress(test_data['highlights'].tolist())

# Save lemmatized data
train_data.to_csv('lemmatized_train_data.csv', index=False)
val_data.to_csv('lemmatized_val_data.csv', index=False)
test_data.to_csv('lemmatized_test_data.csv', index=False)
 

In [None]:
## counter word
from collections import Counter

def replace_numbers(text):
    return re.sub(r'\d+', '<NUM>', text)

def remove_rare_words(text, freq_threshold=5):
    words = text.split()
    word_freq = Counter(words)
    rare_words = {word for word, freq in word_freq.items() if freq < freq_threshold}
    filtered_text = [word if word not in rare_words else '<RARE>' for word in words]
    return ' '.join(filtered_text)

train_data['article'] = train_data['article'].apply(replace_numbers).apply(remove_rare_words)
train_data['highlights'] = train_data['highlights'].apply(replace_numbers).apply(remove_rare_words)

val_data['article'] = val_data['article'].apply(replace_numbers).apply(remove_rare_words)
val_data['highlights'] = val_data['highlights'].apply(replace_numbers).apply(remove_rare_words)

test_data['article'] = test_data['article'].apply(replace_numbers).apply(remove_rare_words)
test_data['highlights'] = test_data['highlights'].apply(replace_numbers).apply(remove_rare_words)

# Save data after handling numbers and rare words
train_data.to_csv('handled_train_data.csv', index=False)
val_data.to_csv('handled_val_data.csv', index=False)
test_data.to_csv('handled_test_data.csv', index=False)