In [4]:
import nltk
import pandas as pd
import os
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

# Ensure necessary downloads
# nltk.download('stopwords')
# nltk.download('punkt')

def cleanF(file_path):
    try:
        #handle TSV files without proper headers
        delimiter = ',' if file_path.endswith('.csv') else '\t' #if it ends with .csv it assigns (comma) as the delimiter
        header = None if file_path.endswith('.tsv') else 'infer'  #use no headers for TSV

        #read file with adjusted headers
        df = pd.read_csv(file_path, dtype=str, delimiter=delimiter, encoding='utf-8', 
                         engine='python', on_bad_lines='skip', header=header)

        #if no headers used, use generic column names (col_0, col_1, ...)
        if header is None:
            df.columns = [f"col_{i}" for i in range(len(df.columns))]

        #ensure stopwords are lowercase
        stop_words = set(w.lower() for w in stopwords.words('english'))
        stemmer = PorterStemmer()

        #vocab tracking
        original_vocab = set()
        filtered_vocab = set()
        stemmed_vocab = set()

        def clean_text(text):
            if not isinstance(text, str):
                return ""

            #tokenization
            word_tokens = word_tokenize(text)
            original_vocab.update(word_tokens)

            #stopword removal
            filtered_tokens = [w.lower() for w in word_tokens if w.isalpha() and w.lower() not in stop_words]
            filtered_vocab.update(filtered_tokens)

            #stemming
            stemmed_tokens = [stemmer.stem(w) for w in filtered_tokens]
            stemmed_vocab.update(stemmed_tokens)

            return " ".join(stemmed_tokens)

        #ignore numbers & JSON-like values, only text used
        text_columns = [col for col in df.columns if df[col].dtype == 'object' and df[col].str.contains(r'[a-zA-Z]').any()]
        if not text_columns:
            print(f"No valid text columns detected in {file_path}. Skipping file.")
            return

        print(f"Processing columns: {text_columns} in {file_path}")

        #apply cleaning function to detected text columns
        for col in text_columns:
            df[col] = df[col].apply(clean_text)

        #print vocabulary statistics
        OG_vocab_size = len(original_vocab)
        filtered_vocab_size = len(filtered_vocab)
        stemmed_vocab_size = len(stemmed_vocab)
        stopword_reduction_rate = ((OG_vocab_size - filtered_vocab_size) / OG_vocab_size * 100) if OG_vocab_size else 0
        stemming_reduction_rate = ((filtered_vocab_size - stemmed_vocab_size) / filtered_vocab_size * 100) if filtered_vocab_size else 0

        print(f"\nStats for {file_path}:")
        print(f"Original Vocabulary Size: {OG_vocab_size}")
        print(f"Vocabulary Size After Stopword Removal: {filtered_vocab_size}")
        print(f"Reduction Rate After Stopword Removal: {stopword_reduction_rate:.2f}%")
        print(f"Vocabulary Size After Stemming: {stemmed_vocab_size}")
        print(f"Reduction Rate After Stemming: {stemming_reduction_rate:.2f}%")

        #generate and save cleaned file
        base_name, ext = os.path.splitext(file_path)
        new_file_name = f"{base_name}_cleaned{ext}"
        df.to_csv(new_file_name, index=False)
        print(f"Successfully saved cleaned dataset: {new_file_name}")

    except Exception as e:
        print(f"Error processing {file_path}: {e}")

#files to process
file_paths = ['bbc_articles.csv', 'test.tsv']

#run processing function on each file
for file_path in file_paths:
    cleanF(file_path)


Processing columns: ['region', 'title', 'summary', 'link'] in bbc_articles.csv

Stats for bbc_articles.csv:
Original Vocabulary Size: 6732
Vocabulary Size After Stopword Removal: 4849
Reduction Rate After Stopword Removal: 27.97%
Vocabulary Size After Stemming: 3707
Reduction Rate After Stemming: 23.55%
Successfully saved cleaned dataset: bbc_articles_cleaned.csv
Processing columns: ['col_0', 'col_1', 'col_2', 'col_3', 'col_4', 'col_5', 'col_6', 'col_7', 'col_13'] in test.tsv

Stats for test.tsv:
Original Vocabulary Size: 7464
Vocabulary Size After Stopword Removal: 4339
Reduction Rate After Stopword Removal: 41.87%
Vocabulary Size After Stemming: 3303
Reduction Rate After Stemming: 23.88%
Successfully saved cleaned dataset: test_cleaned.tsv
