In [None]:
# 02_Preprocessing.ipynb
import pandas as pd
from tqdm import tqdm
from config import *
from utils.init import *

# Execution guard to prevent double execution
if 'preprocessing_executed' not in globals():
    preprocessing_executed = True
    
    # Load cleaned data
    print("Loading cleaned data...")
    try:
        df = pd.read_csv(CLEANED_DATA_PATH)
        print(f"Cleaned data shape: {df.shape}")
    except Exception as e:
        print(f"Error loading cleaned data: {e}")
        raise

    # Load validation data
    print("Loading validation data...")
    try:
        val_df = pd.read_csv(VALIDATION_DATA_PATH)
        print(f"Validation data shape: {val_df.shape}")
    except Exception as e:
        print(f"Error loading validation data: {e}")
        raise

    # Check for NaN values BEFORE preprocessing
    print("\n=== CHECKING FOR NaN VALUES BEFORE PREPROCESSING ===")
    print("Training data NaN values:")
    print(df.isna().sum())
    print("\nValidation data NaN values:")
    print(val_df.isna().sum())

    # Handle NaN values
    df = df.fillna('')
    val_df = val_df.fillna('')

    # Check validation duplicates
    val_duplicates = val_df.duplicated().sum()
    print(f"Validation duplicates: {val_duplicates}")

    # Apply preprocessing pipeline
    print("\n=== PREPROCESSING TRAINING DATA ===")
    tqdm.pandas(desc="Processing titles")
    df['title_clean'] = df['title'].progress_apply(
        lambda x: full_clean_pipeline(x, CUSTOM_STOPWORDS)
    )

    tqdm.pandas(desc="Processing text")
    df['text_clean'] = df['text'].progress_apply(
        lambda x: full_clean_pipeline(x, CUSTOM_STOPWORDS)
    )

    print("\n=== PREPROCESSING VALIDATION DATA ===")
    tqdm.pandas(desc="Processing validation titles")
    val_df['title_clean'] = val_df['title'].progress_apply(
        lambda x: full_clean_pipeline(x, CUSTOM_STOPWORDS)
    )

    tqdm.pandas(desc="Processing validation text")
    val_df['text_clean'] = val_df['text'].progress_apply(
        lambda x: full_clean_pipeline(x, CUSTOM_STOPWORDS)
    )

    # Check for NaN values AFTER preprocessing
    print("\n=== CHECKING FOR NaN VALUES AFTER PREPROCESSING ===")
    print("Training data NaN values:")
    print(df[['title_clean', 'text_clean']].isna().sum())
    print("\nValidation data NaN values:")
    print(val_df[['title_clean', 'text_clean']].isna().sum())

    # Handle any remaining NaN values
    df['title_clean'] = df['title_clean'].fillna('no content')
    df['text_clean'] = df['text_clean'].fillna('no content')
    val_df['title_clean'] = val_df['title_clean'].fillna('no content')
    val_df['text_clean'] = val_df['text_clean'].fillna('no content')

    # Check empty strings
    print("\n=== CHECKING FOR EMPTY STRINGS ===")
    print(f"Empty title_clean in train: {(df['title_clean'] == '').sum()}")
    print(f"Empty text_clean in train: {(df['text_clean'] == '').sum()}")
    print(f"Empty title_clean in val: {(val_df['title_clean'] == '').sum()}")
    print(f"Empty text_clean in val: {(val_df['text_clean'] == '').sum()}")

    # Replace empty strings
    df['title_clean'] = df['title_clean'].replace('', 'no content')
    df['text_clean'] = df['text_clean'].replace('', 'no content')
    val_df['title_clean'] = val_df['title_clean'].replace('', 'no content')
    val_df['text_clean'] = val_df['text_clean'].replace('', 'no content')

    # Save processed data
    try:
        df.to_csv(PROCESSED_TRAIN_PATH, index=False)
        print(f"Processed training data saved to: {PROCESSED_TRAIN_PATH}")
    except Exception as e:
        print(f"Error saving processed training data: {e}")
        raise

    try:
        val_df.to_csv(PROCESSED_VAL_PATH, index=False)
        print(f"Processed validation data saved to: {PROCESSED_VAL_PATH}")
    except Exception as e:
        print(f"Error saving processed validation data: {e}")
        raise

else:
    print("Preprocessing already executed. Restart kernel to run again.")

Loading cleaned data...
Cleaned data shape: (36429, 7)
Loading validation data...
Validation data shape: (4956, 5)

=== CHECKING FOR NaN VALUES BEFORE PREPROCESSING ===
Training data NaN values:
label               0
title               0
text                0
subject             0
date            16640
title_length        0
text_length         0
dtype: int64

Validation data NaN values:
label      0
title      0
text       0
subject    0
date       0
dtype: int64
Validation duplicates: 8

=== PREPROCESSING TRAINING DATA ===


  soup = BeautifulSoup(text, "html.parser")
  soup = BeautifulSoup(text, "html.parser")
Processing titles: 100%|██████████| 36429/36429 [00:13<00:00, 2631.11it/s]
  soup = BeautifulSoup(text, "html.parser")
  soup = BeautifulSoup(text, "html.parser")
Processing text: 100%|██████████| 36429/36429 [01:16<00:00, 473.87it/s]



=== PREPROCESSING VALIDATION DATA ===


  soup = BeautifulSoup(text, "html.parser")
  soup = BeautifulSoup(text, "html.parser")
Processing validation titles: 100%|██████████| 4956/4956 [00:01<00:00, 2575.93it/s]
  soup = BeautifulSoup(text, "html.parser")
  soup = BeautifulSoup(text, "html.parser")
Processing validation text: 100%|██████████| 4956/4956 [00:13<00:00, 371.56it/s]



=== CHECKING FOR NaN VALUES AFTER PREPROCESSING ===
Training data NaN values:
title_clean    0
text_clean     0
dtype: int64

Validation data NaN values:
title_clean    0
text_clean     0
dtype: int64

=== CHECKING FOR EMPTY STRINGS ===
Empty title_clean in train: 5
Empty text_clean in train: 494
Empty title_clean in val: 2
Empty text_clean in val: 23
Processed training data saved to: c:\Users\Amin\Documents\Ironhack_projects\project-nlp-challenge\processed\train_processed.csv
Processed validation data saved to: c:\Users\Amin\Documents\Ironhack_projects\project-nlp-challenge\processed\val_processed.csv
