In [None]:
import pandas as pd
from sklearn.utils import shuffle
import string
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet, stopwords
from nltk import pos_tag, word_tokenize

# --- NLTK Setup ---
def download_nltk_resources():
    """Downloads all required NLTK resources silently."""
    required_resources = ['punkt', 'wordnet', 'omw-1.4', 
                          'averaged_perceptron_tagger', 'stopwords']
    for resource in required_resources:
        try:
            # Use 'quiet=True' to avoid excessive output
            nltk.download(resource, quiet=True)
        except Exception:
            # Simple error handling for missing resources
            pass

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(tag):
    """Maps NLTK POS tags to WordNet POS tags for lemmatization."""
    # Logic copied exactly from your notebook
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def preprocess_text(text):
    """Tokenize, remove stopwords, POS-aware lemmatize, and return cleaned string."""
    if pd.isna(text):
        return ''
        
    words = word_tokenize(text)
    
    # POS-aware Lemmatization and stopword removal (exactly as in your notebook)
    cleaned_words = [lemmatizer.lemmatize(
                        word.translate(str.maketrans('', '', string.punctuation)),
                        get_wordnet_pos(pos)
                     )
                     for word, pos in pos_tag(words)
                     if word not in stop_words and word.strip() != '']
                     
    return ' '.join(cleaned_words)

# --- Main Preprocessing Function ---
def preprocess_data(df):
    """Orchestrates all data cleaning steps."""
    download_nltk_resources()
    
    # Step 0: Shuffle the dataset
    df = shuffle(df, random_state=42)

    # 1. Remove unnecessary columns
    df = df.drop(columns=['Unnamed: 0'], errors='ignore')

    # 2. Handle missing data
    df = df.dropna(subset=['title', 'text'])

    # 3. Combine text features
    df['content'] = df['title'] + " " + df['text']

    # 4. Text cleaning: lowercase and strip spaces
    df['content'] = df['content'].str.lower().str.strip()

    # 5. POS-aware lemmatization and final cleaning
    df['content'] = df['content'].apply(preprocess_text)

    # 6. Final cleanup (removing empty content rows)
    df = df[df['content'].str.strip().astype(bool)]
    
    print("Preprocessing done. Dataset shape:", df.shape)
    return df