In [7]:
# NLP Preprocessing Pipeline (Generalized Python Template)
# ---------------------------------------------------------
# This script includes a generalized preprocessing pipeline for text data in NLP tasks.
# It is designed for reuse in projects like classification, sentiment analysis, topic modeling, etc.
# To use: just copy and paste into your project and call preprocess_text(text).

import re
import nltk
import contractions
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download necessary resources (only run once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize resources
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()


def preprocess_text(text):
    """
    Applies a full preprocessing pipeline to input text.

    Steps:
    1. Remove HTML tags
    2. Convert to lowercase
    3. Expand contractions
    4. Remove URLs
    5. Remove punctuation & special characters
    6. Remove extra whitespace
    7. Tokenize
    8. Remove stop words
    9. Lemmatize tokens

    Returns:
        List of clean tokens
    """

    # 1. Remove HTML tags
    text = re.sub(r'<.*?>', '', text)

    # 2. Lowercase
    text = text.lower()

    # 3. Expand contractions (e.g., don't -> do not)
    text = contractions.fix(text)

    # 4. Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)

    # 5. Remove punctuation and special characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

    # 6. Remove extra whitespaces
    text = ' '.join(text.split())

    # 7. Tokenize
    tokens = word_tokenize(text)

    # 8. Remove stop words
    tokens = [word for word in tokens if word not in stop_words]

    # 9. Lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return tokens

# Example usage
if __name__ == "__main__":
    sample = """<p>Hey! I'm testing this NLP pipeline with a URL: https://example.com,
    some contractions like don't, and some numbers 1234.</p>"""
    print(preprocess_text(sample))


['hey', 'testing', 'nlp', 'pipeline', 'url', 'contraction', 'like', 'number', '1234']


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\bodya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bodya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\bodya\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
