# 🔨 Project 1 – Text Cleaner
- Building a reusable, modular text preprocessing pipeline.

In [1]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [2]:
# Make sure the required NLTK data is downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\oumme\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\oumme\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\oumme\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\oumme\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [3]:
# Initialize required components
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    """
    Cleans input text string using the following steps:
    1. Lowercasing
    2. Removing HTML tags
    3. Removing URLs
    4. Removing punctuation
    5. Removing numbers
    6. Tokenization
    7. Stopword removal
    8. Lemmatization
    Returns: List of clean tokens
    """
    # 1. Lowercase
    text = text.lower()
    
    # 2. Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # 3. Remove URLs
    text = re.sub(r'http\S+|www\S+', '', text)
    
    # 4. Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    
    # 5. Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # 6. Tokenize
    tokens = word_tokenize(text)
    
    # 7. Remove stopwords
    filtered_tokens = [token for token in tokens if token not in stop_words]
    
    # 8. Lemmatization
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    
    return lemmatized_tokens

### Let's Test it

In [4]:
sample = "The <b>quick</b> brown fox, aged 5, jumped over http://example.com the lazy dog!"
print(clean_text(sample))

['quick', 'brown', 'fox', 'aged', 'jumped', 'lazy', 'dog']


### Now lets enhance it a bit more by adding customizations and enhancing it

- Contraction removal

In [5]:
import contractions  # NEW

def clean_text(text,
               lowercase=True,
               expand_contractions=True,  # NEW
               remove_html=True,
               remove_urls=True,
               remove_punct=True,
               remove_numbers=True,
               remove_stopwords=True,
               lemmatize=True):
    
    if not isinstance(text, str):
        return []
    
    if lowercase:
        text = text.lower()
    
    if expand_contractions:
        text = contractions.fix(text)  # NEW STEP
    
    if remove_html:
        text = re.sub(r'<.*?>', '', text)
        
    if remove_urls:
        text = re.sub(r'http\S+|www\S+', '', text)
        
    if remove_punct:
        text = re.sub(r'[^\w\s]', '', text)
        
    if remove_numbers:
        text = re.sub(r'\d+', '', text)
    
    tokens = word_tokenize(text)
    
    if remove_stopwords:
        tokens = [t for t in tokens if t not in stop_words]
        
    if lemmatize:
        tokens = [lemmatizer.lemmatize(t) for t in tokens]
        
    return tokens


In [6]:
sample = "She's going to the store, and he isn't coming."
cleaned = clean_text(sample, expand_contractions=True)
print(cleaned)

['going', 'store', 'coming']


- Emoji removal

In [7]:
import emoji  # NEW

def clean_text(text,
               lowercase=True,
               expand_contractions=True,
               remove_html=True,
               remove_urls=True,
               remove_punct=True,
               remove_numbers=True,
               remove_stopwords=True,
               lemmatize=True,
               remove_emojis=True,          # NEW
               map_emojis_to_text=False):   # NEW
    
    if not isinstance(text, str):
        return []
    
    if lowercase:
        text = text.lower()
    
    if expand_contractions:
        text = contractions.fix(text)
    
    if map_emojis_to_text:
        text = emoji.demojize(text)  # 😀 → :grinning_face:
    elif remove_emojis:
        text = emoji.replace_emoji(text, replace='')
    
    if remove_html:
        text = re.sub(r'<.*?>', '', text)
        
    if remove_urls:
        text = re.sub(r'http\S+|www\S+', '', text)
        
    if remove_punct:
        text = re.sub(r'[^\w\s]', '', text)
        
    if remove_numbers:
        text = re.sub(r'\d+', '', text)
    
    tokens = word_tokenize(text)
    
    if remove_stopwords:
        tokens = [t for t in tokens if t not in stop_words]
        
    if lemmatize:
        tokens = [lemmatizer.lemmatize(t) for t in tokens]
        
    return tokens

In [8]:
sample = "I love pizza 😋 but exams make me 😩!"
print(clean_text(sample, map_emojis_to_text=True))

['love', 'pizza', 'face_savoring_food', 'exam', 'make', 'weary_face']


- Spelling Correction

In [9]:
from textblob import TextBlob  # NEW

def clean_text(text,
               lowercase=True,
               expand_contractions=True,
               remove_html=True,
               remove_urls=True,
               remove_punct=True,
               remove_numbers=True,
               remove_stopwords=True,
               lemmatize=True,
               remove_emojis=True,
               map_emojis_to_text=False,
               correct_spelling=False):  # NEW
     
    if not isinstance(text, str):
        return []

    if lowercase:
        text = text.lower()

    if expand_contractions:
        text = contractions.fix(text)

    if map_emojis_to_text:
        text = emoji.demojize(text)
    elif remove_emojis:
        text = emoji.replace_emoji(text, replace='')

    if remove_html:
        text = re.sub(r'<.*?>', '', text)

    if remove_urls:
        text = re.sub(r'http\S+|www\S+', '', text)

    if remove_punct:
        text = re.sub(r'[^\w\s]', '', text)

    if remove_numbers:
        text = re.sub(r'\d+', '', text)

    tokens = word_tokenize(text)

    if remove_stopwords:
        tokens = [t for t in tokens if t not in stop_words]

    if lemmatize:
        tokens = [lemmatizer.lemmatize(t) for t in tokens]

    if correct_spelling:
        # Join tokens, correct spelling, re-tokenize
        blob = TextBlob(" ".join(tokens))
        corrected = blob.correct()
        tokens = word_tokenize(str(corrected))
    
    return tokens

In [10]:
text = "I reely loveee delicius pizzza 🍕 in Itly!"
print(clean_text(text, correct_spelling=True))

['reply', 'love', 'delicious', 'pizzza', 'italy']


- Unicode Normalization & Whitespace Cleanup

In [11]:
import unicodedata

def clean_text(text,
               lowercase=True,
               expand_contractions=True,
               remove_html=True,
               remove_urls=True,
               remove_punct=True,
               remove_numbers=True,
               remove_stopwords=True,
               lemmatize=True,
               remove_emojis=True,
               map_emojis_to_text=False,
               correct_spelling=False):

    # --- New Step: Normalize Unicode ---
    text = unicodedata.normalize("NFKC", text)
    text = re.sub(r'\s+', ' ', text).strip()

    if not isinstance(text, str):
        return []

    # 1. Unicode normalization & whitespace cleanup
    text = unicodedata.normalize("NFKC", text)
    text = re.sub(r'\s+', ' ', text).strip()

    # 2. Lowercase
    if lowercase:
        text = text.lower()

    # 3. Expand contractions
    if expand_contractions:
        text = contractions.fix(text)

    # 4. Emoji handling
    if map_emojis_to_text:
        text = emoji.demojize(text)
    elif remove_emojis:
        text = emoji.replace_emoji(text, replace='')

    # 5. Remove HTML tags
    if remove_html:
        text = re.sub(r'<.*?>', '', text)

    # 6. Remove URLs
    if remove_urls:
        text = re.sub(r'http\S+|www\S+', '', text)

    # 7. Remove punctuation
    if remove_punct:
        text = re.sub(r'[^\w\s]', '', text)

    # 8. Remove numbers
    if remove_numbers:
        text = re.sub(r'\d+', '', text)

    # 9. Tokenize
    tokens = word_tokenize(text)

    # 10. Remove stopwords
    if remove_stopwords:
        tokens = [t for t in tokens if t not in stop_words]

    # 11. Lemmatization
    if lemmatize:
        tokens = [lemmatizer.lemmatize(t) for t in tokens]

    # 12. Spelling correction (slow)
    if correct_spelling and tokens:
        blob = TextBlob(" ".join(tokens))
        corrected = blob.correct()
        tokens = word_tokenize(str(corrected))

    return tokens

In [12]:
weird_text = "“Tēxt ” with— strange\u200b characters\tand odd spaces.\n\nNewlines."
print(clean_text(weird_text, remove_stopwords=False))

['tēxt', 'with', 'strange', 'character', 'and', 'odd', 'space', 'newlines']


### Batch-Cleaning

In [13]:
from tqdm import tqdm
tqdm.pandas()  # Enable progress bar on DataFrame operations

def clean_dataframe_column(df, column, **cleaner_kwargs):
    """
    Apply clean_text() to a DataFrame column.
    
    Parameters:
    - df: pandas DataFrame
    - column: column name containing raw text
    - cleaner_kwargs: keyword arguments passed to clean_text()
    
    Returns:
    - df: DataFrame with a new column: column_cleaned
    """
    cleaned_col = f"{column}_cleaned"
    df[cleaned_col] = df[column].progress_apply(lambda x: clean_text(str(x), **cleaner_kwargs))
    return df

In [14]:
import pandas as pd

data = {
    'raw_text': [
        "This is a <b>test</b> sentence with a link: https://t.co/abc123 😊",
        "Here's another line — with emojis 😂😂 and   whitespace!",
        "The café's crème brûlée wasn't bad... at all!",
    ]
}

df = pd.DataFrame(data)

# Clean it!
df = clean_dataframe_column(df, 'raw_text', remove_stopwords=True, remove_emojis=True)

100%|██████████| 3/3 [00:00<00:00, 119.24it/s]


In [15]:
df.head()

Unnamed: 0,raw_text,raw_text_cleaned
0,This is a <b>test</b> sentence with a link: ht...,"[test, sentence, link]"
1,Here's another line — with emojis 😂😂 and whi...,"[another, line, emojis, whitespace]"
2,The café's crème brûlée wasn't bad... at all!,"[cafés, crème, brûlée, bad]"
