# Text Preprocessing Techniques

1. Lowercase Conversion

In [1]:
def lowercase_text(text):
    return text.lower()

text = "Hello World! This is an Example."
lowercase_text = lowercase_text(text)
print(lowercase_text)  # hello world! this is an example.

hello world! this is an example.


2. Stop Word Removal

In [None]:
# !pip install nltk -q --user
import nltk
print(nltk.__version__)

3.8.1


In [5]:
# download NLTK stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Amr osama
[nltk_data]     abdellatif\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
from nltk.corpus import stopwords
def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    return [token for token in tokens if token.lower() not in stop_words]
tokens = ['hello', 'world', 'this', 'is','The', 'an', 'example']
filtered_tokens = remove_stopwords(tokens)
print(filtered_tokens)  # ['hello', 'world', 'example']

['hello', 'world', 'example']


3. Punctuation Removal

4. Regular Expressions (Regex)

In [None]:
import string

def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

text = "Hello, world! This is an example: with punctuation."
clean_text = remove_punctuation(text)
print(clean_text)  # "Hello world This is an example with punctuation"

Hello world This is an example with punctuation


In [7]:
# Match Simple Text
import re

text = "Python is fun"
match = re.search("Python", text)
print(match.group())  # Python

Python


In [12]:
# Match Beginning and End

# ^ matches start of string, $ matches end
text = "Python is amazing"
start_match = re.search("^Python", text)
# in case of error means that re.search("^Python", text) did not find a match,
#  so it returned None, and you're trying to call .group() on None.
# print(start_match)
print(start_match.group())  # Python

end_match = re.search("amazing$", text)
print(end_match.group())  # amazing

Python
amazing


In [14]:
# Match Digits

text = "I have 3 apples and 35 oranges"
digits = re.findall(r"\d", text)  # r prefix creates a raw string
print(digits)  # ['3', '5']

text = "I have 3 apples and 35 oranges"
# \d+ matches one or more digits
numbers = re.findall(r"\d+", text)
print(numbers)  # ['3', '5']

['3', '3', '5']
['3', '35']


In [5]:
# Match Word Characters

text = "user_123 has logged in"
# \w matches alphanumeric + underscore
word_chars = re.findall(r"\w+", text)
print(word_chars)  # ['user_123', 'has', 'logged', 'in']

['user_123', 'has', 'logged', 'in']


In [None]:
# matching zero or more 

text = "color colour colouur"
pattern = re.findall(r"colou?r", text)  # ? means 0 or 1 of previous character
print(pattern)  # ['color', 'colour']

['color', 'colour']


In [8]:
# Match One or More

text = "I loooove Python"
pattern = re.findall(r"lo+ve", text)  # + means 1 or more of previous character
print(pattern)  # ['loooove']

['loooove']


In [20]:
# Match Exact Number


text = "Phone numbers: 555-1234 and 555678-5678"
pattern = re.findall(r"\d{3}-\d{4}", text)  # {n} means exactly n occurrences
print(pattern)  # ['555-1234', '555-5678']

['555-1234', '678-5678']


In [12]:
# Match Any of Several Characters

text = "The cat and the rat sat on the mat"
pattern = re.findall(r"[cr]at", text)  # matches 'cat' or 'rat'
print(pattern)  # ['cat', 'rat']

['cat', 'rat']


In [13]:
# Match Range of Characters

text = "a1b2c3D4E5"
letters = re.findall(r"[a-z]", text)  # lowercase letters
print(letters)  # ['a', 'b', 'c']

uppercase = re.findall(r"[A-Z]", text)  # uppercase letters
print(uppercase)  # ['D', 'E']

alphanumeric = re.findall(r"[a-zA-Z0-9]", text)  # all alphanumeric
print(alphanumeric)  # ['a', '1', 'b', '2', 'c', '3', 'D', '4', 'E', '5']

['a', 'b', 'c']
['D', 'E']
['a', '1', 'b', '2', 'c', '3', 'D', '4', 'E', '5']


In [None]:
# Email Validation
# . (unescaped dot) = matches ANY character except newline (wildcard)
# \. (escaped dot) = matches a literal dot character
# {2,} at least 2 chars

emails = ["user@example.com", "invalid@email", "name.last@domain.co.uk"]
pattern = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"

for email in emails:
    if re.match(pattern, email):
        print(f"{email} is valid")
    else:
        print(f"{email} is invalid")
# user@example.com is valid
# invalid@email is invalid
# name.last@domain.co.uk is valid

user@example.com is valid
invalid@email is invalid
name.last@domain.co.uk is valid


In [16]:
# Number Removal/Normalization
# /d is for digits
import re

def remove_numbers(text):
    return re.sub(r'\d+', '', text)

def replace_numbers(text, replacement='NUM'):
    return re.sub(r'\d+', replacement, text)

text = "There are 123 apples and 456 oranges."
text_no_numbers = remove_numbers(text)
text_normalized = replace_numbers(text)

print(text_no_numbers)  # "There are  apples and  oranges."
print(text_normalized)  # "There are NUM apples and NUM oranges."

There are  apples and  oranges.
There are NUM apples and NUM oranges.


In [17]:
# Noise Removal
import re

def remove_noise(text):
    # Remove special characters and symbols
    # [^] match that is not a word or white space (spaces, tabs, newlines) char
    text = re.sub(r'[^\w\s]', '', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Remove ASCII/Unicode characters
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    
    return text

text = "Special @#! characters & unicode like 你好 should    be removed."
clean_text = remove_noise(text)
print(clean_text)  # "Special characters  unicode like  should be removed"

Special characters unicode like  should be removed


In [None]:
# Text Normalization with REGEX

import re

def normalize_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Replace URLs
    # \S means "non-whitespace character" (the opposite of \s)
    # \s = whitespace (spaces, tabs, newlines)
    # \S = any character that is NOT whitespace

    text = re.sub(r'https?://\S+|www\.\S+', '[URL]', text)
    
    # Replace emails
    text = re.sub(r'\S+@\S+', '[EMAIL]', text)
    
    # Replace phone numbers
    text = re.sub(r'\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}', '[PHONE]', text)
    
    # Replace multiple whitespaces with single
    text = re.sub(r'\s+', ' ', text)
    
    # Replace elongated words (e.g., "hellooooo" -> "hello")
    text = re.sub(r'(.)\1{2,}', r'\1', text)
    
    return text.strip()

text = "Contact us at example@gmail.com or visit https://.example.com or call 123-456-7890"
normalized_text = normalize_text(text)
print(normalized_text)  # "contact us at [EMAIL] or visit [URL] or call [PHONE]"

contact us at [EMAIL] or visit [URL] or call [PHONE]


5. Tokenization

Tokenization is the process of splitting text into smaller pieces, called tokens.
These tokens can be:

Words → Word-level tokenization

Characters → Character-level tokenization

Subwords → Subword-level tokenization (used in models like BERT, GPT)

In [5]:
# Using NLTK
import nltk
nltk.download('punkt')

def tokenize_text(text):
    # Word tokenization
    word_tokens = nltk.word_tokenize(text)
    # Sentence tokenization
    sentence_tokens = nltk.sent_tokenize(text)
    return word_tokens, sentence_tokens

text = "Hello world. How are you today?"
word_tokens, sentence_tokens = tokenize_text(text)
print(word_tokens)  # ['Hello', 'world', '.', 'How', 'are', 'you', 'today', '?']
print(sentence_tokens)  # ['Hello world.', 'How are you today?']


# we can look at letters as well

['Hello', 'world', '.', 'How', 'are', 'you', 'today', '?']
['Hello world.', 'How are you today?']


[nltk_data] Downloading package punkt to C:\Users\Amr osama
[nltk_data]     abdellatif\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Bert tokenizer

In [7]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
tokens = tokenizer.tokenize("Hello world. How are you today?")
print(tokens)


['hello', 'world', '.', 'how', 'are', 'you', 'today', '?']


In [9]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokens = tokenizer.tokenize("Hello world. How are you today?")
print(tokens)

# The Ġ symbol represents a space before the word (tokenized using byte-level BPE).

['Hello', 'Ġworld', '.', 'ĠHow', 'Ġare', 'Ġyou', 'Ġtoday', '?']


In [13]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokens = tokenizer.tokenize("unbelievability")
print(tokens)
tokens = tokenizer.tokenize("banana ")
print(tokens)


['un', 'bel', 'iev', 'ability']
['ban', 'ana', 'Ġ']


6. Stemming

In [18]:
from nltk.stem import PorterStemmer, LancasterStemmer, SnowballStemmer

def stem_words(tokens):
    porter = PorterStemmer()
    lancaster = LancasterStemmer()
    snowball = SnowballStemmer('english')
    
    porter_stems = [porter.stem(token) for token in tokens]
    lancaster_stems = [lancaster.stem(token) for token in tokens]
    snowball_stems = [snowball.stem(token) for token in tokens]
    
    return porter_stems, lancaster_stems, snowball_stems

tokens = ['running', 'runs', 'ran', 'easily', 'fairly']
porter_stems, lancaster_stems, snowball_stems = stem_words(tokens)
print(f"Porter: {porter_stems}")    # ['run', 'run', 'ran', 'easili', 'fairli']
print(f"Lancaster: {lancaster_stems}")  # ['run', 'run', 'ran', 'easy', 'fair']
print(f"Snowball: {snowball_stems}")    # ['run', 'run', 'ran', 'easili', 'fair']

Porter: ['run', 'run', 'ran', 'easili', 'fairli']
Lancaster: ['run', 'run', 'ran', 'easy', 'fair']
Snowball: ['run', 'run', 'ran', 'easili', 'fair']


7. Lemmatization

In [19]:
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

def lemmatize_words(tokens):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in tokens]

tokens = ['running', 'runs', 'ran', 'better', 'mice']
lemmatized_tokens = lemmatize_words(tokens)
print(lemmatized_tokens)  # ['running', 'run', 'ran', 'better', 'mouse']

[nltk_data] Downloading package wordnet to C:\Users\Amr osama
[nltk_data]     abdellatif\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


['running', 'run', 'ran', 'better', 'mouse']


8. Spell Correction

In [20]:
# pip install pyspellchecker spellchecker

from spellchecker import SpellChecker

def correct_spelling(tokens):
    spell = SpellChecker()
    corrected = [spell.correction(token) for token in tokens]
    return corrected

tokens = ['helo', 'wrld', 'example']
corrected_tokens = correct_spelling(tokens)
print(corrected_tokens)  # ['hello', 'world', 'example']

['help', 'world', 'example']


9. Text Normalization with TextBlob

In [24]:
# !pip install textblob -q --user
!python -m textblob.download_corpora

Finished.


[nltk_data] Downloading package brown to C:\Users\Amr osama
[nltk_data]     abdellatif\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\brown.zip.
[nltk_data] Downloading package punkt_tab to C:\Users\Amr osama
[nltk_data]     abdellatif\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.
[nltk_data] Downloading package wordnet to C:\Users\Amr osama
[nltk_data]     abdellatif\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\Amr osama
[nltk_data]     abdellatif\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package conll2000 to C:\Users\Amr osama
[nltk_data]     abdellatif\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\conll2000.zip.
[nltk_data] Downloading package movie_reviews to C:\Users\Amr osama
[nltk_data]  

In [25]:
'''TextBlob is a Python library that provides a simple API
for common natural language processing (NLP) tasks.
It's built on top of NLTK (Natural Language Toolkit) and
Pattern, making it easier to perform text analysis without diving
into the complexities of those underlying libraries.
'''



from textblob import TextBlob

def normalize_with_textblob(text):
    blob = TextBlob(text)
    
    # Correct spelling
    corrected = blob.correct()
    
    # Get sentiment
    sentiment = blob.sentiment
    
    # Get noun phrases
    noun_phrases = blob.noun_phrases
    
    return str(corrected), sentiment, noun_phrases

text = "The quik brown fox jumpd over the lazzy dog."
text = "guuod feeling"
corrected, sentiment, noun_phrases = normalize_with_textblob(text)

print(f"Corrected: {corrected}")
print(f"Sentiment: {sentiment}")
print(f"Noun phrases: {noun_phrases}")

# Corrected: The quick brown fox jumped over the lazy dog.
# Sentiment: Sentiment(polarity=0.0, subjectivity=0.0)
# Noun phrases: ['brown fox jumpd', 'lazzy dog']

Corrected: good feeling
Sentiment: Sentiment(polarity=0.0, subjectivity=0.0)
Noun phrases: ['guuod feeling']


10. Named Entity Recognition (NER)

In [26]:
# pip intall spacy
# python -m spacy download en_core_web_sm
import spacy

def extract_entities(text):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

text = "Apple is looking at buying U.K. startup for $1 billion"
entities = extract_entities(text)
print(entities)  # [('Apple', 'ORG'), ('U.K.', 'GPE'), ('$1 billion', 'MONEY')]

[('Apple', 'ORG'), ('U.K.', 'GPE'), ('$1 billion', 'MONEY')]


11. Text Cleaning (HTML/XML tags)


In [27]:
import re
from bs4 import BeautifulSoup

def clean_html(html_text):
    # Using BeautifulSoup
    soup = BeautifulSoup(html_text, "html.parser")
    clean_text = soup.get_text(separator=" ", strip=True)
    return clean_text

def clean_html_regex(html_text):
    # Using regex
    clean_text = re.sub(r'<.*?>', '', html_text)
    return clean_text

html = "<div><p>This is <b>sample</b> HTML text?.</p></div>"
clean_bs = clean_html(html)
clean_re = clean_html_regex(html)

print(clean_bs)  # "This is sample HTML text."
print(clean_re)  # "This is sample HTML text."

This is sample HTML text?.
This is sample HTML text?.


12. Contractions Expansion

In [9]:
# pip install contractions
import contractions

def expand_contractions(text):
    expanded_text = contractions.fix(text)
    return expanded_text

text = "I can't do this and I won't try it."
expanded = expand_contractions(text)
print(expanded)  # "I cannot do this and I will not try it."

I cannot do this and I will not try it.


18. Language Detection and Translation


In [30]:
!pip install googletrans==4.0.0rc1 -q --user
!pip install langdetect -q --user
# restart kernel


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
chromadb 0.5.5 requires httpx>=0.27.0, but you have httpx 0.13.3 which is incompatible.
cohere 5.9.1 requires httpx>=0.21.2, but you have httpx 0.13.3 which is incompatible.
gotrue 2.9.3 requires httpx[http2]<0.28,>=0.26, but you have httpx 0.13.3 which is incompatible.
gradio 5.4.0 requires httpx>=0.24.1, but you have httpx 0.13.3 which is incompatible.
gradio 5.4.0 requires tomlkit==0.12.0, but you have tomlkit 0.12.5 which is incompatible.
gradio 5.4.0 requires typer<1.0,>=0.12; sys_platform != "emscripten", but you have typer 0.9.4 which is incompatible.
gradio-client 1.4.2 requires httpx>=0.24.1, but you have httpx 0.13.3 which is incompatible.
langchain-cohere 0.2.0 requires langchain-core<0.3,>=0.2.24, but you have langchain-core 0.3.66 which is incompatible.
langgraph-sdk 0.1.70 requires httpx>=0.25.2, but

In [31]:

from langdetect import detect
from googletrans import Translator

def detect_and_translate(text, target_lang='en'):
    # Detect language
    source_lang = detect(text)
    
    # Translate text
    translator = Translator()
    translation = translator.translate(text, src=source_lang, dest=target_lang)
    
    return source_lang, translation.text

text = "Bonjour le monde"
source, translation = detect_and_translate(text)
print(f"Detected language: {source}")
print(f"Translation: {translation}")

# Detected language: fr
# Translation: Hello world

Detected language: fr
Translation: Hello world


19. Custom Vocabulary Creation


In [None]:
from collections import Counter

def create_vocabulary(texts, min_freq=2, max_vocab_size=10000):
    # Tokenize all texts
    all_tokens = []
    for text in texts:
        tokens = nltk.word_tokenize(text.lower())
        all_tokens.extend(tokens)
    
    # Count frequency
    token_counts = Counter(all_tokens)
    
    # Filter by frequency and vocabulary size
    vocab = {token: count for token, count in token_counts.most_common(max_vocab_size) 
             if count >= min_freq}
    
    # Create mapping dictionaries
    token2id = {token: idx for idx, (token, _) in enumerate(vocab.items())}
    id2token = {idx: token for token, idx in token2id.items()}
    
    return vocab, token2id, id2token

texts = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?",
]

vocab, token2id, id2token = create_vocabulary(texts, min_freq=2)
print("Vocabulary:", vocab)
print("Token to ID mapping:", token2id)

Vocabulary: {'this': 4, 'is': 4, 'the': 4, 'document': 4, '.': 3, 'first': 2}
Token to ID mapping: {'this': 0, 'is': 1, 'the': 2, 'document': 3, '.': 4, 'first': 5}


Assignment

20. Comprehensive Preprocessing Pipeline
