# DATA 304 — Session 2: Text Normalization

This notebook mirrors the **Text Normalization** session and provides runnable examples.

## 0. Setup

In [None]:
# Core libs
import re, unicodedata, math, random, string
from collections import Counter

# Data
import pandas as pd
import numpy as np

# NLP 
# !pip -q install contractions spacy pyspellchecker
# !python -m spacy download en_core_web_sm
# !pip install nltk

## 1. Tiny sample dataset

In [None]:
raw_texts = [
    "They're RUNNING—résumé!  ",
    "Data, data; DATA.",
    "I can't, you won't.  ",
    "The leaves were falling and mice ran",
    "ML Engineer / Data Scientist II",
    "I love data 🧪📊!  ",
    "colur is my favrite",
]

df = pd.DataFrame({
    "id": range(1, len(raw_texts)+1),
    "text": raw_texts,
})

df

## 2. Lowercasing

In [None]:
df['lower'] = df['text'].str.lower()
df[['id','text','lower']]

## 3. Remove punctuation

In [None]:
df['no_punct'] = df['lower'].str.replace(r"[^\w\s]", " ", regex=True)
df[['id','lower','no_punct']]

## 4. Normalize whitespace

In [None]:
df['ws_norm'] = df['no_punct'].str.replace(r"\s+", " ", regex=True).str.strip()
df[['id','no_punct','ws_norm']]

## 5. Expand contractions

In [None]:
try:
    import contractions
except Exception as e:
    print("Install the 'contractions' package to run this cell: pip install contractions")
    raise

df['expanded'] = df['ws_norm'].apply(contractions.fix)
df[['id','ws_norm','expanded']]

## 6. Strip accents

In [None]:
import unicodedata
def strip_accents(s: str) -> str:
    return "".join(
        c for c in unicodedata.normalize("NFKD", s)
        if not unicodedata.combining(c)
    )

df['no_accents'] = df['expanded'].apply(strip_accents)
df[['id','expanded','no_accents']]

## 7. Remove emojis

In [None]:
import re
emoji_pat = re.compile(r"[\U00010000-\U0010FFFF]")
df['no_emoji'] = df['no_accents'].str.replace(emoji_pat, "", regex=True)
df[['id','no_accents','no_emoji']]

## 8. Lemmatization (spaCy)

In [None]:
import spacy
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    print("Downloading spaCy model 'en_core_web_sm'...")
    import subprocess, sys
    _ = subprocess.run([sys.executable, "-m", "spacy", "download", "en_core_web_sm"], check=False)
    nlp = spacy.load("en_core_web_sm")

def lemmatize(s: str) -> str:
    doc = nlp(s)
    return " ".join(tok.lemma_ for tok in doc)

df['lemma'] = df['no_emoji'].apply(lemmatize)
df[['id','no_emoji','lemma']]

## 9. Stemming (NLTK PorterStemmer)

In [None]:
try:
    from nltk.stem import PorterStemmer
except Exception as e:
    print("Install NLTK to run this cell: pip install nltk")
    raise

ps = PorterStemmer()
df['stem'] = df['no_emoji'].str.lower().str.split().apply(lambda toks: " ".join(ps.stem(w) for w in toks))
df[['id','no_emoji','stem']]

## 10. Standardize spelling (pyspellchecker)

In [None]:
try:
    from spellchecker import SpellChecker
except Exception as e:
    print("Install pyspellchecker to run this cell: pip install pyspellchecker")
    raise

sp = SpellChecker(distance=1)
def correct_spelling(s: str) -> str:
    toks = s.split()
    corr = [sp.correction(w) or w for w in toks]
    return " ".join(corr)

df['spcorr'] = df['lemma'].apply(correct_spelling)
df[['id','lemma','spcorr']]

## 11. Pipeline: putting it together

In [None]:
import re
import contractions
def normalize_pipeline(text: str, nlp_model) -> str:
    t = text.lower()
    t = contractions.fix(t)
    t = re.sub(r"[^\w\s]", " ", t)
    t = strip_accents(t)
    t = re.sub(r"\s+", " ", t).strip()
    doc = nlp_model(t)
    t = " ".join(tok.lemma_ for tok in doc)
    return t

df['norm'] = df['text'].apply(lambda s: normalize_pipeline(s, nlp))
df[['id','text','norm']]

## 12. Measuring impact of normalization

In [None]:
import re
from collections import Counter

def tokenize(s: str):
    return re.findall(r"\w+", s.lower())

def charset_size(texts):
    return len(set("".join(texts)))

raw = df['text'].tolist()
clean = df['norm'].tolist()

# Unique docs
unique_before = len(set(raw))
unique_after  = len(set(clean))

# Frequencies
freq_before = Counter(tok for t in raw for tok in tokenize(t))
freq_after  = Counter(tok for t in clean for tok in tokenize(t))

# Charset
chars_before = charset_size(raw)
chars_after  = charset_size(clean)

# Effects
vocab_before = set(freq_before)
vocab_after  = set(freq_after)
vocab_shrink = (1 - len(vocab_after) / max(1, len(vocab_before)))
jaccard_vocab = len(vocab_before & vocab_after) / max(1, len(vocab_before | vocab_after))

summary = pd.Series({
    "unique_docs_before": unique_before,
    "unique_docs_after": unique_after,
    "tokens_before": sum(freq_before.values()),
    "tokens_after": sum(freq_after.values()),
    "vocab_size_before": len(vocab_before),
    "vocab_size_after": len(vocab_after),
    "vocab_shrink_fraction": round(vocab_shrink, 3),
    "jaccard_vocab_overlap": round(jaccard_vocab, 3),
    "charset_size_before": chars_before,
    "charset_size_after": chars_after,
    "charset_reduction": chars_before - chars_after,
}).to_frame("value")

summary

## 13. Visualize vocabulary change

In [None]:
import matplotlib.pyplot as plt

labels = ["Before", "After"]
sizes = [summary.loc['vocab_size_before','value'], summary.loc['vocab_size_after','value']]
plt.figure()
plt.bar(labels, sizes)
plt.title("Vocabulary Size: Before vs After")
plt.xlabel("Stage")
plt.ylabel("Unique tokens")
plt.show()

## 14. Semantic normalization: dictionary-based mapping

In [None]:
roles = pd.Series([
    "Data Scientist II",
    "Senior Data Analyst",
    "ML Engineer",
    "machine learning engineer",
    "data analyst iii",
    "Director of Data Science"
], name="role_clean")

map_roles = {
    "data scientist": "ds",
    "data analyst": "da",
    "machine learning engineer": "ml"
}

def clean_role_simple(s: str) -> str:
    s2 = re.sub(r"[^a-zA-Z\s/+-]", " ", s.lower())
    s2 = re.sub(r"\s+", " ", s2).strip()
    return s2

role_norm = roles.apply(clean_role_simple)

# Exact mapping, then fallback
mapped = role_norm.replace(map_roles).where(lambda x: x.isin(map_roles.values()), other="other")
pd.DataFrame({"raw": roles, "cleaned": role_norm, "role_std": mapped})

## 15. Regex-based semantic mapping

In [None]:
patterns = {
    r'^data\s+scientist(\b.*)?$': 'ds',
    r'^data\s+analyst(\b.*)?$': 'da',
    r'^(ml|m/?l|machine\s*learning)\s+engineer(\b.*)?$': 'ml',
}
role_std_regex = role_norm.replace(patterns, regex=True).fillna("other")


display(pd.DataFrame({
    "raw": roles,
    "cleaned": role_norm,
    "role_std": role_std_regex
}))

## 16. Multi-field cleaning workflow demo

In [None]:
demo = pd.DataFrame({
    "address": ["123 Main St, Knoxville, TN 37996", "456 2nd Ave, Nashville, TN 37209"],
    "city_ref": ["knoxville", "nashville"]
})

profile = demo.describe(include='object')
display(profile)

demo['city'] = demo['address'].str.extract(r'(,[A-Za-z\s]+),').iloc[:,0].str.strip()
demo['zip']  = demo['address'].str.extract(r'(\d{5})').iloc[:,0]
demo['city_norm'] = demo['city'].str.lower().str.replace(r"\s+", " ", regex=True).str.replace(",", "", regex=False).str.strip()

ref = pd.DataFrame({"city_norm": ["knoxville", "nashville"], "valid_city": [True, True]})
demo = demo.merge(ref, on="city_norm", how="left", indicator=True)
demo

## 17. Performance tips and quick profiling

In [None]:
big = pd.DataFrame({"txt": np.random.choice(df['text'], size=20000, replace=True)})

def vec_clean(s: pd.Series) -> pd.Series:
    return (s.str.lower()
              .str.replace(r"[^\w\s]", " ", regex=True)
              .str.replace(r"\s+", " ", regex=True)
              .str.strip())

def py_apply_clean(s: pd.Series) -> pd.Series:
    def f(x):
        t = x.lower()
        t = re.sub(r"[^\w\s]", " ", t)
        t = re.sub(r"\s+", " ", t).strip()
        return t
    return s.apply(f)

print("Vectorized:")
get_ipython().run_line_magic('timeit', 'vec_clean(big[\'txt\'])')
print("Python apply:")
get_ipython().run_line_magic('timeit', 'py_apply_clean(big[\'txt\'])')

big.memory_usage(deep=True)