# Regex & Data Cleaning

This notebook covers regex patterns and text cleaning techniques.

## Topics:
1. Regex basics
2. Common patterns (phone, email, URLs)
3. PII removal
4. Text normalization
5. Pattern extraction

In [None]:
import re
import pandas as pd

# Sample text with PII
sample_text = """
Please contact john.doe@company.com or call +370 1234567.
Alternative: 81234567 or jane.smith@company.com
Visit https://support.company.com/ticket/123
"""

print(sample_text)

## 1. Regex Basics

Key patterns:
- `\d` - digit [0-9]
- `\w` - word char [a-zA-Z0-9_]
- `\s` - whitespace
- `.` - any char
- `+` - one or more
- `*` - zero or more
- `?` - zero or one
- `{n}` - exactly n
- `{n,m}` - n to m times
- `[]` - character class
- `()` - capture group
- `|` - alternation (OR)

In [None]:
# Find all digits
digits = re.findall(r'\d+', sample_text)
print("Digits found:", digits)

In [None]:
# Find words
words = re.findall(r'\b\w+\b', sample_text)
print("Words:", words[:10])

## 2. Phone Number Patterns

In [None]:
# Pattern for Lithuanian phones: +370 XXXXXXX or 8XXXXXXX
phone_pattern = r'\+370\s?\d{7}|8\d{7}'

phones = re.findall(phone_pattern, sample_text)
print("Phones found:", phones)

In [None]:
# More comprehensive phone pattern
phone_pattern_extended = r'''
    (?:\+\d{1,3}[\s.-]?)?  # International prefix (optional)
    (?:\(?\d{1,4}\)?[\s.-]?)?  # Area code (optional)
    \d{3,4}[\s.-]?  # First group
    \d{3,4}[\s.-]?  # Second group
    \d{0,4}  # Last group (optional)
'''

phone_regex = re.compile(phone_pattern_extended, re.VERBOSE)

test_phones = [
    "+370 1234567",
    "81234567",
    "+1 (555) 123-4567",
    "555.123.4567"
]

for phone in test_phones:
    match = phone_regex.search(phone)
    print(f"{phone}: {'Match' if match else 'No match'}")

## 3. Email Patterns

In [None]:
# Basic email pattern
email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'

emails = re.findall(email_pattern, sample_text)
print("Emails found:", emails)

In [None]:
# Extract email parts with named groups
email_parse_pattern = r'(?P<local>[a-zA-Z0-9._%+-]+)@(?P<domain>[a-zA-Z0-9.-]+)\.(?P<tld>[a-zA-Z]{2,})'

for email in emails:
    match = re.match(email_parse_pattern, email)
    if match:
        print(match.groupdict())

## 4. URL Patterns

In [None]:
# URL pattern
url_pattern = r'https?://[^\s<>"]+'

urls = re.findall(url_pattern, sample_text)
print("URLs found:", urls)

In [None]:
# Extract domain from URL
domain_pattern = r'https?://(?:www\.)?([^/]+)'

for url in urls:
    match = re.match(domain_pattern, url)
    if match:
        print(f"Domain: {match.group(1)}")

## 5. PII Removal Function

In [None]:
def remove_pii(text: str) -> str:
    """
    Remove PII (Personal Identifiable Information) from text.
    Removes: phones, emails, URLs
    """
    # Phone numbers (Lithuanian format)
    text = re.sub(r'\+370\s?\d{7}', '[PHONE]', text)
    text = re.sub(r'\b8\d{7}\b', '[PHONE]', text)
    
    # Email addresses
    text = re.sub(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', '[EMAIL]', text)
    
    # URLs
    text = re.sub(r'https?://[^\s<>"]+', '[URL]', text)
    
    return text

cleaned = remove_pii(sample_text)
print("Cleaned text:")
print(cleaned)

## 6. Category Prefix Removal

In [None]:
# Some tickets have category prefix in description
text_with_category = """Category:
    Software Installation
    I need help installing the software."""

print("Original:")
print(text_with_category)

In [None]:
def remove_category_prefix(text: str, category: str) -> str:
    """Remove category prefix from description."""
    pattern = rf'Category:\s*{re.escape(category)}\s*'
    return re.sub(pattern, '', text, flags=re.IGNORECASE)

cleaned = remove_category_prefix(text_with_category, "Software Installation")
print("Cleaned:")
print(cleaned)

## 7. Text Normalization

In [None]:
def normalize_text(text: str) -> str:
    """
    Normalize text for NLP processing.
    """
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    
    # Remove leading/trailing whitespace
    text = text.strip()
    
    # Remove repeated punctuation
    text = re.sub(r'([.!?])\1+', r'\1', text)
    
    # Fix spacing around punctuation
    text = re.sub(r'\s+([.,!?])', r'\1', text)
    
    return text

messy_text = "  Hello   world!!!   How are   you?   "
print(f"Original: '{messy_text}'")
print(f"Normalized: '{normalize_text(messy_text)}'")

## 8. Complete Cleaning Pipeline

In [None]:
def clean_ticket_description(text: str, category: str = None) -> str:
    """
    Complete cleaning pipeline for ticket descriptions.
    
    Steps:
    1. Remove category prefix (if provided)
    2. Remove PII (phones, emails)
    3. Normalize whitespace
    """
    # Step 1: Remove category prefix
    if category:
        text = re.sub(rf'Category:\s*{re.escape(category)}\s*', '', text, flags=re.IGNORECASE)
    
    # Step 2: Remove PII
    text = re.sub(r'\+370\s?\d{7}', '', text)  # Lithuanian phone +370
    text = re.sub(r'\b8\d{7}\b', '', text)  # Lithuanian phone 8...
    text = re.sub(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', '', text)  # Email
    
    # Step 3: Normalize whitespace
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    
    return text

# Test
test_text = """Category:
    Software Installation
    I need help installing Adobe. Contact: 81234567 john@company.com
"""

cleaned = clean_ticket_description(test_text, "Software Installation")
print(f"Cleaned: '{cleaned}'")

## 9. Apply to DataFrame

In [None]:
# Load tickets
df = pd.read_csv("../fixtures/input/tickets.csv")

# Show original descriptions
print("Original descriptions:")
for i, row in df.head(3).iterrows():
    print(f"\n[{i}] {row['description'][:100]}...")

In [None]:
# Apply cleaning
df["description_clean"] = df.apply(
    lambda row: clean_ticket_description(row["description"], row["category"]),
    axis=1
)

print("Cleaned descriptions:")
for i, row in df.head(3).iterrows():
    print(f"\n[{i}] {row['description_clean']}")

## Summary

Key patterns:
- Phone: `r'\+370\s?\d{7}|8\d{7}'`
- Email: `r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'`
- URL: `r'https?://[^\s<>"]+'

Key functions:
- `re.findall()` - find all matches
- `re.sub()` - replace matches
- `re.search()` - find first match
- `re.compile()` - compile for reuse

### Practice:
Now try the tasks in `../tasks/` folder!