# Topic 0: Text Processing Basics - SOLUTIONS

This notebook contains complete solutions to all exercises from the regex and text basics lesson.

In [None]:
import re
import string

# Sample messy text to work with
messy_text = "Hello!!! This is some text with numbers 123 and symbols @#$%"
print("Original text:", messy_text)

## Solution 1: Advanced Text Cleaning Function

In [None]:
def advanced_clean_text(text):
    """Clean text by removing punctuation, numbers, and extra spaces."""
    # Remove all punctuation using string.punctuation
    clean_text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Remove numbers using regex
    clean_text = re.sub(r'\d+', '', clean_text)
    
    # Remove extra spaces
    clean_text = re.sub(r'\s+', ' ', clean_text).strip()
    
    return clean_text

# Test the function
cleaned = advanced_clean_text(messy_text)
print("Cleaned text:", cleaned)

## Solution 2: Find Email Addresses

In [None]:
def find_emails(text):
    """Find all email addresses in text."""
    # Simple email pattern
    email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
    emails = re.findall(email_pattern, text)
    return emails

# Test with sample text
email_text = "Contact us at info@example.com or support@test.de for help."
found_emails = find_emails(email_text)
print("Found emails:", found_emails)

## Solution 3: Extract Phone Numbers

In [None]:
def find_phone_numbers(text):
    """Find German phone numbers in various formats."""
    # Pattern for German phone numbers
    phone_patterns = [
        r'\+49[\s-]?\d{2,4}[\s-]?\d{6,8}',  # +49 format
        r'0\d{2,4}[\s-]?\d{6,8}',           # 0xxx format
        r'\(0\d{2,4}\)[\s-]?\d{6,8}'        # (0xxx) format
    ]
    
    phones = []
    for pattern in phone_patterns:
        phones.extend(re.findall(pattern, text))
    
    return phones

# Test with sample text
phone_text = "Call us at +49 123 456789 or 0234-567890 or (0345) 678901"
found_phones = find_phone_numbers(phone_text)
print("Found phone numbers:", found_phones)

## Solution 4: Complete Text Processing Pipeline

In [None]:
class TextProcessor:
    """Complete text processing pipeline."""
    
    def __init__(self):
        self.email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
        self.phone_pattern = r'\+?\d[\d\s\-\(\)]{7,}\d'
        self.url_pattern = r'https?://[^\s]+'
    
    def extract_entities(self, text):
        """Extract emails, phones, and URLs from text."""
        return {
            'emails': re.findall(self.email_pattern, text),
            'phones': re.findall(self.phone_pattern, text),
            'urls': re.findall(self.url_pattern, text)
        }
    
    def clean_text(self, text):
        """Clean and normalize text."""
        # Convert to lowercase
        text = text.lower()
        
        # Remove URLs, emails, phones
        text = re.sub(self.url_pattern, '', text)
        text = re.sub(self.email_pattern, '', text)
        text = re.sub(self.phone_pattern, '', text)
        
        # Remove punctuation and numbers
        text = re.sub(r'[^a-zA-ZäöüÄÖÜß\s]', '', text)
        
        # Normalize whitespace
        text = re.sub(r'\s+', ' ', text).strip()
        
        return text
    
    def process(self, text):
        """Complete processing pipeline."""
        entities = self.extract_entities(text)
        cleaned = self.clean_text(text)
        
        return {
            'original': text,
            'cleaned': cleaned,
            'entities': entities,
            'word_count': len(cleaned.split()),
            'char_count': len(cleaned)
        }

# Test the complete pipeline
processor = TextProcessor()
sample_text = """Visit https://example.com or contact info@test.de. 
Call +49 123 456789 for support! Price: 19.99€."""

result = processor.process(sample_text)
print("Processing Results:")
for key, value in result.items():
    print(f"{key}: {value}")