# Advanced String Manipulation and Regular Expressions Lab

This comprehensive lab covers string manipulation and regular expressions in Python, from basics to advanced concepts with practical examples.

In [2]:
import re
from collections import Counter
import pandas as pd  # For displaying results nicely

## 1. Regular Expression Basics

### 1.1 Basic Characters and Special Characters

In [3]:
# Basic character matching
text = "The quick brown fox jumps over the lazy dog"

# Simple character match
print("Finding 'o':", re.findall('o', text))

# Dot (.) matches any character except newline
print("Finding 'q.':", re.findall('q.', text))  # Matches 'qu'

# Escaping special characters
text_with_dots = "www.example.com"
print("Finding literal dots:", re.findall('\.', text_with_dots))

Finding 'o': ['o', 'o', 'o', 'o']
Finding 'q.': ['qu']
Finding literal dots: ['.', '.']


### 1.2 Character Classes []

In [None]:
text = "The year was 2024, Room A123 was empty."

# Match any digit
print("Digits:", re.findall('[0-9]', text))

# Match any uppercase letter
print("Uppercase:", re.findall('[A-Z]', text))

# Match any lowercase letter
print("Lowercase:", re.findall('[a-z]', text))

# Match letters and digits
print("Alphanumeric:", re.findall('[A-Za-z0-9]', text))

# Negation - match anything except digits
print("Non-digits:", re.findall('[^0-9]', text))

### 1.3 Quantifiers and Metacharacters

In [None]:
text = "file1.txt file22.jpg file333.pdf"

# * (zero or more)
print("Files with any digits:", re.findall(r'file\d*\.\w+', text))

# + (one or more)
print("Files with at least one digit:", re.findall(r'file\d+\.\w+', text))

# ? (zero or one)
print("Optional digit:", re.findall(r'file\d?\.\w+', text))

# {n} (exactly n)
print("Files with exactly 2 digits:", re.findall(r'file\d{2}\.\w+', text))

# {n,m} (between n and m)
print("Files with 2-3 digits:", re.findall(r'file\d{2,3}\.\w+', text))

### 1.4 Common Metacharacter Shortcuts

In [None]:
text = "Hello123_World! \t\nTesting"

# \d - digits
print("Digits:", re.findall(r'\d+', text))

# \w - word characters [A-Za-z0-9_]
print("Word chars:", re.findall(r'\w+', text))

# \s - whitespace
print("Whitespace:", [repr(x) for x in re.findall(r'\s+', text)])

# \b - word boundary
print("Words starting with 'T':", re.findall(r'\bT\w+', text))

## 2. Advanced Regular Expression Concepts

### 2.1 Groups and Capturing

In [None]:
# Basic grouping
text = "John Doe (john@example.com), Jane Smith (jane@example.com)"

# Capturing groups with ()
pattern = r'(\w+)\s+(\w+)\s+\((\w+@\w+\.\w+)\)'
matches = re.findall(pattern, text)

# Convert to DataFrame for nice display
df = pd.DataFrame(matches, columns=['First Name', 'Last Name', 'Email'])
print("Captured groups:")
display(df)

# Named groups
pattern = r'(?P<first>\w+)\s+(?P<last>\w+)\s+\((?P<email>\w+@\w+\.\w+)\)'
for match in re.finditer(pattern, text):
    print("\nNamed groups:")
    print(f"First name: {match.group('first')}")
    print(f"Last name: {match.group('last')}")
    print(f"Email: {match.group('email')}")

### 2.2 Lookahead and Lookbehind Assertions

In [None]:
text = "price: $100, discount: $20, total: $80"

# Positive lookahead (?=...)
print("Numbers followed by dollars:", re.findall(r'\d+(?=\$)', text))

# Negative lookahead (?!...)
print("Numbers not followed by dollars:", re.findall(r'\d+(?!\$)', text))

# Positive lookbehind (?<=...)
print("Numbers after dollar sign:", re.findall(r'(?<=\$)\d+', text))

# Negative lookbehind (?<!...)
print("Numbers not after dollar sign:", re.findall(r'(?<!\$)\d+', text))

### 2.3 Greedy vs Non-Greedy Matching

In [None]:
text = "<div>First</div><div>Second</div>"

# Greedy matching (default)
print("Greedy:", re.findall(r'<div>.*</div>', text))

# Non-greedy matching (adding ?)
print("Non-greedy:", re.findall(r'<div>.*?</div>', text))

# More examples
text = "aaaa"
print("Greedy a*:", re.findall(r'a*', text))
print("Non-greedy a*?:", re.findall(r'a*?', text))

## 3. Practical Applications

### 3.1 Email Validation

In [None]:
def validate_email(email):
    """Validate email with comprehensive pattern"""
    pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
    return bool(re.match(pattern, email))

# Test cases
test_emails = [
    "user@example.com",
    "user.name+tag@example.co.uk",
    "invalid.email@",
    "@invalid.com",
    "user@.com",
    "user@example..com"
]

for email in test_emails:
    print(f"{email}: {'Valid' if validate_email(email) else 'Invalid'}")

### 3.2 Log File Analysis

In [None]:
log_data = """
2024-02-06 10:15:30 INFO User 'admin' logged in from 192.168.1.100
2024-02-06 10:15:35 ERROR Failed login attempt from 10.0.0.50
2024-02-06 10:16:00 WARNING High CPU usage detected (85%)
2024-02-06 10:16:30 INFO Database backup completed
"""

# Parse log entries
pattern = r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}) (\w+) (.+)'

log_entries = []
for match in re.finditer(pattern, log_data):
    timestamp, level, message = match.groups()
    log_entries.append({
        'timestamp': timestamp,
        'level': level,
        'message': message
    })

# Display as DataFrame
df = pd.DataFrame(log_entries)
display(df)

# Extract IP addresses
ip_pattern = r'\b(?:\d{1,3}\.){3}\d{1,3}\b'
ip_addresses = re.findall(ip_pattern, log_data)
print("\nIP addresses found:", ip_addresses)

### 3.3 Data Extraction and Cleaning

In [None]:
# Sample messy data
data = """
Product: iPhone 13    Price: $799.99   SKU: IP13-128GB
Product: Galaxy S21   Price: $699.99   SKU: SAM-S21-256
Product: Pixel 6      Price: $599.99   SKU: GP6-128BLK
"""

# Extract structured data
pattern = r'Product:\s*([^\n]+?)\s+Price:\s*\$(\d+\.\d+)\s+SKU:\s*(\S+)'

products = []
for match in re.finditer(pattern, data):
    product, price, sku = match.groups()
    products.append({
        'product': product.strip(),
        'price': float(price),
        'sku': sku
    })

# Display as DataFrame
df = pd.DataFrame(products)
display(df)

## 4. Advanced Exercise: Data Validation Tool

Create a comprehensive data validation tool that can:
1. Validate various data formats (email, phone, dates, etc.)
2. Extract and standardize data
3. Generate validation reports

In [None]:
class DataValidator:
    def __init__(self):
        # Define validation patterns
        self.patterns = {
            'email': r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$',
            'phone': r'^(\+\d{1,3}[- ]?)?\(?\d{3}\)?[- ]?\d{3}[- ]?\d{4}$',
            'date': r'^\d{4}-(0[1-9]|1[0-2])-(0[1-9]|[12]\d|3[01])$',
            'url': r'^https?://(?:www\.)?[\w\d\-_]+\.\w{2,}(?:/[\w\d\-._/?%&=]*)?$'
        }
    
    def validate_field(self, field_type, value):
        """Validate a single field"""
        if field_type not in self.patterns:
            raise ValueError(f"Unknown field type: {field_type}")
            
        return bool(re.match(self.patterns[field_type], str(value)))
    
    def standardize_phone(self, phone):
        """Standardize phone number format"""
        # Remove all non-digit characters
        digits = re.sub(r'\D', '', phone)
        if len(digits) == 10:
            return f"({digits[:3]}) {digits[3:6]}-{digits[6:]}"
        return phone
    
    def validate_dataset(self, data):
        """Validate a dataset with multiple fields"""
        results = []
        for record in data:
            validation = {
                'original': record,
                'validation': {},
                'standardized': record.copy()
            }
            
            # Validate each field
            for field, value in record.items():
                field_type = field.split('_')[0]  # e.g., email_address -> email
                if field_type in self.patterns:
                    validation['validation'][field] = self.validate_field(field_type, value)
                    
                    # Standardize phone numbers
                    if field_type == 'phone' and validation['validation'][field]:
                        validation['standardized'][field] = self.standardize_phone(value)
            
            results.append(validation)
        return results

# Test the validator
validator = DataValidator()

test_data = [
    {
        'email_address': 'user@example.com',
        'phone_number': '123-456-7890',
        'date_joined': '2024-02-06',
        'website_url': 'https://example.com'
    },
    {
        'email_address': 'invalid.email',
        'phone_number': '(123) 456 7890',
        'date_joined': '2024-13-45',
        'website_url': 'not-a-url'
    }
]

results = validator.validate_dataset(test_data)

# Display results
for result in results:
    print("\nOriginal Data:", result['original'])
    print("Validation Results:", result['validation'])
    print("Standardized Data:", result['standardized'])

## 5. Practice Exercises

1. Create a function to extract all URLs from a text and validate them
2. Build a password strength checker using regex
3. Create a function to parse and validate different date formats
4. Build a function to extract and categorize different types of identifiers (emails, phones, SSNs, etc.)

Try implementing these exercises using the concepts learned above!