In [None]:
# === Environment Setup ===
import os, sys, math, time, random, json, textwrap, warnings, re, timeit, unicodedata
import numpy as np, pandas as pd, matplotlib.pyplot as plt
from IPython.display import display, Markdown

# --- Configuration ---
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams.update({'figure.dpi': 150, 'font.size': 12, 'axes.titlesize': 'large'})

# --- Utility Functions ---
def note(msg, **kwargs):
    """Prints a formatted message with a notebook icon."""
    formatted_msg = textwrap.fill(msg, width=100, subsequent_indent='   ')
    print(f"\n📝 {formatted_msg}", **kwargs)
def sec(title):
    """Prints a formatted section title for code blocks."""
    print(f"\n{100*'='}\n| {title.upper()} |\n{100*'='}")

note("Environment initialized.")

# Part 1: Foundations
## Chapter 1.6: Advanced String Processing with Unicode and Regular Expressions

### Table of Contents
1. [Introduction: Text as Data in Economics](#Introduction:-Text-as-Data-in-Economics)
2. [The String Data Model: Immutability and Performance](#1.-The-String-Data-Model:-Immutability-and-Performance)
3. [From Abstract Text to Physical Bytes: Unicode and Encodings](#2.-From-Abstract-Text-to-Physical-Bytes:-Unicode-and-Encodings)
    - [Unicode Normalization: Canonical Equivalence](#2.1-Unicode-Normalization:-Canonical-Equivalence)
4. [Advanced Text Processing with Regular Expressions](#3.-Advanced-Text-Processing-with-Regular-Expressions)
    - [Greedy vs. Non-Greedy Matching](#3.1-Greedy-vs.-Non-Greedy-Matching)
    - [Verbose Regex for Readability](#3.2-Verbose-Regex-for-Readability)
5. [Exercises](#4.-Exercises)
6. [Solutions to Exercises](#5.-Solutions-to-Exercises)

### Introduction: Text as Data in Economics

A vast and growing amount of economic data is unstructured text. Central bank announcements, corporate filings, news articles, and social media posts all contain valuable information that requires specialized tools to unlock. This chapter moves beyond basic string manipulation to cover the advanced techniques essential for working with text as data.

We will cover three core topics:
1.  **The String Data Model:** Understanding the performance implications of string immutability.
2.  **Unicode and Encodings:** Mastering the distinction between abstract text and its physical byte representation, a common source of errors when working with international or web-scraped data.
3.  **Regular Expressions:** A powerful mini-language for finding and extracting complex patterns from text, which is indispensable for data cleaning and feature engineering.

Mastery of these tools is a prerequisite for more advanced topics in Natural Language Processing (NLP) and is a crucial skill for any modern applied economist.

### 1. The String Data Model: Immutability and Performance

A Python `str` is an immutable sequence of **Unicode** characters. This has two critical implications for a programmer:

1.  **Immutability and Safety:** Once a string is created, it cannot be changed. Any operation that appears to modify a string, such as `my_str.replace('a', 'b')` or `my_str.upper()`, actually creates and returns a *new* string, leaving the original untouched. This prevents a class of subtle bugs where a string passed to a function is unexpectedly modified by that function. It also allows Python to make internal optimizations, such as **interning** commonly used strings (storing only one copy of a string literal in memory to save space).

2.  **Unicode Support:** A string is a sequence of Unicode code points, not bytes. This means a `str` can natively represent characters from virtually any writing system in the world (e.g., Greek `β`, Japanese `円`, Russian `рубль`). This is essential for any economic analysis involving international data, multilingual text, or mathematical symbols.

The immutability of strings is a core feature, and trying to violate it will result in a `TypeError`.

In [None]:
sec("Demonstrating String Immutability")
my_string = "alpha"
print(f"Original string: '{my_string}' at memory ID: {id(my_string)}")

note("Calling .upper() creates a NEW string:")
upper_string = my_string.upper()
print(f"New string: '{upper_string}' at memory ID: {id(upper_string)}")
print(f"Original string is unchanged: '{my_string}'")

note("Attempting to modify a string by index fails:")
try:
    my_string[0] = 'b'
except TypeError as e:
    print(f"Caught expected error: {e}")

#### 1.1 Performance: The `join` Method vs. Concatenation

A direct consequence of immutability is that building a string from many smaller pieces by repeatedly using the `+` operator in a loop is **extremely inefficient**. Each `+` operation creates a new string and copies the contents of the previous strings. This results in quadratic O(n²) time complexity, where `n` is the number of strings to join. 

The correct, Pythonic, and highly performant way to build a string from a sequence of smaller strings is to use the `str.join()` method. This method takes an iterable (like a list of strings) and concatenates its elements with the string separator. It performs a single memory allocation for the final string, resulting in linear O(n) time complexity.

In [None]:
sec("String Concatenation Performance: `+` vs. `join`")
words = ["alpha", "beta", "gamma", "delta"] * 1000 # A list of 4000 words

def build_string_with_plus():
    result = ""
    for word in words:
        result += word + " " # Inefficient: creates a new string in every iteration
    return result

def build_string_with_join():
    # Efficient: one allocation and pass
    return " ".join(words)

note("Timing the inefficient O(n^2) approach using `+` in a loop:")
time_plus = timeit.timeit(build_string_with_plus, number=100)
print(f"  -> Time taken: {time_plus:.4f} seconds")

note("Timing the efficient O(n) approach using `' '.join()`:")
time_join = timeit.timeit(build_string_with_join, number=100)
print(f"  -> Time taken: {time_join:.4f} seconds")

if time_join > 0:
    note(f"The join() method was approximately {time_plus / time_join:,.0f}x faster.")

### 2. From Abstract Text to Physical Bytes: Unicode and Encodings

One of the most common sources of errors when working with real-world data is a misunderstanding of the difference between **text** and **bytes**. 

- A `str` in Python is a sequence of abstract **Unicode code points**. It represents text. It has no physical representation on disk or over a network.
- A `bytes` object in Python is a sequence of raw 8-bit bytes. It represents binary data. 

The process of converting a `str` into `bytes` is called **encoding**. The process of converting `bytes` back into a `str` is called **decoding**. To perform these conversions, you need to specify an **encoding**, which is a rule for mapping code points to byte sequences. **UTF-8** is the modern, de facto standard encoding and should be used unless you have a specific reason not to.

This distinction is not academic. When you read a file from disk or receive data from a web API, you are always receiving a stream of bytes. If you try to interpret those bytes using the wrong encoding, you will either get garbage data or a `UnicodeDecodeError`.

In [None]:
sec("Encoding and Decoding Demonstration")
euro_symbol = "€"
print(f"Original string: '{euro_symbol}' (type: {type(euro_symbol)})")

note("Encoding the string to bytes using different standards:")
utf8_bytes = euro_symbol.encode('utf-8')
utf16_bytes = euro_symbol.encode('utf-16')
print(f"UTF-8 bytes: {utf8_bytes} (Length: {len(utf8_bytes)})")
print(f"UTF-16 bytes: {utf16_bytes} (Length: {len(utf16_bytes)})")

note("Decoding the bytes back to a string:")
decoded_string = utf8_bytes.decode('utf-8')
print(f"Decoded string: '{decoded_string}'")

note("Attempting to decode using the WRONG encoding (e.g., ASCII) will fail:")
try:
    utf8_bytes.decode('ascii')
except UnicodeDecodeError as e:
    print(f"Caught expected error: {e}")

#### 2.1 Unicode Normalization: Canonical Equivalence
A subtle but critical issue in text processing is that Unicode sometimes provides multiple ways to represent the same character. For example, the character "é" can be represented as a single pre-composed code point (`U+00E9`) or as a base character 'e' (`U+0065`) followed by a separate combining accent mark (`U+0301`).

While these look identical to a human reader, they are different byte sequences to a computer. A naive string comparison would find them to be unequal, leading to bugs in data cleaning, merging, or feature extraction. The solution is **Unicode normalization**. The `unicodedata` module provides functions to convert strings into a standard, canonical form.

- **NFC (Normalization Form C):** Composes characters into the shortest possible form. This is the most common form for general-purpose text.
- **NFD (Normalization Form D):** Decomposes characters into base characters and combining marks.

**Guideline:** Before performing any string comparisons or processing on text that comes from multiple sources (e.g., different websites, user inputs, or files), you should normalize it to a consistent form, typically NFC.

In [None]:
sec("Unicode Normalization")
# Create two visually identical but byte-wise different strings
s1 = 'café' # Uses the pre-composed 'é'
s2 = 'cafe\u0301' # 'e' followed by a combining acute accent

print(f"s1: {s1}, s2: {s2}")
print(f"Are they visually the same? Yes.")
note(f"Are they equal according to Python? {s1 == s2}")
print(f"Length of s1: {len(s1)}, Length of s2: {len(s2)}")

note("Normalizing both to NFC (Normalization Form C) makes them equal.")
s1_nfc = unicodedata.normalize('NFC', s1)
s2_nfc = unicodedata.normalize('NFC', s2)
print(f"Normalized strings are equal: {s1_nfc == s2_nfc}")
print(f"Length of normalized s1: {len(s1_nfc)}, Length of normalized s2: {len(s2_nfc)}")

### 3. Advanced Text Processing with Regular Expressions

Strings have a rich set of methods for common text manipulation tasks. For more complex pattern matching, Python's `re` module provides **regular expressions** (regex), a powerful mini-language for describing and matching text patterns.

| Method | Description |
|:---|:---|
|`re.search(p, s)`| Scans string `s` for the first location where pattern `p` matches.|
|`re.match(p, s)`| Matches pattern `p` only at the *beginning* of string `s`.|
|`re.findall(p, s)`| Finds all non-overlapping matches of `p` in `s` as a list.|
|`re.finditer(p, s)`| Like `findall`, but returns an iterator of match objects (more memory efficient).|
|`re.sub(p, r, s)`| Replaces all occurrences of `p` in `s` with replacement `r`.|
|`re.compile(p)`| Compiles pattern `p` into a regex object for faster repeated use.|

#### 3.1 Greedy vs. Non-Greedy Matching
A crucial concept in regex is **greediness**. By default, quantifiers like `*` (0 or more) and `+` (1 or more) are **greedy**: they match as much text as possible while still allowing the rest of the pattern to match. This can lead to unexpected results, especially when matching text between two delimiters.

To make a quantifier **non-greedy** (or "lazy"), you add a `?` after it (e.g., `*?` or `+?`). This tells the engine to match as *few* characters as possible.

In [None]:
sec("Greedy vs. Non-Greedy Regex Matching")
text = "The price is <p>100</p> but the limit is <l>200</l>."

note("Greedy match (.*) tries to match everything between the first < and last >")
greedy_match = re.search('<.*>', text)
print(f"  Result: {greedy_match.group(0) if greedy_match else 'No match'}")

note("Non-greedy match (.*?) matches the shortest possible text")
non_greedy_matches = re.findall('<.*?>', text)
print(f"  Result: {non_greedy_matches}")

#### 3.2 Verbose Regex for Readability
Complex regular expressions can become almost unreadable. The `re.VERBOSE` flag is a powerful tool for making them maintainable. It allows you to add whitespace and comments to your pattern, which the regex engine will ignore.

**Named Capture Groups `(?P<name>...)`** are essential for clarity. Instead of accessing results by a numeric index, you can access them by name from the match object's `.groupdict()` method.

In [None]:
sec("Parsing Text with Verbose Regex and Named Groups")
policy_text = "The FOMC raised the target range for the federal funds rate to 5-1/4 to 5.5 percent. This 25 basis point adjustment is key."

rate_pattern = re.compile(
    r"""
    # Capture the numeric value. This part is complex to handle various formats.
    (?P<value>[\d./\s-]+[\d])  # Handles integers, floats, fractions ('1/4'), and ranges ('5-1/4').
    \s*                        # Allow for optional whitespace between value and unit.
    # Capture the unit, allowing for several common variations.
    (?P<unit>percent|basis\spoint|bps|%) 
    """, 
    re.IGNORECASE | re.VERBOSE
)

note("Found the following rate mentions in the text:")
for match in rate_pattern.finditer(policy_text):
    print(f"  - Value: '{match.group('value').strip()}', Unit: '{match.group('unit').lower()}'")

### 4. Exercises

1.  **Unicode Normalization Challenge**
    - Create two strings representing the same name: `s1 = 'Jürgen'` and `s2 = 'Ju\u0308rgen'`. 
    - Demonstrate that they are not equal. 
    - Write a function `are_names_equal(n1, n2)` that returns `True` if the names are canonically equivalent, regardless of their initial representation. Use `unicodedata.normalize('NFC', ...)` inside the function.

2.  **Advanced Regex for Financial Data**
    - You are given a string from a financial report: `"The company's revenue was $1.25B in 2022, a 15% increase from the previous year's $1.09B."`
    - Write a single, compiled, verbose regex with named groups to extract all currency amounts. 
    - The pattern should capture the `currency_symbol` ('$'), the `value` (e.g., '1.25'), and the `multiplier` ('B' for billion, 'M' for million, etc.), if present.
    - Iterate through the matches and print out each captured group dictionary.

3.  **Greedy vs. Non-Greedy HTML Parsing**
    - Given the HTML snippet `html = "<p>First paragraph.</p><div>Some content.</div><p>Second paragraph.</p>"`
    - Write a regex using a **greedy** quantifier (`.*`) to find the text between `<p>` and `</p>`. Print the result. Explain why it captures more than you expect.
    - Write a second regex using a **non-greedy** quantifier (`.*?`) to correctly capture the text of *each* paragraph tag individually. Use `re.findall` to get both results.

### 5. Solutions to Exercises

In [None]:
# Solution for Exercise 1
s1 = 'Jürgen'
s2 = 'Ju\u0308rgen'
def are_names_equal(n1, n2):
    return unicodedata.normalize('NFC', n1) == unicodedata.normalize('NFC', n2)
sec("Solution 1: Unicode Normalization")
print(f"s1 == s2 is {s1 == s2}")
print(f"are_names_equal(s1, s2) is {are_names_equal(s1, s2)}")

# Solution for Exercise 2
sec("Solution 2: Regex for Financials")
report_text = "The company's revenue was $1.25B in 2022, a 15% increase from the previous year's $1.09B."
currency_pattern = re.compile(r"""
    (?P<currency_symbol>\$|€|£) # Match common currency symbols
    (?P<value>\d+\.\d+)       # Match the numeric value
    (?P<multiplier>[BMT])?      # Optionally match a multiplier (B, M, or T)
    """, re.VERBOSE)
for match in currency_pattern.finditer(report_text):
    print(match.groupdict())

# Solution for Exercise 3
sec("Solution 3: Greedy vs. Non-Greedy")
html = "<p>First paragraph.</p><div>Some content.</div><p>Second paragraph.</p>"
print(f"Greedy match: {re.search('<p>.*</p>', html).group(0)}")
print(f"Non-greedy matches: {re.findall('<p>(.*?)</p>', html)}")