# PyMOTW RE exmples

Examples from https://pymotw.com/3/re/

## Applications for Regular Expressions

Validation - Is this a legal phone number?  Typically anchor 
```python
pattern = r'^<pattern>$'
```

Search - Can I find this?

Normalizing - convert to standard form - 9 digit zip to 5

Documentation: https://docs.python.org/3/library/re.html

In [None]:
import re

pattern = r'this'
text = 'A bit of this and that.'

match = re.search(pattern, text)

s = match.start()
e = match.end()

print(f'Found "{match.re.pattern}"\nin "{match.string}"\nfrom {s} to {e} "{text[s:e]}"')

## To speed up search, compile the pattern

In [None]:
## Compile the patterns
import re

# Precompile the patterns
regexes = [
    re.compile(p)
    for p in [r'this', r'that', r'thing']
]

text = 'A bit of this and that.'

print('Text: {!r}\n'.format(text))

for regex in regexes:
    print(f'Seeking "{regex.pattern}" ->',  end=' ')

    if regex.search(text):
        print('match!')
    else:
        print('no match')

## Multiple matches

In [None]:
import re

text = 'abbaaabbbbaaaaa'

pattern = r'ab'

for match in re.findall(pattern, text):
    print(f'Found {match}')

## Create an iterator

In [None]:
import re

text = 'abbaaabbbbaaaaa'

pattern = r'ab'

for match in re.finditer(pattern, text):
    s = match.start()
    e = match.end()
    print(f'Found {text[s:e]} at {s}:{e}')

## Patern Syntax

In [None]:
import re


def test_patterns(text, patterns):
    """Given source text and a list of patterns, look for
    matches for each pattern within the text and print
    them to stdout.
    """
    # Look for each pattern in the text and print the results
    for pattern, desc in patterns:
        print(f"'{pattern}' ({desc})\n")
        print(f"  '{text}'")
        for match in re.finditer(pattern, text):
            s = match.start()
            e = match.end()
            substr = text[s:e]
            n_backslashes = text[:s].count('\\')
            prefix = '.' * (s + n_backslashes)
            print(f"  {prefix}'{substr}'")
        print()
    return


if __name__ == '__main__':
    test_patterns('abbaaabbbbaaaaa',
                  [(r'ab', "'a' followed by 'b'"),
                   ])

## Run it

Matching is greedy by default.  We can use '?' to limit this.

In [None]:
test_patterns(
    'abbaabbba',
    [(r'ab*', 'a followed by zero or more b'),
     (r'ab+', 'a followed by one or more b'),
     (r'ab?', 'a followed by zero or one b'),
     (r'ab{3}', 'a followed by three b'),
     (r'ab{2,3}', 'a followed by two to three b')],
)

## Turn off Greediness by following the repetition instruction with ?

In [None]:
test_patterns(
    'abbaabbba',
    [(r'ab*?', 'a followed by zero or more b'),
     (r'ab+?', 'a followed by one or more b'),
     (r'ab??', 'a followed by zero or one b'),
     (r'ab{3}?', 'a followed by three b'),
     (r'ab{2,3}?', 'a followed by two to three b')],
)

## Character Sets

In [None]:
test_patterns(
    'abbaabbba',
    [(r'[ab]', 'either a or b'),
     (r'a[ab]+', 'a followed by 1 or more a or b'),
     (r'a[ab]+?', 'a followed by 1 or more a or b, not greedy')],
)

In [None]:
test_patterns(
    'This is some text -- with punctuation.',
    [(r'[^-. ]+', 'sequences without -, ., or space')],
)


In [None]:
test_patterns(
    'This is some text -- with punctuation.',
    [(r'[a-z]+', 'sequences of lowercase letters'),
     (r'[A-Z]+', 'sequences of uppercase letters'),
     (r'[a-zA-Z]+', 'sequences of letters of either case'),
     (r'[A-Z][a-z]+', 'one uppercase followed by lowercase')],
)

## Extending Counts

We can use the slice notation on counts

In [None]:
test_patterns(
    'abbaabbba',
    [(r'ab{3}', 'a followed by three b'),
     (r'ab{2,3}', 'a followed by two to three b'),
     (r'ab{,3}', 'a followed by up to three b'),
     (r'ab{2,}', 'a followed by two or more b'),
     ],
)

## Modifying search

Search takes a third parameter

```
m = search(pattern, string, re.IGNORECASE)
```

In [None]:
import re

string = 'abbAABBA'
pattern = r'ab{2}'

re.findall(pattern, string, re.IGNORECASE)

## Match Zip Codes

In [None]:
import re

string = '1234 90210-1234 01702'
pattern = r'(\d{5})(-\d{4})' 

re.findall(pattern, string)

In [None]:
string = '1234 90210-1234 01702'
pattern = r'(\d{5})(-\d{4})?' 

re.findall(pattern, string)

## Finding TLAs

In [None]:
string = 'I forgot to find a TLA for my FOMO at the ATM!'
pattern = r'\b[A-Z]{3}\b' 

re.findall(pattern, string)

In [None]:
pattern = r'\b[A-Z]{3}\b' 

re.findall(pattern, string)