In [24]:
import re

names_file = open("names.txt", encoding="utf-8")
data = names_file.read()
names_file.close()

#### .match() - Matches a string starting from the beginning

In [8]:
print(re.match(r'Hawkins', data)) # works
print(re.match(r'Derek', data)) # doesn't work

<_sre.SRE_Match object; span=(0, 7), match='Hawkins'>
None


#### .search() - Matches anywhere in string

In [10]:
re.search(r'Derek', data)

<_sre.SRE_Match object; span=(9, 14), match='Derek'>

In [12]:
re.search(r'\w, \w', data)

<_sre.SRE_Match object; span=(6, 10), match='s, D'>

In [20]:
re.search(r'\(\d\d\d\) \d\d\d-\d\d\d\d', data)

<_sre.SRE_Match object; span=(38, 52), match='(555) 555-5555'>

### Exercise 1:
<p>Write a function that checks for n number of consecutive digits in a string.</p>

In [35]:
def find_digits(count, string):
    return re.search(r'\d'*count, string)

find_digits(4, data)

<_sre.SRE_Match object; span=(48, 52), match='5555'>

## Counts

In [43]:
# instead of typing \d and \w consecutively
re.search(r'\(\d{3}\) \d{3}-\d{4}', data) # specify number of digits to look for
re.search(r'\w+, \w+', data) # more than 1 letters

<_sre.SRE_Match object; span=(0, 14), match='Hawkins, Derek'>

In [47]:
# Not all numbers have parantheses
# Find all numbers
re.findall(r'\(?\d{3}\)? \d{3}-\d{4}', data) # '?' means optional

['(555) 555-5555',
 '(555) 555-5554',
 '(555) 555-5543',
 '555 555-5551',
 '(555) 555-5553',
 '(555) 555-4444']

In [53]:
# Find all names
re.findall(r'\w+, \w+', data)

['Hawkins, Derek',
 'Teacher, Coding',
 'Zhai, Mo',
 'Teacher, Coding',
 'Johnson, Joe',
 'Johson, Joe',
 'Österberg, Sven',
 'Governor, Norrbotten',
 'Enchanter, Killer',
 'Butz, Ryan',
 'CEO, Coding',
 'Doctor, The',
 'Lord, Gallifrey',
 'Exampleson, Example',
 'Example, Example',
 'Obama, Barack',
 'President, United',
 'Pael, Ripal',
 'Teacher, Coding',
 'Vader, Darth',
 'Lord, Galactic',
 'Sanz, María',
 'Minister, Spanish']

## Sets

In [73]:
# Find email addresses
re.findall(r'[-\w\d+.]+@[-\w\d.]+', data)

['derek@codingtemple.com',
 'mozhai@codingtemple.com',
 'joejohnson@codingtemple.com',
 'governor@norrbotten.co.se',
 'tim@killerrabbit.com',
 'ryanb@codingtemple.com',
 'doctor+companion@tardis.co.uk',
 'me@example.com',
 'president.44@us.gov',
 'ripalp@codingtemple.com',
 'darth-vader@empire.gov',
 'mtfvs@spain.gov']

## Exercise
<p>Find all email addresses with the codingtemple domain</p>

In [74]:
# Find all email addresses with a codingtemple domain
# [] Looks for unique characters
re.findall(r'[-\w\d+.]+@[codingtempl]{12}.com', data, re.IGNORECASE) # re.I for short

['derek@codingtemple.com',
 'mozhai@codingtemple.com',
 'joejohnson@codingtemple.com',
 'ryanb@codingtemple.com',
 'ripalp@codingtemple.com']

## Negation

In [83]:
# Search for emails
print(re.findall(r'''
    \b@[-\w\d.]* # word boundary, an @ and then any number of chars
    [^gov\t]+ # exclude instances of 'g', 'o', 'v', or 't' and tab
    \b # Another word boundary
''', data, re.X|re.I)) # VERBOSE allows for multiline regex

['@codingtemple.com', '@codingtemple.com', '@codingtemple.com', '@norrbotten.co.se', '@killerrabbit.com', '@codingtemple.com', '@tardis.co.uk', '@example.com', '@us.', '@codingtemple.com', '@empire.', '@spain.']


In [84]:
# search for all names
print(re.findall(r'''
    \b[-\w]+, # find boundary, 1+ hypens or words and a comma
    \s # find 1 whitespace
    [-\w ]+ # 1+ hyphens and chars and a space
    [^\t\n] # ignore tabs and newlines
''', data, re.X|re.I))

['Hawkins, Derek', 'Teacher, Coding Temple', 'Zhai, Mo', 'Teacher, Coding Temple', 'Johnson, Joe', 'Johson, Joe', 'Österberg, Sven-Erik', 'Governor, Norrbotten', 'Enchanter, Killer Rabbit Cave', 'Butz, Ryan', 'CEO, Coding Temple', 'Doctor, The', 'Lord, Gallifrey', 'Exampleson, Example', 'Example, Example Co.', 'Obama, Barack', 'President, United States of America', 'Pael, Ripal', 'Teacher, Coding Temple', 'Vader, Darth', 'Lord, Galactic Empire', 'Sanz, María Teresa', 'Minister, Spanish Govt.']


## Grouping

In [105]:
print(re.findall(r'''
    ^([-\w ]*,\s[-\w ]+)\t # last and first names
    ([-\w\d.+]+@[-\w\d.]+)\t # email
    (\(?\d{3}\)?-?\s?\d{3}-\d{4})\t # phone
    ([\w\s]+,\s[\w\s.]+)\t? # job and company
    (@[\w\d]+)?$ # twitter
''', data, re.X|re.M))

[('Hawkins, Derek', 'derek@codingtemple.com', '(555) 555-5555', 'Teacher, Coding Temple\t', '@derekhawkins'), ('Zhai, Mo', 'mozhai@codingtemple.com', '(555) 555-5554', 'Teacher, Coding Temple', ''), ('Butz, Ryan', 'ryanb@codingtemple.com', '(555) 555-5543', 'CEO, Coding Temple\t', '@ryanbutz'), ('Exampleson, Example', 'me@example.com', '555-555-5552', 'Example, Example Co.\t', '@example'), ('Obama, Barack', 'president.44@us.gov', '555 555-5551', 'President, United States of America\t', '@potus44'), ('Pael, Ripal', 'ripalp@codingtemple.com', '(555) 555-5553', 'Teacher, Coding Temple\t', '@ripalp'), ('Vader, Darth', 'darth-vader@empire.gov', '(555) 555-4444', 'Sith Lord, Galactic Empire\t', '@darthvader')]


## Labels

In [99]:
information = re.findall(r'''
    ^(?P<name>[-\w ]*,\s[-\w ]+)\t # last and first names
    (?P<email>[-\w\d.+]+@[-\w\d.]+)\t # email
    (?P<phone>\(?\d{3}\)?-?\s?\d{3}-\d{4})\t # phone
    (?P<job>[\w\s]+,\s[\w\s.]+)\t? # job and company
    (?P<twitter>@[\w\d]+)?$ # twitter
''', data, re.X|re.M)

print(information)

[('Hawkins, Derek', 'derek@codingtemple.com', '(555) 555-5555', 'Teacher, Coding Temple\t', '@derekhawkins'), ('Zhai, Mo', 'mozhai@codingtemple.com', '(555) 555-5554', 'Teacher, Coding Temple', ''), ('Butz, Ryan', 'ryanb@codingtemple.com', '(555) 555-5543', 'CEO, Coding Temple\t', '@ryanbutz'), ('Exampleson, Example', 'me@example.com', '555-555-5552', 'Example, Example Co.\t', '@example'), ('Obama, Barack', 'president.44@us.gov', '555 555-5551', 'President, United States of America\t', '@potus44'), ('Pael, Ripal', 'ripalp@codingtemple.com', '(555) 555-5553', 'Teacher, Coding Temple\t', '@ripalp'), ('Vader, Darth', 'darth-vader@empire.gov', '(555) 555-4444', 'Sith Lord, Galactic Empire\t', '@darthvader')]


In [106]:
info = re.compile(r'''
    ^(?P<name>(?P<last>[-\w ]*),\s(?P<first>[-\w ]+))\t # last and first names
    (?P<email>[-\w\d.+]+@[-\w\d.]+)\t # email
    (?P<phone>\(?\d{3}\)?-?\s?\d{3}-\d{4})\t # phone
    (?P<job>[\w\s]+,\s[\w\s.]+)\t? # job and company
    (?P<twitter>@[\w\d]+)?$ # twitter
''', re.X|re.M)

for i in info.finditer(data):
    print(i.group('name'))

Hawkins, Derek
Zhai, Mo
Butz, Ryan
Exampleson, Example
Obama, Barack
Pael, Ripal
Vader, Darth
