In [1]:
import re     # The regular expression module.

In [2]:
phoneNumRegex = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d')

In [3]:
mo = phoneNumRegex.search('My number is 415-555-4242')

In [4]:
mo.group()

'415-555-4242'

Parenthesis in regular expression strings group items together and can be referenced by number. For example, in order to capture the area code and the rest of the phone number separately, we can use parenthesis to group each collection as below.

In [5]:
phoneNumRegex = re.compile(r'(\d\d\d)-(\d\d\d-\d\d\d\d)')

In [6]:
mo = phoneNumRegex.search('My number is 415-555-4242')

In [7]:
mo.group()

'415-555-4242'

Now to reference the area code we can pass the number 1 into the .group method (1 since the area code was within the first pair of parenthesis).

In [8]:
mo.group(1)

'415'

In [9]:
mo.group(2)

'555-4242'

The pipe character "|" acts as an "or" to allow for matching one or another choice

In [10]:
batRegex = re.compile(r'B|bat(man|mobile|copter|bat)')

In [11]:
mo = batRegex.search('batmobile lost a wheel')

In [12]:
mo.group()

'batmobile'

In [13]:
mo.group(1)

'mobile'

In [14]:
longstring = 'Once upon a time batman went searching for his batmobile. He was unable to find it -- must have forgotten where he parked it after having been on a bender. He then went to the batcopter pad and found his trusty batcopter.'

In [15]:
mo = batRegex.findall(longstring)

In [16]:
mo

['man', 'mobile', 'copter', 'copter']

# Matching a specific number of repetitions of something

"?" - Match the preceding group zero or one times.

"*" - Match zero OR MORE times

"+" - Match one or more times

{x} - Match EXACTLY x times

{x, y} - Match at least x and at most y times. greedy matching unless append '?' after the braces

In [17]:
batRegex = re.compile(r'Bat(wo)?man')
mo = batRegex.search('The adventures of Batman')

In [18]:
mo.group()

'Batman'

In [19]:
mo = batRegex.search('The adventures of Batwoman')
mo.group()

'Batwoman'

In [20]:
# area code optional

phoneRegex = re.compile(r'(\d\d\d-)?\d\d\d-\d\d\d\d')

In [21]:
mo = phoneRegex.search('My phone number is 408-267-4859')

In [22]:
mo.group()

'408-267-4859'

In [23]:
mo = phoneRegex.search('My phone number is 267-4859')
mo.group()

'267-4859'

In [24]:
haRegex = re.compile(r'(Ha){3}')

In [25]:
haRegex.search('He said "HaHaHa"')

<_sre.SRE_Match object; span=(9, 15), match='HaHaHa'>

In [26]:
haRegex = re.compile(r'(Ha){3,5}')

In [27]:
haRegex.search('He said HaHaHa')

<_sre.SRE_Match object; span=(8, 14), match='HaHaHa'>

In [28]:
haRegex.search('He said HaHaHaHaHa')

<_sre.SRE_Match object; span=(8, 18), match='HaHaHaHaHa'>

In [29]:
# Note that the search returns a full five matches of 'Ha' (greedy match)

haRegex.search('He said HaHaHaHaHaHaHa')

<_sre.SRE_Match object; span=(8, 18), match='HaHaHaHaHa'>

In [30]:
# To make it not greedy (match minimum in the range {3,5} of 'Ha', use '?')

haRegex = re.compile(r'(Ha){3,5}?')

In [31]:
# Here with the '?' after the {3,5}, we match only the first three 'Ha''s.

haRegex.search('He said HaHaHaHaHaHaHa')

<_sre.SRE_Match object; span=(8, 14), match='HaHaHa'>

# The 'findall' Method of the Regex Module

The .search() method returns the FIRST match in a search string.

The .findall() method searches the search string for ALL occurances of the matching expression.

In [32]:
phoneRegex = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d')

phoneRegex

re.compile(r'\d\d\d-\d\d\d-\d\d\d\d', re.UNICODE)

In [33]:
phoneRegex.search('This string has multiple phone numbers. For example 408-807-3381 is a phone number as is 650-574-6621. It occurs to me that 408-807-3371 is also a phone number.')

<_sre.SRE_Match object; span=(52, 64), match='408-807-3381'>

In [34]:
# No groups in the search pattern: findall returns a list of the matched strings.

phoneRegex.findall('This string has multiple phone numbers. For example 408-807-3381 is a phone number as is 650-574-6621. It occurs to me that 408-807-3371 is also a phone number.')

['408-807-3381', '650-574-6621', '408-807-3371']

In [35]:
# Groups in the search pattern: findall returns a list of tuples, where the grouped patterns are elements of the tuples.

phoneRegex = re.compile(r'(\d\d\d)-(\d\d\d-\d\d\d\d)')

In [36]:
phoneRegex.findall('This string has multiple phone numbers. For example 408-807-3381 is a phone number as is 650-574-6621. It occurs to me that 408-807-3371 is also a phone number.')

[('408', '807-3381'), ('650', '574-6621'), ('408', '807-3371')]

# Character Classes
\d - match any digit  
\D - match any non-digit  
\w - match any letter, numeric digit or underscore character; "word" characters.  
\W - match non-word characters  
\s - match white space  
\S - match non-white space

A "^" at the beginning of a character class will match everything NOT in that class

In [37]:
# Can make your own character class

vowelRegex = re.compile(r'[aeiouAEIOU]')

In [38]:
vowelRegex.findall('Robocop eats baby food.')

['o', 'o', 'o', 'e', 'a', 'a', 'o', 'o']

In [39]:
doubleVowelRegex = re.compile(r'[aeiouAEIOU]{2}')

In [40]:
doubleVowelRegex.findall('Robocop eats baby food.')

['ea', 'oo']

In [41]:
# Using "^" to make a "negative" character class.

consonantRegex = re.compile(r'[^aeiouAEIOU]')

In [42]:
consonantRegex.findall('Robocop eats baby food.')

['R', 'b', 'c', 'p', ' ', 't', 's', ' ', 'b', 'b', 'y', ' ', 'f', 'd', '.']

# Matching the Beginning and End of a Search String

"^" - Use at the beginning of a regular expression to match the search pattern at the beginning of the search string.

"$" Use at the end of a regular expression to match the search pattern at the end of the search string.

Using both "^" and "$" will require the pattern to match the ENTIRE string.

In [43]:
# Must match "Hello" at the beginning of the search string

beginsWithHelloRegex = re.compile(r'^Hello')

In [44]:
beginsWithHelloRegex.search('Hello there!')

<_sre.SRE_Match object; span=(0, 5), match='Hello'>

In [45]:
beginsWithHelloRegex.search('He said "Hello!"') == None

True

In [46]:
# Must match "world" at the end of the search string

endsWithWorldRegex = re.compile(r'world!$')

In [47]:
endsWithWorldRegex.search('Hello world!')

<_sre.SRE_Match object; span=(6, 12), match='world!'>

In [48]:
endsWithWorldRegex.search('It\'s a large world!, they say') == None

True

In [49]:
# Using both "^" and "$"

allDigitsRegex = re.compile(r'^\d+$')

In [50]:
# Match a string composed of only digits, start to end.

allDigitsRegex.search('23554353453788')

<_sre.SRE_Match object; span=(0, 14), match='23554353453788'>

In [51]:
# No match because of the "x" in the string.

allDigitsRegex.search('4235554x534354')

# "." - The "Match All" Regex Character

"." matches ANY single character except the newline character. Can get it to match every character INCLUDING the newline character by passing "re.DOTALL" as an option to re.compile.

".*" matches ANY collection of characters in a search string. This is greedy; It will try to match as much as possible.

".*?" Like ".*" but in "non-greedy" mode; It will match as least as possible.

In [52]:
# Note that only "lat" in the word "flat" is matched since the dot represents only ONE character before the "at" in the matching string.

atRegex = re.compile(r'.at')
atRegex.findall('The cat in the hat sat on the flat mat')

['cat', 'hat', 'sat', 'lat', 'mat']

In [53]:
# Fixes the problem with the previous example; Now "flat" is matched BUT also matching ANY other previous character in the search strings like " cat" (matching white space)

atRegex = re.compile(r'.{1,2}at')
atRegex.findall('The cat in the hat sat on the flat mat')

[' cat', ' hat', ' sat', 'flat', ' mat']

In [54]:
# Using ".*" in a regex

nameRegex = re.compile(r'First Name: (.*) Last Name: (.*)')

nameRegex.findall('First Name: Gary Last Name: Church')

[('Gary', 'Church')]

In [55]:
# Examples of using ".*" in greedy and non-greedy (".*?") mode

serve = '<To serve humans> for dinner.>'

nongreedy = re.compile(r'<(.*?)>')
print(nongreedy.findall(serve))

greedy = re.compile(r'<(.*)>')
print(greedy.findall(serve))

['To serve humans']
['To serve humans> for dinner.']


In [56]:
# '.*' won't match the newline character

prime = 'Serve the public trust.\nProtect the innocent.\nUphold the law'

print(prime)

dotStar = re.compile(r'.*')
dotStar.search(prime)

Serve the public trust.
Protect the innocent.
Uphold the law


<_sre.SRE_Match object; span=(0, 23), match='Serve the public trust.'>

In [57]:
# '.*' matching ALL characters, even the newline character, using the re.DOTALL directive in re.compile

dotStar = re.compile(r'.*', re.DOTALL)
dotStar.search(prime)

<_sre.SRE_Match object; span=(0, 60), match='Serve the public trust.\nProtect the innocent.\nU>

In [58]:
# Passing re.IGNORECASE (or just re.I) to re.compile to ignore case in matches.

vowelRegex = re.compile(r'[aeiou]')

print(vowelRegex.findall('Al, why does your programming book talk about RoboCop so much?'))

print()
vowelRegex = re.compile(r'[aeiou]', re.I)

print(vowelRegex.findall('Al, why does your programming book talk about RoboCop so much?'))

['o', 'e', 'o', 'u', 'o', 'a', 'i', 'o', 'o', 'a', 'a', 'o', 'u', 'o', 'o', 'o', 'o', 'u']

['A', 'o', 'e', 'o', 'u', 'o', 'a', 'i', 'o', 'o', 'a', 'a', 'o', 'u', 'o', 'o', 'o', 'o', 'u']


# Find and Replace with Regular Expressions using the .sub() method

regex.sub(<'replace string'>, <'search string'>)

Can also use \1, \2, ... to replace matched groups in the regex.

In [59]:
namesRegex = re.compile(r'Agent \w+')

namesRegex.findall('Agent Alice gave the secret documents to Agent Bob.')

['Agent Alice', 'Agent Bob']

In [60]:
# Using the .sub(<replace string>, <search string>) to replace the string in <search string> matched with the regex by the string in <replace string>

namesRegex.sub('REDACTED', 'Agent Alice gave the secret documents to Agent Bob.')

'REDACTED gave the secret documents to REDACTED.'

In [61]:
# Using /1, /2, ...

namesRegex = re.compile(r'Agent (\w)\w*')

print(namesRegex.findall('Agent Alice gave the secret documents to Agent Bob.'))

print()

print(namesRegex.sub(r'Agent \1****', 'Agent Alice gave the secret documents to Agent Bob.'))

['A', 'B']

Agent A**** gave the secret documents to Agent B****.


# re.VERBOSE option in re.compile()

This option allows whitespace to be inserted into the search pattern to make it more legible. We can also insert comments to document the search pattern.

In [62]:
# An example of using the re.VERBOSE option in re.compile()

# This regex does not work as intended.

phoneNumberRegex=re.compile(r'''
((\d\d\d-)|     # area code (without parens, with dash)
(\(\d\d\d\) ))  # -or- area code with parens and no dash
(\d\d\d         # first 3 digits
-              # second dash
\d\d\d\d)       # last four digits
\s(x\d{2,4})     # extension, like x1234''', re.VERBOSE)

In [63]:
mo = phoneNumberRegex.search('My phone number is 408-807-3381 x123 and Aaryn\'s phone number is (408) 807-3371 x345')

print(mo.group())
print(mo.group(1))
print(mo.group(2))
print(mo.group(3))

408-807-3381 x123
408-
408-
None


In [64]:
phoneNumberRegex.findall('My phone number is 408-807-3381 x123 and Aaryn\'s phone number is (408) 807-3371 x345')

[('408-', '408-', '', '807-3381', 'x123')]

In [65]:
phoneNumberRegex = re.compile(r'(\d\d\d)-(\d\d\d-\d\d\d\d)\s(x\d{2,4})')

mo = phoneNumberRegex.search('My phone number is 408-807-3381 x123 and Aaryn\'s phone number is (408) 807-3371 x345')

print(mo.group(1))
print(mo.group(2))
print(mo.group(3))

408
807-3381
x123


In [66]:
phoneNumberRegex.findall('My phone number is 408-807-3381 x123 and Aaryn\'s phone number is (408-807-3371 x345')

[('408', '807-3381', 'x123'), ('408', '807-3371', 'x345')]

# Example Program to "Scrape" a file of all its phone numbers and email addresses

In [67]:
import pyperclip

In [68]:
# Create a regex for phone numbers

phoneRegex = re.compile(r'''
# 415-555-0000, 555-0000, (415) 555-0000, 555-0000 ext 12345, ext. 12345, x12345

(((\d\d\d) | (\(\d\d\d\)))?    # area code (optional)
(\s|-)     # first separator
\d\d\d     # first 3 digits
-     # separator
\d\d\d\d    # last 4 digits
(((ext(\.)?\s)|x)    # extension word-part (optional)
 (\d{2,5}))?)    # extension number-part (optional)

''', re.VERBOSE)

In [69]:
# Create a regex for email addresses

emailRegex = re.compile(r'''
# some.+_thing@(\d{2,5})?.com

[a-zA-Z0-9_.+]+    # name part
@                  # @ symbol
[a-zA-Z0-9_.+]+    # domain name part

''', re.VERBOSE)

In [70]:
# Get the text off the clipboard

text = pyperclip.paste()

In [71]:
# Extract the email/phone from this text

extractedPhone = phoneRegex.findall(text)
extractedEmail = emailRegex.findall(text)

allPhoneNumbers = [phoneNumber[0] for phoneNumber in extractedPhone]


print(allPhoneNumbers)
print()
print(extractedEmail)

[]

[]


In [72]:
# Join the phone numbers and emails in the lists with newline characters to put them each on a separate line.

results = '\n'.join(allPhoneNumbers) + '\n' + '\n'.join(extractedEmail)

In [73]:
# Copy the "results" from the previous line to the clipboard using pyperclip's copy method.

pyperclip.copy(results)