In [2]:
import re

# The regex describing one or more whitespace characters
# is \s+

text = "foo     bar\t baz   \tqux"
re.split( '\s+', text)


['foo', 'bar', 'baz', 'qux']

In [4]:
# Creating a regex object with re.compile is highly recommended if you intend to
# apply the same expression to many strings; doing so will save CPU cycles
regex = re.compile('\s+')
regex.split(text)

['foo', 'bar', 'baz', 'qux']

In [5]:
# If, instead, you wanted to get a list of all patterns matching the regex, you can use the
# findall method

regex.findall(text)

['     ', '\t ', '   \t']

In [6]:
'''
match and search are closely related to findall . While findall returns all matches
in a string, search returns only the first match. More rigidly, match only matches at
the beginning of the string. As a less trivial example, let’s consider a block of text and
a regular expression capable of identifying most email addresses
'''

text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""
pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'
# re.IGNORECASE makes the regex case-insensitive
regex = re.compile(pattern, flags=re.IGNORECASE)


regex.findall(text)

['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']

In [7]:
regex.match(text)

In [8]:
# search returns a special match object for the first email address in the text. For the
# preceding regex, the match object can only tell us the start and end position of the
# pattern in the string:
m = regex.search(text)
m

<_sre.SRE_Match object; span=(5, 20), match='dave@google.com'>

In [9]:
text[m.start():m.end()]


'dave@google.com'

In [11]:
# regex.match returns None , as it only will match if the pattern occurs at the start of the
# string
print(regex.match(text))

None


In [12]:
# Relatedly, sub will return a new string with occurrences of the pattern replaced by the
# a new string
print(regex.sub('REDACTED', text))

Dave REDACTED
Steve REDACTED
Rob REDACTED
Ryan REDACTED



In [13]:
# Suppose you wanted to find email addresses and simultaneously segment each
# address into its three components: username, domain name, and domain suffix. To
# do this, put parentheses around the parts of the pattern to segment

pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
regex = re.compile(pattern, flags=re.IGNORECASE)
m = regex.match('wesm@bright.net')
m.groups()

('wesm', 'bright', 'net')

In [14]:
regex.findall(text)

[('dave', 'google', 'com'),
 ('steve', 'gmail', 'com'),
 ('rob', 'gmail', 'com'),
 ('ryan', 'yahoo', 'com')]

In [15]:
# sub also has access to groups in each match using special symbols like \1 and \2 . The
# symbol \1 corresponds to the first matched group, \2 corresponds to the second, and
# so forth:
print(regex.sub(r'Username: \1, Domain: \2, Suffix: \3', text))


Dave Username: dave, Domain: google, Suffix: com
Steve Username: steve, Domain: gmail, Suffix: com
Rob Username: rob, Domain: gmail, Suffix: com
Ryan Username: ryan, Domain: yahoo, Suffix: com



In [None]:
'''


findall -> Return all non-overlapping matching patterns in a string as a list
finditer -> Like findall , but returns an iterator
match -> Match pattern at start of string and optionally segment pattern components into groups; 
         if the pattern matches, returns a match object, and otherwise None
search -> Scan string for match to pattern; returning a match object if so; unlike match , the match can be anywhere in
          the string as opposed to only at the beginning
split -> Break string into pieces at each occurrence of pattern

sub, subn -> Replace all ( sub ) or first n occurrences ( subn ) of pattern in string with 
             replacement expression; use symbols \1, \2, ... to refer to match group elements
             in the replacement string
'''