In [None]:
# strings must be coded as raw strings "r\n\w"
# signals python not to interpret backslashes and metacharacters as strings - so regex can go through them

In [1]:
# uses re.search() - None if no match a re.MatchObject if found
# only for testing - stops at first match

In [None]:
#matchObject = re.search(pattern, input_str, flags=0)

In [2]:
import re

In [3]:
regex = r"([a-zA-Z]+) (\d+)"
if re.search(regex, "June 24"): #if not None i.e. finds something
    match = re.search(regex, "June 24")
    print("Match at index %s, %s" % (match.start(), match.end()))
    print("Full match: %s" % (match.group(0)))
    print("Month: %s" % (match.group(1)))
    print("Day: %s" % (match.group(2)))
else:
    print("The regex pattern does match.")

Match at index 0, 7
Full match: June 24
Month: June
Day: 24


In [4]:
# for global search over the whole input string re.findall()
# returns a list of all captured data - 

#for additional context for each match - use re.finditer() - returns an iterator over matchObjects

In [None]:
# matchList = re.findall(pattern, input_str, flags=0)
# matchList = re.finditer(pattern, input_str, flags=0)

In [5]:
regex = r"([a-zA-Z]+) \d+"
matches = re.findall(regex, "June 24, August 9, Dec 12")
for match in matches:
    # This will now print:
    #   June
    #   August
    #   Dec
    print("Match month: %s" % (match))

Match month: June
Match month: August
Match month: Dec


In [6]:
# iter also gives start and end index
regex = r"([a-zA-Z]+) \d+"
matches = re.finditer(regex, "June 24, August 9, Dec 12")
for match in matches:
    # This will now print:
    #   0 7
    #   9 17
    #   19 25
    # which corresponds with the start and end of each match in the input string
    print("Match at index: %s, %s" % (match.start(), match.end()))

Match at index: 0, 7
Match at index: 9, 17
Match at index: 19, 25


In [None]:
# can be used to find and replace. 
# replacedString = re.sub(pattern, replacement_pattern, input_str, count, flags=0)

In [7]:
import re
# Lets try and reverse the order of the day and month in a date 
# string. Notice how the replacement string also contains metacharacters
# (the back references to the captured groups) so we use a raw 
# string for that as well.
regex = r"([a-zA-Z]+) (\d+)"

# This will reorder the string and print:
#   24 of June, 9 of August, 12 of Dec
print(re.sub(regex, r"\2 of \1", "June 24, August 9, Dec 12"))

24 of June, 9 of August, 12 of Dec


In [None]:
# optional flags argument for convencience instead of putting in the regex 
# e.g.: re.IGNORECASE (case insensitive), re.MULTILINE (^ and $ goes to match at beginning/end of line instead of whole string), re.DOTALL (. matches all characters including new lines \n)

In [None]:
# compiling for performance:
# regexObject = re.compile(pattern, flags=0)

In [9]:
regex = re.compile(r"(\w+) World")
result = regex.search("Hello World is the easiest")
if result:
    # This will print:
    #   0 11
    # for the start and end of the match
    print(result)
    print(result.start(), result.end())

<re.Match object; span=(0, 11), match='Hello World'>
0 11


In [None]:
# done instead of doing re.search(regex, word) 

In [10]:
for result in regex.findall("Hello World, Bonjour World"):
    print(result)

Hello
Bonjour


In [11]:
print(regex.sub(r"\1 Earth", "Hello World"))

Hello Earth
