In [1]:
## Chapter 7 is all about regex in python

#The example they use is recognizing whether or not a string is an american phone 
    #number and doing that without regex is kind of a pain and would look something like the following...

In [2]:
# For this topic we are assuming the pattern for a phone number is XXX-XXX-XXXX [X's being any digit 0-9]
def isPhoneNumber(text):
    ##if the string is not 12 characters long its not going to match the pattern above
    if len(text) != 12:
        return False
    ##check that the first three characters are numbers
    for i in range(0,3):
        if not text[i].isdecimal():
            return False
    ##check that the 4th character is the deliminating dash
    if text[3] != '-':
        return False
    ##check the next three characters that they're numbers
    for i in range(4,7):
        if not text[i].isdecimal():
            return False
    ##check that the 8th character is the deliminating dash
    if text[7] != '-':
        return False
    ##check the next four characters that they're numbers
    for i in range(8,12):
        if not text[i].isdecimal():
            return False
    #if none of the checks fail to meet the requirement than it must be a phone number so we return True
    return True
    

In [3]:
print(isPhoneNumber("XXX-XXX-XXXX"))
print(isPhoneNumber("666-420-6969"))
print(isPhoneNumber("666 420-6969"))

False
True
False


In [4]:
## So to use the isPhoneNumber on a string to search a string for a phone number the code would look something like the following...
def searchNumber(message):
    for i in range(len(message)):
        chunk = message[i:i+12]
        if isPhoneNumber(chunk):
            print('Phone number found: ' + chunk)
    print('Done')
# This function breaks the string into 12 character long substrings along the string and tests each substring 
    #if it matches the pattern previously established in isPhoneNumber()

In [5]:
searchNumber('Call me at 415-555-1011 tomorrow. 415-555-9999 is my office.')
searchNumber('Call me at my office.')

Phone number found: 415-555-1011
Phone number found: 415-555-9999
Done
Done


In [6]:
## as you can see this method is bulky and complicated so instead we can use regex to simplify that code and make it readable
import re
phoneNumRegex = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d')
mo = phoneNumRegex.search('My number is 415-555-4242.')
print('Phone number found: ' + mo.group())

Phone number found: 415-555-4242


In [7]:
##you can even do pattern groupings with parenthesis
phoneNumRegex = re.compile(r'(\d\d\d)-(\d\d\d-\d\d\d\d)')
mo = phoneNumRegex.search('My number is 415-555-4242.')
print("4: " + mo.group(1))
print("5: " + mo.group(2))
print("6: " + mo.group(0))
print("7: " + mo.group())
print("8:", mo.groups())
areaCode, mainNumber = mo.groups()
print("10: " + areaCode)
print("11: " + mainNumber)

4: 415
5: 555-4242
6: 415-555-4242
7: 415-555-4242
8: ('415', '555-4242')
10: 415
11: 555-4242


In [8]:
## you can search for multiple groupings using the | notation
heroRegex = re.compile(r'Batman|Tina Fey')
mo1 = heroRegex.search('Batman and Tina Fey')
mo1.group()

'Batman'

In [9]:
mo2 = heroRegex.search('Tina Fey and Batman') 
mo2.group()

'Tina Fey'

In [10]:
batRegex = re.compile(r'Bat(man|mobile|copter|bat)')
mo = batRegex.search('Batmobile lost a wheel')
mo.group()

'Batmobile'

In [11]:
mo.group(1)

'mobile'

In [12]:
#There is also optional matching using ?
batRegex = re.compile(r'Bat(wo)?man')
mo1 = batRegex.search('The Adventures of Batman')
mo1.group()

'Batman'

In [13]:
mo2 = batRegex.search('The Adventures of Batwoman')
mo2.group()

'Batwoman'

In [14]:
phoneRegex = re.compile(r'(\d\d\d-)?\d\d\d-\d\d\d\d')
mo1 = phoneRegex.search('My number is 415-555-4242')
mo1.group()

'415-555-4242'

In [15]:
mo2 = phoneRegex.search('My number is 555-4242')
mo2.group()

'555-4242'

In [16]:
#There is also matching zero or more with * similar to ? just allows for multiples
batRegex = re.compile(r'Bat(wo)*man')
mo1 = batRegex.search('The Adventures of Batman')
mo1.group()

'Batman'

In [17]:
mo2 = batRegex.search('The Adventures of Batwoman')
mo2.group()

'Batwoman'

In [18]:
mo3 = batRegex.search('The Adventures of Batwowowowowoman')
mo3.group()

'Batwowowowowoman'

In [19]:
#match 1 or more with +
batRegex = re.compile(r'Bat(wo)+man')
mo1 = batRegex.search('The Adventures of Batman')
mo1 == None

True

In [20]:
mo2 = batRegex.search('The Adventures of Batwoman')
mo2.group()

'Batwoman'

In [21]:
mo3 = batRegex.search('The Adventures of Batwowowowowoman')
mo3.group()

'Batwowowowowoman'

In [22]:
## Match specific repetitions with {} can be used as either a single number {3} or a range {3,5}
haRegex = re.compile(r'(Ha){3}')
mo1 = haRegex.search('HaHaHa')
mo1.group()

'HaHaHa'

In [23]:
mo2 = haRegex.search('Ha')
mo2 == None

True

In [24]:
# can be done in a greedy or non greedy determined by adding a ? for nongreedy
haRegex = re.compile(r'(Ha){3,5}')
mo1 = haRegex.search('HaHaHaHa')
mo1.group()

'HaHaHaHa'

In [25]:
haRegex = re.compile(r'(Ha){3,5}?')
mo2 = haRegex.search('HaHaHaHaHa')
mo2.group()

'HaHaHa'