# Regex

## What?
Regular expressions (called regexes or regex patterns) are a tiny language for dealing with text and character patterns.
With RegEx patterns we can:
- Does this string match a pattern?
- Is there a match for the pattern anywhere in the string?
- Modify + split strings in various ways

In [2]:
import pandas as pd
import re # part of the python stdlib

In [4]:
log_file_lines = '''
76.185.131.226 - - [11/May/2020:14:25:53 +0000] "GET / HTTP/1.1" 200 42 "-" "python-requests/2.23.0"
76.185.131.226 - - [11/May/2020:16:25:46 +0000] "GET / HTTP/1.1" 200 42 "-" "python-requests/2.23.0"
76.185.131.226 - - [11/May/2020:16:25:58 +0000] "GET / HTTP/1.1" 200 42 "-" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36"
76.185.131.226 - - [11/May/2020:16:25:58 +0000] "GET /favicon.ico HTTP/1.1" 200 162 "https://python.zach.lol/" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36"
104.5.217.57 - - [11/May/2020:16:26:27 +0000] "GET / HTTP/1.1" 200 42 "-" "python-requests/2.23.0"
76.185.131.226 - - [11/May/2020:16:26:46 +0000] "GET /documentation HTTP/1.1" 200 348 "-" "python-requests/2.23.0"
76.185.131.226 - - [11/May/2020:16:26:54 +0000] "GET /documentation HTTP/1.1" 200 348 "-" "python-requests/2.23.0"
104.5.217.57 - - [11/May/2020:16:27:04 +0000] "GET /documentation HTTP/1.1" 200 348 "-" "python-requests/2.23.0"
76.185.131.226 - - [11/May/2020:16:27:05 +0000] "GET /documentation HTTP/1.1" 200 348 "-" "python-requests/2.23.0"
76.185.131.226 - - [11/May/2020:16:27:10 +0000] "GET /documentation HTTP/1.1" 200 348 "-" "python-requests/2.23.0"
'''

In [6]:
lines = pd.Series(log_file_lines.strip().split('\n'))
lines

0    76.185.131.226 - - [11/May/2020:14:25:53 +0000...
1    76.185.131.226 - - [11/May/2020:16:25:46 +0000...
2    76.185.131.226 - - [11/May/2020:16:25:58 +0000...
3    76.185.131.226 - - [11/May/2020:16:25:58 +0000...
4    104.5.217.57 - - [11/May/2020:16:26:27 +0000] ...
5    76.185.131.226 - - [11/May/2020:16:26:46 +0000...
6    76.185.131.226 - - [11/May/2020:16:26:54 +0000...
7    104.5.217.57 - - [11/May/2020:16:27:04 +0000] ...
8    76.185.131.226 - - [11/May/2020:16:27:05 +0000...
9    76.185.131.226 - - [11/May/2020:16:27:10 +0000...
dtype: object

In [9]:
regex = r'''
    (?P<ip>.*?)\s.*?\[(?P<timestamp>.*?)\]\s+"(?P<method>[A-Z]+)\s(?P<path>.*?)\sHTTP/1.1"
    \s(?P<status>\d+)\s(?P<bytes_sent>\d+)\s"(?P<referrer>.*?)"\s"(?P<user_agent>.*?)"
    '''

regex = re.compile(regex, re.VERBOSE)
regex

re.compile(r'\n    (?P<ip>.*?)\s.*?\[(?P<timestamp>.*?)\]\s+"(?P<method>[A-Z]+)\s(?P<path>.*?)\sHTTP/1.1"\n    \s(?P<status>\d+)\s(?P<bytes_sent>\d+)\s"(?P<referrer>.*?)"\s"(?P<user_agent>.*?)"\n    ',
           re.UNICODE|re.VERBOSE)

In [10]:
lines.str.extract(regex)

Unnamed: 0,ip,timestamp,method,path,status,bytes_sent,referrer,user_agent
0,76.185.131.226,11/May/2020:14:25:53 +0000,GET,/,200,42,-,python-requests/2.23.0
1,76.185.131.226,11/May/2020:16:25:46 +0000,GET,/,200,42,-,python-requests/2.23.0
2,76.185.131.226,11/May/2020:16:25:58 +0000,GET,/,200,42,-,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6...
3,76.185.131.226,11/May/2020:16:25:58 +0000,GET,/favicon.ico,200,162,https://python.zach.lol/,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6...
4,104.5.217.57,11/May/2020:16:26:27 +0000,GET,/,200,42,-,python-requests/2.23.0
5,76.185.131.226,11/May/2020:16:26:46 +0000,GET,/documentation,200,348,-,python-requests/2.23.0
6,76.185.131.226,11/May/2020:16:26:54 +0000,GET,/documentation,200,348,-,python-requests/2.23.0
7,104.5.217.57,11/May/2020:16:27:04 +0000,GET,/documentation,200,348,-,python-requests/2.23.0
8,76.185.131.226,11/May/2020:16:27:05 +0000,GET,/documentation,200,348,-,python-requests/2.23.0
9,76.185.131.226,11/May/2020:16:27:10 +0000,GET,/documentation,200,348,-,python-requests/2.23.0


## re library function

### `re.findall` 

 - finds all substrings where the RE matches; returns a list

### Literals - start simple

In [12]:
subject = 'abc'

#### find the letter a

In [13]:
regexp = r'a'

re.findall(regexp, subject)

['a']

In [14]:
re.findall(r'a', 'abc')

['a']

#### find the letter c

In [15]:
regexp = r'c'

re.findall(regexp, subject)

['c']

#### find the letter d

In [16]:
regexp = r'd'

re.findall(regexp, subject)

[]

### Literals - make it more complex

In [17]:
subject = 'Mary had a little lamb. 1 little lamb. Not 10, not 12, not 22, just one'

#### find mary

In [18]:
regexp = r'mary'

re.findall(regexp, subject)

[]

In [19]:
regexp = r'Mary'

re.findall(regexp, subject)

['Mary']

##### regex flag: re.IGNORECASE

In [21]:
regexp = r'mary'

re.findall(regexp, subject, re.IGNORECASE)

['Mary']

#### find little

In [24]:
regexp = r'little'

re.findall(regexp, subject)

['little', 'little']

#### find the number 1

In [25]:
regexp = r'1'

re.findall(regexp, subject)

['1', '1', '1']

### Metacharacters

-  `.` : anything


- `\w`: any letter or number
- `\W`: anything that is *not* a letter or number



- `\d`: any digit
- `\D`: anything that is *not* a digit


- `\s` : any whitespace

In [43]:
subject = 'abccc. 123'

#### try all the metacharacters

In [37]:
regexp = r'.'

re.findall(regexp, subject)

['a', 'b', 'c', 'c', 'c', '.', ' ', '1', '2', '3', '\\', 'd']

In [38]:
regexp = r'\w'

re.findall(regexp, subject)

['a', 'b', 'c', 'c', 'c', '1', '2', '3', 'd']

In [39]:
regexp = r'\W'

re.findall(regexp, subject)

['.', ' ', '\\']

In [40]:
regexp = r'\d'

re.findall(regexp, subject)

['1', '2', '3']

In [41]:
regexp = r'\D'

re.findall(regexp, subject)

['a', 'b', 'c', 'c', 'c', '.', ' ', '\\', 'd']

In [42]:
regexp = r'\s'

re.findall(regexp, subject)

[' ']

#### what does \w\w bring back?

In [45]:
subject

'abccc. 123'

In [44]:
regexp = r'\w\w'

re.findall(regexp, subject)

['ab', 'cc', '12']

#### match the period

In [46]:
regexp = r'.'

re.findall(regexp, subject)

['a', 'b', 'c', 'c', 'c', '.', ' ', '1', '2', '3']

In [53]:
#escape it out with a backslash
regexp = r'\.'

re.findall(regexp, subject)

['.']

In [54]:
subject = 'abccc\. 123'

In [57]:
#escape it out with a backslash
regexp = r'\\\.'

re.findall(regexp, subject)

['\\.']

#### match the string 'c 1' using only metacharacters

In [70]:
subject = 'c 1'

In [71]:
regexp = r'...'

re.findall(regexp, subject)

['c 1']

In [72]:
regexp = r'\w\s\w'

re.findall(regexp, subject)

['c 1']

In [73]:
regexp = r'\w\s\d'

re.findall(regexp, subject)

['c 1']

### Repeating

- `{}`: custom number of repititions
    - `{x}`: exactly x repititions
    - `{x,}`: x or more
    - `{x,y}`: between x and y repititions
- `*`: zero or more
- `+`: one or more
- `?`: optional
- `?`: greedy

#### what will be returned?

In [78]:
subject = 'abc 12345'

In [79]:
regexp = r'\w+\s?\d'

re.findall(regexp, subject)

['abc 1', '2345']

In [82]:
regexp = r'\w+'

re.findall(regexp, subject)

['abc', '12345']

In [89]:
regexp = r'\w+\s?\d'

re.findall(regexp, subject)

['abc 1', '2345']

### Find the matches

In [91]:
subject = """
    Codeup, founded in 2014, is located at 600 Navarro St. Suite 350, 
    San Antonio, TX 78230. 
    You can find us online at http://codeup.com 
    and our alumni portal is located at https://alumni.codeup.com.
    """

#### match all the numbers

In [95]:
regexp = r'\d+'

re.findall(regexp, subject)

['2014', '600', '350', '78230']

#### match a 5 digit number, but not a number with fewer digits

In [98]:
regexp = r'\d\d\d\d\d'

re.findall(regexp, subject)

['78230']

In [99]:
regexp = r'\d{5}'

re.findall(regexp, subject)

['78230']

#### match a 4 or more digit number

In [100]:
regexp = r'\d{4,}'

re.findall(regexp, subject)

['2014', '78230']

#### match 3 to 4 digit number

In [105]:
regexp = r'\d{3,4}'

re.findall(regexp, subject)

['2014', '600', '350', '7823']

#### match `http://` or `https://`

In [111]:
#using | (or) operator
regexp = r'http://|https://'

re.findall(regexp, subject)

['http://', 'https://']

In [112]:
#using ? optional
regexp = r'https?://'

re.findall(regexp, subject)

['http://', 'https://']

In [115]:
#using * zero or more
regexp = r'https*:*//'

re.findall(regexp, subject)

['http://', 'https://']

In [121]:
subject = """
    Codeup, founded in 2014, is located at 600 Navarro St. Suite 350, 
    San Antonio, TX 78230. 
    You can find us online at http://codeup.com 
    and our alumni portal is located at https:::://alumni.codeup.com.
    """

In [122]:
#using * zero or more
regexp = r'https*:*//'

re.findall(regexp, subject)

['http://', 'https:::://']

### Any of / None of

- `[]`: will match anything inside of
- `[^]`: will match anything NOT inside of
- `[-]`: will match a range of values inside of

In [132]:
subject = 'abc 12345 1bc'

#### match using brackets

In [137]:
regexp = r'[a1 ]'

re.findall(regexp, subject)

['a', ' ', '1', ' ', '1']

In [138]:
regexp = r'[a1][b2][c3]'

re.findall(regexp, subject)

['abc', '123', '1bc']

In [139]:
regexp = r'[a1][b2][c]'

re.findall(regexp, subject)

['abc', '1bc']

In [144]:
regexp = r'[a-z0-9]'

re.findall(regexp, subject)

['a', 'b', 'c', '1', '2', '3', '4', '5', '1', 'b', 'c']

In [145]:
subject

'abc 12345 1bc'

In [147]:
regexp = r'[a-z][0-9]'

re.findall(regexp, subject)

[]

In [148]:
regexp = r'[0-9][a-z]'

re.findall(regexp, subject)

['1b']

In [149]:
subject

'abc 12345 1bc'

#### match using carrot

In [163]:
regexp = r'[a-z]+'

re.findall(regexp, subject)

['abc', 'bc']

In [170]:
regexp = r'[^a-z]'

re.findall(regexp, subject)

[' ', '1', '2', '3', '4', '5', ' ', '1']

In [171]:
regexp = r'[^a-z]\s'

re.findall(regexp, subject)

['5 ']

#### match using range

In [172]:
subject

'abc 12345 1bc'

In [173]:
regexp = r'[2-5]'

re.findall(regexp, subject)

['2', '3', '4', '5']

#### match using range and carrot

In [183]:
regexp = r'[^2-5]'

re.findall(regexp, subject)

['a', 'b', 'c', ' ', '1', ' ', '1', 'b', 'c']

### Anchors

- `^`: starts with
- `$`: ends with
- `\b`: word boundary

In [191]:
subject = """
    Codeup, founded in 2014, is located at 600 Navarro St. Suite 350, 
    San Antonio, TX 78230. 
    You can find us online at http://codeup.com 
    and our alumni portal is located at https://alumni.codeup.com.
    """

#### match 3 or 4 digit number

In [199]:
regexp = r'\b\d{3,4}\b'

re.findall(regexp, subject)

['2014', '600', '350']

In [200]:
subject = 'kiwi aardvark banana codeup data science academy extra'

#### match all words that start with a vowel

In [203]:
regexp = r'[aeiou]\w+'

re.findall(regexp, subject)

['iwi', 'aardvark', 'anana', 'odeup', 'ata', 'ience', 'academy', 'extra']

In [204]:
#using a boundry
regexp = r'\b[aeiou]\w+'

re.findall(regexp, subject)

['aardvark', 'academy', 'extra']

In [206]:
subject

'kiwi aardvark banana codeup data science academy extra'

In [205]:
#using an anchor
regexp = r'^[aeiou]\w+'

re.findall(regexp, subject)

[]

In [210]:
#split subjects to use anchor
subjects = subject.split()
subjects

['kiwi', 'aardvark', 'banana', 'codeup', 'data', 'science', 'academy', 'extra']

In [211]:
regexp = r'^[aeiou]\w+'

for subject in subjects:
    print(re.findall(regexp, subject))

[]
['aardvark']
[]
[]
[]
[]
['academy']
['extra']


#### match all words that end with a vowel

In [214]:
regexp = r'\w+[aeiou]$'

for subject in subjects:
    print(re.findall(regexp, subject))

['kiwi']
[]
['banana']
[]
['data']
['science']
[]
['extra']


In [217]:
subject = 'kiwi aardvark banana codeup data science academy extra'

In [218]:
regexp = r'\w+[aeiou]$'

re.findall(regexp, subject)

['extra']

### Capture Groups

- `()`: grab what's contained in parentheses 

In [247]:
subject = '''
    You can find us on the web at https://codeup.com. Our ip address is 123.123.123.123 (maybe).
    '''

#### find the domain only

In [248]:
regexp = r'https://(\w+).com'

re.findall(regexp, subject)

['codeup']

#### find everything after the first sentence

In [259]:
regexp = r'\.\s(.+)'

re.findall(regexp, subject)

['Our ip address is 123.123.123.123 (maybe).']

#### find the ip address

In [261]:
subject.strip()

'You can find us on the web at https://codeup.com. Our ip address is 123.123.123.123 (maybe).'

In [270]:
regexp = r'Our ip address is (.*)\s'

re.findall(regexp, subject)

['123.123.123.123 (maybe).']

In [274]:
#dont be greedy
regexp = r'Our ip address is (.*?)\s'

re.findall(regexp, subject)

['123.123.123.123']

In [292]:
regexp = r'\d+\.\d+\.\d+\.\d+'

re.findall(regexp, subject)

['123.123.123.123']

In [297]:
regexp = r'((\d+\.){3}\d+)'

re.findall(regexp, subject)

[('123.123.123.123', '123.')]

#### find domain and ip address

In [299]:
subject.strip()

'You can find us on the web at https://codeup.com. Our ip address is 123.123.123.123 (maybe).'

In [306]:
regexp = r'https://(\w+).com. Our ip address is (.*?)\s'

re.findall(regexp, subject)

[('codeup', '123.123.123.123')]

In [307]:
regexp = r'\/(\w+).*?(\d.*?)\s'

re.findall(regexp, subject)

[('codeup', '123.123.123.123')]

#### find the protocol, domain, and tld

In [314]:
subject = '''
    You can find us on the web at httsss://codeup.com. Our ip address is 123.123.123.123 (maybe).
    '''

In [316]:
regexp = r'(http?s*)://(\w+)\.(\w+)'

re.findall(regexp, subject)

[('httsss', 'codeup', 'com')]

### Non Capture Group

- `?:`: to ignore a capture group also called shy

#### find the ip address

In [320]:
regexp = r'((?:\d{3}\.){3}\d+)'

re.findall(regexp, subject)

['123.123.123.123']

#### find the protocol and tld

In [322]:
regexp = r'(http?s*)://(?:\w+)\.(\w+)'

re.findall(regexp, subject)

[('httsss', 'com')]

## more re library functions

### `re.search`

- scans through a string, looking for any location where the RE matches; returns match object

In [326]:
subject = """
    Codeup, founded in 2014, is located at 600 Navarro St. Suite 350, 
    San Antonio, TX 78230. 
    You can find us online at http://codeup.com and
    our alumni portal is located at https://alumni.codeup.com.
    """

#### find the word located

In [327]:
regexp = r'located'

re.search(regexp, subject)

<re.Match object; span=(33, 40), match='located'>

In [328]:
#results type
type(re.search(regexp, subject))

re.Match

#### find numbers

In [329]:
regexp = r'\d+'

re.search(regexp, subject)

<re.Match object; span=(24, 28), match='2014'>

#### find navarro

In [335]:
regexp = r'navarro'

re.search(regexp, subject)

In [336]:
#results type
type(re.search(regexp, subject))

NoneType

In [338]:
regexp = r'navarro'

re.search(regexp, subject, re.IGNORECASE)

<re.Match object; span=(48, 55), match='Navarro'>

### Name Capture Group

- `?P`: to name a capture group

In [342]:
subject = '''
    You can find us on the web at https://codeup.com. Our ip address is 123.123.123.123 (maybe).
    '''

#### find the protocol, domain, and tld and name them

In [346]:
regexp = r'(?P<protocol>http?s*)://(?P<domain>\w+)\.(?P<tld>\w+)'

match = re.search(regexp, subject)
match

<re.Match object; span=(35, 53), match='https://codeup.com'>

In [348]:
#groups()
match.groups()

('https', 'codeup', 'com')

In [351]:
#groupdict()
match.groupdict()

{'protocol': 'https', 'domain': 'codeup', 'tld': 'com'}

##### regex flag:  `re.VERBOSE`

- `re.VERBOSE` will ignore whitespace in regex pattern

In [355]:
regexp = r'''
            (?P<protocol>http?s*)://
            (?P<domain>\w+)\.
            (?P<tld>\w+)
            '''
match = re.search(regexp, subject, re.VERBOSE)
match

<re.Match object; span=(35, 53), match='https://codeup.com'>

### `re.sub`

- allows us to match a regex and substitute in a new substring for the match; returns string

In [356]:
subject = 'abc 12345xyz'

#### remove all the digits

In [357]:
regexp = r'\d'

re.sub(regexp, '' ,subject)

'abc xyz'

#### replace all digits with an o 

In [358]:
regexp = r'\d'

re.sub(regexp, 'o' ,subject)

'abc oooooxyz'

#### replace all the digits with a single o

In [360]:
regexp = r'\d+'

re.sub(regexp, 'o' ,subject)

'abc oxyz'

#### Using regex with a str.replace, add regex=True arguement

In [370]:
pd.Series(subjects).str.replace('[a-c]','***',regex=True)

0              kiwi
1    ******rdv***rk
2    ******n***n***
3          ***odeup
4          d***t***
5       s***ien***e
6     *********demy
7           extr***
dtype: object

### `re.compile`

- prepare a regular expression for use ahead of time, returns expression

In [371]:
emails = [
    "jane@company.com",
    "bob@company.com",
    "jane.janeway@company.com",
    "jane.janeway@dogood.org",
    "jane.janet.janeway@dogood.org", # bonus for the 3 part address
]

#### get name, domain, tld

In [380]:
regexp = r'(\w+\.*\w+)@(\w+)\.(\w+)'

for email in emails:
    print(re.findall(regexp, email))

[('jane', 'company', 'com')]
[('bob', 'company', 'com')]
[('jane.janeway', 'company', 'com')]
[('jane.janeway', 'dogood', 'org')]
[('janet.janeway', 'dogood', 'org')]


In [389]:
pattern = re.compile(r'''
        (?P<name>\w+\.*\w+)@
        (?P<domain>\w+)\.
        (?P<tld>\w+)
        ''', re.VERBOSE)
pattern

re.compile(r'\n        (?P<name>\w+\.*\w+)@\n        (?P<domain>\w+)\.\n        (?P<tld>\w+)\n        ',
           re.UNICODE|re.VERBOSE)

In [394]:
contacts = [re.search(pattern,email).groupdict() for email in emails]
contacts

[{'name': 'jane', 'domain': 'company', 'tld': 'com'},
 {'name': 'bob', 'domain': 'company', 'tld': 'com'},
 {'name': 'jane.janeway', 'domain': 'company', 'tld': 'com'},
 {'name': 'jane.janeway', 'domain': 'dogood', 'tld': 'org'},
 {'name': 'janet.janeway', 'domain': 'dogood', 'tld': 'org'}]

In [395]:
pd.DataFrame(contacts)

Unnamed: 0,name,domain,tld
0,jane,company,com
1,bob,company,com
2,jane.janeway,company,com
3,jane.janeway,dogood,org
4,janet.janeway,dogood,org
