In [2]:
import re

https://regex101.com/

### 

## re Module Functions

### Searching Functions

#### re.search()
- Scans a string for a regex match

In [4]:
bool(re.search('[0-9]', 'f20ew494'))

True

In [5]:
re.search(r'\d+', '123foobar')

<re.Match object; span=(0, 3), match='123'>

In [6]:
re.search(r'\d+', 'foo123bar')

<re.Match object; span=(3, 6), match='123'>

#### re.match()
- Looks for a regex match at the beginning of a string

In [7]:
re.match(r'\d+', '123foobar')

<re.Match object; span=(0, 3), match='123'>

In [8]:
print(re.match(r'\d+', 'foo123bar'))

None


#### re.fullmatch()
- Looks for a regex match on an entire string

In [9]:
print(re.fullmatch(r'\d+', 'foo123bar'))

None


In [10]:
re.fullmatch(r'\d+', '123')

<re.Match object; span=(0, 3), match='123'>

#### re.findall()
- Returns a list of all non-overlapping regex matches in a string

In [11]:
re.findall(r'\w+', '...foo,,,,bar:%$baz//|')

['foo', 'bar', 'baz']

If contains a group, returns only content of the group and not what is outside

In [12]:
re.findall(r'#(\w+)#', '#foo#.#bar#.#baz#')

['foo', 'bar', 'baz']

In [13]:
re.findall(r'(\w+),(\w+)', 'foo,bar,baz,qux,quux,corge')

[('foo', 'bar'), ('baz', 'qux'), ('quux', 'corge')]

#### re.finditer()
- Returns an iterator that yields regex matches from a string

In [19]:
it = re.finditer(r'\w+', '...foo,,,,bar:%$baz//|')

In [20]:
next(it)

<re.Match object; span=(3, 6), match='foo'>

In [21]:
for i in re.finditer(r'\w+', '...foo,,,,bar:%$baz//|'):
    print(i)

<re.Match object; span=(3, 6), match='foo'>
<re.Match object; span=(10, 13), match='bar'>
<re.Match object; span=(16, 19), match='baz'>


### 

### Substitution Functions

#### re.sub(\<regex\>, \<repl\>, \<string\>, count=0, flags=0)
- Scans a string for a regex match
- Replace the matching portions of the string with the specified replacement string
- Returns the result

#### Substitution by string

In [22]:
s = 'foo.123.bar.789.baz'

In [23]:
re.sub(r'\d+', '#', s)

'foo.#.bar.#.baz'

In [24]:
re.sub(r'(\w+),bar,baz,(\w+)',
        r'\2,bar,baz,\1',
       'foo,bar,baz,qux')

'qux,bar,baz,foo'

In [26]:
re.sub(r'foo,(?P<w1>\w+),(?P<w2>\w+),qux',
        r'foo,\g<w2>,\g<w1>,qux',
        'foo,bar,baz,qux')

'foo,baz,bar,qux'

#### Substitution by function
- Calls the \<repl\> function for each match
- Each match is passed as an argument

In [48]:
def f(match_obj):
    s = match_obj.group(0)
    
    # s.isdigit() returns True if ALL characters in s are strings
    if s.isdigit():
        return str(int(s) * 10)
    else:
        return s.upper()

In [47]:
re.sub(r'\w+', f, 'foo.10.bar.20.baz.30')

foo
10
bar
20
baz
30


'FOO.100.BAR.200.BAZ.300'

#### Limiting the number of replacemensts

In [30]:
re.sub(r'\w+', 'xxx', 'foo.bar.baz.qux')

'xxx.xxx.xxx.xxx'

In [31]:
re.sub(r'\w+', 'xxx', 'foo.bar.baz.qux', count=2)

'xxx.xxx.baz.qux'

#### re.subn()
- Identical but returns a 2 tuple with number of substitutions made

In [32]:
re.subn(r'\w+', 'xxx', 'foo.bar.baz.qux')

('xxx.xxx.xxx.xxx', 4)

In [33]:
re.subn(r'\w+', 'xxx', 'foo.bar.baz.qux', count=2)

('xxx.xxx.baz.qux', 2)

In [34]:
def f(match_obj):
    m = match_obj.group(0)
    if m.isdigit():
        return str(int(m)*10)
    else:
        return m.upper()

In [36]:
re.subn(r'\w+', f, 'foo.10.bar.20.baz.30')

('FOO.100.BAR.200.BAZ.300', 6)

### 

### Utility Functions

#### re.split(\<regex\>, \<string\>, maxsplit=0, flags=0)
- Splits a string into substrings using a regex as a delimiter

In [226]:
re.split('\s*[,;/]\s*', 'foo,bar  ;  baz / qux')

['foo', 'bar', 'baz', 'qux']

In [227]:
re.split('(\s*[,;/]\s*)', 'foo,bar  ;  baz / qux')

['foo', ',', 'bar', '  ;  ', 'baz', ' / ', 'qux']

In [39]:
string = 'foo, bar ; baz / qux'

In [40]:
regex = r'(\s*[,;/]\s*)'

In [41]:
a = re.split(regex, string)

In [42]:
a

['foo', ', ', 'bar', ' ; ', 'baz', ' / ', 'qux']

In [43]:
for i, s in enumerate(a):
    # This will be True for the tokens but not the delimiters
    if not re.fullmatch(regex, s):
        a[i] = f'<{s}>'

In [44]:
a

['<foo>', ', ', '<bar>', ' ; ', '<baz>', ' / ', '<qux>']

In [45]:
''.join(a)

'<foo>, <bar> ; <baz> / <qux>'

#### 

In [49]:
string = 'foo,bar ;  baz / qux'

In [50]:
regex = r'(?:\s*[,;/]\s*)'

In [51]:
re.split(regex, string)

['foo', 'bar', 'baz', 'qux']

#### 

In [55]:
s = 'foo, bar, baz, qux, quux, corge'

In [56]:
re.split(r',\s*', s)

['foo', 'bar', 'baz', 'qux', 'quux', 'corge']

In [59]:
re.split(r',\s*', s, maxsplit=3) # 4th element is remainder

['foo', 'bar', 'baz', 'qux, quux, corge']

#### 

In [61]:
re.split('(/)', '/foo/bar/') # adds empty etring if group is at the start/end

['', '/', 'foo', '/', 'bar', '/', '']

#### 

#### re.escape(\<regex\>)
- Escapes character in a regex
- Returns a copy of regex with each nonword character preceded by a backslash

In [62]:
print(re.match('foo^bar(baz)|qux', 'foo^bar(baz)|qux'))

None


In [66]:
re.match('foo\^bar\(baz\)\|qux', 'foo^bar(baz)|qux')

<re.Match object; span=(0, 16), match='foo^bar(baz)|qux'>

In [67]:
re.escape('foo^bar(baz)|qux') == 'foo\^bar\(baz\)\|qux'

True

In [69]:
re.match(re.escape('foo^bar(baz)|qux'), 'foo^bar(baz)|qux')

<re.Match object; span=(0, 16), match='foo^bar(baz)|qux'>

### 

## Compiled Regex Objects in Python

### 

### re.compile(\<regex\>, flags=0)
- **Precompile** a regex into a **regular expression object**

re_obj = re.compile(\<regex\>, \<flags\>)

result = re.search(re_obj, \<string\>)

result = re_obj.search(\<string\>)

result = re.search(\<regex\>, \<string\>, \<flags\>)

In [228]:
re.search(r'(\d+)', 'foo123bar')

<re.Match object; span=(3, 6), match='123'>

In [229]:
re_obj = re.compile(r'(\d+)')

In [230]:
re_obj

re.compile(r'(\d+)', re.UNICODE)

In [72]:
re.search(re_obj, 'foo123bar')

<re.Match object; span=(3, 6), match='123'>

In [74]:
re_obj.search('foo123bar')

<re.Match object; span=(3, 6), match='123'>

In [75]:
r1 = re.search('ba[rz]', 'FOOBARBAZ', flags=re.I)

In [77]:
re_obj = re.compile('ba[rz]', flags=re.I)

In [81]:
r2 = re.search(re_obj, 'FOOBARBAZ')

In [78]:
r3 = re_obj.search('FOOBARBAZ')

In [79]:
r1 

<re.Match object; span=(3, 6), match='BAR'>

In [82]:
r2

<re.Match object; span=(3, 6), match='BAR'>

In [83]:
r3

<re.Match object; span=(3, 6), match='BAR'>

### 

### Why bother compiling a regex?
- Modularity
- A little bit faster

In [84]:
s1, s2, s3, s4 = 'foo.bar', 'foo123bar', 'baz99', 'qux & grault'

In [85]:
re_obj = re.compile('\d+')

In [89]:
re_obj.search(s1)

In [88]:
re_obj.search(s2)

<re.Match object; span=(3, 6), match='123'>

In [90]:
re_obj.search(s3)

<re.Match object; span=(3, 5), match='99'>

In [92]:
re_obj.search(s4)

In [93]:
regex = '\d+'

In [94]:
re.search(regex, s1)

In [95]:
re.search(regex, s2)

<re.Match object; span=(3, 6), match='123'>

### 

### Regular Expression Object Methods
- **accepts optional arguments pos and endpos**

re_obj.search(\<string\>[, \<pos\>[, \<endpos\>]])

re_obj.match(\<string\>[, \<pos\>[, \<endpos\>]])

re_obj.fullmatch(\<string\>[, \<pos\>[, \<endpos\>]])

re_obj.findall(\<string\>[, \<pos\>[, \<endpos\>]])

re_obj.finditer(\<string\>[, \<pos\>[, \<endpos\>]])

In [99]:
re_obj = re.compile(r'\d+')
s = 'foo123barbaz'

In [100]:
re_obj.search(s)

<re.Match object; span=(3, 6), match='123'>

In [101]:
s[6:9]

'bar'

In [102]:
print(re_obj.search(s, 6, 9))

None


In [104]:
re_obj = re.compile('^bar')
s = 'foobarbaz'

In [105]:
s[3:]

'barbaz'

In [107]:
print(re_obj.search(s, 3)) # the anchor doesn't apply on the pos argument

None


re_obj.split(\<string\>, maxsplit=0)

re_obj.sub(\<repl\>, \<string\>, count=0)

re_obj.subn(\<repl\>, \<string\>, count=0)

### 

### Regular Expression Object Attributes

### re_obj.flags()
- Any \<flags\> that are in effect for the regex

In [109]:
re_obj = re.compile(r'(?m)(\w+), (\w+)', re.I)

In [110]:
re_obj.flags

42

### re_obj.groups
- The number of capturing groups in the regex

In [113]:
re_obj.groups

2

### re_obj.pattern
- The \<regex\> pattern that produced this object

In [114]:
re_obj.pattern

'(?m)(\\w+), (\\w+)'

### re_obj.groupindex
- A dictionnary mapping each symbolic group name defined by the (?P\<name\>) construct (if any) to the corresponding group number

In [116]:
re_obj = re.compile(r'(?P<w1>), (?P<w2>)')

In [117]:
re_obj.groupindex

mappingproxy({'w1': 1, 'w2': 2})

In [118]:
re_obj.groupindex['w1']

1

### 

## Match Object Methods and Attributes

In [119]:
m = re.search('bar', 'foo.bar.baz')

In [120]:
bool(m)

True

### 

### Match Object Methods

### match.group([\<group1\>, ...])
- The specified captured group or groups from match

In [121]:
m = re.search(r'(\w+),(\w+),(\w+)', 'foo,bar,baz')

In [122]:
m.group(1)

'foo'

In [123]:
m = re.search(r'(?P<w1>\w+),(?P<w2>\w+),(?P<w3>\w+)', 'foo,bar,baz')

In [124]:
m.group('w3')

'baz'

In [126]:
m.group('w1', 'w3', 'w1')

('foo', 'baz', 'foo')

In [128]:
m = re.match(r'(\w{3},)+', 'foo,bar,baz,qux')

In [129]:
m

<re.Match object; span=(0, 12), match='foo,bar,baz,'>

In [131]:
m.group(1) # accesses the last group, other aren't accessible

'baz,'

In [132]:
m.group(0)

'foo,bar,baz,'

### match.\_\_getitem\_\_()
- A captured group from match
- Identical to match.group(\<grp\>)

In [135]:
m = re.search(r'(\w+),(\w+),(\w+)', 'foo,bar,baz')

In [136]:
m.__getitem__(2)

'bar'

In [137]:
m[2]

'bar'

### match.groups(default=None)
- All the captured groups from match

In [140]:
m = re.search(r'(\w+),(\w+),(\w+)?', 'foo,bar,')

In [141]:
m.groups()

('foo', 'bar', None)

In [231]:
m.groups(default='---')

('foo', 'bar', 'baz')

### match.groupdict(default=None)
- A dictionary of named captured groups from match

In [154]:
m = re.match(r'foo,(?P<w1>\w+),(?P<w2>\w+),qux','foo,bar,baz,qux')

In [155]:
m.groupdict()

{'w1': 'bar', 'w2': 'baz'}

In [156]:
m.groupdict()

{'w1': 'bar', 'w2': 'baz'}

In [159]:
m = re.match(r'foo,(?P<w1>\w+),(?P<w2>\w+)?,qux','foo,bar,,qux')

In [160]:
m.groupdict(default='---')

{'w1': 'bar', 'w2': '---'}

### match.expand(\<template\>)
- The result of performing backreference subsitutions from match

In [161]:
m = re.search(r'(\w+),(\w+),(\w+)', 'foo,bar,baz')

In [163]:
m.groups()

('foo', 'bar', 'baz')

In [164]:
m.expand(r'\2')

'bar'

In [166]:
m.expand(r'[\2] -> [\1]')

'[bar] -> [foo]'

In [168]:
m = re.search(r'(?P<num>\d+)', 'foo123qux')

In [170]:
m.expand(r'--- \g<num> ---')

'--- 123 ---'

### match.start()
### match.end()
- The starting/ending index of match

In [171]:
s = 'foo123bar456baz'

In [172]:
m = re.search('\d+', s)

In [173]:
m

<re.Match object; span=(3, 6), match='123'>

In [174]:
m.start()

3

In [175]:
m.end()

6

In [176]:
s[m.start():m.end()]

'123'

In [178]:
m = re.search(r'(\d+)\D*(?P<num>\d+)', s)

In [180]:
m.group('num')

'456'

In [181]:
m.start('num'), m.end('num')

(9, 12)

### match.span()
- Both the starting and ending indices of match as a tuple

In [182]:
m.span()

(3, 12)

In [183]:
m.span(1)

(3, 6)

In [184]:
m.span('num')

(9, 12)

### Match Object Attributes

### match.pos
### match.endpos
- The effective values of the pose and endpos 

In [185]:
re_obj = re.compile(r'\d+')

In [186]:
m = re_obj.search('foo123bar', 2, 7)

In [188]:
m.pos, m.endpos

(2, 7)

In [189]:
m = re_obj.search('foo123bar')

In [190]:
m.pos, m.endpos

(0, 9)

### match.lastindex
- The index of the last captured group

In [191]:
m = re.search(r'(\w+),(\w+),(\w+)', 'foo,bar,baz')

In [192]:
m.lastindex

3

In [193]:
m[m.lastindex]

'baz'

In [199]:
m = re.search(r'(\w+),(\w+),(\w+)?', 'foo,bar,')

In [200]:
m.groups()

('foo', 'bar', None)

In [201]:
m.lastindex, m[m.lastindex]  # determine how many groups participated in the match

(2, 'bar')

In [202]:
m = re.match('((a)(b))', 'ab')

In [203]:
m.groups()

('ab', 'a', 'b')

In [204]:
m.lastindex

1

In [205]:
m[m.lastindex]

'ab'

### match.lastgroup
- The name of the last captured group

In [206]:
s = 'foo123bar456baz'

In [207]:
m = re.search(r'(?P<n1>\d+)\D*(?P<n2>\d+)', s)

In [208]:
m.lastgroup

'n2'

In [209]:
m = re.search(r'(\d+)\D*(\d+)', s)

In [210]:
m.groups()

('123', '456')

In [211]:
print(m.lastgroup)

None


### match.re
- The compiled regular expression object for the match
- Same object passed to re.compile()

In [213]:
regex = r'(\w+),(\w+),(\w+)'

In [214]:
m1 = re.search(regex, 'foo,bar,baz')

In [216]:
m1.re

re.compile(r'(\w+),(\w+),(\w+)', re.UNICODE)

In [217]:
re.compile(regex) is m1.re

True

In [218]:
m1.re.groups

3

In [219]:
m1.re.pattern

'(\\w+),(\\w+),(\\w+)'

In [220]:
m1.re.flags

32

In [221]:
m = re.search(r'(\w+),(\w+),(\w+)', 'foo,bar,baz')

In [222]:
m.re

re.compile(r'(\w+),(\w+),(\w+)', re.UNICODE)

In [223]:
m.re.match('wuewr,fewfew,few')

<re.Match object; span=(0, 16), match='wuewr,fewfew,few'>

### match.string
- The search string for the match

In [224]:
m.string

'foo,bar,baz'

### 

### Regex library

https://pypi.org/project/regex/