In [2]:
import re

# Match object

In [4]:
match_object = re.search(r'ab*c', 'abc ac adc abbbc')
match_object

<re.Match object; span=(0, 3), match='abc'>

In [7]:
match_object = re.fullmatch(r'1(2|3)*4', '1233224')
match_object

<re.Match object; span=(0, 7), match='1233224'>

In [9]:
# `span` method.
sentence = 'That is quite a fabricated tale.'
m = re.search(r'q.*?t', sentence)
beg, end = m.span()
sentence[beg:end]

'quit'

In [13]:
# `group` or indexing can retrieve capture groups as one string.
m = re.search(r'b.*d', 'abc ac adc abbbc')
m.group() == m[0]  # group() == group(0)

True

In [26]:
# `groups` retrieve the matching capture groups as a list.
m = re.fullmatch(r'a(.*?) (.*)d(.*)c', 'abc ac adc abbbc')
print(m.groups())  # all captured groups
print(m.group(3, 1))  # 3rd, 1st capture group

('bc', 'ac a', 'c abbb')


('c abbb', 'bc')

In [34]:
m = re.search(r'w(.*)me', 'awesome')
print(fullmatch_span := m.span(0))
print(firstmatch_span := m.span(1))

print(m.start(1), m.end(1))  # start idx of the 1st group

(1, 7)
(2, 5)
2 5


<re.Match object; span=(1, 7), match='wesome'>

In [40]:
pat = re.compile(r'hi.*bye')
# pos: 검색 시작 위치(inclusive), endpos: 검색 종료 위치(exclusive)
m = pat.search('This is goodbye then', pos=1, endpos=15)

print(f'The searching subject text: "{m.string}"')
print(f'The compiled pattern was: {m.re}')
print(f'The search starts at {m.pos} and ends at{m.endpos}..idea')

The searching subject text: "This is goodbye then"
The compiled pattern was: re.compile('hi.*bye')
The search starts at 1 and ends at15..idea


In [41]:
# Assignment expression(walrus operator)
text = ['type: fruit', 'date: 2020/04/28']
for ip in text:
    if m := re.search(r'type: (.*)', ip):
        print(m[1])
    elif m := re.search(r'date: (.*?)/(.*?)/', ip):
        print(f'month: {m[2]}, year: {m[1]}')

fruit
month: 04, year: 2020


In [46]:
# Replace the matched strings by a function.
# The function must have a return value in `str`.
re.sub(r'(a|b)\^2',  # pattern
       lambda m: m[0].upper(),  # repl
       'a^2 + b^2 - C*3')  # string

'A^2 + B^2 - C*3'

In [48]:
re.sub(r'2|3',
       lambda m: str(int(m[0]) ** 2),
       'a^2 + b^2 - C*3')

'a^4 + b^4 - C*9'

In [49]:
d = {'1': 'one', '2': 'two', '4': 'four'}
re.sub(r'1|2|4', lambda m: d[m[0]], '9234012')

'9two3four0onetwo'

In [50]:
# Swapping technique
swap = {'cat': 'tiger', 'tiger': 'cat'}
words = 'cat tiger dog tiger cat'
re.sub(r'cat|tiger', lambda m: swap[m[0]], words)

'tiger cat dog cat tiger'

In [51]:
# note that numbers have been converted to strings here
# otherwise, you'd need to convert it in the lambda code
d = {'hand': '1', 'handy': '2', 'handful': '3', 'a^b': '4'}

# sort the keys to handle precedence rules
words = sorted(d.keys(), key=len, reverse=True)
# add anchors and flags if needed
pat = re.compile('|'.join(re.escape(s) for s in words))
print(pat.pattern)
print(pat.sub(lambda m: d[m[0]], 'handful hand pin handy (a^b)'))
# Consider using flashtext if too many K-V pairs in a dict:
# https://github.com/vi3k6i5/flashtext

handful|handy|hand|a\^b
3 1 pin 2 (4)


# findall

In [56]:
print(re.findall(r'ab*c', 'abc ac adc abbbc'))  # 모든 일치 부분을 찾는다.
print(re.findall(r'ab+c', 'abc ac adc abbbc'))

['abc', 'abbbc']

In [58]:
not_intended = re.findall(r't.*a', 'that is quite a fabricated tale.')
intended = re.findall(r't.*?a', 'that is quite a fabricated tale.')
print(not_intended)
print(intended)

['that is quite a fabricated ta']
['tha', 't is quite a', 'ted ta']


In [61]:
# 0 capture group
case1 = re.findall(r'ab*c', 'abc ac adc abbc xabbbcz bbb bc abbbbbc')
# 1 capture group
case2 = re.findall(r'a(b*)c', 'abc ac adc abbc xabbbcz bbb bc abbbbbc')
# 2 or more than capture groups
case3 = re.findall(r'(.*?)/(.*?)/(.*?),', '2020/04/25,1986/Mar/02,77/12/31')

print(case1)
print(case2)  # 패턴 전체가 아닌, '()'로 묶인 그룹 부분만 나열.
print(case3)  # List[Tuple[str]], str은 각 그룹에 대응하는 문자열.

['abc', 'ac', 'abbc', 'abbbc', 'abbbbbc']
['b', '', 'bb', 'bbb', 'bbbbb']
[('2020', '04', '25'), ('1986', 'Mar', '02')]


# finditer

In [62]:
m_iter = re.finditer(r'ab+c', 'abc ac adc abbbc')
for m in m_iter:
    print(m)

<re.Match object; span=(0, 3), match='abc'>
<re.Match object; span=(11, 16), match='abbbc'>


# `split` 과 capture group
1. 패턴대로 문자열을 쪼갠다.
2. 1 중에서 그룹에 의해 쪼개진 문자열 내에서 그룹이 아닌 부분을 제외한다.
3. 2의 단계에서 보던 문자열을 그룹별로 쪼갠다.

In [64]:
# pattern without capture group
case1 = re.split(r'1*4?2', '31111111111251111426')
# pattern with capture group
case2 = re.split(r'(1*4?2)', '31111111111251111426')
print(case1)
print(case2)

['3', '5', '6']
['3', '11111111112', '5', '111142', '6']


In [65]:
# here 4?2 is outside capture group, so that portion won't be in output
re.split(r'(1*)4?2', '31111111111251111426')
# matched: 3, 11111111112, 5, 111142, 6

['3', '1111111111', '5', '1111', '6']

In [66]:
# multiple capture groups example
# note that the portion matched by b+ isn't present in the output
re.split(r'(a+)b+(c+)', '3.14aabccc42')
# matched: 3.14, 'aabccc', 42

['3.14', 'aa', 'ccc', '42']

In [67]:
# here (4)? matches zero times on the first occasion
re.split(r'(1*)(4)?2', '31111111111251111426')
# matched: 3, '11111111112', 5, '111142', 6

['3', '1111111111', None, '5', '1111', '4', '6']

# subn

In [69]:
greeting = 'Have a nice weekend'

re.sub(r'e', 'E', greeting)

'HavE a nicE wEEkEnd'

In [70]:
# with `re.subn`, you can infer that 5 substitutions were made
re.subn(r'e', 'E', greeting)  # (replaced_string, times_replaced)

('HavE a nicE wEEkEnd', 5)

In [71]:
word = 'coffining'
# recursively delete 'fin'
while True:
    word, cnt = re.subn(r'fin', '', word)
    if cnt == 0:
        break
print(word)

cog


# Exercises

In [73]:
# E1. For the given strings, extract the matching portion from first 'is' to last 't'.
str1 = 'This the biggest fruit you have seen?'
str2 = 'Your mission is to read and practice consistently'

In [80]:
# A1
pat = re.compile(r'is.*t')
print(pat.search(str1).group())  # s1: 'is the biggest fruit'
print(pat.search(str2).group())  # s2: 'ission is to read and practice consistent'

is the biggest fruit
ission is to read and practice consistent


In [102]:
# E2. Find the starting index of first occurrence of 'is' or 'the' or 'was' or
# 'to' for the given input strings.
s1 = 'match after the last newline character'
s2 = 'and then you want to test'
s3 = 'this is good bye then'
s4 = 'who was there to see?'

In [106]:
# A2
pat = re.compile(r'was|the|is|to')
start1, _ = pat.search(s1).span()  # 12
start2, _ = pat.search(s2).span()  # 4
start3, _ = pat.search(s3).span()  # 2
start4, _ = pat.search(s4).span()  # 4
print(start1, start2, start3, start4)

12 4 2 4


In [104]:
# E3. Find the starting index of last occurrence of 'is' or 'the' or 'was' or
# 'to' for the given input strings.
s1 = 'match after the last newline character'
s2 = 'and then you want to test'
s3 = 'this is good bye then'
s4 = 'who was there to see?'

In [121]:
# A3
pat = re.compile(r'was|the|is|to')
start1, _ = [_ for _ in pat.finditer(s1)][-1].span()  # 12
start2, _ = [_ for _ in pat.finditer(s2)][-1].span()  # 18
start3, _ = [_ for _ in pat.finditer(s3)][-1].span()  # 17
start4, _ = [_ for _ in pat.finditer(s4)][-1].span()  # 14
print(start1, start2, start3, start4)

12 18 17 14


In [122]:
# E4. The given input string contains ':' exactly once.
# Extract all characters after the ':' as output.
ip = 'fruits:apple, mango, guava, blueberry'

In [125]:
# A4
start, _ = re.search(r':.*', ip).span()
ip[start + 1:]  # 'apple, mango, guava, blueberry'

'apple, mango, guava, blueberry'

In [126]:
# E5. The given input strings contains some text followed by '-' followed by a
# number. Replace that number with its log value using `math.log()`.
s1 = 'first-3.14'
s2 = 'next-123'

In [154]:
# A5
import math

pat = re.compile(r'-.+')
replaced1 = pat.sub(lambda m: '-' + str(math.log(float(m.group()[1:]))),
        s1)  # 'first-1.144222799920162'
replaced2 = pat.sub(lambda m: '-' + str(math.log(float(m.group()[1:]))),
        s2)  # 'next-4.812184355372417'
print(replaced1, '\n', replaced2)

first-1.144222799920162 
 next-4.812184355372417


In [162]:
# E7. Replace all occurrences of 'par' with 'spar', 'spare' with 'extra' and
# 'park' with 'garden' for the given input strings.
str1 = 'apartment has a park'
str2 = 'do you have a spare cable'
str3 = 'write a parser'

In [165]:
# A7
repl = {
    'par': 'spar',
    'spare': 'extra',
    'park': 'garden'
}

def replace(m:re.Match):
    return repl[m[0]]

pat = re.compile(r'spare|park|par')
print(pat.sub(replace, str1))  # 'aspartment has a garden'
print(pat.sub(replace, str2))  # 'do you have a extra cable'
print(pat.sub(replace, str3))  # 'write a sparser'

aspartment has a garden
do you have a extra cable
write a sparser


In [156]:
# E8. Extract all words between '(' and ')' from the given input string as a
# list. Assume that the input will not contain any broken parentheses.
ip = 'another (way) to reuse (portion) matched (by) capture groups'

In [169]:
# A8
re.findall(r'\((.*?)\)', ip)  # ['way', 'portion', 'by']

['way', 'portion', 'by']

In [171]:
# E9. Extract all occurrences of '<' up to next occurrence of '>',
# provided there is at least one character in between '<' and '>'.
ip = 'a<apple> 1<> b<bye> 2<> c<cat>'

In [175]:
# A9
re.findall(r'<.+?>', ip)  # ['<apple>', '<> b<bye>', '<> c<cat>']

['<apple>', '<> b<bye>', '<> c<cat>']

In [177]:
# E10. Use `re.findall` to get the output as shown below for the given input
# strings. Note the characters used in the input strings carefully.
row1 = '-2,5 4,+3 +42,-53 4356246,-357532354 '
row2 = '1.32,-3.14 634,5.63 63.3e3,9907809345343.235 '

In [183]:
# A10
pat = re.compile(r'(.+?),(.+?) ')

print(pat.findall(row1))  # [('-2', '5'), ('4', '+3'), ('+42', '-53'), ('4356246', '-357532354')]
print(pat.findall(row2))  # [('1.32', '-3.14'), ('634', '5.63'), ('63.3e3', '9907809345343.235')]


[('-2', '5'), ('4', '+3'), ('+42', '-53'), ('4356246', '-357532354')]
[('1.32', '-3.14'), ('634', '5.63'), ('63.3e3', '9907809345343.235')]


In [None]:
# E11. This is an extension to previous question.
#  - For row1, find the sum of integers of each tuple element.
#  For example, sum of -2 and 5 is 3.
#  - For row2, find the sum of floating-point numbers of each tuple element.
#  For example, sum of 1.32 and -3.14 is -1.82.
row1 = '-2,5 4,+3 +42,-53 4356246,-357532354 '
row2 = '1.32,-3.14 634,5.63 63.3e3,9907809345343.235 '

In [None]:
# A11
# should be same as previous question
pat = re.compile(r'(.+?),(.+?) ')
pat.  # [3, 7, -11, -353176108]
# [-1.82, 639.63, 9907809408643.234]

In [184]:
# E12. Use `re.split` to get the output as shown below.
ip = '42:no-output;1000:car-truck;SQEX49801'

In [192]:
# A12
# ':no-', ';', ':car-' -> ''
re.split(r'(:|;).*?-?', ip)  # ['42', 'output', '1000', 'truck', 'SQEX49801']

['42', ':', 'no-output', ';', '1000', ':', 'car-truck', ';', 'SQEX49801']

In [None]:
# E13. For the given list of strings, change the elements into a tuple of
# original element and number of times 't' occurs in that element.
words = ['sequoia', 'attest', 'tattletale', 'asset']

In [None]:
# A13
# [('sequoia', 0), ('attest', 3), ('tattletale', 4), ('asset', 1)]

In [226]:
# E14. The given input string has fields separated by ':'. Each field contains
# four uppercase alphabets followed optionally by two digits. Ignore the last
# field, which is empty. See docs.python: Match.groups and use `re.finditer` to
# get the output as shown below. If the optional digits aren't present, show
# 'NA' instead of `None`.
ip = 'TWXA42:JWPA:NTED01:'

In [232]:
# A14
# [('TWXA', '42'), ('JWPA', 'NA'), ('NTED', '01')]
[item.groups() for item in re.finditer(r'(.{4})(.*?):', ip)]

# Note that this is different from `re.findall` which will just give empty
# string instead of `None` when a capture group doesn't participate.

[('TWXA', '42'), ('JWPA', ''), ('NTED', '01')]

In [209]:
# E15. Convert the comma separated strings to corresponding dict objects as
# shown below.
row1 = 'name:rohan,maths:75,phy:89,'
row2 = 'name:rose,maths:88,phy:92,'

In [224]:
# A15
pat = re.compile(r',?(.+?):(.+?)\b')
print(dict(pat.findall(row1)))  # for row1: {'name': 'rohan', 'maths': '75', 'phy': '89'}
print(dict(pat.findall(row2)))  # for row1: {'name': 'rose', 'maths': '88', 'phy': '92'}

{'name': 'rohan', 'maths': '75', 'phy': '89'}
{'name': 'rose', 'maths': '88', 'phy': '92'}
