In [1]:
import re

# dot metacharacter
개행 문자를 제외한 문자 하나를 의미.

In [4]:
# 하나의 문자열 내에서 패턴 속 `.`가 의미하는 문자들이 모두 같을 필요는 없다.
print(re.sub(r'c.t', 'X', 'tac tin cat abc;tuv acute'))
print(re.sub(r'r..d', 'X', 'breadth markedly reported overrides'))  # 'reed' 든 'read' 무관.
print(re.sub(r'2.3', '8', '42\t35'))

taXin X abXuv aXe
bXth maXly repoX oveXes
485


In [6]:
# re.DOTALL 옵션을 주지 않는 한 `.`은 '\n'에만 대응하지 않는다.
re.search(r'a.b', 'a\nb')

# split

In [9]:
print(re.split(r'-', 'apple-85-mango-70'))
print(re.split(r'-', 'apple-85-mango-70', maxsplit=1))

['apple', '85', 'mango', '70']
['apple', '85-mango-70']


# Quantifiers

## Greedy quantifiers
 'greedy'라는 말은 최대한 문자를 많이 먹을 수 있는 방향으로 해석한다는 뜻이다.
  - `?`: 문자 하나 혹은 그룹을 0이나 1회 상징한다.
  - `*`: 문자나 그룹 하나를 0회 이상 상징한다
  - `+`: 문자나 그룹 하나를 1회 이상 상징한다

### `?`

In [10]:
# 최대한 문자를 많이 먹을 수 있는 방향으로 해석한다.
re.sub(r'e?ar', 'X', 'far feat flare fear')  # (ear|ar)

'fX feat flXe fX'

In [12]:
re.sub(r'\bpart?\b', 'X', 'par spare part party')  # '\bpar(t|)\b'

'X spare X party'

In [15]:
words = ['red', 'read', 'ready', 're;d', 'road', 'redo', 'reed', 'rod']

matched = [word for word in words if re.search(r'\bre.?d\b', word)]
print(matched)
matched = [word for word in words if re.search(r'\bre(.|)d\b', word)]
print(matched)

['red', 'read', 're;d', 'reed']
['red', 'read', 're;d', 'reed']


In [19]:
# group 단위로 상징
replaced1 = re.sub(r'par(ro)?t', 'X', 'par part parrot parent')  # r'par(ro|)t'  == r'(parrot|part)
print(replaced1)
replaced1 = re.sub(r'(parrot|part)', 'X', 'par part parrot parent')  # r'par(ro|)t'  == r'(parrot|part)
print(replaced1)

par X X parent
par X X parent


In [20]:
replaced2 = re.sub(r'par(en|ro)?t', 'X', 'par part parrot parent')  # r'(parrot|parent|part)'
print(replaced2)
replaced2 = re.sub(r'(parrot|parent|part)', 'X', 'par part parrot parent')  # r'(parrot|parent|part)'
print(replaced2)

par X X X
par X X X


### `*`

In [21]:
re.sub(r'ta*r', 'X', 'tr tear tare steer sitaara')  # tr, tar, taar, taaar, ta...r

'X tear Xe steer siXa'

In [22]:
# t와 r 사이에 들어 있는 문자가 없거나, e나 a 어느 쪽이기만 하면 된다.
re.sub(r't(e|a)*r', 'X', 'tr tear tare steer sitaara')

'X X Xe sX siXa'

In [24]:
# 2 단독이거나, 앞에 1이 연속하는 2를 상징한다
re.sub(r'1*2', 'X', '3111111111125111142')

'3X511114X'

In [27]:
print(re.split(r'1*2', '3111111111125111142'))
print(re.split(r'1*2', '3111111111125111142', maxsplit=1))

['3', '511114', '']
['3', '5111142']


In [30]:
# '' 이거나 'u'가 연속하는 문자열이거나.
re.split(r'u*', 'clouudy')

['', 'c', 'l', 'o', '', 'd', 'y', '']

### `+`

In [32]:
# tar, taar, taaaa...r 등 't'와 'r' 사이 최소 1개의 연속되는 'a'는 들어 있다.
re.sub(r'ta+r', 'X', 'tr tear tare steer sitaara')

'tr tear Xe steer siXa'

In [33]:
# 't'와 'r' 사이에 'e'나 'a'를 최소 한 글자는 사용한 조합이 들어간다.
re.sub(r't(e|a)+r', 'X', 'tr tear tare steer sitaara')

'tr X Xe sX siXa'

In [35]:
re.sub(r'1+2', 'X', '3111111111125111142')

'3X5111142'

In [37]:
re.split(r'1+', '3111111111125111142', )

['3', '25', '42']

## `{}`: 중괄호 안에 지정된 방법만큼 상징한다.
  - `{m,n}`: [m,n]
  - `{m,}`: [m, Inf)
  - `{,n}`: [0, n]
  - `{n}`: exactly n

In [39]:
demo = ['abc', 'ac', 'adc', 'abbc', 'xabbbcz', 'bbb', 'bc', 'abbbbbc']

matched = [w for w in demo if re.search(r'ab{1,4}c', w)]
print(matched)  # ['abc', 'abbc', 'xabbbcz']
matched = [w for w in demo if re.search(r'ab{3,}c', w)]
print(matched)  # ['xabbbcz', 'abbbbbc']
matched = [w for w in demo if re.search(r'ab{,2}c', w)]
print(matched)  # ['abc', 'ac', 'abbc']
matched = [w for w in demo if re.search(r'ab{3}c', w)]
print(matched)  # ['xabbbcz']

['abc', 'abbc', 'xabbbcz']
['xabbbcz', 'abbbbbc']
['abc', 'ac', 'abbc']
['xabbbcz']


In [41]:
re.search(r'Error.*valid', 'Error: not a valid input')

<re.Match object; span=(0, 18), match='Error: not a valid'>

# Logical AND technique

In [43]:
seq1, seq2 = 'cat and dog', 'dog and cat'

compiled_pattern = re.compile(r'cat.*dog|dog.*cat')
print(bool(compiled_pattern.search(seq1)))  # True
print(bool(compiled_pattern.search(seq2)))  # True

# if you just need True/False result, this would be a scalable approach
patterns = (r'cat', r'dog')
print(all(re.search(p, seq1) for p in patterns))  # True
print(all(re.search(p, seq2) for p in patterns))  # True

True
True
True
True


# greedy 동작 설명

In [51]:
# (f.o|fo)
re.sub(r'f.?o', 'X', 'foot')

'Xt'

In [56]:
# a more practical example
# prefix '<' with '\' if it is not already prefixed
# both '<' and '\<' will get replaced with '\<'
# note the use of raw string for all the three arguments
print(re.sub(r'\\?<',  # '?' 앞에 '\'만 붙으면 수량자를 escape 하게 된다.
             r'\<',
             r'blah \< foo < bar \< blah < baz'))

blah \< foo \< bar \< blah \< baz


In [57]:
# say goodbye to r'handful|handy|hand' shenanigans
re.sub(r'hand(y|ful)?', 'X', 'hand handy handful')

'X X X'

## backtracking
greedy 수량자가 자신이 먹은 문자열을 끝에서부터 하나씩 뱉어내면서 일치 여부를 가리는 동작.

In [58]:
sentence = 'that is quite a fabricated tale'

# r't.*a' will always match from first 't' to last 'a'
# also, note that count argument is set to 1 for illustration purposes
print(re.sub(r't.*a', 'X', sentence, count=1))  # greedy 동작을 잘 보여주는 예.
print(re.sub(r't.*a', 'X', 'star', count=1))

Xle
sXr


In [60]:
sentence = 'that is quite a fabricated tale'
print(re.sub(r't.*a.*q.*f', 'X', sentence, count=1))
print(re.sub(r't.*a.*u', 'X', sentence, count=1))

Xabricated tale
Xite a fabricated tale


## non-greedy quantifiers
greedy 수량자 뒤에 '?'를 붙여서 'lazy' 혹은 'reluctant'로 만들 수 있다.
이 들은 일단 문자를 하나씩만 먹고 일치 여부를 따져보는 방식이다.

  - greedy: 문자열을 마지막 자리까지 통째로 먹고 하나씩 뱉어보면서 일치 여부를 따져보기
  - lazy: 문자열을 0번째 자리부터 하나씩만 먹어보면서 일치 여부 따져보기

In [61]:
print(re.sub(r'f.??o', 'X', 'foot', count=1))  # 선술했듯이 두번째 물음표는 lazifier이다.
print(re.sub(r'f.??o', 'X', 'frost', count=1))
print(re.sub(r'.{2,5}?', 'X', '123456789', count=1))
print(re.split(r':.*?:', 'green:3.14:teal::brown:oh!:blue'))

Xot
Xst
X3456789
['green', 'teal', 'brown', 'blue']


In [63]:
sentence = 'that is quite a fabricated tale'

print(re.sub(r't.*?a', 'X', sentence, count=1))
print(re.sub(r't.*?a.*?f', 'X', sentence, count=1))

Xt is quite a fabricated tale
Xabricated tale


# Exercises

In [66]:
# E1. Replace 42//5 or 42/5 with 8 for the given input.
ip = 'a+42//5-c pressure*3+42/5-14256'

In [68]:
# A1
re.sub(r'42/+5', '8', ip)  # 'a+8-c pressure*3+8-14256'

'a+8-c pressure*3+8-14256'

In [69]:
# b) For the list items, filter all elements starting with 'hand' and ending with
# at most one more character or 'le'.
items = ['handed', 'hand', 'handled', 'handy', 'unhand', 'hands', 'handle']

In [71]:
# A2
# ['hand', 'handy', 'hands', 'handle']
compiled_pattern = re.compile(r'\Ahand(.?|le)\Z')
[item for item in items if compiled_pattern.search(item)]

['hand', 'handy', 'hands', 'handle']

In [73]:
# E3. Use `re.split` to get the output as shown for the given input strings.
eqn1 = 'a+42//5-c'
eqn2 = 'pressure*3+42/5-14256'
eqn3 = 'r*42-5/3+42///5-42/53+a'

In [80]:
# A3
compiled_pattern = re.compile(r'42//?5')
splited1 = compiled_pattern.split(eqn1)
splited2 = compiled_pattern.split(eqn2)
splited3 = compiled_pattern.split(eqn3)

print(splited1)  # ['a+', '-c']
print(splited2)  # ['pressure*3+', '-14256']
print(splited3)  # ['r*42-5/3+42///5-', '3+a']

['a+', '-c']
['pressure*3+', '-14256']
['r*42-5/3+42///5-', '3+a']


In [81]:
# E4. For the given input strings, remove everything from the first occurrence
# of 'i' till end of the string.
s1 = 'remove the special meaning of such constructs'
s2 = 'characters while constructing'

In [90]:
# A4
pat = re.compile(r'i.*\Z')
print(pat.sub('', s1))  # 'remove the spec'
print(pat.sub('', s2))  # 'characters wh'

remove the spec
characters wh


In [91]:
# E5. For the given strings, construct a RE to get output as shown.
str1 = 'a+b(addition)'
str2 = 'a/b(division) + c%d(#modulo)'
str3 = 'Hi there(greeting). Nice day(a(b)'

In [97]:
# A5
remove_parentheses = re.compile(r'\(.*?\)')
print(remove_parentheses.sub('', str1))  # 'a+b'
print(remove_parentheses.sub('', str2))  # 'a/b + c%d'
print(remove_parentheses.sub('', str3))  # 'Hi there. Nice day'

a+b
a/b + c%d
Hi there. Nice day


In [99]:
# E6. Correct the given RE to get the expected output.
words = 'plink incoming tint winter in caution sentient'

# wrong output
change = re.compile(r'int|in|ion|ing|inco|inter|ink')
change.sub('X', words)  # 'plXk XcomXg tX wXer X cautX sentient'

'plXk XcomXg tX wXer X cautX sentient'

In [103]:
# A6
# ink, ing, int, inter, in, ion -> X
change = re.compile(r'io?n.*?\b')
change.sub('X', words)  # 'plX XmX tX wX X cautX sentient'

'plX X tX wX X cautX sentient'

In [None]:
# E7. For the given greedy quantifiers, what would be the equivalent form using
# '{m,n}' representation?

In [147]:
# A7
sample = 'I am very grateful for your mercy.'

# r'?' == {,1}
result = (re.search(r'a?', sample).group() ==
          re.search(r'a{,1}', sample).group())
print(result)

# r'*' == {0,}
result = (re.search(r'a*', sample).group() ==
          re.search(r'a{0,}', sample).group())
print(result)

# r'+' == {1,}
result = (re.search(r'a+', sample).group() ==
          re.search(r'a{1,}', sample).group())
print(result)

True
True
True


In [None]:
# E8. (a*|b*) is same as (a|b)* — True or False?

In [107]:
# A8.
raw_pattern1 = r'(a*|b*)'  # a의 시퀀스, 혹은 b의 시퀀스
raw_pattern2 = r'(a|b)*'  # a나 b로 조합된 시퀀스
sample = 'abraham'
matched1 = re.search(raw_pattern1, sample)
matched2 = re.search(raw_pattern2, sample)
print(f'{matched1.group()} != {matched2.group()}')

a != ab


In [140]:
# E9. For the given input strings, remove everything from the first occurrence
# of 'test' (irrespective of case) till end of the string, provided 'test'
# isn't at the end of the string.
s1 = 'this is a Test'
s2 = 'always test your RE for corner cases'
s3 = 'a TEST of skill tests?'

In [142]:
# A9
pat = re.compile(r'test.+\Z', re.I)
print(f"'{pat.sub('', s1)}'")  # 'this is a Test' <- '\Z' 직전 'test'는 사라져선 안된다.
print(f"'{pat.sub('', s2)}'")  # 'always '
print(f"'{pat.sub('', s3)}'")  # 'a '

'this is a Test'
'always '
'a '


In [None]:
# E10. For the input list words, filter all elements starting with 's' and
# containing 'e' and 't' in any order.
words = ['sequoia', 'subtle', 'exhibit', 'asset', 'sets', 'tests', 'site']

In [138]:
# A10
compiled_pattern = re.compile(r'\As.*(e.*t|t.*e).*')
matched = [word for word in words if compiled_pattern.search(word)]
print(matched)  # ['subtle', 'sets', 'site']

['subtle', 'sets', 'site']


In [None]:
# E11 For the input list 'words', remove all elements having less than 6
# characters.
words = ['sequoia', 'subtle', 'exhibit', 'asset', 'sets', 'tests', 'site']

In [136]:
# A11
compiled_pattern = re.compile(r'.{6,}')
matched = [word for word in words if compiled_pattern.search(word)]
print(matched)  # ['sequoia', 'subtle', 'exhibit']

['sequoia', 'subtle', 'exhibit']


In [131]:
# E12. For the input list `words`, filter all elements starting with 's' or 't'
# and having a maximum of 6 characters.
words = ['sequoia', 'subtle', 'exhibit', 'asset', 'sets', 'tests', 'site']

In [135]:
# A12
compiled_pattern = re.compile(r'\A(s|t).{,5}\Z')
matched = [word for word in words if compiled_pattern.search(word)]
print(matched)  # ['subtle', 'sets', 'tests', 'site']


['subtle', 'sets', 'tests', 'site']


In [119]:
# E13. Can you reason out why this code results in the output shown?
# The aim was to remove all <characters> patterns but not the <> ones.
# The expected result was 'a 1<> b 2<> c'.
ip = 'a<apple> 1<> b<bye> 2<> c<cat>'

print(f"Unexpected: {re.sub(r'<.+?>', '', ip)}")  # 'a 1 2'

Unexpected: a 1 2


In [129]:
# A13
# What replaced? '<apple>', '<> b<bye>', '<> c<cat>'
# '<>'에서 '.+?' 패턴이 찾은 것은 '>'으로 다음 닫는 꺽쇠를 찾을 때까지 탐색을 진행했다.
re.sub(r'<(a|b|c).*?>', '', ip)  # 'a 1<> b 2<> c'

'a 1<> b 2<> c'

In [113]:
# E14. Use `re.split` to get the output as shown below for given input strings.
s1 = 'go there  //   "this // that"'  # '  //   '
s2 = 'a//b // c//d e//f // 4//5'  # ' // '
s3 = '42// hi//bye//see // carefully'  # ' // '

In [116]:
# A14
pat = re.compile(r' +?// +?')
print(pat.split(s1))  # ['go there', '"this // that"']
print(pat.split(s2))  # ['a//b', 'c//d e//f // 4//5']
print(pat.split(s3))  # ['42// hi//bye//see', 'carefully']

['go there', '  "this', 'that"']
['a//b', 'c//d e//f', '4//5']
['42// hi//bye//see', 'carefully']
