In [1]:
import re

capture group에 의해 포착된 문자열을 backreference로 재사용할 수 있다.
단순 capture group에 의해 잡을 수 없던 문자열을 특별한 그룹 문법으로 잡을수 있다.

# 문법 설명
The syntax is \N or \g<N> where N is the capture group you want. The below syntax variations is applicable for replacement section, assuming they are used within raw strings.

  - \1, \2 up to \99 to refer to the corresponding capture group
    - provided there are no digit characters after
    - \0 and \NNN will be interpreted as octal value
  - \g<1>, \g<2> etc (not limited to 99) to refer to the corresponding capture group
    - this also helps to avoid ambiguity between backreference and digits that follow
  - \g<0> to refer to entire matched portion, similar to index 0 of re.Match objects
    - \0 cannot be used because numbers starting with 0 are treated as octal value

## `\N` 방식 역참조

In [3]:
# 주어진 문자열에서 [nn] 표현을 각괄호를 없애고 숫자만 남긴다.
# repl 역참조 패턴은 raw string임에 주의.
re.sub(r'\[(\d+)\]', r'\1', '[52] apples and [31] mangoes')

'52 apples and 31 mangoes'

In [4]:
# '__'는 '_'로, '_'만 있다면 아예 삭제.
re.sub(r'(_)?_', r'\1', '_foo_ __123__ _baz_')

'foo _123_ baz'

In [6]:
# 쉼표로 구분된 두 단어의 위치를 맞바꾼다.
re.sub(r'(\w+),(\w+)', r'\2,\1', 'good,bad 42,24')

'bad,good 24,42'

In [16]:
# capture group은 Match 객체의 group 메소드로도 잡을 수 있다.
result = ''
for matched in re.finditer(r'(\w+),(\w+)', 'good,bad 42,24'):
    w1, w2 = matched.groups()
    result += f'{w2},{w1} '
print(result)

bad,good 24,42 


In [18]:
# \1 다음에 오는 숫자도 그룹 번호의 일부로 인식하기 때문에 문제가 생긴다.
re.sub(r'\[(\d+)\]', r'(\15)', '[52] apples and [31] mangoes')
# re.error: invalid group reference 15 at position 2

error: invalid group reference 2 at position 2

In [20]:
# \g<N>을 쓰면 그룹 번호 바로 앞뒤로 숫자를 붙여도 무방해진다.
re.sub(r'\[(\d+)\]', r'\g<1>5', '[52] apples and [31] mangoes')
# '(525) apples and (315) mangoes'

'525 apples and 315 mangoes'

In [21]:
# \0<N>는 8진법으로 인식된다. \065 == 5 in dec.
re.sub(r'\[(\d+)\]', r'\1\065', '[52] apples and [31] mangoes')
# '(525) apples and (315) mangoes'

'525 apples and 315 mangoes'

In [24]:
# 알파벳으로 이루어진 단어 주위에 '{}'를 붙인다.
re.sub(r'([a-z]+)', r'{\1}', '[52] apples and [31] mangoes')  # '[52] {apples} {and} [31] {mangoes}'

'[52] {apples} {and} [31] {mangoes}'

In [25]:
# note the use of '+' instead of '*' quantifier to avoid empty matching
re.sub(r'.+', r'Hi. \g<0>. Have a nice day', 'Hello world')

'Hi. Hello world. Have a nice day'

In [28]:
# 그룹이 잡은 문자열을 맨 뒤에 ',\1'로 추가한다.
re.sub(r'\A([^,]+),.*', r'\g<0>,\1', 'fork,42,nice,3.14')
# \0 잡은 문자열 전체가 아니라, 8진수를 의미함에 주의.

'fork,42,nice,3.14,fork'

패턴 정의 내에 역참조가 필요한 경우에는 `\<N>` 방식 밖에 사용할 수 없다.

In [31]:
# 연속하는 동일 글자 2개가 포함된 단어 찾기.
words = ['effort', 'flee', 'facade', 'oddball', 'rat', 'tool']
for word in words:
    matched = re.search(r'''
        \b      # 단어 시작
        \w*     # 임의 글자
        (\w)\1  # 연속된 글자 2개
        \w*     # 임의 글자
        \b      # 단어 종료
    ''', word, flags=re.X)
    if matched: print(word)

effort
flee
oddball
tool


In [35]:
# 공백으로 구분된 sub 문자열 중 연속하면서 동일한 것은 하나만 남기고 제거
# use \W+ instead of space to cover cases like 'a;a<-;a'
re.sub(r'\b(\w+)( \1)+\b', r'\1', 'aa a a a 42 f_1 f_1 f_13.14')

'aa a 42 f_1 f_13.14'

In [36]:
# even though there's only one capture group, \11 will give an error
re.sub(r'(.).*\11', 'X', 'abcdefghijklmna1d')

error: invalid group reference 11 at position 6

In [38]:
# RE 정의 내라서 \<N> 방식 역참조를 꼭 써야 하고 뒤에 숫자가 와야 한다면
# \x, \0 등으로 escaped number를 사용할 수 있다.  '\x31' == 1 in dec.
re.sub(r'(.).*\1\x31', 'X', 'abcdefghijklmna1d')  # 'Xd'

'Xd'

In [42]:
# there are 12 capture groups here, so no error
re.sub(r'(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.).*\11',  # \11은 이번 예제에서 'k'를 상징한다.
       'X',
       'abcdefghijklmna1kd')

'Xd'

In [43]:
# use escapes again
re.sub(r'(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.).*\1\x31', 'X', 'abcdefghijklmna1d')  # 'Xd'

'Xd'

# Non-capturing groups
backreferencing이 필요 없다면 사용. 생성 문법은 `(?:pat)`이다.

In [45]:
# normal capture group will hinder ability to get whole match
# non-capturing group to the rescue
re.findall(r'''
    \b
    \w*(?:st|in)  # 변수에 유효한 글자들로 이루어졌으며, 'st' or 'in'으로 끝난다
    \b
''', 'cost akin more east run against', flags=re.X)
# ['cost', 'akin', 'east', 'against']

['cost', 'akin', 'east', 'against']

In [46]:
# capturing wasn't needed here, only common grouping and quantifier
# ['123', '42', '777', '500']
re.split(r'hand(?:ful|y)?', '123hand42handy777handful500')

['123', '42', '777', '500']

In [66]:
# 수량자가 뒤따라 오는 capture group의 경우 수량자의 마지막 횟수에서 붙들린 문자열만 capture group이 된다.
# 수량자만큼의 capture group이 생성되진 않는다.
# 이 예제에서는 group 2가 필요 없음에도 3번이나 갱신되야 하는 불필요한 계산이 존재한다.
# 이는 non-capture group을 사용함으로써 절약할 수 있는 부분이다.
re.sub(r'''
    \A
    (            # start of group 1
    ([^,]+,){3}  # group 2: {3}에 의해 3회를 돌았어도 그룹은 1개만 생기며,
                 # 그 그룹이 가리키는 문자열은 마지막으로 포착된 '3,'이다.
    )            # end of group 1
    ([^,]+)      # group 3
    ''',
       r'\1(\3)',
       '1,2,3,4,5,6,7',
       flags=re.X)
# '1,2,3,(4),5,6,7'

'1,2,3,(4),5,6,7'

In [69]:
# 위 예제에서 각 그룹은 어떤 문자열을 가리키는지 확인해보자.
matched = re.search(r'''
                    \A
                    (([^,]+,){3})
                    ([^,]+)
                    ''',
                    '1,2,3,4,5,6,7',
                    flags=re.X)

for i, g in enumerate(matched.groups()):
    print(f'group {i + 1}: {g}')

group 1: 1,2,3,
group 2: 3,
group 3: 4


In [78]:
# using non-capturing groups, only relevant groups have to be tracked
re.sub(raw_pattern := r'''
        \A
        (              # 그룹 1
        (?:[^,]+,){3}  # 안쪽 grouping 메타 문자는 더 이상 역참조 기능을 하지 않는다.
        )
        ([^,]+)        # 그룹 2
        ''',
       r'\1(\2)',
       string := '1,2,3,4,5,6,7',
       flags=re.X)
# '1,2,3,(4),5,6,7'

'1,2,3,(4),5,6,7'

In [80]:
# 이번에도 어떤 그룹들이 잡혔는지 확인해보자.
for i, g in enumerate(re.search(raw_pattern, string, flags=re.X).groups()):
    print(f'group {i}: {g}')

group 0: 1,2,3,
group 1: 4


In [81]:
# capture group의 예 - 마지막 captured str('123')만 그룹이 가리킨다.
re.findall(r'(123)+', 'hi 123123123 bye 456123456')  # ['123', '123']

# 개념적으로는 (123)(123)(123) 이렇게 세 번을 가리키지만 매번 참조가 덮어 씌워져서 결국 123만 남게 된다고 생각하자.
for captured in ['123', '123', '123']:
    group1 = captured

['123', '123']

In [None]:
# non-capture group의 예 - capture 대신 구분 기호로서의 역할만 한다.
re.findall(r'(?:123)+', 'hi 123123123 bye 456123456')  # ['123123123', '123']

In [88]:
# 그렇지만 이 문제는 `sub` 혹은 `subn`에는 무관하다.
replaced = re.sub(r'(123)+', 'X', 'hi 123123123 bye 456123456')  # 'hi X bye 456X456'
print(replaced)

# 교체되는 대상은 group[0]이니깐.
replacing_subject = re.search(r'(123)+', 'hi 123123123 bye 456123456').group(0)
print(replacing_subject)

hi X bye 456X456
123123123


In [89]:
re.sub(r'''
        \A([^,]+,){3}  # \1 == '3.14,'
        ([^,]+)        # \2 == '42'
        ''',
       r'\1"\2"',
       'one,2,3.14,42,five',
       flags=re.X)  # '3.14,"42",five'

'3.14,"42",five'

In [90]:
re.sub(r'''
        \A((?:[^,]+,){3})  # \1 == 'one,2,3.14,',
        ([^,]+)            # \2 == '42'
        ''',
       r'\1"\2"',
       'one,2,3.14,42,five',
       flags=re.X)  # 'one,2,3.14,"42",five'

'one,2,3.14,"42",five'

In [96]:
words = 'effort flee facade oddball rat tool'
# whole words containing at least one consecutive repeated character
repeat_char = re.compile(
    r'''
    \b
    \w*
    (\w)\1  # 연속되는 같은 2글자
    \w*
    \b
    ''',
    flags=re.X)

In [97]:
# `findall`과 capture group의 조합은 반환값에 오직 capture group만 포함되게 만든다.
repeat_char.findall(words)  # ['f', 'e', 'l', 'o']

['f', 'e', 'l', 'o']

In [98]:
# finditer to the rescue
m_iter = repeat_char.finditer(words)
[m[0] for m in m_iter]  # ['effort', 'flee', 'oddball', 'tool']

['effort', 'flee', 'oddball', 'tool']

# Named capture groups
  - capturing: `(?P<name>pattern)`
  - back-referencing: `(?P=name)`
  - replacement: `\g<name>`

In [99]:
# giving names to first and second captured words
re.sub(r'(?P<fw>\w+),(?P<sw>\w+)',  # 'fw', 'sw'란 이름으로 각각 그룹 캡쳐.
       r'\g<sw>,\g<fw>',
       'good,bad 42,24')

'bad,good 24,42'

In [113]:
# Used hexcode for space(\x20) because all whitespace are ignored in re.X flag.
re.sub(r'''
    \b
    (?P<dup>\w+)
    (\x20(?P=dup))+
    \b
    ''',
       r'\g<dup>',
       'aa a a a 42 f_1 f_1 f_13.14',
       flags=re.X)

'aa a 42 f_1 f_13.14'

In [114]:
sentence = 'I bought an apple'
m = re.search(r'(?P<fruit>\w+)\Z', sentence)

print(m[1])  # 'apple'
print(m['fruit'])  # 'apple'
print(m.group('fruit'))  # 'apple'

apple
apple
apple


In [115]:
# single match
details = '2018-10-25,car,2346'
re.search(r'''
    (?P<date>[^,]+),
    (?P<product>[^,]+)
    ''',
          details,
          flags=re.X).groupdict()
# {'date': '2018-10-25', 'product': 'car'}

{'date': '2018-10-25', 'product': 'car'}

In [116]:
# normal groups won't be part of the output
re.search(r'(?P<date>[^,]+),([^,]+)', details).groupdict()
# {'date': '2018-10-25'}

{'date': '2018-10-25'}

In [117]:
# multiple matches
s = 'good,bad 42,24'
compiled_pattern = re.compile(r'''
    (?P<fw>\w+),(?P<sw>\w+)
    ''',
                              flags=re.X)
[m.groupdict() for m in compiled_pattern.finditer(s)]

[{'fw': 'good', 'sw': 'bad'}, {'fw': '42', 'sw': '24'}]

# Conditional groups
`(?(id/name)yes-pattern|no-pattern)`

  - yes-pattern: ?(id/name)에 지정된 그룹에 비어 있지 않은 문자열이 매칭된 경우
    - 단독으로 사용될 경우: 비일치 시 검사할 아무 패턴도 없다고 생각할 것
  - no-pattern: ?(id/name)으로 지정된 그룹에 빈 문자열 매칭된 경우

In [130]:
# Only yes-pattern.
words = ['"hi"', 'bye', 'bad"', '"good"', '42', '"3']
pat = re.compile(r'''
    (")?     # 후에 나올 조건 그룹에 지정된 ID 1의 그룹
    \w+
    (?(1)")  # ID: 1, yes-pattern(there was double quotation mark): '"'
    ''',
                 flags=re.X)
[w for w in words if pat.fullmatch(w)]  # ['"hi"', 'bye', '"good"', '42']

['"hi"', 'bye', '"good"', '42']

In [131]:
# for this simple case, you can also expand it manually
# but for complex patterns, it is better to use conditional groups
# as it will avoid repeating the complex pattern
[w for w in words if re.fullmatch(r'"\w+"|\w+', w)]  # ['"hi"', 'bye', '"good"', '42']

['"hi"', 'bye', '"good"', '42']

In [132]:
# cannot simply use ? quantifier as they are independent, not constrained
[w for w in words if re.fullmatch(r'"?\w+"?', w)]  # ['"hi"', 'bye', 'bad"', '"good"', '42', '"3']

['"hi"', 'bye', 'bad"', '"good"', '42', '"3']

In [133]:
# also, `fullmatch` plays a big role in avoiding partial match
pat.search('"bad')  # 큰따옴표는 건너 뛰고 보면 조건이 충족되기 때문에 의도와는 달리 출력된다.
[w for w in words if pat.search(w)]  # ['"hi"', 'bye', 'bad"', '"good"', '42', '"3']

['"hi"', 'bye', 'bad"', '"good"', '42', '"3']

In [135]:
# search를 쓰면서 fullmatch처럼 동작하도록 하기 위해 \A와 \Z로 묶을 수 있다.
[word for word in words
    if re.search(r'''
        \A
        (")?     # 후에 나올 조건 그룹에 지정된 ID 1의 그룹
        \w+
        (?(1)")  # ID: 1, yes-pattern(there was double quotation mark): '"'
        \Z
        ''',
        word,
        flags=re.X)]

['"hi"', 'bye', '"good"', '42']

In [None]:
# no-pattern도 존재하는 경우
# filter elements containing word characters surrounded by ()
# or, containing word characters separated by a hyphen
words = ['(hi)', 'good-bye', 'bad', '(42)', '-oh', 'i-j', '(-)']

# same as: r'\(\w+\)|\w+-\w+'
pat = re.compile(
    r'''
    (\()?  # condition
    \w+
    (?(1)
    \)     # yes-pattern: 여는 괄호 있었으면,
    |
    -\w+   # no-pattern: 여는 괄호 없었다면,
    )
    ''',
    flags=re.X)
[w for w in words if pat.fullmatch(w)]  # ['(hi)', 'good-bye', '(42)', 'i-j']

# Match.expand
`Match.expand(template)`의 template은 `\<N>`으로 표현된 역참조 그룹을 포함한 RE 패턴이다.
이 역참조 그룹을 포함한 패턴이 실제 문자열로 어떻게 치환되는지를 보여준다.
`re.sub`와 유사하나 차이점은 원래 문자열이 아닌 `template` 인자에 들어간 내용만 대체해서 보여준다는 점이다.

In [138]:
# re.sub vs Match.expand
# 패턴은 'wesom', \1은 'eso'를 잡았다.
# 원래 문자열인 'awesome'이 어떻게 대체됐는지를 출력
re.sub(r'w(.*)m', r'[\1]', 'awesome')  # 'a[eso]e'

'a[eso]e'

In [139]:
# expand() 안 인자가 뭘로 대체됐는지를 리턴.
re.search(r'w(.*)m', 'awesome').expand(r'[\1]')  # '[eso]'

'[eso]'

In [140]:
# example with re.finditer
dates = '2020/04/25,1986/03/02,77/12/31'
m_iter = re.finditer(r'([^/]+)/([^/]+)/[^,]+,?', dates)

# same as: [f'Month:{m[2]}, Year:{m[1]}' for m in m_iter]
[m.expand(r'Month:\2, Year:\1') for m in m_iter]

['Month:04, Year:2020', 'Month:03, Year:1986', 'Month:12, Year:77']

# Exercises

In [None]:
# a) Replace the space character that occurs after a word ending with 'a' or
# 'r' with a newline character.
ip = 'area not a _a2_ roar took 22'

In [None]:
# A1
print(re.sub())
# area
# not a
# _a2_ roar
# took 22

In [None]:
# E2. Add '[]' around words starting with 's' and containing 'e' and 't' in any
# order.
ip = 'sequoia subtle exhibit asset sets tests site'

In [None]:
# A2
# 'sequoia [subtle] exhibit asset [sets] tests [site]'

In [None]:
# E3. Replace all whole words with 'X' that start and end with the same word
# character. Single character word should get replaced with 'X' too, as it
# satisfies the stated condition.
ip = 'oreo not a _a2_ roar took 22'

In [None]:
# A3
# 'X not X X X took X'

In [None]:
# E4. Convert the given markdown headers to corresponding anchor tag. Consider
# the input to start with one or more '#' characters followed by space and
# word characters. The `name` attribute is constructed by converting the header
# to lowercase and replacing spaces with hyphens. Can you do it without using a
# capture group?
header1 = '# Regular Expressions'
header2 = '## Compiling regular expressions'

In [None]:
# A4
# '# <a name="regular-expressions"></a>Regular Expressions'
# '## <a name="compiling-regular-expressions"></a>Compiling regular expressions'

In [None]:
# E5. Convert the given markdown anchors to corresponding hyperlinks.
anchor1 = '# <a name="regular-expressions"></a>Regular Expressions'
anchor2 = '## <a name="subexpression-calls"></a>Subexpression calls'

In [None]:
# A5
# '[Regular Expressions](#regular-expressions)'
# '[Subexpression calls](#subexpression-calls)'


In [None]:
# E6. Count the number of whole words that have at least two occurrences of
# consecutive repeated alphabets. For example, words like 'stillness' and
# 'Committee' should be counted but not words like 'root' or 'readable' or
# 'rotational'.
ip = '''oppressed abandon accommodation bloodless
... carelessness committed apparition innkeeper
... occasionally afforded embarrassment foolishness
... depended successfully succeeded
... possession cleanliness suppress'''

In [None]:
# A6
# 13

In [None]:
# E7. For the given input string, replace all occurrences of digit sequences
# with only the unique non-repeating sequence. For example, '232323' should
# be changed to '23' and '897897' should be changed to '897'. If there no
# repeats (for example '1234') or if the repeats end prematurely (for example
# '12121'), it should not be changed.
ip = '1234 2323 453545354535 9339 11 60260260'

In [None]:
# A7
# '1234 23 4535 9339 1 60260260'

In [None]:
# E8. Replace sequences made up of words separated by ':' or '.' by the first
# word of the sequence. Such sequences will end when ':' or '.' is not followed
# by a word character.
ip = 'wow:Good:2_two:five: hi-2 bye kite.777.water.'

In [None]:
# A8
# 'wow hi-2 bye kite'

In [None]:
# E9. Replace sequences made up of words separated by ':' or '.' by the last
# word of the sequence. Such sequences will end when ':' or '.' is not
# followed by a word character.
ip = 'wow:Good:2_two:five: hi-2 bye kite.777.water.'

In [None]:
# A9
# 'five hi-2 bye water'

In [None]:
# E10. Split the given input string on one or more repeated sequence of 'cat'.
ip = 'firecatlioncatcatcatbearcatcatparrot'

In [None]:
# A10
# ['fire', 'lion', 'bear', 'parrot']

In [None]:
# E11. For the given input string, find all occurrences of digit sequences with
# at least one repeating sequence. For example, '232323' and '897897'. If the
# repeats end prematurely, for example '12121', it should not be matched.
ip = '1234 2323 453545354535 9339 11 60260260'

In [None]:
# A11
pat = re.compile()
# entire sequences in the output
# ['2323', '453545354535', '11']
# only the unique sequence in the output
# ['23', '4535', '1']

In [None]:
# E12. Convert the comma separated strings to corresponding `dict` objects as
# shown below. The keys are 'name', 'maths' and 'phy' for the three fields
# in the input strings.
row1 = 'rohan,75,89'
row2 = 'rose,88,92'

In [None]:
# A12
pat = re.compile()  ##### add your solution here
# {'name': 'rohan', 'maths': '75', 'phy': '89'}
# {'name': 'rose', 'maths': '88', 'phy': '92'}

In [None]:
# E13. Surround all whole words with '()'. Additionally, if the whole word is
# 'imp' or 'ant', delete them. Can you do it with single substitution?
ip = 'tiger imp goat eagle ant important'

In [None]:
# A13
# '(tiger) () (goat) (eagle) () (important)'

In [None]:
# E14. Filter all elements that contains a sequence of lowercase alphabets
# followed by - followed by digits. They can be optionally surrounded by
# '{{' and '}}'. Any partial match shouldn't be part of the output.
ip = ['{{apple-150}}', '{{mango2-100}}', '{{cherry-200', 'grape-87']

In [None]:
# A14
# ['{{apple-150}}', 'grape-87']

In [None]:
# E15. The given input string has sequences made up of words separated by ':'
# or '.' and such sequences will end when ':' or '.' is not followed by a word
# character. For all such sequences, display only the last word followed by '-'
# followed by first word.
ip = 'wow:Good:2_two:five: hi-2 bye kite.777.water.'

In [None]:
# A15
# ['five-wow', 'water-kite']