In [1]:
import re

lookarounds(zero-width patterns) == some kind of grouping
 - lookarounds의 조건 패턴은 문자열을 먹지 않는다는 점에서 앵커처럼 작동한다.
 - 문법: (?<dir><sign><pattern)
   - 예시1: foo(?=\d)  positive lookahead
   - 예시2: foo(?!\d)  negative lookahead
   - 예시3: (?<\d)foo  positive lookbehind
   - 예시4: (?<!\d)foo negative lookbehind
 - negative/positive: !와 =의 차이가 있음.
 - look{ahead/behind}: behind는 < 가 붙음.


In [3]:
items = ['1,2,3,4', 'a,b,c,d', '#foo 123']

has_digit_and_sharp = [s for s in items if re.search(r'\d', s) and '#' in s]
print(has_digit_and_sharp)

['#foo 123']
1 
a 


In [4]:
for s in items:
    if s[0] != '#':
        print(only_first_char := re.sub(r',.+', ' ', s))

1 
a 


In [7]:
# negative lookahead
# "'foo' 뒤(읽는 방향은 정방향이니 ahead)에 숫자가 붙지 않는(neg) 경우에 한해서" <- 이게 neg. lookahead 의미
# 'foo'를 교체할 것.
re.sub(r'foo(?!\d)', 'baz', 'hey food! foo42 foot5 foofoo')

'hey bazd! foo42 bazt5 bazbaz'

In [8]:
# negative lookbehind 1
re.sub(r'(?<!_)foo', 'baz', 'foo _foo 42foofoo')

'baz _foo 42bazbaz'

In [9]:
# negative lookbehind 2
re.sub(r'(?<!_)foo.', 'baz', 'food _fool 42foo_foot')

'baz _fool 42bazfoot'

In [15]:
# change whole word "only if it is not preceded by : or -"
re.sub(r'''
        (?<![:-])  # only if it is not preceded by ':' or '-'
        \b\w+\b    # whole word
        ''',
       'X',
       ':cart <apple -rest ;tea', flags=re.X)

':cart <X -rest ;X'

In [2]:
# add space to word boundaries, but not at start or end of string
# similar to: re.sub(r'\b', ' ', 'foo_baz=num1+35*42/num2').strip()
re.sub(r'(?<!\A)\b(?!\Z)', ' ', 'foo_baz=num1+35*42/num2')
# 'foo_baz = num1 + 35 * 42 / num2'

'foo_baz = num1 + 35 * 42 / num2'

In [3]:
# lookbehind라고 왼쪽에 나오고, lookahead라고 오른쪽에 나오고 그러진 않는다.
# replace a character as long as it is not preceded by 'p' or 'r'
re.sub(r'(?<![pr]).', '*', 'spare')  # '**a*e'
re.sub(r'.(?<![pr].)', '*', 'spare')  # 'spare'

'**a*e'

In [8]:
# replace 'par' as long as 's' is not present later in the input
# this assumes that the lookaround doesn't conflict with search pattern
# i.e. 's' will not conflict 'par' but would affect if it was 'r' and 'par'
print(re.sub(r'par(?!.*s)', 'X', 'par spare part party'))  # 'par sXe Xt party'
print(re.sub(r'(?!.*s)par', 'X', 'par spare part party'))  # 'par spare part party'

par sXe Xt Xty
par sXe Xt Xty


In [9]:
# around,behind는 왼쪽에 있든 오른쪽에 있든 그 의미는 변하지 않는다.
print(re.sub(r'(?!\Z)\b(?<!\A)', ' ', 'foo_baz=num1+35*42/num2'))
print(re.sub(r'(?<!\A)\b(?!\Z)', ' ', 'foo_baz=num1+35*42/num2'))

foo_baz = num1 + 35 * 42 / num2
foo_baz = num1 + 35 * 42 / num2


In [10]:
# positive lookahead: 쉼표 앞에 오는 숫자 시퀀스를 찾아라.
re.findall(r'\d+(?=,)', '42 foo-5, baz3; x-83, y-20: f12')

['5', '83']

In [11]:
re.findall(r'''
            (?<=-)    # '-' leads
            \d+       # digit sequence
            (?=[:;])  # ';' or ':' follows
            ''',
           '42 foo-5, baz3; x-83, y-20: f12',
           flags=re.X)

['20']

In [13]:
re.sub(r'par(?=.*\bpart\b)', 'X', 'par spare part party')  # party는 'y' 때문에 누락되니 주의

'X sXe part party'

In [14]:
# 앞뒤로 쉼표로 둘러쌓인 글자.
re.findall(r'(?<=,)[^,]+(?=,)', '1,two,3,four,5')

['two', '3', 'four']

In [15]:
# 쉼표 간 아무 것도 없으면 'NA'를 삽입
re.sub(r'(?<![^,])(?![^,])', 'NA', ',1,,,two,3,,,')

'NA,1,NA,NA,two,3,NA,NA,NA'

In [16]:
# lookaround 안에 capture group이 사용됐음에 주의!
# 처음 매칭된 문자열 'a b'만 보면 \1 == 'a ', \2 == 'b'
print(re.sub(r'(\S+\s+)(?=(\S+)\s)', r'\1\2\n', 'a b c d e'))

a b
b c
c d
d e


In [17]:
# findall은 패턴에 capture group이 들어 있으면 그것만 나열해서 보여준다.
re.findall(r'''
            (?<=(po|ca)re)  # \1
            \d+
            ''',
           'pore42 car3 pare7 care5',
           flags=re.X)

['po', 'ca']

In [18]:
re.findall(r'''
            (?<=(?:po|ca)re)  # (?:pattern)은 non-capture group임을 기억
            \d+
            ''',
           'pore42 car3 pare7 care5',
           flags=re.X)

['42', '5']

# Working as a logical AND operator

In [21]:
# words containing 'b' and 'e' and 't' "in any order"
# same as: r'b.*e.*t|b.*t.*e|e.*b.*t|e.*t.*b|t.*b.*e|t.*e.*b'
words = ['sequoia', 'subtle', 'questionable', 'exhibit', 'equation']
compiled_pattern = re.compile(
    r'''
    (?=.*b)  # 'sub'
    (?=.*e)  # 'subtle'
    .*t      # 'subt'
    ''',
    flags=re.X)
[w for w in words if compiled_pattern.search(w)]
# ['subtle', 'questionable', 'exhibit']

['subtle', 'questionable', 'exhibit']

In [None]:
# words containing all lowercase vowels in any order
[w for w in words if re.search(r'(?=.*a)(?=.*e)(?=.*i)(?=.*o).*u', w)]
# ['sequoia', 'questionable', 'equation']

In [None]:
# words containing 'a' and 'q' but not 'n' at the end of the element
[w for w in words if re.search(r'(?=.*a)(?=.*q)(?!.*n\Z)', w)]
# ['sequoia', 'questionable']

# Variable length lookbehind
lookbehind 내부 패턴은 고정된 길이의 패턴만 가능하다. 수량자도 *처럼 가변 길이가 아니라 {3}처럼 정해진 길이만 가능.

In [23]:
# allowed case 1
re.findall(r'''
            (?<=(?:po|da)re)  # The non-capture group ignored in `findall`.
            \d+               # This pattern matched.
            ''',
           'pore42 tar3 dare7 care5',
           flags=re.X)
# ['42', '7']

['42', '7']

In [24]:
# allowed case 2
re.findall(r'''
            (?<=\b[a-z]{4})
            \d+
            ''',
           'pore42 tar3 dare7 care5',
           flags=re.X)
# ['42', '7', '5']

['42', '7', '5']

In [26]:
# not allowed case 1
# len('tar') != len('dare')이라서 pattern 길이가 3인지 4인지 미정인 상태가 된다.
re.findall(r'(?<!tar|dare)\d+', 'pore42 tar3 dare7 care5')
# re.error: look-behind requires fixed-width pattern

error: look-behind requires fixed-width pattern

In [27]:
# workaround for case 1: 별개의 lookbehind
re.findall(r'(?<!tar)(?<!dare)\d+', 'pore42 tar3 dare7 care5')

['42', '5']

In [41]:
# wordaround for case 1 without lookarounds
# same as: re.findall(r'(?:(?<=tar)|(?<=dare))\d+', s)
re.findall(r'(?:tar|dare)(\d+)', 'pore42 tar3 dare7 care5')
# ['3', '7']

['3', '7']

In [25]:
# not allowed case 2: * 때문에 걸린다.
re.findall(r'(?<=\b[pd][a-z]*)\d+', 'pore42 tar3 dare7 care5')
# re.error: look-behind requires fixed-width pattern

error: look-behind requires fixed-width pattern

In [40]:
# workaround for case 2: 어떻게 처리하지?
re.findall(r'(?<=[pd])(?:[a-z]*)\d+', 'pore42 tar3 dare7 care5')

['ore42', 'are7']

In [43]:
# workaround for case 2 without lookarounds
# get digits only if they are preceded by a word starting with 'p' or 'd'
re.findall(r'\b[pd][a-z]*(\d+)', s)

['42', '7']

In [None]:
# not allowed case 3: '\A'는 길이가 없는 anchor이므로 ','과 길이가 달라
# 패턴 전체로 보면 0인지 1인지 미정인 상태가 된다.
re.sub(r'(?<=\A|,)(?=,|\Z)', 'NA', ',1,,,two,3,,,')
# re.error: look-behind requires fixed-width pattern

In [28]:
# workaround for case 3
re.sub(r'((?<=\A)|(?<=,))(?=,|\Z)', 'NA', ',1,,,two,3,,,')

'NA,1,NA,NA,two,3,NA,NA,NA'

In [42]:
# delete digits only if they are preceded by 'tar' or 'dare'
re.sub(r'(tar|dare)\d+', r'\1', 'pore42 tar3 dare7 care5')

'pore42 tar dare care5'

In [None]:
# delete digits only if they are preceded by a word starting with 'p' or 'd'
re.sub(r'(\b[pd][a-z]*)\d+', r'\1', s)
# 'pore tar3 dare care5'

# Negated groups

In [44]:
# note the use of \A anchor to force matching all characters up to 'dog'
bool(re.search(r'\A((?!cat).)*dog', 'fox,cat,dog,parrot'))
# 'dog'까지 가다 중간에 'cat'을 만나 탐색 실패.

False

In [45]:
re.search(r'\A((?!parrot).)*dog', 'fox,cat,dog,parrot')  # 'fox,cat,dog'

<re.Match object; span=(0, 11), match='fox,cat,dog'>

In [49]:
# '\A'가 없어 검색 성공.
re.search(r'((?!cat).)*dog', 'fox,cat,dog,parrot')

<re.Match object; span=(5, 11), match='at,dog'>

In [52]:
# easier to understand by checking matched portion
re.search(r'\A((?!cat).)*', 'fox,cat,dog,parrot')[0]

'fox,'

In [53]:
re.search(r'\A((?!parrot).)*', 'fox,cat,dog,parrot')[0]

'fox,cat,dog,'

In [54]:
re.search(r'''
            \A(    # \2
            (?!
            (.)\2
            ).
            )*
            ''',
          'fox,cat,dog,parrot',
          flags=re.X)[0]

'fox,cat,dog,pa'

In [56]:
# match if 'do' is not there between 'at' and 'par'
bool(re.search(r'at((?!do).)*par', 'fox,cat,dog,parrot'))
# 'fox,cat,dog,parrot'

False

In [57]:
# match if 'go' is not there between 'at' and 'par'
re.search(r'''at
            ((?!go).)*
            par''',
          'fox,cat,dog,parrot',
          flags=re.X)

<re.Match object; span=(5, 15), match='at,dog,par'>

In [58]:
# use non-capturing group if required
re.findall(r'a(?:(?!\d).)*z', 'at,baz,a2z,bad-zoo')

['at,baz', 'ad-z']

# Exercises

In [None]:
# E1. Replace all whole words with 'X' unless it is preceded by '(' character.
ip = '(apple) guava berry) apple (mango) (grape'

In [None]:
# A1
# '(apple) X X) X (mango) (grape'

In [None]:
# E2. Replace all whole words with 'X' unless it is followed by ')' character.
ip = '(apple) guava berry) apple (mango) (grape'

In [None]:
# A2
# '(apple) X berry) X (mango) (X'

In [None]:
# E3. Replace all whole words with 'X' unless it is preceded by '(' or followed
# by ')' characters.
ip = '(apple) guava berry) apple (mango) (grape'

In [None]:
# A3
# '(apple) X berry) X (mango) (grape'

In [None]:
# E4. Extract all whole words that do not end with 'e' or 'n'.
ip = 'at row on urn e note dust n'

In [None]:
# A4
# ['at', 'row', 'dust']

In [None]:
# E5. Extract all whole words that do not start with 'a' or 'd' or 'n'.
ip = 'at row on urn e note dust n'

In [None]:
# A5
# ['row', 'on', 'urn', 'e']

In [None]:
# E6. Extract all whole words only if they are followed by ':' or ',' or '-'.
ip = 'poke,on=-=so:ink.to/is(vast)ever-sit'

In [None]:
# A6
# ['poke', 'so', 'ever']

In [None]:
# E7. Extract all whole words only if they are preceded by '=' or '/' or '-'.
ip = 'poke,on=-=so:ink.to/is(vast)ever-sit'

In [None]:
# A7
# ['so', 'is', 'sit']

In [None]:
# E8. Extract all whole words only if they are preceded by '=' or ':' and
# followed by ':' or '.'.
ip = 'poke,on=-=so:ink.to/is(vast)ever-sit'

In [None]:
# A8
# ['so', 'ink']

In [None]:
# E9. Extract all whole words only if they are preceded by '=' or ':' or '.' or
# '(' or '-' and not followed by '.' or '/'.
ip = 'poke,on=-=so:ink.to/is(vast)ever-sit'

In [None]:
# A9
# ['so', 'vast', 'sit']

In [None]:
# E10. Remove leading and trailing whitespaces from all the individual fields
# where ',' is the field separator.
csv1 = ' comma  ,separated ,values \t\r '
csv2 = 'good bad,nice  ice  , 42 , ,   stall   small'

In [None]:
# A10
remove_whitespace = re.compile()
# remove_whitespace.sub('', csv1)  # 'comma,separated,values'
remove_whitespace.sub('', csv2)  # 'good bad,nice  ice,42,,stall   small'

In [None]:
# E11. Filter all elements that satisfy all of these rules:
#     should have at least two alphabets
#     should have at least 3 digits
#     should have at least one special character among '%' or '*' or '#' or '$'
#     should not end with a whitespace character
pwds = ['hunter2', 'F2H3u%9', '*X3Yz3.14\t', 'r2_d2_42', 'A $B C1234']

In [None]:
# A11
# ['F2H3u%9', 'A $B C1234']

In [None]:
# E12. For the given string, surround all whole words with '{}' except for
# whole words 'par' and 'cat' and 'apple'.
ip = 'part; cat {super} rest_42 par scatter apple spar'

In [None]:
# A12
# '{part}; cat {{super}} {rest_42} par {scatter} apple {spar}'

In [None]:
# E13. Extract integer portion of floating-point numbers for the given string.
# A number ending with '.' and no further digits should not be considered.
ip = '12 ab32.4 go 5 2. 46.42 5'

In [None]:
# A13
['32', '46']

In [None]:
# E14. For the given input strings, extract all overlapping two character
# sequences.
s1 = 'apple'
s2 = '1.2-3:4'

In [None]:
# A14
pat = re.compile()
# ['ap', 'pp', 'pl', 'le']
# ['1.', '.2', '2-', '-3', '3:', ':4']

In [None]:
# E15. The given input strings contain fields separated by ':' character.
# Delete ':' and the last field if there is a digit character anywhere before
# the last field.
s1 = '42:cat'
s2 = 'twelve:a2b'
s3 = 'we:be:he:0:a:b:bother'

In [None]:
# A15
pat = re.compile()  ##### add your solution here
pat.sub()  # '42'
pat.sub()  # 'twelve:a2b'
pat.sub()  # 'we:be:he:0:a:b'

In [None]:
# E16. Extract all whole words unless they are preceded by ':' or '<=>' or
# '----' or '#'.
ip = '::very--at<=>row|in.a_b#b2c=>lion----east'

In [None]:
# A16
['at', 'in', 'a_b', 'lion']

In [None]:
# E17. Match strings if it contains 'qty' followed by 'price' but not if there
# is whitespace or the string 'error' between them.
str1 = '23,qty,price,42'
str2 = 'qty price,oh'
str3 = '3.14,qty,6,errors,9,price,3'
str4 = '42\nqty-6,apple-56,price-234,error'
str5 = '4,price,3.14,qty,4'

In [None]:
# A17
neg = re.compile()
bool(neg.search(str1))  # True
bool(neg.search(str2))  # False
bool(neg.search(str3))  # False
bool(neg.search(str4))  # True
bool(neg.search(str5))  # False

In [None]:
# E18. Can you reason out why the output shown is different for these two
# regular expressions?
ip = 'I have 12, he has 2!'
re.sub(r'\b..\b', '{\g<0>}', ip)
'{I }have {12}{, }{he} has{ 2}!'
re.sub(r'(?<!\w)..(?!\w)', '{\g<0>}', ip)
'I have {12}, {he} has {2!}'