In [2]:
import re

# case-insensitive

In [27]:
# 여러 플래그를 세우고 싶다면 '|'를 사이에 두고 열거하면 된다.
result = re.search(r'ab.c', 'Ab\nC', flags=re.I|re.S)

<re.Match object; span=(0, 4), match='Ab\nC'>

In [3]:
# re.IGNORECASE
sensitive_result = bool(re.search(r'cat', 'Cat'))
insensitive_result = bool(re.search(r'cat', 'Cat', flags=re.IGNORECASE))
print(sensitive_result, insensitive_result)

False True


In [4]:
re.findall(r'c.t', 'Cat cot CATER ScUtTLe', flags=re.I)

['Cat', 'cot', 'CAT', 'cUt']

In [5]:
# without flag, you need to use: r'[a-zA-Z]+'
# with flag, can also use: r'[A-Z]+'
re.findall(r'[a-z]+', 'Sample123string42with777numbers', flags=re.I)

['Sample', 'string', 'with', 'numbers']

# Dotall

In [6]:
# by default, the . metacharacter doesn't match newline
re.sub(r'the.*ice', 'X', 'Hi there\nHave a Nice Day')

'Hi there\nHave a Nice Day'

In [9]:
# re.S flag will allow newline character to be matched as well
re.sub(r'the.*ice', 'X', 'Hi there\nHave a Nice Day', flags=re.S)

'Hi X Day'

In [8]:
# `re.S`는 축 `re.DOTALL`의 축약형이다.
re.S == re.DOTALL

True

# Multiline
'\n'으로 구분 문자열을 잘라 그 줄별로 anchor를 검사한다.

In [21]:
# check if any line in the string starts with 'top'
bool(re.search(r'^top', "hi hello\nTop spot", flags=re.M|re.I))

True

In [11]:
# check if any line in the string ends with 'ar'
bool(re.search(r'ar$', "spare\npar\ndare", flags=re.M))

True

# Verbose


In [13]:
# same as: pat = re.compile(r'\A((?:[^,]+,){3})([^,]+)')
# note the use of triple quoted string
pat = re.compile(r'''
    \A(                 # group-1, captures first 3 columns
    (?:[^,]+,){3}   # non-capturing group to get the 3 columns
    )
    ([^,]+)             # group-2, captures 4th column
''', flags=re.X)
pat.sub(r'\1(\2)', '1,2,3,4,5,6,7')

'1,2,3,(4),5,6,7'

In [16]:
bool(re.search(r't a', 'cat and dog', flags=re.X))  # whitespace ignored by the `re.X` flag.
# 3 methods to match whitespace in the pattern.
print(bool(re.search(r't\ a', 'cat and dog', flags=re.X)))
print(bool(re.search(r't[ ]a', 'cat and dog', flags=re.X)))
print(bool(re.search(r't\x20a', 'cat and dog', flags=re.X)))

True
True
True


In [17]:
# '#' is the comment indicator.
print(re.search(r'a#b', 'foo a#b 123', flags=re.X)[0])
print(re.search(r'a\#b', 'foo a#b 123', flags=re.X)[0])

a
a#b


In [19]:
# Inline comments: (?#comments goes here)
# re.X 플래그 없이도 사용 가능하다.
pat = re.compile(r'\A((?:[^,]+,){3})(?#3-cols)([^,]+)(?#4th-col)')
pat.sub(r'\1(\2)', '1,2,3,4,5,6,7')

'1,2,3,(4),5,6,7'

# Inline flags
'inline'의 의미는 '패턴 문자열 내'라는 의미이다.
`re.compile`, `re.search` 등에 선언된 플래그 인자보다 이 종류의 플래그가 우선순위가 높다.
그리고 그룹 기호 안에 들어가긴 하지만 capture group이 아니다.
re.I처럼 대문자로 쓰던 축약 알파벳을 소문자로 변경. `(?i:pat)`는 inline case-insensitive flag.

  - (?flags:pat) 패턴 문자열 중에서도 지정된 일부분에만 적용된다.
  - (?-flags:pat) will negate flags only for this portion
  - (?flags-flags:pat) will apply and negate particular flags only for this portion
  - (?flags)가 패턴 문자열의 맨 처음에 올 경우 그 패턴 전체 범위에 적용된다.
  - if anchors are needed, they should be specified after (?flags)

In [None]:
# case-sensitive for whole RE definition
re.findall(r'Cat[a-z]*\b', 'Cat SCatTeR CATER cAts')
# case-insensitive only for '[a-z]*' portion
re.findall(r'Cat(?i:[a-z]*)\b', 'Cat SCatTeR CATER cAts')

In [None]:
# case-insensitive for whole RE definition using flags argument
re.findall(r'Cat[a-z]*\b', 'Cat SCatTeR CATER cAts', flags=re.I)
# case-insensitive for whole RE definition using inline flags
re.findall(r'(?i)Cat[a-z]*\b', 'Cat SCatTeR CATER cAts')
# case-sensitive only for 'Cat' portion
re.findall(r'(?-i:Cat)[a-z]*\b', 'Cat SCatTeR CATER cAts', flags=re.I)

# Exercises

In [None]:
# E1. Remove from first occurrence of 'hat' to last occurrence of 'it' for the
# given input strings. Match these markers case insensitively.
s1 = 'But Cool THAT\nsee What okay\nwow quite'
s2 = 'it this hat is sliced HIT.'

In [None]:
# A1
pat = re.compile()

pat.sub('', s1)  # 'But Cool Te'
pat.sub('', s2)  # 'it this .'

In [None]:
# E2. Delete from 'start' if it is at the beginning of a line up to the next
# occurrence of the 'end' at the end of a line. Match these markers case
# insensitively.
para = '''\
... good start
... start working on that
... project you always wanted
... to, do not let it end
... hi there
... start and end the end
... 42
... Start and try to
... finish the End
... bye'''

In [None]:
# A2
pat = re.compile()
print(pat.sub('', para))
# good start
#
# hi there
#
# 42
#
# bye

In [None]:
# E3. For the given input strings, match all of these three patterns:
#  - 'This' case sensitively
#  - 'nice' and 'cool' case insensitively
s1 = 'This is nice and Cool'
s2 = 'Nice and cool this is'
s3 = 'What is so nice and cool about This?'

In [None]:
# A3
pat = re.compile()
bool(pat.search(s1))  # True
bool(pat.search(s2))  # False
bool(pat.search(s3))  # True

In [None]:
# E4. For the given input strings, match if the string begins with 'Th' and
# also contains a line that starts with 'There'.
s1 = 'There there\nHave a cookie'
s2 = 'This is a mess\nYeah?\nThereeeee'
s3 = 'Oh\nThere goes the fun'

In [None]:
# A4
pat = re.compile()
bool(pat.search(s1))  # True
bool(pat.search(s2))  # True
bool(pat.search(s3))  # False

In [None]:
# E5. Explore what the `re.DEBUG` flag does. Here's some example patterns to
# check out.
re.compile(r'\Aden|ly\Z', flags=re.DEBUG)
re.compile(r'\b(0x)?[\da-f]+\b', flags=re.DEBUG)
re.compile(r'\b(?:0x)?[\da-f]+\b', flags=re.I | re.DEBUG)