# String anchors

In [5]:
import re

## 용어 설명
- constcharacters:
- metacharacters: 특수한 기능을 담고 있는 문자. 이 기능을 지닌 문자로 해석하지 않게 만드려면 escaped해야 한다.

## \A - the start of a string

In [7]:
# 문자열의 시작을 알리는 따옴표 바로 다음에만 찾는다.
print(re.search(r'\Acat', 'cater'))
print(re.search(r'\Acat', 'concatenation'))

<re.Match object; span=(0, 3), match='cat'>
None


In [8]:
print(re.search(r'\Ahi', 'hi hello\ntop spot'))
print(re.search(r'\Atop', 'hi hello\ntop spot'))  # 개행문자 다음도 기존 문자열의 일부이다.

<re.Match object; span=(0, 2), match='hi'>
None


## \Z - the end of a string

In [10]:
print(re.search(r'are\Z', 'spare'))
print(re.search(r'are\Z', 'nearest'))

<re.Match object; span=(2, 5), match='are'>
None


In [12]:
words = ['surrender', 'unicorn', 'newer', 'door', 'empty', 'eel', 'pest']
compiled_pattern = re.compile(raw_pattern:=r'er\Z')
for word in words:
    print(f'{raw_pattern} in {word}: {compiled_pattern.search(word)}')

er\Z in surrender: <re.Match object; span=(7, 9), match='er'>
er\Z in unicorn: None
er\Z in newer: <re.Match object; span=(3, 5), match='er'>
er\Z in door: None
er\Z in empty: None
er\Z in eel: None
er\Z in pest: None


In [14]:
print(re.sub(r'\A', 're', 'live'))
print(re.sub(r'\Z', 'er', 'hack'))

relive
hacker


In [20]:
not_found = re.search(r'\Aat', 'cater', 1)
found = re.search(r'\Aat', 'cater'[1:])

print(not_found)
print(found)

None
<re.Match object; span=(0, 2), match='at'>


## fullmatch

In [23]:
found = re.search(r'\Acat\Z', 'cat')
matched = re.fullmatch(r'cat', 'cat')

not_found = re.search(r'\Acat\Z', 'concatenation')
not_matched = re.fullmatch(r'cat', 'concatenation')

# Line anchors

## ^ - start of the line
여기서 'line'은 MULTILINE 플래그에 의해 \n가 줄의 시작으로 해석된 문자열을 의미한다.
이 플래그가 없으면 line의 시작은 문자열의 시작과 일치한다.

In [45]:
string, line_pattern = '\nabc\n123', r'^123'

print(re.search(line_pattern, string))  # 문자열은 1줄로 해석.
print(re.search(line_pattern, string, re.M))  # 문자열은 2줄로 해석.

None
<re.Match object; span=(5, 8), match='123'>


In [46]:
greeting = 'hi there\nhave a nice day\n'
print(re.search(r'there$', greeting, flags=re.MULTILINE))
# $는 문자열의 끝을 알리는 '를 만나거나, 혹은 마지막 \n을 의미.
# \Z는 오로지 문자열의 끝을 알리는 '에만 반응.

<re.Match object; span=(3, 8), match='there'>


In [52]:
lines = 'catapults\nconcatenate\ncat'
print(re.sub(r'^', '* ', lines, flags=re.M))
print(re.sub(r'$', '.', lines, flags=re.M))

* catapults
* concatenate
* cat
catapults.
concatenate.
cat.


## word anchors
word: alphanumeric이나 _로 구성되고, 양 옆에 그 외 기호가 놓인 문자 시퀀스.

In [53]:
words = 'par spar apparent spare part'

# replace 'par' irrespective of where it occurs
print(re.sub(r'par', 'X', words))

# replace 'par' only at start of word
print(re.sub(r'\bpar', 'X', words))

# replace 'par' only at end of word
print(re.sub(r'par\b', 'X', words))

# replace 'par' only if it is not part of another word
print(re.sub(r'\bpar\b', 'X', words))

X sX apXent sXe Xt
X spar apparent spare Xt
X sX apparent spare part
X spar apparent spare part


In [58]:
print(re.sub(r'\b', '"', words))
print(re.sub(r'\b', ' ', '---hello---'))
print(re.sub(r'\b', ' ', 'circumference=2*pi*r'))

"par" "spar" "apparent" "spare" "part"
--- hello ---
 circumference = 2 * pi * r 


In [67]:
words = 'par spar apparent spare part'
print(re.sub(r'\Bpar', 'X', words))
print(re.sub(r'\Bpar\b', 'X', words))
print(re.sub(r'par\B', 'X', words))
print(re.sub(r'\Bpar\B', 'X', words))

par sX apXent sXe part
par sX apparent spare part
par spar apXent sXe Xt
par spar apXent sXe part


In [71]:
re.sub(r'\b', ':', 'copper')
re.sub(r'\B', ':', 'copper')

re.sub(r'\b', ' ', '---hello---')
re.sub(r'\B', ' ', '---hello---')

' - - -h e l l o- - - '

# Exercises

In [74]:
# E1. Check if the given strings start with 'be'.
line1 = 'be nice'
line2 = '"best!"'
line3 = 'better?'
line4 = 'oh no\nbear spotted'

In [77]:
# A1
pat = re.compile(r'^be')

print(bool(pat.search(line1)))  # True
print(bool(pat.search(line2)))  # False
print(bool(pat.search(line3)))  # True
print(bool(pat.search(line4)))  # False

True
False
True
False


In [73]:
# E2. For the given input string, change only whole word 'red' to 'brown'.
words = 'bred red spread credible'

In [80]:
# A2
print(substr := re.sub(r'\bred\b', 'brown', words))  # 'bred brown spread credible'

'bred brown spread credible'

In [85]:
# E3. For the given input list, filter all elements that contain '42' surrounded
# by word characters(\w).
words = ['hi42bye', 'nice1423', 'bad42', 'cool_42a', 'fake4b']

In [88]:
# A3
# ['hi42bye', 'nice1423', 'cool_42a']
matched = [w for w in words if re.search(r'\B42\B', w)]
print(matched)

['hi42bye', 'nice1423', 'cool_42a']


In [91]:
# E4. For the given input list, filter all strings that start with 'den'
# or end with 'ly'.
items = ['lovely', '1\ndentist', '2 lonely', 'eden', 'fly\n', 'dent']

In [94]:
# A4
# ['lovely', '2 lonely', 'dent']
raw_pattern1, raw_pattern2 = r'\Aden', r'ly\Z'
matched = [item for item in items if re.search(raw_pattern1, item)
           or re.search(raw_pattern2, item)]
print(matched)

['lovely', '2 lonely', 'dent']


In [96]:
# E5. For the given input string, change whole word 'mall' to '1234' only if
# it is at the start of a line.
para = '''\
ball fall wall tall
mall call ball pall
wall mall ball fall
mallet wallet malls'''

In [118]:
# A5
# ball fall wall tall
# 1234 call ball pall
# wall mall ball fall
# mallet wallet malls
print(re.sub(r'^mall\b', '1234', para, flags=re.M))

ball fall wall tall
1234 call ball pall
wall mall ball fall
mallet wallet malls


In [119]:
# E6. For the given list, filter all elements having a line starting with 'den'
# or ending with 'ly'.
items = ['lovely', '1\ndentist', '2 lonely', 'eden', 'fly\nfar', 'dent']

In [123]:
# A6
# ['lovely', '1\ndentist', '2 lonely', 'fly\nfar', 'dent']
matched = [item for item in items if re.search(r'^den', item, flags=re.M)
           or re.search(r'ly$', item, flags=re.M)]
print(matched)

['lovely', '1\ndentist', '2 lonely', 'fly\nfar', 'dent']


In [124]:
# E7. For the given input list, filter all whole elements '12\nthree' irrespective of case.
items = ['12\nthree\n', '12\nThree', '12\nthree\n4', '12\nthree']

In [130]:
# A7
# ['12\nThree', '12\nthree']
matched1 = [item for item in items if re.fullmatch(r'12\nthree', item, flags=re.I)]
matched2 = [item for item in items if re.search(r'\A12\nthree\Z', item, flags=re.I)]
print(matched1, matched2)

['12\nThree', '12\nthree'] ['12\nThree', '12\nthree']


In [131]:
# E8. For the given input list, replace 'hand' with 'X' for all elements that
# start with 'hand' followed by at least one word character.
items = ['handed', 'hand', 'handy', 'unhanded', 'handle', 'hand-2']

In [139]:
# A9
# ['Xed', 'hand', 'Xy', 'unhanded', 'Xle', 'hand-2']
subs = [re.sub(r'^hand\B', 'X', item) for item in items]
print(subs)

['Xed', 'hand', 'Xy', 'unhanded', 'Xle', 'hand-2']


In [140]:
# E9. For the given input list, filter all elements starting with 'h'.
# Additionally, replace 'e' with 'X' for these filtered elements.
items = ['handed', 'hand', 'handy', 'unhanded', 'handle', 'hand-2']

In [145]:
# A9
# ['handXd', 'hand', 'handy', 'handlX', 'hand-2']
subs = [re.sub(r'e', 'X', item) for item in items if re.search(r'\Ah', item)]
print(subs)

['handXd', 'hand', 'handy', 'handlX', 'hand-2']
