In [1]:
import re

# Alternation
'alternation'은 여기서는 '대안'이라는 의미로 쓰이며, 이를 나타내는데 `|` 기호를 사용한다.

In [12]:
sentence1 = 'I like cats'
sentence2 = 'I like dogs'
sentence3 = 'I like parrots'

compiled_pattern = re.compile(r'cat|dog')
matched1 = compiled_pattern.search(sentence1)
matched2 = compiled_pattern.search(sentence2)
matched3 = compiled_pattern.search(sentence3)

print(bool(matched1),
      bool(matched2),
      bool(matched3))

True True False


In [13]:
# Replace 'cat' at the start of a string or
# at the end of a word.
re.sub(r'\Acat|cat\b', 'X', 'catapults concatenate cat scat')

'Xapults concatenate X sX'

In [19]:
re.sub(raw_pattern := '|'.join(['cat', 'dog']), 'mammal', 'cat dog parrot')

cat|dog


'mammal mammal parrot'

# Grouping
Get pattern inside `()`.

In [25]:
# without grouping
substr1 = re.sub(r'reform|rest', 'X', 'red reform read arrest')
# with grouping
substr2 = re.sub(r're(form|st)', 'X', 'red reform read arrest')
print('\n'.join([substr1, substr2]))

red X read arX
red X read arX


In [27]:
matched1 = re.sub(r'\bpar\b|\bpart\b', 'X', 'par spare part party')
matched2 = re.sub(r'\b(par|part)\b', 'X', 'par spare part party')
matched3 = re.sub(r'\bpar(|t)\b', 'X', 'par spare part party')
print('\n'.join([matched1, matched2, matched3]))

X spare X party
X spare X party
X spare X party


In [32]:
terms = ['no', 'ten', 'it']
items = ['dip', 'nobody', 'it', 'oh', 'no', 'bitten']

# Method 1
raw_pattern = r'\b(' + '|'.join(terms) + r')\b'
compiled_pattern = re.compile(raw_pattern)
matched = [compiled_pattern.search(item).string for item in items
           if compiled_pattern.search(item)]
print(matched)

# Method 2 (using `fullmatch`)
raw_pattern = '|'.join(terms)
compiled_pattern = re.compile(raw_pattern)
matched = [compiled_pattern.fullmatch(item).string for item in items
           if compiled_pattern.fullmatch(item)]
print(matched)

['it', 'no']
['it', 'no']


# Precedence rules

In [39]:
# Scenario 1
words = 'lion elephant are rope not'

matched1 = re.search(r'on', words)
beg1, end1 = matched1.span()

matched2 = re.search(r'ant', words)
beg2, end2 = matched2.span()

print(beg1, beg2)

# 'on' is followed by 'ant' in the text.
# which means 'on' has precedence than 'ant'.
substr1 = re.sub(r'on|ant', 'X', words, count=1)
substr2 = re.sub(r'ant|on', 'X', words, count=1)
print([substr1, substr2])

2 10
['liX elephant are rope not', 'liX elephant are rope not']


In [41]:
# Scenario 2: on the same beginning index
mood = 'best years'
beg1, end1 = re.search(r'year', mood).span()
beg2, end2 = re.search(r'years', mood).span()
print(beg1, beg2)

# starting index for 'year' and 'years' will always be same
# so, which one gets replaced depends on the order of alternation
substr1 = re.sub(r'year|years', 'X', mood, count=1)
substr2 = re.sub(r'years|year', 'X', mood, count=1)
print(substr1, '\n', substr2)

5 5
best Xs 
 best X


In [42]:
# 작동방식: (alt1, alt2, ...)에서
# 1. alt1을 들고 텍스트를 검사하며 일치하는 문자열을 대체한다.
# 2. alt2를 들고 텍스트를 검사하며 일치하는 문자열을 대체한다.
# 이런 식으로 ()안에 나열된 순서대로 대체를 반복한다.
# 이 말은 짧은 alt 때문에 긴 alt의 대체에 실패하길 원치 않으면 alt 순서를 단어 길이가 긴 순으로 나열해라.
words = 'ear xerox at mare part learn eye'

# this is going to be same as: r'ar'
substr1 = re.sub(r'ar|are|art', 'X', words)

# this is going to be same as: r'are|ar'
substr2 = re.sub(r'are|ar|art', 'X', words)

# phew, finally this one works as needed
substr3 = re.sub(r'are|art|ar', 'X', words)

print('\n'.join([substr1, substr2, substr3]))

eX xerox at mXe pXt leXn eye
eX xerox at mX pXt leXn eye
eX xerox at mX pX leXn eye


# Exercises

In [51]:
# E1. For the given input list, filter all elements that start with 'den' or
# end with 'ly'.
items = ['lovely', '1\ndentist', '2 lonely', 'eden', 'fly\n', 'dent']

In [55]:
# A1
# ['lovely', '2 lonely', 'dent']
raw_pattern = r'\Aden|ly\Z'
compiled_pattern = re.compile(raw_pattern)
matched = [item for item in items if compiled_pattern.search(item)]
print(matched)

['lovely', '2 lonely', 'dent']


In [56]:
# E2. For the given list, filter all elements having a line starting with 'den'
# or ending with 'ly'.
items = ['lovely', '1\ndentist', '2 lonely', 'eden', 'fly\nfar', 'dent']

In [59]:
# A2
# ['lovely', '1\ndentist', '2 lonely', 'fly\nfar', 'dent']
raw_pattern = r'^den|ly$'
compiled_pattern = re.compile(raw_pattern, flags=re.M)
matched = [item for item in items if compiled_pattern.search(item)]
print(matched)

['lovely', '1\ndentist', '2 lonely', 'fly\nfar', 'dent']


In [60]:
# E3. For the given input strings, replace all occurrences of
# 'removed' or 'reed' or 'received' or 'refused' with 'X'.
s1 = 'creed refuse removed read'
s2 = 'refused reed redo received'

In [63]:
# A3
pat = re.compile(r're(mov|ceiv|fus|)ed')

print(pat.sub('X', s1))
# 'cX refuse X read'
print(pat.sub('X', s2))
# 'X X redo X'

cX refuse X read
X X redo X


In [66]:
# E4. For the given input strings, replace all matches from the list `words`
# with 'A'.

s1 = 'plate full of slate'
s2 = "slated for later, don't be late"
words = ['late', 'later', 'slated']

In [67]:
# A4
raw_pattern = '|'.join(sorted(words, key=len, reverse=True))
pat = re.compile(raw_pattern)

print(pat.sub('A', s1))
# 'pA full of sA'
print(pat.sub('A', s2))
# "A for A, don't be A"

pA full of sA
A for A, don't be A


In [75]:
# E5. Filter all whole elements from the input list `items` based on elements
# listed in `words`.
items = ['slate', 'later', 'plate', 'late', 'slates', 'slated ']
words = ['late', 'later', 'slated']

In [79]:
# A5
# ['later', 'late']
raw_pattern = '|'.join(sorted(words, key=len, reverse=True))
compiled_pattern = re.compile(raw_pattern)

matched = [item for item in items if compiled_pattern.fullmatch(item)]
print(matched)

['later', 'late']
