In [1]:
import re

`[]` 안에 들어 있는 글자들 중 하나를 상징한다.

In [5]:
words = ['cute', 'cat', 'cot', 'coat', 'cost', 'scuttle']

# same as: r'cot|cut' or r'c(o|u)t'
result = [word for word in words if re.search(r'c[ou]t', word)]
result

['cute', 'cot', 'scuttle']

In [7]:
# same as: r'(a|e|o)+t'
re.sub(r'[aeo]+t', 'X', 'meeting cute boat site foot')

'mXing cute bX site fX'

metachar들은 charset 안에서 본래 의미 대신 새로운 의미로 쓰이는 경우가 있다.

In [9]:
# 'a-c', '1-3'처럼 문자집합 내 시퀀스를 표현한다.
re.findall(r'[0-9]+', 'Sample123string42with777numbers')

['123', '42', '777']

In [10]:
# 소문자나 숫자의 연속된 조합으로 이루어진 단어.
re.findall(r'\b[a-z0-9]+\b', 'coat Bin food tar12 best')

['coat', 'food', 'tar12', 'best']

In [11]:
# p에서 z로 시작한 다음 알파벳 소문자로 이루어진 단어, 혹은 p에서 z로만 이뤄진 단어.
re.findall(r'\b[p-z][a-z]*\b', 'coat tin food put stoop best')

['tin', 'put', 'stoop']

In [12]:
# a-f, p-t 사이 글자로만 최소 하나 이상 모여 이뤄진 단어
re.findall(r'\b[a-fp-t]+\b', 'coat tin food put stoop best')

['best']

## negating
'^'는 문자 집합 처음에 와 그 이후의 문자들은 사용되지 않음을 알린다.

In [13]:
re.findall(r'[^0-9]+', 'Sample123string42with777numbers')

['Sample', 'string', 'with', 'numbers']

In [14]:
# 문자열 초두에 오는 'abc:def:' 이 패턴을 지우고자 한다.
re.sub(r'\A([^:]+:){2}', '', 'foo:123:bar:baz', count=1)

'bar:baz'

In [15]:
re.sub(r'=[^=]+\Z', '', 'foo=42; baz=123', count=1)

'foo=42; baz'

In [16]:
dates = '2020/04/25,1986/Mar/02,77/12/31'
re.findall(r"""  # 숫자 외 문자도 날짜 표현에 사용될 수 있다.
    ([^/]+)/     # 연, 월, 일 중 하나
    ([^/]+)/     # 연, 월, 일 중 하나
    ([^/]+),?    # 연, 월, 일 중 하나
""", dates, flags=re.VERBOSE)

[('2020', '04', '25,1986'), ('Mar', '02,77', '12')]

In [22]:
words = ['tryst', 'fun', 'glyph', 'pity', 'why']
not_vowels1 = [word for word in words if re.search(r'\A[^aeiou]+\Z', word)]
not_vowels2 = [word for word in words if not re.search(r'[aeiou]', word)]
print(not_vowels1, not_vowels2)

['tryst', 'glyph', 'why'] ['tryst', 'glyph', 'why']


# Escaping in charset

In [23]:
# 알파벳 소문자나 하이픈으로 이루어진 단어.
# 문자 집합 안에서 짝을 이루지 못한 하이픈은 literally 해석된다.
re.findall(r'\b[a-z-]{2,}\b', 'ab-cd gh-c 12-423')

['ab-cd', 'gh-c']

In [28]:
# 짝을 이루지 못하게 하든지, '\'로 escaping 시키든지 할 수 있다.
literal_hyphen1 = re.findall(r'\b[a-z\-0-9]{2,}\b', 'ab-cd gh-c 12-423')
literal_hyphen2 = re.findall(r'\b[a-z-0-9]{2,}\b', 'ab-cd gh-c 12-423')
print(literal_hyphen1, literal_hyphen2)

['ab-cd', 'gh-c', '12-423'] ['ab-cd', 'gh-c', '12-423']


In [29]:
# 문자 집합 내 처음에 오지 않는 '^'는 literally(글자 모양 그대로) 해석된다.
re.findall(r'a[+^]b', 'f*(a^b) - 3*(a+b)')

['a^b', 'a+b']

In [30]:
# 문자 집합 처음에 왔어도 '\'로 인해 literally 해석된다.
re.findall(r'a[\^+]b', 'f*(a^b) - 3*(a+b)')

['a^b', 'a+b']

In [32]:
# 아래 패턴에서 마지막에 등장한 '['와 처음에 등장한 ']'는 escaped.
# 알파벳 소문자, 숫자, 혹은 각괄호가 최소 1번은 등장하며 이루어진 시퀀스.
re.search(r'[]a-z0-9[]+', 'words[5] = tea')[0]

'words[5]'

In [33]:
# 이번에도 의미는 같으나, escaping할 각괄호는 순서에 상관 없이 나열할 수 있다.
re.search(r'[a-z\[\]0-9]+', 'words[5] = tea')[0]

'words[5]'

In [34]:
# '\'는 '\'가 escape시킨다.
re.search(r'[a\\b]+', r'5ba\babc2')[0]

'ba\\bab'

# 문자 집합을 대신할 escaped chars
  - `\w == [a-zA-Z0-9_]`
    - `\W == [^a-zA-Z0-9_]`
  - `\d == [0-9]`
    - `\D == [^0-9]`
  - `\s == [ \t\n\r\f\v]`
    - `\S == [^ \t\n\r\f\v]`

In [35]:
# 연속된 숫자를 구분자로 사용.
re.split(r'\d+', 'Sample123string42with777numbers')

['Sample', 'string', 'with', 'numbers']

In [36]:
re.findall(r'\d+', 'foo=5, bar=3; x=83, y=120')

['5', '3', '83', '120']

In [37]:
''.join(
    re.findall(r'\b\w', 'sea eat car rat eel tea')
)

'secret'

In [38]:
re.findall(r'[\w\s]+', 'tea sea-pit sit-lean\tbean')

['tea sea', 'pit sit', 'lean\tbean']

## Negate logic in escaped sequences

In [39]:
re.sub(r'\D+', '-', 'Sample123string42with777numbers')

'-123-42-777-'

In [40]:
re.sub(r'\W+', '', 'foo=5, bar=3; x=83, y=12')

'foo5bar3x83y12'

In [44]:
ws_excluded = re.findall(r'\S+', '   1..3  \v\f  foo_baz 42\tzzz   \r\n1-2-3  ')
print(ws_excluded)
# \S를 구분자로 쪼개는 기능은 str.split()과 동일하다.
print('   1..3  \v\f  foo_baz 42\tzzz   \r\n1-2-3  '.split())

['1..3', 'foo_baz', '42', 'zzz', '1-2-3']
['1..3', 'foo_baz', '42', 'zzz', '1-2-3']


# Numeric ranges

In [45]:
# [10,30)인 정수
re.findall(r'\b[12]\d\b', '23 154 12 26 98234')

['23', '12', '26']

In [46]:
# 세 자리 숫자로 이뤄진 시퀀스 추출
re.findall(r'\b\d{3,}\b', '23 154 12 26 98234 000')

['154', '98234', '000']

In [48]:
# leading 0s가 올 수도 있는 세 자리 숫자.
re.findall(r'\b0*[1-9]\d{2,}\b', '0501 035 154 12 26 98234')

['0501', '154', '98234']

In [52]:
#
m_iter = re.finditer(r'\d+', '45 349 651 593 4 204')
under_350 = [m[0] for m in m_iter if int(m[0]) < 350]
print(under_350)

['45', '349', '4', '204']


In [53]:
def num_range(s):
    """Find a number between [200,650]."""
    return '1' if 200 <= int(s[0]) <= 650 else '0'


re.sub(r'\d+', num_range, '45 349 651 593 4 204')

'0 1 0 1 0 1'


# Exercises

In [56]:
# E1. For the list `items`, filter all elements starting with 'hand' and ending
# with 's' or 'y' or 'le'.
items = ['-handy', 'hand', 'handy', 'unhand', 'hands', 'handle']

In [60]:
# A1
result = [item for item in items if re.search(r'\Ahand.*[sy(le)]\Z', item)]
print(result)  # ['handy', 'hands', 'handle']

['handy', 'hands', 'handle']


In [61]:
# E2. Replace all whole words 'reed' or 'read' or 'red' with 'X'.
ip = 'redo red credible :read: rod reed'

In [63]:
# A2
re.sub(r're[ea]*d\b', 'X', ip)  # 'redo X credible :X: rod X'

'redo X credible :X: rod X'

In [64]:
# E3. For the list `words`, filter all elements containing 'e' or 'i' followed
# by 'l' or 'n'. Note that the order mentioned should be followed.
words = ['surrender', 'unicorn', 'newer', 'door', 'empty', 'eel', 'pest']

In [67]:
# A3
result = [word for word in words if re.search(r'[ei].*[ln]', word)]
print(result)  # ['surrender', 'unicorn', 'eel']

['surrender', 'unicorn', 'eel']


In [68]:
# E4. For the list `words`, filter all elements containing 'e' or 'i', and 'l'
# or 'n' in any order.
words = ['surrender', 'unicorn', 'newer', 'door', 'empty', 'eel', 'pest']

In [82]:
# A4
# ['surrender', 'unicorn', 'newer', 'eel']
for word in words:
    matched = re.search(r'([ei].*[ln])|([ln].*[ei])', word)
    if matched:
        print(word)

surrender
unicorn
newer
eel


In [83]:
# E5. Extract all hex character sequences, with '0x' optional prefix.
# Match the characters case insensitively, and the sequences shouldn't be
# surrounded by other word characters.
str1 = '128A foo 0xfe32 34 0xbar'
str2 = '0XDEADBEEF place 0x0ff1ce bad'

In [99]:
# A5
hex_seq = re.compile(r'\b0[xX][0-9a-fA-F]+\b|\b[0-9a-fA-F]+\b')
print(hex_seq.findall(str1))  # ['128A', '0xfe32', '34']
print(hex_seq.findall(str2))  # ['0XDEADBEEF', '0x0ff1ce', 'bad']

['128A', '0xfe32', '34']
['0XDEADBEEF', '0x0ff1ce', 'bad']


In [101]:
# E6. Delete from '(' to the next occurrence of ')' unless they contain
# parentheses characters in between.
# 내부에 또 다른 괄호쌍을 품지 않은 괄호쌍만 지울 것.
str1 = 'def factorial()'
str2 = 'a/b(division) + c%d(#modulo) - (e+(j/k-3)*4)'
str3 = 'Hi there(greeting). Nice day(a(b)'

In [105]:
# A6
remove_parentheses = re.compile(r'\(.*?\)')  ##### add your solution here
print(remove_parentheses.sub('', str1))  # 'def factorial'
print(remove_parentheses.sub('', str2))  # 'a/b + c%d - (e+*4)'
print(remove_parentheses.sub('', str3))  # 'Hi there. Nice day(a'

def factorial
a/b + c%d - *4)
Hi there. Nice day


In [10]:
# E7. For the list `words`, filter all elements not starting with 'e' or 'p' or 'u'.
words = ['surrender', 'unicorn', 'newer', 'door', 'empty', 'eel', 'pest']

In [106]:
# A7
# ['surrender', 'newer', 'door']
for word in words:
    if re.search(r'\A[^epu]', word):
        print(word)

surrender
newer
door


In [107]:
# E8. For the list `words`, filter all elements not containing 'u' or 'w' or 'ee' or '-'.
words = ['p-t', 'you', 'tea', 'heel', 'owe', 'new', 'reed', 'ear']

In [131]:
# A8
# ['tea', 'ear']
for word in words:
    pattern1 = r'\A[^uw-]+\Z'
    if re.search('[(e{2})]', word):
        print(word)

tea
heel
owe
new
reed
ear


In [132]:
# E9. The given input strings contain fields separated by ',' and fields can be
# empty too. Replace last three fields with 'WHTSZ323'.
row1 = '(2),kite,12,,D,C,,'
row2 = 'hi,bye,sun,moon'

In [134]:
# A9
pat = re.compile(r'\w*,\w*,\w*\Z')
print(pat.sub('WHTSZ323', row1))  # '(2),kite,12,,D,WHTSZ323'
print(pat.sub('WHTSZ323', row2))  # 'hi,WHTSZ323'

(2),kite,12,,D,WHTSZ323
hi,WHTSZ323


In [135]:
# E10. Split the given strings based on consecutive sequence of digit or whitespace characters.
str1 = 'lion \t Ink32onion Nice'
str2 = '**1\f2\n3star\t7 77\r**'

In [136]:
# A10
pat = re.compile(r'[\d\s]+')
print(pat.split(str1))  # ['lion', 'Ink', 'onion', 'Nice']
print(pat.split(str2))  # ['**', 'star', '**']

['lion', 'Ink', 'onion', 'Nice']
['**', 'star', '**']


In [137]:
# E11. Delete all occurrences of the sequence <characters> where characters are
# one or more non '>' characters and cannot be empty.
ip = 'a<apple> 1<> b<bye> 2<> c<cat>'

In [138]:
# A11
# 'a 1<> b 2<> c'
re.sub(r'<[^>]+>', '', ip)

'a 1<> b 2<> c'

In [139]:
# E12. '\b[a-z](on|no)[a-z]\b' is same as '\b[a-z][on]{2}[a-z]\b'.
# True or False? Sample input lines shown below might help to understand
# the differences, if any.
print('known\nmood\nknow\npony\ninns')

known
mood
know
pony
inns


In [141]:
# A12
counterexample = 'soon'  # 'oo', 'nn'
raw_pattern1 = r'\b[a-z](on|no)[a-z]\b'
raw_pattern2 = r'\b[a-z][on]{2}[a-z]\b'

print(re.search(raw_pattern1, counterexample))
print(re.search(raw_pattern2, counterexample))

None
<re.Match object; span=(0, 4), match='soon'>


In [142]:
# E13. For the given list, filter all elements containing any number sequence
# greater than '624'.
items = ['hi0000432abcd', 'car00625', '42_624 0512', '3.14 96 2 foo1234baz']

In [153]:
# A13
# ['car00625', '3.14 96 2 foo1234baz']

from typing import List


def compare_over_624(nums: List[str]):
    def extract_num(nums: List[str]):
        return list(map(float, nums))

    return any(list(map(lambda n: n > 624, extract_num(nums))))


compiled_pattern = re.compile(r'[\d\.]+')
result = [item for item in items if compare_over_624(compiled_pattern.findall(item))]
print(result)

['car00625', '3.14 96 2 foo1234baz']


In [None]:
# E14. Count the maximum depth of nested braces for the given strings.
# Unbalanced or wrongly ordered braces should return '-1'. Note that this will
# require a mix of regular expressions and Python code.
def max_nested_braces(ip):
    pass

In [None]:
# A14
max_nested_braces('a*b')  # 0
max_nested_braces('}a+b{')  # -1
max_nested_braces('a*b+{}')  # 1
max_nested_braces('{{a+2}*{b+c}+e}')  # 2
max_nested_braces('{{a+2}*{b+{c*d}}+e}')  # 3
max_nested_braces('{{a+2}*{\n{b+{c*d}}+e*d}}')  # 4
max_nested_braces('a*{b+c*{e*3.14}}}')  # -1

In [154]:
# E15. By default, `str.split` method will split on whitespace and remove empty
# strings from the result. Which `re` module function would you use to
# replicate this functionality?
ip = ' \t\r  so  pole\t\t\t\n\nlit in to \r\n\v\f  '
ip.split()  # ['so', 'pole', 'lit', 'in', 'to']

['so', 'pole', 'lit', 'in', 'to']

In [163]:
# A15
# ['so', 'pole', 'lit', 'in', 'to']
re.findall(r'\S+', ip)

['so', 'pole', 'lit', 'in', 'to']

In [21]:
# E16. Convert the given input string to two different lists as shown below.
ip = 'price_42 roast^\t\n^-ice==cat\neast'

In [22]:
# A16
# ['price_42', 'roast', 'ice', 'cat', 'east']
# ['price_42', ' ', 'roast', '^\t\n^-', 'ice', '==', 'cat', '\n', 'east']

In [23]:
# E17. Filter all elements whose first non-whitespace character is not a '#'
# character. Any element made up of only whitespace characters should be
# ignored as well.
items = ['    #comment', '\t\napple #42', '#oops', 'sure', 'no#1', '\t\r\f']

In [None]:
# A17
# ['\t\napple #42', 'sure', 'no#1']