In [1]:
import re

# Data preparation

In [13]:
with open(textpath := 'pride_and_prejudice.txt', 'r') as fin:
    novel = fin.read()

In [17]:
print(len(novel))
print(novel[:150])

774838
﻿The Project Gutenberg eBook of Pride and Prejudice, by Jane Austen

This eBook is for the use of anyone anywhere in the United States and
most other 


In [29]:
# pattern 문자열에는 앞으로 raw string을 사용해 파이썬이 esc_char을 해석하는 방식이 영향을 미치지 못하게 함이 좋다.
raw_pattern = r'pride'
compiled_pattern = re.compile(raw_pattern, )
print(compiled_pattern.pattern)
print(compiled_pattern.search(novel))

pride
<re.Match object; span=(32427, 32432), match='pride'>


# Bytes and String

In [34]:
byte_string = b'Hello world!'
raw_pattern = rb'hello'  # make the pattern bytes if the text is bytes type.

matched = re.match(raw_pattern, byte_string, flags=re.I)
print(matched)

<re.Match object; span=(0, 5), match=b'Hello'>


# Two ways to call methods

In [None]:
raw_pattern = r'hope'
sentence = 'I hope to master the regular expression.'

# 1. 컴파일 후 패턴 메소드를 사용
# 패턴을 여러 텍스트에서 검사할 일이 많다면 컴파일된 패턴을 사용하는 것이 좋다.
# 또한 re 모듈에서 호출하는 것보다 컴파일 시 더 많은 제어 옵션을 추가할 수 있다.
compiled_pattern = re.compile(raw_pattern, sentence)
matched = compiled_pattern.search(raw_pattern, sentence)
print(matched)

# 2. 컴파일 없이 re 모듈에서 직접 메소드 호출
matched = re.search(raw_pattern, sentence)
print(matched)

# Methods

## match

In [12]:
matched = compiled_pattern.match(sentence)
print(matched)

None


In [13]:
# pattern.match: 지정된 검색 시작 지점에서 일치 문자열을 찾을 수 없었다면 None 반환.
matched = compiled_pattern.match(sentence, pos=1)
print(matched.group())

o


## search

In [14]:
# pattern.search: `match`와 달리 모든 위치에서부터 검색을 시작하고 그래도 찾지 못하면 None을 반환.
# match()를 모든 위치에서 수행한다고 생각하면 편하다.
matched = compiled_pattern.search(sentence)
print(matched)

<re.Match object; span=(1, 2), match='o'>


In [None]:
raw_pattern = r'is'
sentence = 'This is a sample sentence.'

bool(re.search(raw_pattern, sentence))

## findall

In [35]:
# pattern.findall: 패턴에 맞는 최초의 문자열만 지니는 match, search와 달리 패턴에 맞는 모든 문자열을 담는다.
raw_pattern = '[a-z]+'
sentence = 'I lived in the california.'
pattern = re.compile(raw_pattern)
matched = pattern.findall(sentence)
print(matched)

['lived', 'in', 'the', 'california']


## finditer

In [37]:
# pattern.finditer: findall은 일치 문자열의 리스트였으나, 일치 문자열을 감싼 Match 객체들을 얻어야 한다면 이걸 써라.
matched_iter = pattern.finditer(sentence)
print(matched_iter)
for item in matched_iter:
    print(item.group())

<callable_iterator object at 0x106dd3df0>
lived
in
the
california


In [40]:
# pattern.<method>() 대신 re.<method>(raw_pattern, sentence)의 1회용 축약형을 쓸 수도 있다.
matched = re.findall(raw_pattern, sentence)
print(matched)

['lived', 'in', 'the', 'california']


## sub(substitue)

In [8]:
sentence = 'Have a nice weekend.'
raw_pattern = r'e'

# `count = 0` means that replace all the matched.
substituted_sentence = re.sub(raw_pattern, 'E', sentence, count=1)

'HavE a nice weekend.'

# FLAGS

## Case insensitive

In [41]:
# re.IGNORECASE == re.I
compiled_pattern = re.compile(raw_pattern, re.IGNORECASE)
matched = compiled_pattern.match(sentence)
print(matched)

<re.Match object; span=(0, 1), match='I'>


# Make New line in dot's matching range

In [43]:
# re.DOTALL == re.S
# '.'이 개행 문자도 패턴 일치 대상으로 삼을 수 있게 만든다.
raw_pattern = 'that.she'
sentence = '''
I noticed that
she had the gun.
'''

compiled_pattern = re.compile(raw_pattern, re.DOTALL)
matched = pattern.search(sentence)
print(matched)

<re.Match object; span=(11, 19), match='that\nshe'>


# New line breaks down one sentence to each sentences.

In [45]:
# re.MULTILINE == re.M
# 여러 줄 문자열의 개행 문자 다음을 각 줄의 새 시작으로 인식.
compiled_pattern = re.compile("^python\s\w+", re.MULTILINE)

sentence = """python one
life is too short
python two
you need python
python three"""

matched = compiled_pattern.findall(sentence)
print(matched)

['python one', 'python two', 'python three']


# Comments in pattern expression

In [None]:
# re.VERBOSE
# 컴파일 시 어떻게 정규표현식을 해석할지 영향을 준다.
compiled_pattern = re.compile(r"""
 &[#]                # Start of a numeric entity reference
 (
     0[0-7]+         # Octal form
   | [0-9]+          # Decimal form
   | x[0-9a-fA-F]+   # Hexadecimal form
 )
 ;                   # Trailing semicolon
""", re.VERBOSE)

In [47]:
matched = re.findall('.', sentence)
print(matched)

['I', ' ', 'n', 'o', 't', 'i', 'c', 'e', 'd', ' ', 't', 'h', 'a', 't', 's', 'h', 'e', ' ', 'h', 'a', 'd', ' ', 't', 'h', 'e', ' ', 'g', 'u', 'n', '.']


# Metacharacters

In [76]:
raw_pattern = re.escape('\d.*')
sentence = '\d.*'
compiled_pattern = re.compile(raw_pattern)
matched = compiled_pattern.search(sentence)
print(matched)

<re.Match object; span=(0, 4), match='\\d.*'>


In [65]:
sentence = 'the the tHe THE'
raw_pattern1 = '(the) \1'
raw_pattern2 = '(?i)the'
raw_pattern3 = '(T|t)(H|h)(e|E)'
compiled_pattern = re.compile(raw_pattern1)
matched = compiled_pattern.search(sentence)
print(matched)

None


In [96]:
raw_pattern = r'^(?P<one>\S+) (?P=one)'
sentence = 'the the'
print(re.match(raw_pattern, sentence))

<re.Match object; span=(0, 7), match='the the'>


In [101]:
raw_pattern = 'the|The|THE'
sentence = 'The the THE'
print(re.findall(raw_pattern, sentence))

['The', 'the', 'THE']


# Exercises

In [35]:
# E1. Find whether '0xB0' in the sample strings.
line1 = 'start address: 0xA0, func1 address: 0xC0'
line2 = 'end address: 0xFF, func2 address: 0xB0'

False
True


In [36]:
# A1
raw_pattern = r'0xB0'

compiled_pattern = re.compile(raw_pattern)
print(bool(compiled_pattern.search(line1)))
print(bool(compiled_pattern.search(line2)))

False
True


In [37]:
# E2. Replace all occurrences of 5 with 'five' for the given string.
ip = 'They ate 5 apples and 5 oranges.'

In [41]:
# A2
raw_pattern = r'5'
substituted_sentence = re.sub(raw_pattern, 'five', ip)
print(substituted_sentence)

They ate five apples and five oranges


In [42]:
# E3. Replace *first* occurrence of 5 with 'five' for the given string.
ip = 'They ate 5 apples and 5 oranges'

In [43]:
# A3
raw_pattern = r'5'
substituted_sentence = re.sub(raw_pattern, 'five', ip, count=1)
print(substituted_sentence)

They ate five apples and 5 oranges


In [46]:
# E4. For the given list, filter all elements that do not contain 'e'.
words = ['goal', 'new', 'user', 'sit', 'eat', 'dinner']

In [55]:
# A4
compiled_pattern = re.compile(raw_pattern := r'e')

contains = [word for word in words if bool(compiled_pattern.search(word))]
not_contains = list(set(words) - set(contains))
print(f"words containing 'e': {contains}")
print(f"words not containing 'e': {not_contains}")

words containing 'e': ['new', 'user', 'eat', 'dinner']
words not containing 'e': ['goal', 'sit']


In [56]:
# E5. Replace all occurrences of 'note' irrespective of case with 'X'.
ip = 'This note should not be NoTeD'

In [58]:
# A5
substituted_sentence = re.sub(r'note', 'X', ip, flags=re.I)
print(substituted_sentence)

This X should not be XD


In [59]:
# E6. Check if 'at' is present in the given byte input data.
ip = b'tiger imp goat'

In [61]:
matched = re.search(rb'at', ip)
print(bool(matched))

True


In [63]:
# E7. For the given input string, display all lines not containing 'start' irrespective of case.
paragraph = '''good start
Start working on that
project you always wanted
stars are shining brightly
hi there
start and try to
finish the book
bye'''

In [71]:
compiled_pattern = re.compile(r'start', flags=re.I)
lines_not_matching = [line for line in paragraph.splitlines() if not bool(compiled_pattern.search(line))]
print(lines_not_matching)

['project you always wanted', 'stars are shining brightly', 'hi there', 'finish the book', 'bye']


In [72]:
# E8. For the given list, filter all elements that contains either 'a' or 'w'.
words = ['goal', 'new', 'user', 'sit', 'eat', 'dinner']

In [76]:
# A8
compiled_pattern1 = re.compile(r'a')
compiled_pattern2 = re.compile(r'w')

words_matched = [word for word in words if compiled_pattern1.search(word)
                 or compiled_pattern2.search(word)]
print(words_matched)

['goal', 'new', 'eat']


In [74]:
# E9. For the given list, filter all elements that contains both 'e' and 'n'.
words = ['goal', 'new', 'user', 'sit', 'eat', 'dinner']

In [77]:
# A9
compiled_pattern1 = re.compile(r'e')
compiled_pattern2 = re.compile(r'n')

words_matched = [word for word in words if compiled_pattern1.search(word)
                 and compiled_pattern2.search(word)]
print(words_matched)

['new', 'dinner']


In [79]:
# E10. For the given string, replace '0xA0' with '0x7F', and '0xC0' with '0x1F'.
ip = 'start address: 0xA0, func1 address: 0xC0'

In [80]:
# A10
sub1 = re.sub(r'0xA0', '0x7F', ip)
sub2 = re.sub(r'0xC0', '0x1F', sub1)
print(sub2)

start address: 0x7F, func1 address: 0x1F
