## Regular Expression

In [1]:
import re

### 문자 클래스 [ ]

### match: matches zero or more characters at the beginning of the string

In [2]:
p = re.compile('[a-z]+') # a~z문자열이 1번이상 반복 pattern 생성
m = p.match('python')
print(m)

<_sre.SRE_Match object; span=(0, 6), match='python'>


In [3]:
m = p.match('3 python')
print(m)

None


### search: scan through string looking for a match

In [4]:
p = re.compile('[a-z]+') # a~z문자열이 1번이상 반복 pattern 생성
m = p.search('wow python 1234')
print(m)

<_sre.SRE_Match object; span=(0, 3), match='wow'>


### findall: return a list of all non-overlapping matches of pattern in string

In [5]:
p = re.compile('[a-z]+') # a~z문자열이 1번이상 반복 pattern 생성
m = p.findall('life is too short')
print(m)

['life', 'is', 'too', 'short']


In [6]:
p = re.compile('[a-z]+') # a~z문자열이 1번이상 반복 pattern 생성
m = p.finditer('life is too short') # match 객체 형태로 return
for r in m:
    print(r)

<_sre.SRE_Match object; span=(0, 4), match='life'>
<_sre.SRE_Match object; span=(5, 7), match='is'>
<_sre.SRE_Match object; span=(8, 11), match='too'>
<_sre.SRE_Match object; span=(12, 17), match='short'>


In [7]:
p = re.compile('[a-z]+') # a~z문자열이 1번이상 반복 pattern 생성
m = p.match('python')
print(m.group()) # match의 문자열
print(m.start()) # match의 시작 인덱스
print(m.end()) # match의 끝 인덱스
print(m.span()) # match의 시작, 끝

python
0
6
(0, 6)


### Dot(.) : 줄바꿈(\n)을 제외한 모든 문자와 매치

In [8]:
p = re.compile('a.b')
m = p.match('a\nb')
print(m)

None


#### DOTALL: DOT 에서 \n 허용

In [9]:
p = re.compile('a.b', re.DOTALL)
m = p.match('a\nb')
print(m)

<_sre.SRE_Match object; span=(0, 3), match='a\nb'>


#### IGNORECASE: 대소문자 무시

In [10]:
p = re.compile('[a-z]', re.I)
print(p.match('python'))
print(p.match('Python'))
print(p.match('PYTHON'))

<_sre.SRE_Match object; span=(0, 1), match='p'>
<_sre.SRE_Match object; span=(0, 1), match='P'>
<_sre.SRE_Match object; span=(0, 1), match='P'>


#### MULTILINE, M: ^(꺽쇄)를 각 라인마다 첫 번째로 해석

In [11]:
p = re.compile("^python\s\w+", re.M)

data = """python one
life is too hosrt
python two
you need python
python three"""

print(p.findall(data))

['python one', 'python two', 'python three']


#### VERBOSE: 컴파일할 때 줄바꿈 허용

In [12]:
charref = re.compile(r'&[#](0[0-7]+|[0-9]+|x[0-9a-fA-F]+);')

In [13]:
charref = re.compile(r"""
&[#]
(
    0[0-7]+
    | [0-9]+
    | x[0-9a-fA-F]+
)
;
""", re.VERBOSE)

#### 메타문자

In [14]:
p = re.compile("Crow|Servo") # |는 or
m = p.match("CrowHello")
print(m)

<_sre.SRE_Match object; span=(0, 4), match='Crow'>


In [15]:
print(re.search("^Life", "Life is too short")) # ^는 맨처음
print(re.search("^Life", "MY Life"))

<_sre.SRE_Match object; span=(0, 4), match='Life'>
None


In [16]:
print(re.search("short$", "Life is too short")) # $는 맨 마지막
print(re.search("short$", "MY Life short, you need it"))

<_sre.SRE_Match object; span=(12, 17), match='short'>
None


In [17]:
p = re.compile(r'\bclass\b') # \b는 공백 표시
print(p.search("no class at all"))
print(p.search("the declassified algorithm"))
print(p.search("one subclass is"))

<_sre.SRE_Match object; span=(3, 8), match='class'>
None
None


## Grouping ()

In [18]:
p = re.compile("(ABC)+")
m = p.search("ABCABCABC OK?")
print(m)
print(m.group())

<_sre.SRE_Match object; span=(0, 9), match='ABCABCABC'>
ABCABCABC


In [19]:
p = re.compile(r"(\w+)\s+\d+[-]\d+[-]\d+")
m = p.search("park 010-1234-1234")
print(m.group(1)) # grouping한 정규표현식 중 1번만 가져옴

park


In [20]:
p = re.compile(r'(\b\w+)\s+\1')
print(p.search("Paris in the the spring").group()) #그룹핑 된 것만 가져옴

the the


#### ?P<name>: 그룹에 <name>이라는 이름을 부여해 호출 

In [21]:
p = re.compile(r"(?P<name>\w+)\s+((\d+)[-]\d+[-]\d+)") 
m = p.search("park 010-1234-1234")
print(m.group("name"))

park


#### 문자 클래스 [ ]

#### Dot(.)

In [22]:
x = "2020, python is fun"

r1 = re.findall(r"^\w+", x)
print(r1)

['2020']
