## 정규 표현식(Regular Expression)


#### 1) .기호

In [3]:
import re
r = re.compile("a.c")
r.search("kkk")

In [4]:
r.search("abc")

<re.Match object; span=(0, 3), match='abc'>

#### 2) ?기호

In [10]:
r = re.compile("ab?c")
print(r.search("abc"))
print(r.search("ac"))

<re.Match object; span=(0, 3), match='abc'>
<re.Match object; span=(0, 2), match='ac'>


#### 3) *기호

In [12]:
r = re.compile("ab*c")
print(r.search("a"))
print(r.search("ac"))
print(r.search("abc"))
print(r.search("abbbbbbbbbbbc"))

None
<re.Match object; span=(0, 2), match='ac'>
<re.Match object; span=(0, 3), match='abc'>
<re.Match object; span=(0, 13), match='abbbbbbbbbbbc'>


#### 4) +기호

In [13]:
r = re.compile("ab+c")
print(r.search("ac"))
print(r.search("abc"))
print(r.search("abbc"))
print(r.search("a"))

None
<re.Match object; span=(0, 3), match='abc'>
<re.Match object; span=(0, 4), match='abbc'>
None


#### 5) ^기호

In [14]:
r = re.compile("^a")
print(r.search("bbc"))
print(r.search("ab"))
print(r.search("abc"))

None
<re.Match object; span=(0, 1), match='a'>
<re.Match object; span=(0, 1), match='a'>


#### 6) {숫자} 기호

In [16]:
r = re.compile("ab{2}c")
print(r.search("ac"))
print(r.search("abc"))
print(r.search("abbc"))
print(r.search("abbbc"))

None
None
<re.Match object; span=(0, 4), match='abbc'>
None


#### 7) {숫자1, 숫자2} 기호

In [20]:
r = re.compile("ab{2,8}c")
print(r.search("ac"))
print(r.search("abc"))
print(r.search("abbc"))
print(r.search("abbbc"))
print(r.search("abbbbbbbbc"))
print(r.search("abbbbbbbbbbbc"))

None
None
<re.Match object; span=(0, 4), match='abbc'>
<re.Match object; span=(0, 5), match='abbbc'>
<re.Match object; span=(0, 10), match='abbbbbbbbc'>
None


#### 8) {숫자,} 기호

In [22]:
r = re.compile("a{2,}bc")
print(r.search("aabc"))
print(r.search("abc"))
print(r.search("aaaaaaaaaabc"))

<re.Match object; span=(0, 4), match='aabc'>
None
<re.Match object; span=(0, 12), match='aaaaaaaaaabc'>


#### 9) [] 기호

In [28]:
r = re.compile("[abc]")
print(r.search("zzz"))
print(r.search("a"))
print(r.search("b"))
print(r.search("c"))
print(r.search("ab"))
print(r.search("baac"))

None
<re.Match object; span=(0, 1), match='a'>
<re.Match object; span=(0, 1), match='b'>
<re.Match object; span=(0, 1), match='c'>
<re.Match object; span=(0, 1), match='a'>
<re.Match object; span=(0, 1), match='b'>


In [31]:
r = re.compile("[a-z]")
print(r.search("aaa"))
print(r.search("AAA"))
print(r.search("1111"))

<re.Match object; span=(0, 1), match='a'>
None
None


#### 10) [^문자] 기호

In [35]:
r = re.compile("[^abc]")
print(r.search("a"))
print(r.search("b"))
print(r.search("ab"))
print(r.search("d"))
print(r.search("dda"))
print(r.search("1"))

None
None
None
<re.Match object; span=(0, 1), match='d'>
<re.Match object; span=(0, 1), match='d'>
<re.Match object; span=(0, 1), match='1'>


### 모듈 함수

#### 1) re.match() 와 re.search()의 차이

In [3]:
import re
r = re.compile("ab.")
print(r.search("kkkabc"))
print(r.match("kkkabc"))
print(r.match("abckkk"))

<re.Match object; span=(3, 6), match='abc'>
None
<re.Match object; span=(0, 3), match='abc'>


#### 2) re.split()

In [4]:
text="사과 딸기 수박 메론 바나나"
re.split(" ",text)

['사과', '딸기', '수박', '메론', '바나나']

In [5]:
text="""사과
딸기
수박
메론
바나나"""
re.split("\n",text)

['사과', '딸기', '수박', '메론', '바나나']

In [6]:
text="사과+딸기+수박+메론+바나나"
re.split("\+",text)
['사과', '딸기', '수박', '메론', '바나나']  

['사과', '딸기', '수박', '메론', '바나나']

#### 3) re.findall()

In [11]:
text="""이름 : 김철수
전화번호 : 010 - 1234 - 1234
나이 : 30
성별 : 남"""  
print(re.findall("\d+",text))
# 빈 리스트 리턴
print(re.findall("\d+", "문자열입니다."))

['010', '1234', '1234', '30']
[]


#### 4) re.sub()

In [12]:
text="Regular expression : A regular expression, regex or regexp[1] (sometimes called a rational expression)[2][3] is, in theoretical computer science and formal language theory, a sequence of characters that define a search pattern."
re.sub('[^a-zA-Z]',' ',text)

'Regular expression   A regular expression  regex or regexp     sometimes called a rational expression        is  in theoretical computer science and formal language theory  a sequence of characters that define a search pattern '

### 정규 표현식 텍스트 전처리 예제

In [13]:
text = """100 John    PROF
101 James   STUD
102 Mac   STUD"""  

re.split('\s+', text)  

['100', 'John', 'PROF', '101', 'James', 'STUD', '102', 'Mac', 'STUD']

In [17]:
re.findall('\d+',text)

['100', '101', '102']

In [18]:
re.findall('[A-Z]', text)

['J', 'P', 'R', 'O', 'F', 'J', 'S', 'T', 'U', 'D', 'M', 'S', 'T', 'U', 'D']

In [19]:
re.findall('[A-Z]{4}', text)

['PROF', 'STUD', 'STUD']

In [20]:
re.findall('[A-Z][a-z]+',text)

['John', 'James', 'Mac']

In [22]:
letters_only = re.sub('[^a-zA-Z]', ' ', text)
print(letters_only)

    John    PROF     James   STUD     Mac   STUD


### 정규 표현식을 이용한 토큰화

In [23]:
import nltk
from nltk.tokenize import RegexpTokenizer

In [24]:
tokenizer = RegexpTokenizer("[\w]+")
print(tokenizer.tokenize("Don't be fooled by the dark sounding name, Mr. Jone's Orphanage is as cheery as cheery goes for a pastry shop"))

['Don', 't', 'be', 'fooled', 'by', 'the', 'dark', 'sounding', 'name', 'Mr', 'Jone', 's', 'Orphanage', 'is', 'as', 'cheery', 'as', 'cheery', 'goes', 'for', 'a', 'pastry', 'shop']


In [26]:
# gaps=true는 해당 정규 표현식을 토큰으로 나누기 위한 기준으로 사용
tokenizer=RegexpTokenizer("[\s]+", gaps=True)
print(tokenizer.tokenize("Don't be fooled by the dark sounding name, Mr. Jone's Orphanage is as cheery as cheery goes for a pastry shop"))

["Don't", 'be', 'fooled', 'by', 'the', 'dark', 'sounding', 'name,', 'Mr.', "Jone's", 'Orphanage', 'is', 'as', 'cheery', 'as', 'cheery', 'goes', 'for', 'a', 'pastry', 'shop']
