### 정규 표현식
- 특정한 패턴과 일치하는 문자열을 검색, 치환, 제거하는 기능을 지원
- re 모듈 사용
    - match(), search(), findall(), finditer(), ...
    - [a-zA-Z] : 모든 알파벳 패턴
    - . : \n 을 제외한 모든 문자
    - * : 0 ~ 무한대, + : 1 ~ 무한대, ? : 0 ~ 1, {2}, {2, 10}

In [1]:
import re

# 검색할 패턴 작성
pattern = re.compile("D.A") # 시작 D, 끝 A, 가운데는 상관 x

# 원본 문자열
origin = "DAA"

# 원본 문자열과 패턴이 일치하는가
result = pattern.search(origin)
print(result)

print("패턴 시작 위치 ",result.start())
print("패턴 끝 위치 ",result.end())
print("re와 일치하는 문자열 반환 ",result.group())
print("패턴 위치 ",result.span())

<re.Match object; span=(0, 3), match='DAA'>
패턴 시작 위치  0
패턴 끝 위치  3
re와 일치하는 문자열 반환  DAA
패턴 위치  (0, 3)


In [2]:
origin = "D00A"

# 원본 문자열과 패턴이 일치하는가
result = pattern.search(origin)
result

In [3]:
origin = "D0A D1A 0111"

# 원본 문자열과 패턴이 일치하는가
result = pattern.search(origin)
result

<re.Match object; span=(0, 3), match='D0A'>

In [4]:
re.search(r"D.A","DAA")

<re.Match object; span=(0, 3), match='DAA'>

In [5]:
pattern = re.compile("D?A") # D가 최소 0, 최대 1 가능, A 문자가 있어야 함
print(pattern.search("A"))
print(pattern.search("DA"))
print(pattern.search("AA"))

<re.Match object; span=(0, 1), match='A'>
<re.Match object; span=(0, 2), match='DA'>
<re.Match object; span=(0, 1), match='A'>


In [6]:
pattern = re.compile("D+A") # D가 최소 1, 최대 무한대 가능, A 문자가 있어야 함
print(pattern.search("A"))
print(pattern.search("DA"))
print(pattern.search("DDDDDDDDDDDDDDDDDDDDDDDDAA"))

None
<re.Match object; span=(0, 2), match='DA'>
<re.Match object; span=(0, 25), match='DDDDDDDDDDDDDDDDDDDDDDDDA'>


In [7]:
pattern = re.compile("AD{2}A") # D가 최소 2, 최대 2, A 문자가 있어야 함
print(pattern.search("ADA"))
print(pattern.search("ADDA"))
print(pattern.search("ADDDDDDDDDDDDDDDDDDDDDDDDAA"))

None
<re.Match object; span=(0, 4), match='ADDA'>
None


In [8]:
pattern = re.compile("AD{2,6}A") # D가 최소 2, 최대 6, A 문자가 있어야 함
print(pattern.search("ADA"))
print(pattern.search("ADDA"))
print(pattern.search("ADDDDDDDDDDDDDDDDDDDDDDDDAA"))

None
<re.Match object; span=(0, 4), match='ADDA'>
None


In [9]:
pattern = re.compile("[ABCDEFGabcdefg]")
print(pattern.search("aa1234"))
print(pattern.search("A4567"))

<re.Match object; span=(0, 1), match='a'>
<re.Match object; span=(0, 1), match='A'>


In [10]:
pattern = re.compile("[A-Ga-g]")
print(pattern.search("aa1234"))
print(pattern.search("A4567"))

<re.Match object; span=(0, 1), match='a'>
<re.Match object; span=(0, 1), match='A'>


In [11]:
pattern = re.compile("[A-Ga-g]+")
print(pattern.search("aa1234"))
print(pattern.search("A4567"))

<re.Match object; span=(0, 2), match='aa'>
<re.Match object; span=(0, 1), match='A'>


In [12]:
pattern = re.compile("[A-Ga-g0-9]+")
print(pattern.search("aa1234"))
print(pattern.search("A4567"))

<re.Match object; span=(0, 6), match='aa1234'>
<re.Match object; span=(0, 5), match='A4567'>


In [13]:
# [^찾을패턴] : 찾을 패턴이 아닌(not)
pattern = re.compile("[^A-Ga-g]+")
print(pattern.search("aa1234!@#$%"))
print(pattern.search("A4567"))

<re.Match object; span=(2, 11), match='1234!@#$%'>
<re.Match object; span=(1, 5), match='4567'>


In [14]:
pattern = re.compile("[가-힣]+")
print(pattern.search("aa1234대한민국"))
print(pattern.search("A백두산4567"))

<re.Match object; span=(6, 10), match='대한민국'>
<re.Match object; span=(1, 4), match='백두산'>


In [15]:
pattern = re.compile("[a-z]+")
print(pattern.search("aa1234대한민국"))
print(pattern.match("aa1234대한민국"))

<re.Match object; span=(0, 2), match='aa'>
<re.Match object; span=(0, 2), match='aa'>


In [17]:
origin = "DDA D1A DDA DA"
# sub(패턴, 바꿀문자열, 원본문자열) : 문자열 찾아서 바꾸기
print(re.sub("D.A","Dave",origin))

Dave Dave Dave DA


In [19]:
pattern = re.compile("D.A")
pattern.sub("Dave",origin)

'Dave Dave Dave DA'

In [21]:
# findall() : 표현식과 매칭되는 문자들을 리스트로 반환
pattern = re.compile("[a-z]+")
origin = "Game of Life in Python"

# pattern.findall(origin)
for w in pattern.findall(origin):
    print(w)

ame
of
ife
in
ython


In [22]:
for w in pattern.finditer(origin):
    print(w.group())

<re.Match object; span=(1, 4), match='ame'>
<re.Match object; span=(5, 7), match='of'>
<re.Match object; span=(9, 12), match='ife'>
<re.Match object; span=(13, 15), match='in'>
<re.Match object; span=(17, 22), match='ython'>


In [23]:
pattern = re.compile(":")
pattern.split("python:java:javascript")

['python', 'java', 'javascript']

In [29]:
# VS 로 문자열 분리 → python, java
origin = "python VS java"
pattern = re.compile(" VS ")
print(pattern.split(origin))

# - 기호를 * 로 바꿔서 출력
jumin = "801210-1011323"
pattern = re.compile("-")
print(pattern.sub("*",jumin))

['python', 'java']
801210*1011323


In [55]:
# data_kr 엑셀 읽기
# 주민번호 뒷자리를 * 로 바꿔서 보여주기
from openpyxl import load_workbook

excel_file = load_workbook("./file/data_kr.xlsx")

work_sheet = excel_file.active

pattern = re.compile(r"[0-9]{7}")

for row in work_sheet.rows:
    print(re.sub(pattern, "*******", row[1].value))

excel_file.close()

주민등록번호
800215-*******
821030-*******
841230-*******
790903-*******
800125-*******
820612-*******


In [57]:
origin = "<b>아이폰</b>"

# * : 매칭을 최대화 - <b>아이폰</b> → < ~ > 전체 조회
# +?, *? : 매칭을 최소화 <b>
pattern = re.compile("<.*?>")
pattern.search(origin)

<re.Match object; span=(0, 3), match='<b>'>

In [63]:
import requests
from bs4 import BeautifulSoup

res = requests.get("https://www.naver.com")
soup = BeautifulSoup(res.text, "lxml")

# h로 시작하는 모든 태그 요소 찾기
print(soup.find_all(string=re.compile(r"h\d")))

# 이미지 요소 찾기 (jpg or png)
print(soup.find_all("img",attrs={"src":re.compile(r".+\.jpg|png")}))


['\nwindow["EAGER-DATA"] = window["EAGER-DATA"] || {};\nwindow["EAGER-DATA"]["PC-FEED-WRAPPER"] = {"@type":"BLOCK","blocks":[{"@type":"BLOCK","blocks":[{"@type":"PC-FEED-BLOCK","excludeInPaging":false,"positionForPaging":0,"realtime":false,"filterList":[{"id":"ALL","name":"인기","defaultType":true},{"id":"ANIMAL","name":"동물","defaultType":false},{"id":"TRAVEL","name":"여행","defaultType":false},{"id":"WEDDING","name":"연애결혼","defaultType":false},{"id":"MOVIE","name":"영화","defaultType":false},{"id":"MOMKIDS","name":"육아","defaultType":false}],"needCologger":false,"ad":false,"@type":"PC-FEED-BLOCK","@code":"PC-FEED-RECOMMEND-FILTER-TAB","@template":"PC-FEED-RECOMMEND-FILTER-TAB"},{"@type":"PC-FEED-BLOCK","materials":[{"@type":"MATERIAL-PC-FEED","title":"학교에 가야 배고프지 않을 수 있는 케냐 학생들","url":"https://happybean.naver.com/donations/H000000195252?p=p&s=nmrp","image":{"url":"https://s.pstatic.net/dthumb.phinf/?src=%22https%3A%2F%2Fhappybean-phinf.pstatic.net%2F20240510_163%2F1715322132927t0pio_JPEG%2Fn

In [75]:
excel_file = load_workbook("./file/train.xlsx")

work_sheet = excel_file.active

pattern = re.compile(r" Mr.")

for row in work_sheet.rows:
    if len(pattern.findall(row[3].value)) > 0:
        if pattern.search(row[3].value)[0].strip() == "Mr.":
            print(row[3].value)

excel_file.close()

Braund, Mr. Owen Harris
Allen, Mr. William Henry
Moran, Mr. James
McCarthy, Mr. Timothy J
Saundercock, Mr. William Henry
Andersson, Mr. Anders Johan
Williams, Mr. Charles Eugene
Fynney, Mr. Joseph J
Beesley, Mr. Lawrence
Sloper, Mr. William Thompson
Emir, Mr. Farred Chehab
Fortune, Mr. Charles Alexander
Todoroff, Mr. Lalio
Wheadon, Mr. Edward H
Meyer, Mr. Edgar Joseph
Holverson, Mr. Alexander Oskar
Mamee, Mr. Hanna
Cann, Mr. Ernest Charles
Kraeff, Mr. Theodor
Rogers, Mr. William John
Lennon, Mr. Denis
Samaan, Mr. Youssef
Nosworthy, Mr. Richard Cater
Ostby, Mr. Engelhart Cornelius
Woolner, Mr. Hugh
Novel, Mr. Mansouer
Sirayanian, Mr. Orsen
Harris, Mr. Henry Birkhardt
Stewart, Mr. Albert A
Crease, Mr. Ernest James
Kink, Mr. Vincenz
Jenkin, Mr. Stephen Curnow
Hood, Mr. Ambrose Jr
Chronopoulos, Mr. Apostolos
Bing, Mr. Lee
Moen, Mr. Sigurd Hansen
Staneff, Mr. Ivan
Moutal, Mr. Rahamin Haim
Waelens, Mr. Achille
Sheerlinck, Mr. Jan Baptist
Carrau, Mr. Francisco M
Ford, Mr. William Neal
Slocovs

In [80]:
# Mr. → 남성, Miss. → 미혼여성, Mrs. → 기혼여성, X
from openpyxl import Workbook

excel_file = load_workbook("./file/train.xlsx")

work_sheet = excel_file.active

# 새 엑셀 파일 작성
# 파일 새로 만들면 시트 1개는 기본으로 들어있음
wb = Workbook()
work_sheet_man = wb.active
work_sheet_man.column_dimensions["D"].width = 70
work_sheet_man.title = "남성"

work_sheet_women = wb.create_sheet(title="기혼여성")
work_sheet_women.column_dimensions["D"].width = 70

work_sheet_solo_women = wb.create_sheet(title="미혼여성")
work_sheet_solo_women.column_dimensions["D"].width = 70

work_sheet_others = wb.create_sheet(title="기타")
work_sheet_others.column_dimensions["D"].width = 70

pattern = re.compile(" [A-Za-z]+\.")

list1 = []
for row in work_sheet.rows:
    if pattern.search(row[3].value):
        data = pattern.search(row[3].value).group()
        # print(data)

    # 제목 행 옮기기
    if row[0].row == 1:
        # for title in row:
        #     list1.append(title.value)

        # work_sheet_man.append(list1)
        # work_sheet_women.append(list1)
        # work_sheet_solo_women.append(list1)
        # work_sheet_others.append(list1)
        
        work_sheet_man.append([title.value for title in row])
        work_sheet_women.append([title.value for title in row])
        work_sheet_solo_women.append([title.value for title in row])
        work_sheet_others.append([title.value for title in row])

    else:
        if data:
            if data == " Mr.":
                work_sheet_man.append([col.value for col in row])
                # survived 컬럼 값이 1(생존자) or 0(사망자)
            elif data == " Mrs.":
                work_sheet_women.append([col.value for col in row])
            elif data == " Miss.":
                work_sheet_solo_women.append([col.value for col in row])
            else:
                work_sheet_others.append([col.value for col in row])

wb.save("./file/train_gender.xlsx")
wb.close()

excel_file.close

  pattern = re.compile(" [A-Za-z]+\.")


<bound method Workbook.close of <openpyxl.workbook.workbook.Workbook object at 0x000002AD0E3C9AF0>>