## Regular Expression 常用內容補充

#### python re 常用變數名稱，以身分證字號為例

In [1]:
import re
# 通常要找的文本叫做text
text = 'J123123456 is valid, and J12345 is invalid.'

# pattern通常是正則表達式
pattern = r'[A-Z][1-2]\d{8}'

# regex是re.compile()，regex前面可以加上易辨識的東西增加可讀性，例如：ID_regex
regex = re.compile(r'[A-Z][1-2]\d{8}')

# mo代表配對到的東西(matched objects)，下面三種寫法結果相同
mo_1 = re.search(pattern, text)
print('1 :', mo_1)
mo_2 = re.search(regex, text)
print('2 :', mo_2)
mo_3 = regex.search(text)
print('3 :', mo_3)

1 : <re.Match object; span=(0, 10), match='J123123456'>
2 : <re.Match object; span=(0, 10), match='J123123456'>
3 : <re.Match object; span=(0, 10), match='J123123456'>


#### <re.Match object; span=(0, 10), match='J123123456'>
#### 觀察上述程式print出的結果，span代表所取的東西的index起始，match代表抓到的東西

In [5]:
import re
text = 'J123123456 is valid, and J12345 is invalid.'
regex = re.compile(r'[A-Z][1-2]\d{8}')
mo = regex.search(text)
print(f'Start : {mo.span()[0]}')
print(f'Stop : {mo.span()[1]}')
# 雖然上面的結果叫做match，但要用mo.group()取，不能用mo.match()取
print(f'Match : {mo.group()}')

Start : 0
Stop : 10
Match : J123123456


#### re常用語法補充

In [7]:
# {}指定次數
# \d\d\d\d\d = \d{5}
text = '12345'
regex = re.compile(r'\d{5}')
print(regex.search(text))

<re.Match object; span=(0, 5), match='12345'>


In [10]:
# r'[oil]' != r'oil'
# 前者是o或i或l，後者是oil
text = 'oil lollipop'
with_brackets_regex = re.compile(r'[oil]')
without_brackets_regex = re.compile(r'oil')
print('[oil] =>', with_brackets_regex.findall(text))
print('oil =>', without_brackets_regex.findall(text))

[oil] => ['o', 'i', 'l', 'l', 'o', 'l', 'l', 'i', 'o']
oil => ['oil']


In [12]:
# ^指定開頭 $指定結尾
valid_id = 'E44123123'
invalid_id = 'E441231234'

# 注意，invalid_id多一位數，但仍會抓到東西
print('=====false=====')
regex_false = re.compile(r'[A-Z]\d{8}')
print(regex_false.findall(valid_id))
print(regex_false.findall(invalid_id))

print('=====true=====')
regex_true_1 = re.compile(r'^[A-Z]\d{8}$')
print(regex_true_1.findall(valid_id))
print(regex_true_1.findall(invalid_id))

=====false=====
['E44123123']
['E44123123']
=====true=====
['E44123123']
[]


#### findall()，finditer()差別

In [17]:
text = 'oil gasoil boil oily'
regex = re.compile(r'\w*oil\w*')

# findall return list，list裡面是抓到的東西不是位置(index)
print('=====findall=====')
findall_mo = regex.findall(text)
print(findall_mo)

#finditer return iterator，iterator內容與search相同，包含span和match
print('=====finditer=====')
finditer_mo = regex.finditer(text)
print(finditer_mo)
for i in finditer_mo:
    print(i)

=====findall=====
['oil', 'gasoil', 'boil', 'oily']
=====finditer=====
<callable_iterator object at 0x0000021500B07D30>
<re.Match object; span=(0, 3), match='oil'>
<re.Match object; span=(4, 10), match='gasoil'>
<re.Match object; span=(11, 15), match='boil'>
<re.Match object; span=(16, 20), match='oily'>


## Homework 8-1

In [18]:
import re
s = '123asd789'
regex = re.compile(r'([0-9]*)([a-z]*)([0-9]*)')
mo = regex.search(s)
print(mo.groups())
print(mo.group())
print(mo.group(1))
print(mo.group(2))
print(mo.group(3))

('123', 'asd', '789')
123asd789
123
asd
789


## Homework 8-2

#### 基礎

In [30]:
ID = input()
regex = re.compile(r'^[A-Z]\d{8}$')
mo = re.findall(regex, ID)

if mo == []:
    print("This id isn't correct.")
else:
    print('Id :', mo[0], 'is correct.')

Id : E44106062 is correct.


#### 進階

#### 事實上，學生證第二碼有些系所是英文字母，先把所有系所代碼爬下來看

In [26]:
import requests
from bs4 import BeautifulSoup

# 爬取系所代碼
url = 'https://reg-acad.ncku.edu.tw/p/405-1041-167305,c7569.php?Lang=zh-tw'
resp = requests.get(url).text
soup = BeautifulSoup(resp, 'html.parser')

# 資料整理
department_list = [i.get_text().strip() for i in soup.find_all('b')]
department_list.remove('系所代碼')
department_list.remove('系所名稱')

print(department_list)

['A1', 'A2', 'A3', 'A4', 'A5', 'A7', 'A9', 'AH', 'AN', 'B1', 'B2', 'B3', 'B5', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'D2', 'D4', 'D5', 'D8', 'E1', 'E2', 'E3', 'E4', 'E5', 'E6', 'E7', 'E8', 'E9', 'F0', 'F1', 'F2', 'F3', 'F4', 'F5', 'F6', 'F7', 'F8', 'F9', 'H1', 'H2', 'H3', 'H4', 'H5', 'I1', 'I2', 'I3', 'I4', 'I5', 'I6', 'I7', 'I8', 'I9', 'K1', 'K2', 'K3', 'K4', 'K5', 'K7', 'K8', 'L1', 'L2', 'L3', 'L4', 'L5', 'L6', 'L7', 'L8', 'L9', 'LA', 'N0', 'N1', 'N2', 'N3', 'N4', 'N5', 'N6', 'N7', 'N8', 'N9', 'NA', 'NB', 'NC', 'ND', 'NE', 'P0', 'P1', 'P2', 'P3', 'P4', 'P5', 'P6', 'P7', 'P8', 'P9', 'PA', 'PB', 'Q1', 'Q2', 'Q3', 'Q4', 'Q5', 'Q7', 'R0', 'R1', 'R2', 'R3', 'R4', 'R5', 'R6', 'R7', 'R8', 'R9', 'RA', 'RB', 'RD', 'RE', 'S1', 'S2', 'S3', 'S4', 'S5', 'S6', 'S7', 'S8', 'S9', 'SA', 'SB', 'SC', 'T1', 'T2', 'T3', 'T4', 'T6', 'T7', 'T8', 'T9', 'TA', 'TB', 'TC', 'U1', 'U2', 'U3', 'U5', 'U6', 'U7', 'U8', 'Z1', 'Z2', 'Z3', 'Z5', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'VA', 'VB', 'VC', 'VD

#### 修改正則表達式讓第二碼可以是英文或數字，最後再加上系所代碼真實性的檢查

In [33]:
import re
import requests
from bs4 import BeautifulSoup

#Set up
ID = 'E44123123'

# 爬取系所代碼
url = 'https://reg-acad.ncku.edu.tw/p/405-1041-167305,c7569.php?Lang=zh-tw'
resp = requests.get(url).text
soup = BeautifulSoup(resp, 'html.parser')

department_list = [i.get_text().strip() for i in soup.find_all('b')]
department_list.remove('系所代碼')
department_list.remove('系所名稱')

# re檢查
regex = re.compile(r'^[A-Z][A-Z0-9]\d{7}$')
mo = re.findall(regex, ID)

# 系所代碼檢查
if mo == []:
    print(f"ID : {ID} format isn't correct.")
# 檢查前兩碼是否在爬下來的系所代碼表中
elif mo[0][0:2] in department_list:
    print(f'Id : {mo[0]} is correct.')
else:
    print(f"Department {mo[0][0:2]} isn't exist.")

Id : E44123123 is correct.


#### 再進階，包成function方便調用

In [35]:
import re
import requests
from bs4 import BeautifulSoup

# 爬取系所代碼
url = 'https://reg-acad.ncku.edu.tw/p/405-1041-167305,c7569.php?Lang=zh-tw'
resp = requests.get(url).text
soup = BeautifulSoup(resp, 'html.parser')

department_list = [i.get_text().strip() for i in soup.find_all('b')]
department_list.remove('系所代碼')
department_list.remove('系所名稱')

def student_id_validation(ID):    
    # re檢查
    regex = re.compile(r'^[A-Z][A-Z0-9]\d{7}$')
    mo = re.findall(regex, ID)

    # 系所代碼檢查
    if mo == []:
        return f"ID : {ID} format isn't correct."
    # 檢查前兩碼是否在爬下來的系所代碼表中
    elif mo[0][0:2] in department_list:
        return f'Id : {mo[0]} is correct.'
    else:
        return f"Department {mo[0][0:2]} isn't exist."

In [36]:
# 第一、二個是正確的，第三個系所代碼錯，第四個多一碼
ID_list = ['E44123123', 'VB1234567', 'ZZ1234567', 'E441231234']
for ID in ID_list:
    print(student_id_validation(ID))

Id : E44123123 is correct.
Id : VB1234567 is correct.
Department ZZ isn't exist.
ID : E441231234 format isn't correct.


## Homework 8-3

#### gasoil算在內

In [1]:
import re
with open('../data/FeatureArticle.txt', 'r', encoding='utf-8') as f:
    text = f.read()

#注意regex寫法
regex = re.compile(r'oil', re.IGNORECASE)
mo = re.finditer(regex, text)
for i in mo:
    print(i.span()[0])

7
45
562
748
1537
1861
2037
2200
2339
2534
2662
2767
2877
3204
3410
3622


#### gasoil不算在內，少了index 1861

In [2]:
import re
with open('../data/FeatureArticle.txt', 'r', encoding='utf-8') as f:
    text = f.read()

#text開頭結尾加空白，避免開頭或結尾是oil無法被抓到
space = ' '
text = space + text + space

#注意regex寫法
regex = re.compile(r'[^a-z]oil[^a-z]', re.IGNORECASE)
mo = re.finditer(regex, text)
for i in mo:
    print(i.span()[0])

7
45
562
748
1537
2037
2200
2339
2534
2662
2767
2877
3204
3410
3622
