# 最常规匹配

In [2]:
import re

content = 'Hello 123 4567 World_This is a Regex Demo'
print(len(content))
result = re.match('^Hello\s\d\d\d\s\d{4}\s\w{10}.*Demo$',content)
print(result)
print(result.group())
print(result.span())

41
<re.Match object; span=(0, 41), match='Hello 123 4567 World_This is a Regex Demo'>
Hello 123 4567 World_This is a Regex Demo
(0, 41)


# 泛匹配

In [3]:
import re

content = 'Hello 123 4567 World_This is a Regex Demo'
result = re.match('^Hello.*Demo$',content)
print(result)
print(result.group())
print(result.span())

<re.Match object; span=(0, 41), match='Hello 123 4567 World_This is a Regex Demo'>
Hello 123 4567 World_This is a Regex Demo
(0, 41)


# 匹配目标

In [10]:
import re

content = 'Hello 1234567 World_This is a Regex Demo'
result = re.match('^Hello\s(\d+)\s(\w+).*Demo$',content)
print(result)
print(result.group(2))
print(result.span())

<re.Match object; span=(0, 40), match='Hello 1234567 World_This is a Regex Demo'>
World_This
(0, 40)


# 贪婪匹配

In [11]:
import re

content = 'Hello 1234567 World_This is a Regex Demo'

result = re.match('^He.*(\d+).*Demo$',content)
print(result)
print(result.group(1))
print(result.span())

<re.Match object; span=(0, 40), match='Hello 1234567 World_This is a Regex Demo'>
7
(0, 40)


# 非贪婪匹配

In [12]:
import re

content = 'Hello 1234567 World_This is a Regex Demo'

result = re.match('^He.*?(\d+).*Demo$',content)
print(result)
print(result.group(1))
print(result.span())

<re.Match object; span=(0, 40), match='Hello 1234567 World_This is a Regex Demo'>
1234567
(0, 40)


# 匹配模式

In [17]:
import re

content = '''Hello 1234567 World_This
is a Regex Demo
'''

result = re.match('^He.*?(\d+).*?Demo$',content,re.S)
print(result)
print(result.group(1))
print(result.span())

<re.Match object; span=(0, 40), match='Hello 1234567 World_This\nis a Regex Demo'>
1234567
(0, 40)


# 转义

In [19]:
import re

content = 'price is $5.00'

result = re.match('price is \$5\.00',content)
print(result)

<re.Match object; span=(0, 14), match='price is $5.00'>


总结：尽量使用泛匹配、使用括号得到匹配目标、尽量使用非贪婪匹配模式、有换行符就用re.S

# re.search

In [3]:
import re

content = 'Artificial intelligence (AI) is an important technology that 121212121 supports daily social life and economic activities'
result = re.search('.*?(\d+).*?',content)
print(result)
print(result.group(1))

<re.Match object; span=(0, 70), match='Artificial intelligence (AI) is an important tech>
121212121


# 匹配演练

re.findall
搜索字符，以列表的形式返回全部能匹配的子串。
re.sub
替换字符串中的子串后返回替换后的字符串

In [21]:
import re

html = '''
<div class="nav-con fl">
    <ul>
        <li report-id="playpage_main" class="nav-item home">
            <a href="//www.bilibili.com" title="主站" class="t">
            <i class="header-iconfont header-icon-bilibili-tv">
            </i>主站
    <!----></a><!---->
    </li>
        <li report-id="Webtab_click_audio" class="nav-item mbili">
            <a href="//www.bilibili.com/audio/home/?type=10" target="_blank" title="来探索bilibili音乐的世界吧~" class="t">音频</a>
        </li>
        <li report-id="playpage_game" class="nav-item game">
            <a href="//game.bilibili.com" target="_blank" title="游戏中心" class="t">游戏中心</a>
            <div class="i-frame" style="width: 680px; height: 260px; display: none;">
            <iframe src="https://www.bilibili.com/page-proxy/game-nav.html" frameborder="0" width="100%" height="100%"></iframe>
            </div>
        </li>
        <li report-id="playpage_live" class="nav-item live">
            <a href="//live.bilibili.com" target="_blank" title="直播" class="t">直播</a>
<!----></li>
        <li report-id="playpage_buy" class="nav-item buy">
            <a href="//show.bilibili.com/platform/home.html?msource=pc_web" target="_blank" title="会员购" class="t">会员购</a>
        </li>
        <li report-id="playpage_manga" class="nav-item manga">
            <a href="//manga.bilibili.com" target="_blank" title="漫画" class="t">漫画</a>
        </li>
        <li class="nav-item loc-menu">
            <a href="https://bml.bilibili.com/" target="_blank" class="t">BML</a>
<!----></li>
<!----><li report-id="playpage_download" class="nav-item mobile">
            <i class="header-iconfont header-icon-Navbar_mobile b-icon-app"></i>
            <a id="header-mobile-app" href="//app.bilibili.com" target="_blank" title="下载APP" class="t">下载APP</a>
<!----></li>
    </ul>
</div>
'''

results = re.findall('<a.*?href="(.*?)".*?>(.*?)</a>',html)
for result in results:
    print(result[0],"---",result[1])

//www.bilibili.com/audio/home/?type=10 --- 音频
//game.bilibili.com --- 游戏中心
//live.bilibili.com --- 直播
//show.bilibili.com/platform/home.html?msource=pc_web --- 会员购
//manga.bilibili.com --- 漫画
https://bml.bilibili.com/ --- BML
//app.bilibili.com --- 下载APP


# re.compile

将一个正则表达式串编译成正则对象，以便于复用该匹配对象

# 实战练习

In [1]:
import requests
import re

content = requests.get('https://ac.qq.com/Light').text
pattern = re.compile('<div.*?update-item.*?href="(.*?)".*?cover-name.*?>(.*?)</a>.*?(tag-item.*?>(.*?)</a>)*.*?</div>',re.S)
results = re.findall(pattern,content)
for result in results:
    print(result)

('/Comic/comicInfo/id/622957', '暗黑革命', '', '')
('/Comic/comicInfo/id/626832', '龙娘七七七埋藏的', '', '')
('/Comic/comicInfo/id/638326', '碧阳学园学生会默', '', '')
('/Comic/comicInfo/id/536117', '魔法的禁书目录', '', '')
('/Comic/comicInfo/id/541392', '永生之酒', '', '')
('/Comic/comicInfo/id/552204', '魔法的禁书目录S', '', '')
('/Comic/comicInfo/id/549129', '重装武器', '', '')
('/Comic/comicInfo/id/542718', '无头骑士异闻录S', '', '')
('/Comic/comicInfo/id/547903', '奋斗吧！系统工程', '', '')
('/Comic/comicInfo/id/547529', '龙与虎', '', '')
('/Comic/comicInfo/id/541413', '天魔黑兔', '', '')
('/Comic/comicInfo/id/551413', '黑色子弹', '', '')
('/Comic/comicInfo/id/635592', '神样家族', '', '')
('/Comic/comicInfo/id/634757', '为美好的世界献上', '', '')
('/Comic/comicInfo/id/637677', '大传说中勇者的传', '', '')
('/Comic/comicInfo/id/536137', '圣剑锻造师', '', '')
('/Comic/comicInfo/id/551426', '金色时光', '', '')
('/Comic/comicInfo/id/536079', '绯弹的亚里亚', '', '')
('/Comic/comicInfo/id/637081', '传说的勇者的传说', '', '')
('/Comic/comicInfo/id/631422', '魔装少女就是本少', '', '')
('/Comic/comicInfo/