In [2]:
#2.1 使用多个界定符分割字符串 re.split()

line = 'asdf fjdk; afed, fjek, asdf, foo'
import re

re.split(r'[;,\s]\s*', line)

['asdf', 'fjdk', 'afed', 'fjek', 'asdf', 'foo']

In [3]:
fields = re.split(r'(;|,|\s)\s*', line)
fields

['asdf', ' ', 'fjdk', ';', 'afed', ',', 'fjek', ',', 'asdf', ',', 'foo']

In [4]:
values = fields[::2]
delimiters = fields[1::2] + ['']
values

['asdf', 'fjdk', 'afed', 'fjek', 'asdf', 'foo']

In [6]:
delimiters

[' ', ';', ',', ',', ',', '']

In [7]:
# Reform the line using the same delimiters
''.join(v + d for v, d in zip(values, delimiters))

'asdf fjdk;afed,fjek,asdf,foo'

In [9]:
#2.2 字符串开头或结尾匹配 str.startswith()/str.endswith()

filename = 'spam.txt'
filename.endswith('t')

True

In [13]:
import os

filenames = os.listdir('.')
filenames
[name for name in filenames if name.endswith(('.py', '.ipynb'))]  #放入元组中

['Chapter 1.ipynb', 'Chapter 1.py', 'Chapter 2.ipynb', 'main.py']

In [21]:
#2.3 用Shell通配符匹配字符串  fnmatch()、fnmatchcase()
from fnmatch import fnmatch, fnmatchcase

fnmatch('foo.txt', '*.txt')
fnmatch('foo.txt', 'f?o.txt')
fnmatch('Dat45.csv', 'Dat[0-9]*')
fnmatch('foo.txt', '*.TXT')
fnmatchcase('foo.txt', '*.TXT')  #区分大小写

False

In [23]:
addresses = [
    '5412 N CLARK ST',
    '1060 W ADDISON ST',
    '1039 W GRANVILLE AVE',
    '2122 N CLARK ST',
    '4802 N BROADWAY',
]
[addr for addr in addresses if fnmatchcase(addr, '* ST')]
[addr for addr in addresses if fnmatchcase(addr, '54[0-9][0-9] *CLARK*')]

['5412 N CLARK ST']

In [25]:
#2.4 字符串匹配和搜索
text = 'yeah, but no, but yeah, but no, but yeah'
# Search for the location of the first occurrence
text.find('no')

10

In [28]:
text1 = '11/27/2012'
text2 = 'Nov 27, 2012'
if re.match(r'\d+/\d+/\d+', text1):
    print('Y')
else:
    print('N')

Y


In [30]:
#同一个模式去做多次匹配，先将模式字符串预编译为模式对象
datepat = re.compile(r'\d+/\d+/\d+')
text = 'Today is 11/27/2012. PyCon starts 3/13/2013.'
datepat.findall(text)

['11/27/2012', '3/13/2013']

In [34]:
#用括号捕获分组
datepat = re.compile(r'(\d+)/(\d+)/(\d+)')
m = datepat.match('11/27/2012')
m.group(0)  #11/27/2012
m.group(1)  #11
m.groups()

('11', '27', '2012')

In [39]:
datepat.findall(text)
for mm, dd, yyyy in datepat.findall(text):
    print('{}-{}-{}'.format(yyyy, mm, dd))

2012-11-27
2013-3-13


In [41]:
#2.5 字符串搜索和替换 str.replace()
text = 'yeah, but no, but yeah, but no, but yeah'
text.replace('yeah', 'yep')

'yep, but no, but yep, but no, but yep'

In [44]:
text = 'Today is 11/27/2012. PyCon starts 3/13/2013.'
import re

re.sub(r'(\d+)/(\d+)/(\d+)', r'\3-\1-\2', text)
#sub() 函数中的第一个参数是被匹配的模式，第二个参数是替换模式。反斜杠数字比如 \3 指向前面模式的捕获组号。

'Today is 2012-11-27. PyCon starts 2013-3-13.'

In [45]:
datepat = re.compile(r'(\d+)/(\d+)/(\d+)')
datepat.sub(r'\3-\1-\2', text)

'Today is 2012-11-27. PyCon starts 2013-3-13.'

In [47]:
#2.6 字符串忽略大小写的搜索替换
text = 'UPPER PYTHON, lower python, Mixed Python'
re.sub('python', 'snake', text, flags=re.IGNORECASE)

'UPPER snake, lower snake, Mixed snake'

In [48]:
def matchcase(word):
    def replace(m):
        text = m.group()
        if text.isupper():
            return word.upper()
        elif text.islower():
            return word.lower()
        elif text[0].isupper():
            return word.capitalize()
        else:
            return word

    return replace


re.sub('python', matchcase('snake'), text, flags=re.IGNORECASE)

'UPPER SNAKE, lower snake, Mixed Snake'

In [5]:
#2.7 最短匹配模式
import re

str_pat = re.compile(r'"(.*)"')
text1 = 'Computer says "no."'
str_pat.findall(text1)

['no.']

In [11]:
text2 = 'Computer says "no." Phone says "yes."'
str_pat.findall(text2)
str_pat = re.compile(r'"(.*?)"')
str_pat.findall(text2)

['no.', 'yes.']

In [17]:
#2.8 多行匹配模式

text2 = '''/* this is a
 multiline comment */'''
comment = re.compile(r'/\*((?:.|\n)*?)\*/')  #增加对换行的支持
comment.findall(text2)

[' this is a\n multiline comment ']

In [22]:
#2.9 将Unicode文本标准化

s1 = 'Spicy Jalape\u00f1o'
s2 = 'Spicy Jalapen\u0303o'
s1, s2
import unicodedata

'''
normalize() 第一个参数指定字符串标准化的方式。
NFC表示字符应该是整体组成(比如可能的话就使用单一编码)，
NFD表示字符应该分解为多个组合字符表示。
'''
t1 = unicodedata.normalize('NFC', s1)
t2 = unicodedata.normalize('NFC', s2)
t1 == t2
t3 = unicodedata.normalize('NFD', s1)
t4 = unicodedata.normalize('NFD', s2)
print(ascii(t3))

'Spicy Jalapen\u0303o'


In [24]:
#2.11 删除字符串中不需要的字符
'''
strip() 方法能用于删除开始或结尾的字符。
lstrip() 和 rstrip() 分别从左和从右执行删除操作
'''
s = ' hello     world \n'
s = s.strip()
s

'hello     world'

In [29]:
import re

re.sub('\s+', ' ', s)

'hello world'

In [31]:
#2.12 审查清理文本字符串

s = 'pýtĥöñ\fis\tawesome\r\n'
remap = {
    ord('\t'): ' ',
    ord('\f'): ' ',
    ord('\r'): None  # Deleted
}
a = s.translate(remap)
a

'pýtĥöñ is awesome\n'

In [34]:
import unicodedata
import sys

cmb_chrs = dict.fromkeys(c for c in range(sys.maxunicode)
                         if unicodedata.combining(chr(c)))
b = unicodedata.normalize('NFD', a)
b
b.translate(cmb_chrs)

'python is awesome\n'

In [35]:
digitmap = {c: ord('0') + unicodedata.digit(chr(c))
            for c in range(sys.maxunicode)
            if unicodedata.category(chr(c)) == 'Nd'}
len(digitmap)
# Arabic digits
x = '\u0661\u0662\u0663'
x.translate(digitmap)

'123'