# 第二章：字符串和文本 

##  使用多个界定符分割字符串

In [1]:
line = 'asdf fjdk; afed, fjek,asdf, foo'
import re
print(re.split(r'[;,\s]\s*',line))

['asdf', 'fjdk', 'afed', 'fjek', 'asdf', 'foo']


## 字符串开头或结尾匹配

In [2]:
filename = 'spam.txt'
print(filename.endswith('.txt'))
print(filename.startswith('file:'))
import os
filenames = os.listdir('.')
print([name for name in filenames if name.endswith('.ipynb')])

True
False
['ch1.ipynb', 'ch2.ipynb']


## 用 Shell 通配符匹配字符串

In [3]:
from fnmatch import fnmatch, fnmatchcase
print(fnmatch('foo.txt','*.txt'))
print(fnmatch('foo.txt', '?oo.txt'))
print(fnmatch('Dat45.csv', 'Dat[0-9]*'))
# fnmatch() 函数使用底层操作系统的大小写敏感规则
# 如果对这个区别很在意，可以使用 fnmatchcase() 来代替。它完全使用你的模式大小写匹配。
print("------------------")
print(fnmatch('foo.txt', '*.TXT'))
print(fnmatchcase('foo.txt','*.TXT'))

True
True
True
------------------
True
False


## 字符串匹配和搜索

In [4]:
text = 'yeah, but no, but yeah, but no, but yeah'
print(text.startswith('yeah'))
print(text.endswith('yeah'))
print(text.find('no'))
# 复杂的匹配可以使用正则表达式
text1 = '11/27/2012'
text2 = 'Nov 27, 2012'
import re
if re.match('\d+/\d+/\d+',text1):
    print('yes')
else:
    print('no')
if re.match('\d+/\d+/\d+',text2):
    print('yes')
else:
    print('no')
# 预编译
datepat = re.compile('\d+/\d+/\d+')
if datepat.match(text1):
    print('yes')
else:
    print('no')
if datepat.match(text2):
    print('yes')
else:
    print('no')

text = 'Today is 11/27/2012. PyCon starts 3/13/2013.'
print(datepat.findall(text))

# 在定义正则式的时候，通常会利用括号去捕获分组。
datepat = re.compile('(\d+)/(\d+)/(\d+)')
m = datepat.match('11/27/2012')
print(m.group(1),m.group(2),m.group(3))
print(m.groups())

True
True
10
yes
no
yes
no
['11/27/2012', '3/13/2013']
11 27 2012
('11', '27', '2012')


##  字符串搜索和替换

In [5]:
text = 'yeah, but no, but yeah, but no, but yeah'
print(text.replace('yeah','yep'))

text = 'Today is 11/27/2012. PyCon starts 3/13/2013.'
import re
print(re.sub(r'(\d+)/(\d+)/(\d+)',r'\3-\1-\2',text))

from calendar import month_abbr
def change_date(m):
    mon_name = month_abbr[int(m.group(1))]
    return '{} {} {}'.format(m.group(2), mon_name, m.group(3))

datepat = re.compile(r'(\d+)/(\d+)/(\d+)')
print(datepat.sub(change_date, text))

newtext, n = datepat.subn(r'\3-\1-\2',text)
print(newtext)
print(n)

yep, but no, but yep, but no, but yep
Today is 2012-11-27. PyCon starts 2013-3-13.
Today is 27 Nov 2012. PyCon starts 13 Mar 2013.
Today is 2012-11-27. PyCon starts 2013-3-13.
2


## 字符串忽略大小写的搜索替换

In [6]:
text = 'UPPER PYTHON, lower python, Mixed Python'
print(re.findall('python', text, flags=re.IGNORECASE))
print(re.sub('python','snake', text, flags=re.IGNORECASE))

def matchcase(word):
    def replace(m):
        text = m.group()
        if text.isupper():
            return word.upper()
        elif text.islower():
            return word.lower()
        elif text[0].isupper():
            return word.capitalize()
        else:
            return word
    return replace
print(re.sub('python', matchcase('snake'), text, flags=re.IGNORECASE))

['PYTHON', 'python', 'Python']
UPPER snake, lower snake, Mixed snake
UPPER SNAKE, lower snake, Mixed Snake


## 最短匹配模式

In [7]:
# 贪婪模式
str_pat = re.compile('\"(.*)\"')
text1 = 'Computer says "no."'
print(str_pat.findall(text1))
text2 = 'Computer says "no." Phone says "yes."'
print(str_pat.findall(text2))
# 非贪婪模式  
str_pat = re.compile(r'\"(.*?)\"')
print(str_pat.findall(text2))

['no.']
['no." Phone says "yes.']
['no.', 'yes.']


## 多行匹配模式

In [9]:
# 无法匹配换行符
comment = re.compile('/\*(.*?)\*/')
text1 = '/* this is a comment */'
text2 = '''/* this is a
multiline comment */
'''
print(comment.findall(text1))
print(comment.findall(text2))

# 增加对换行的支持
comment = re.compile('/\*((?:.|\n)*?)\*/')
print(comment.findall(text2))

# re.DOTALL可以使点 (.) 匹配包括换行符在内的任意字符
comment = re.compile(r'/\*(.*?)\*/', re.DOTALL)
print(comment.findall(text2))


[' this is a comment ']
[]
[' this is a\nmultiline comment ']
[' this is a\nmultiline comment ']


## 将 Unicode 文本标准化

In [19]:
s1 = 'Spicy Jalape\u00f1o'
s2 = 'Spicy Jalapen\u0303o'
print(s1, 'len:{}'.format(len(s1)))
print(s2, 'len:{}'.format(len(s2)))
print('s1 == s2 : {}'.format(s1 == s2))

import unicodedata
# NFC 表示字符应该是整体组成 
# 而 NFD 表示字符应该分解为多个组合字符表示。
t1 = unicodedata.normalize('NFC', s1)
t2 = unicodedata.normalize('NFC', s2)
print('t1 == t2 : {}'.format(t1 == t2))
print("t1 : ", t1)
print("t2 : ", t2)
print(ascii(t1))

t3 = unicodedata.normalize('NFD', s1)
t4 = unicodedata.normalize('NFD', s2)
print('t3 == t4 : {}'.format(t3 == t4))
print("t3 : ", t3)
print("t4 : ", t4)
print(ascii(t3))

Spicy Jalapeño len:14
Spicy Jalapeño len:15
s1 == s2 : False
t1 == t2 : True
t1 :  Spicy Jalapeño
t2 :  Spicy Jalapeño
'Spicy Jalape\xf1o'
t3 == t4 : True
t3 :  Spicy Jalapeño
t4 :  Spicy Jalapeño
'Spicy Jalapen\u0303o'


##  在正则式中使用 Unicode

In [25]:
import re
num = re.compile('\d+')
print('123:')
print(num.match('123'))
print('\u0661\u0662\u0663:')
print(num.match('\u0661\u0662\u0663'))

123:
<re.Match object; span=(0, 3), match='123'>
١٢٣:
<re.Match object; span=(0, 3), match='١٢٣'>


## 删除字符串中不需要的字符

In [8]:
s = ' hello world \n'
# strip()去除开始和结尾的字符
print(ascii(s.strip()))
# lstrip()去除左边的字符
print(ascii(s.lstrip()))
# rstrip()去除右边的字符
print(ascii(s.rstrip()))

t = '-----hello====='
print(t.lstrip('-'))
print(t.strip('-=')) 

'hello world'
'hello world \n'
' hello world'
hello=====
hello


##  审查清理文本字符串

In [15]:
s = 'pýtĥöñ\fis\tawesome\r\n'
print(s)

remap = {
    ord('\t') : ' ',
    ord('\f') : ' ',
    ord('\r') : None
}

a = s.translate(remap)
print(a)

import unicodedata, sys
cmb_chrs = dict.fromkeys(c for c in range(sys.maxunicode) if unicodedata.combining(chr(c)))
b = unicodedata.normalize('NFD', a)
print(b)
print(b.translate(cmb_chrs))

pýtĥöñis	awesome

pýtĥöñ is awesome

pýtĥöñ is awesome

python is awesome



##  字符串对齐

In [28]:
text = 'Hello World'
print('方法一：')
print(ascii(text.ljust(20)))
print(ascii(text.rjust(20)))
print(ascii(text.center(20)))
print(ascii(text.center(20,'-')))
print('方法二：')
print(ascii(format(text, '<20')))
print(ascii(format(text, '>20')))
print(ascii(format(text, '^20')))
print(ascii(format(text, '-^20')))
print('方法三：')
print(ascii('{:>10s} {:>10s}'.format('Hello', 'World')))

方法一：
'Hello World         '
'         Hello World'
'    Hello World     '
'----Hello World-----'
方法二：
'Hello World         '
'         Hello World'
'    Hello World     '
'----Hello World-----'
方法三：
'     Hello      World'


## 合并拼接字符串

In [30]:
# 对于少量的拼接运算使用+即可
# join挺方便的
parts = ['Is', 'Chicago', 'Not', 'Chicago?']
print(' '.join(parts))
print(','.join(parts))


Is Chicago Not Chicago?
Is,Chicago,Not,Chicago?


## 字符串中插入变量

In [49]:
s = '{name} has {n} messages.'
print(s.format(name='Guido', n = 37))
name = 'Guido'
n = 37
print(s.format_map(vars()))

class Info:
    def __init__(self, name, n):
        self.name = name
        self.n = n

a = Info('Guido', 37)
print(s.format_map(vars(a)))

class safesub(dict):
    def __missing__(self, key):
        return '{' + key + '}'

import sys
# sys._getframe(1) 返回调用者的栈帧。可以从中访问属性f_locals 来获得局部变量
def sub(text):
    return text.format_map(safesub(sys._getframe(1).f_locals))  


print(sub('Hello {name}'))
print(sub('You have {n} messages.'))
print(sub('Your favorite color is {color}'))

Guido has 37 messages.
Guido has 37 messages.
Guido has 37 messages.
Hello Guido
You have 37 messages.
Your favorite color is {color}


##  以指定列宽格式化字符串

In [63]:
s = "Look into my eyes, look into my eyes, the eyes, the eyes, \
the eyes, not around the eyes, don't look around the eyes, \
look into my eyes, you're under."
import textwrap
print(textwrap.fill(s,70))
print('-'*70)
print(textwrap.fill(s,40))
print('-'*70)
print(textwrap.fill(s,40, initial_indent='  '))
print('-'*70)
print(textwrap.fill(s, 40, subsequent_indent=' '))

# textwrap 模块对于字符串打印是非常有用的，特别是当你希望输出自动匹配终端大小的时候。你可以使用 os.get_terminal_size() 方法来获取终端的大小尺寸。

Look into my eyes, look into my eyes, the eyes, the eyes, the eyes,
not around the eyes, don't look around the eyes, look into my eyes,
you're under.
----------------------------------------------------------------------
Look into my eyes, look into my eyes,
the eyes, the eyes, the eyes, not around
the eyes, don't look around the eyes,
look into my eyes, you're under.
----------------------------------------------------------------------
  Look into my eyes, look into my eyes,
the eyes, the eyes, the eyes, not around
the eyes, don't look around the eyes,
look into my eyes, you're under.
----------------------------------------------------------------------
Look into my eyes, look into my eyes,
 the eyes, the eyes, the eyes, not
 around the eyes, don't look around the
 eyes, look into my eyes, you're under.


## 在字符串中处理 html 和 xml

In [70]:
s = 'Elements are written as "<tag>text</tag>".'
import html
print(s)
print(html.escape(s))
print(html.escape(s, quote=False))

s = 'Spicy Jalapeño'
print(s.encode('ascii', errors='xmlcharrefreplace'))

s = 'Spicy &quot;Jalape&#241;o&quot.'
from html.parser import HTMLParser
p = HTMLParser()
print(p.unescape(s))

t = 'The prompt is &gt;&gt;&gt;'
from xml.sax.saxutils import unescape
print(unescape(t))

Elements are written as "<tag>text</tag>".
Elements are written as &quot;&lt;tag&gt;text&lt;/tag&gt;&quot;.
Elements are written as "&lt;tag&gt;text&lt;/tag&gt;".
b'Spicy Jalape&#241;o'
Spicy "Jalapeño".
The prompt is >>>


##  字符串令牌解析

In [79]:
import re
NAME = r'(?P<NAME>[a-zA-Z_][a-zA-Z_0-9]*)'
NUM = r'(?P<NUM>\d+)'
PLUS = r'(?P<PLUS>\+)'
TIMES = r'(?P<TIMES>\*)'
EQ = r'(?P<EQ>=)'
WS = r'(?P<WS>\s+)'
master_pat = re.compile('|'.join([NAME, NUM, PLUS, TIMES, EQ, WS]))

def generate_tokens(pat, text):
    from collections import namedtuple
    Token = namedtuple('Token', ['type', 'value'])
    scanner = pat.scanner(text)
    for m in iter(scanner.match, None):
        yield Token(m.lastgroup, m.group())

for tok in generate_tokens(master_pat, 'foo = 42'):
    print(tok)

Token(type='NAME', value='foo')
Token(type='WS', value=' ')
Token(type='EQ', value='=')
Token(type='WS', value=' ')
Token(type='NUM', value='42')


## 实现一个简单的递归下降分析器

```
expr ::= expr + term
| expr - term
| term
term ::= term * factor
| term / factor
| factor
factor ::= ( expr )
| NUM
```

In [80]:
import re
import collections

NUM = r'(?P<NUM>\d+)'
PLUS = r'(?P<PLUS>\+)'
MINUS = r'(?P<MINUS>-)'
TIMES = r'(?P<TIMES>\*)'
DIVIDE = r'(?P<DIVIDE>/)'
LPAREN = r'(?P<LPAREN>\()'
RPAREN = r'(?P<RPAREN>\))'
WS = r'(?P<WS>\s+)'
master_pat = re.compile('|'.join([NUM, PLUS, MINUS, TIMES,
DIVIDE, LPAREN, RPAREN, WS]))
Token = collections.namedtuple('Token', ['type', 'value'])

def generate_tokens(text):
    scanner = master_pat.scanner(text)
    for m in iter(scanner.match, None):
        tok = Token(m.lastgroup, m.group())
        if tok.type != 'WS':
            yield tok

class ExpressionEvaluator:
    def parse(self, text):
        self.tokens = generate_tokens(text)
        self.tok = None # Last symbol consumed
        self.nexttok = None # Next symbol tokenized
        self._advance() # Load first lookahead token
        return self.expr()

    def _advance(self):
        'Advance one token ahead'
        self.tok, self.nexttok = self.nexttok, next(self.tokens, None)
        
    def _accept(self, toktype):
        'Test and consume the next token if it matches toktype'
        if self.nexttok and self.nexttok.type == toktype:
            self._advance()
            return True
        else:
            return False
            
    def _expect(self, toktype):
        'Consume next token if it matches toktype or raise SyntaxError'
        if not self._accept(toktype):
            raise SyntaxError('Expected ' + toktype)

    def expr(self):
        "expression ::= term { ('+'|'-') term }*"
        exprval = self.term()
        while self._accept('PLUS') or self._accept('MINUS'):
            op = self.tok.type
            right = self.term()
            if op == 'PLUS':
                exprval += right
            elif op == 'MINUS':
                exprval -= right
        return exprval
    
    def term(self):
        "term ::= factor { ('*'|'/') factor }*"
        termval = self.factor()
        while self._accept('TIMES') or self._accept('DIVIDE'):
            op = self.tok.type
            right = self.factor()
            if op == 'TIMES':
                termval *= right
            elif op == 'DIVIDE':
                termval /= right
        return termval
    
    def factor(self):
        "factor ::= NUM | ( expr )"
        if self._accept('NUM'):
            return int(self.tok.value)
        elif self._accept('LPAREN'):
            exprval = self.expr()
            self._expect('RPAREN')
            return exprval
        else:
            raise SyntaxError('Expected NUMBER or LPAREN')
            
def descent_parser():
    e = ExpressionEvaluator()
    print(e.parse('2'))
    print(e.parse('2 + 3'))
    print(e.parse('2 + 3 * 4'))
    print(e.parse('2 + (3 + 4) * 5'))

descent_parser()

2
5
14
37


## 字节字符串上的字符串操作

In [87]:
# 字节字符串
data = b'Hello World'
print(data[0:5])
print(data.startswith(b'Hello'))
print(data.split())
print(data.replace(b'Hello', b'Hello Cruel'))
# 字节数组
data = bytearray(b'Hello World')
print(data[0:5])
print(data.startswith(b'Hello'))
print(data.split())
print(data.replace(b'Hello', b'Hello Cruel'))

data = b'FOO:BAR,SPAM'
import re
re.split(b'[:,]',data) # Notice: pattern as bytes

b'Hello'
True
[b'Hello', b'World']
b'Hello Cruel World'
bytearray(b'Hello')
True
[bytearray(b'Hello'), bytearray(b'World')]
bytearray(b'Hello Cruel World')


[b'FOO', b'BAR', b'SPAM']