# 2.字符串和文本

## 2.1针对任意的分隔符来拆分字符

In [1]:
line = 'ask  fu,ke; liu, kang'
import re
re.split(r'[,;\s]\s+', line)

['ask', 'fu,ke', 'liu', 'kang']

,/; /空格 后接上任意的空格. 注意这里是贪婪模式

In [2]:
# 当使用re.split时候，注意捕获组是否包含在了括号中：如果含的话，捕获组也会在列表中
re.split(r'(;|,|\s)\s+', line)

['ask', ' ', 'fu,ke', ';', 'liu', ',', 'kang']

In [3]:
re.split(r'(?:;|,|\s)\s+', line) # 参考2.8

['ask', 'fu,ke', 'liu', 'kang']

## 2.2在字符串开头和结尾处做文本匹配

In [4]:
filename = 'span.txt'

In [5]:
filename.endswith('.txt')

True

In [6]:
filename.startswith('span')

True

In [7]:
'http://'.startswith('http:')

True

In [8]:
#如徐亚同时针对多项进行检查，只需要给startswith提供包含可能选项的元组即可
import os
file = os.listdir('.')

In [9]:
file

['.ipynb_checkpoints',
 '1-Copy1.第一章数据结构和算法.ipynb',
 '1.第一章数据结构和算法.ipynb',
 '1.第一章数据结构和算法.md',
 '1.第一章数据结构和算法.py',
 'cookbook.jpg',
 'notebook.tex',
 'untitled.txt',
 '新建文件夹',
 '第2章.字符串和文本.ipynb']

In [10]:
[n for n in file if n.endswith(('.txt','.ipynb'))] # (('.txt', '.ipynb'))

['1-Copy1.第一章数据结构和算法.ipynb',
 '1.第一章数据结构和算法.ipynb',
 'untitled.txt',
 '第2章.字符串和文本.ipynb']

In [11]:
# 切片也可以完成类似的操作】
filename

'span.txt'

In [12]:
filename[-4:] == '.txt'

True

In [13]:
# 使用这个方法来检测目录中是否有相关文件
if any(name.endswith('.py') for name in os.listdir(r'C:\Users\李博\Documents')):
    print('yes')

In [14]:
any(name.endswith('ts') for name in os.listdir(r'C:\Users\李博\Documents'))

True

## 2.3使用shell通配符做文本检测

In [15]:
from fnmatch import fnmatch, fnmatchcase
fnmatch('123.txt', '*.txt')

True

In [16]:
fnmatch('123.txt', '?23.txt')

True

In [17]:
fnmatch('1s.txt', '[1-9]*.txt')

True

In [18]:
# 在windows系统中
fnmatch('123.txt', '*.TxT')

True

In [19]:
# 如果要使用严格的大小写匹配，那么：
fnmatchcase('123.txt', '*.TXT')

False

上面这两个方法的功能，结余简单的字符串方法和强大的正则表达式之间。有时给我们提供许多便利。

## 2.4文本模式的匹配和查找

In [20]:
txt = '世界，你好 hello world'

In [21]:
txt.find('你好')

3

In [22]:
txt.find('he')

6

In [23]:
# 复杂一点的匹配
import re

date = '2018/12/28'
date2 = 'Dec 28, 2018'

In [24]:
re.match(r'\d*/\d*/\d*', date)

<_sre.SRE_Match object; span=(0, 10), match='2018/12/28'>

In [25]:
print(re.match(r'\d*/\d*/\d*', date2))

None


match（）总是从开头处匹配，并返回一个匹配对象。如果想要在整个文本中搜寻特定字符，可以使用findall（）方法

In [26]:
date3 = date2 + date
date3

'Dec 28, 20182018/12/28'

In [27]:
re.findall(r'\d*/\d*/\d*', date3)

['20182018/12/28']

**如果使用了捕获组（），那么我们可以对括号内的内容进行提取**

In [28]:
date = '如果我们123使用了捕获组（），我们321那么我们可以对括号内的内容进行提取'

re.findall('(\d+).*?(\d+)', date) # 注意re.findall('(\d*).*?(\d*)', date)的区别，*包括0次

[('123', '321')]

In [29]:
result = re.search('(\d+).*?(\d+)', date)

In [30]:
result.groups()

('123', '321')

In [31]:
result.group(1) # 从1开始计数

'123'

## 2.5查找替换文本

In [32]:
txt = 'yes but no ,not but yes, do you do'

In [33]:
txt.replace('no','not')

'yes but not ,nott but yes, do you do'

In [34]:
# 对于复杂的文本，可以使用正则表达式re
text = 'today is 2018/12/28, tomorrow is balabal'
re.sub('\d+/\d+/\d+','tomorrow',text)

'today is tomorrow, tomorrow is balabal'

In [35]:
re.sub('(\d+)/(\d+)/(\d+)',r'\3-\2-\1',text)

'today is 28-12-2018, tomorrow is balabal'

## 2.6以不区分大小写的方式做查找和替换

In [36]:
txt = 'yes but no ,not but yes, Do you Do'
re.findall(r'do',txt, flags=re.IGNORECASE) # ignore-case

['Do', 'Do']

In [37]:
re.sub('Do', 'did', txt, flags=re.IGNORECASE)

'yes but no ,not but yes, did you did'

In [38]:
# 如果在替换过程中，想要维持原来位置的大小写：

def matchtcase(word):
    def replace(m):
        text = m.group()
        if text.isupper():
            return word.uper()
        elif text.islower():
            return word.lower()
        elif text[0].isupper():
            return word.capitalize()
    return replace

In [39]:
re.sub('Do', matchtcase('did'), txt, flags=re.IGNORECASE)

'yes but no ,not but yes, Did you Did'

In [40]:
matchtcase('did')

<function __main__.matchtcase.<locals>.replace>

## 2.7非贪婪模式的匹配

In [41]:
import re
re.findall('"(.*)"', '"computer" said "no."')

['computer" said "no.']

In [42]:
re.findall('"(.*?)"', '"computer" said "no."')

['computer', 'no.']

## 2.8换行符的匹配

(?:pattern)	
匹配 pattern 但不获取匹配结果，也就是说这是一个非获取匹配，不进行存储供以后使用。这在使用 "或" 字符 (|) 来组合一个模式的各个部分是很有用。例如， 'industr(?:y|ies) 就是一个比 'industry|industries' 更简略的表达式。

In [43]:
txt = '''Hello
World'''

In [44]:
p = re.compile(r'H(.*?)d')
p.findall(txt)

[]

In [45]:
p = re.compile(r'H((.|\n)*?)d')
p.findall(txt)

[('ello\nWorl', 'l')]

In [46]:
p = re.compile(r'H((?:.|\n)*?)d')
p.findall(txt)

['ello\nWorl']

re.DOTALL可以更好的解决这个问题

In [47]:
p = re.compile(r'H(.*?)d', re.DOTALL)
p.findall(txt)

['ello\nWorl']

## 2.9将Unicode文本统一为规范形式

In [48]:
s1 = 'english is Jalape\u00f1o'
s2 = 'english is Jalapen\u0303o'

In [49]:
print(s1)

english is Jalapeño


In [50]:
print(s2)

english is Jalapeño


In [51]:
s1 == s2

False

In [52]:
print(len(s1), len(s2))

19 20


In [53]:
# 可以考虑 unicodedata
import unicodedata
t1 = unicodedata.normalize('NFC', s1)
t2 = unicodedata.normalize('NFC', s1)
t1 == t2

True

In [54]:
ascii(t1)

"'english is Jalape\\xf1o'"

In [55]:
ascii(t2)

"'english is Jalape\\xf1o'"

In [56]:
t3 = unicodedata.normalize('NFD', s1)
t4 = unicodedata.normalize('NFD', s1)
t3 == t4

True

In [57]:
ascii(t3)

"'english is Jalapen\\u0303o'"

In [58]:
# normalize()的第一个参数指定如何完成规范表示，NFC表示字符串应该是全组成的，NFD指明字符串应该是组合的

## 2.10用正则表达式处理Unicode字符

In [59]:
#略

## 2.11从字符串中去掉不需要的字符

In [60]:
s = " hello     word\n"

In [61]:
s.strip()

'hello     word'

In [62]:
print(s.lstrip(), s.rstrip())

hello     word
  hello     word


In [63]:
s.replace(" ", "")

'helloword\n'

生成器表达式：

In [64]:
gen = (n*2 for n in s )

In [65]:
gen

<generator object <genexpr> at 0x0000022F4656A9E8>

## 2.12文本的过滤和清理

In [66]:
s = "python\fis\tawesoñe\r\n"  # english is Jalapeño

In [67]:
s

'python\x0cis\tawesoñe\r\n'

In [68]:
remap = {
    ord('\t'): ' ',
    ord('\f'): ' ',
    ord('\r'): None
}

In [69]:
a = s.translate(remap)

In [70]:
a

'python is awesoñe\n'

In [71]:
ord("\n") # ??

10

利用这种映射思想可以把所有的Unicode字符都去掉

In [72]:
import unicodedata
import sys
cmb_chrs = dict.fromkeys(c for c in range(sys.maxunicode) if unicodedata.combining(chr(c)))
# 这里每一个Unicode字符组合都映射了一个none的字典
b = unicodedata.normalize('NFD', a)
# 这里使用了组合形式表示字符（NFD，参考上面），下步将组合字符中的特殊字符去除

In [73]:
b

'python is awesoñe\n'

In [74]:
b.translate(cmb_chrs)

'python is awesone\n'

In [75]:
ascii('国家鐹矌~')

"'\\u56fd\\u5bb6\\u9439\\u77cc~'"

## 2.13对齐文本字符串

In [76]:
text = "******"

In [77]:
text.rjust(10)

'    ******'

In [78]:
text.ljust(10)

'******    '

In [79]:
text.rjust(10, '=')

'====******'

In [80]:
# formata()左中右对其
print(format(text,'>20'))
print(format(text,'<20'))
print(format(text,'^20'))
print(format(text,'-^20'))

              ******
******              
       ******       
-------******-------


In [81]:
"{:<10}{:<20}".format('hello','world')

'hello     world               '

In [82]:
# format好处在于，不仅针对字符串
format(1.23, '^10.3f')

'  1.230   '

## 2.14字符串的连接和合并

In [83]:
".".join(str(i) for i in range(10))

'0.1.2.3.4.5.6.7.8.9'

In [84]:
".".join(i for i in range(10))

TypeError: sequence item 0: expected str instance, int found

 ## 2.15给字符串的变量名做插值处理

In [85]:
s = "my {name} is {sth}"
s.format(name='name',sth='libo')

'my name is libo'

In [86]:
s = "my {name} is {sth}"
name = 'name'
sth = 'libo'
s.format_map(vars())

'my name is libo'

In [87]:
# 其实vars()也能应用在类上. 即某个实例.name = 'name', .sth = 'libo'

In [88]:
type(vars())

dict

## 2.16固定宽度打印文本

In [89]:
s = '''
TypeError                                 Traceback (most recent call last)
<ipython-input-224-a6334a44ce45> in <module>()
----> 1 ".".join(i for i in range(10))

TypeError: sequence item 0: expected str instance, int found
'''

In [90]:
import textwrap
print(textwrap.fill(s, 20))

 TypeError
Traceback (most
recent call last)
<ipython-input-224-a
6334a44ce45> in
<module>() ----> 1
".".join(i for i in
range(10))
TypeError: sequence
item 0: expected str
instance, int found


In [91]:
import textwrap
print(textwrap.fill(s, 100))

 TypeError                                 Traceback (most recent call last) <ipython-
input-224-a6334a44ce45> in <module>() ----> 1 ".".join(i for i in range(10))  TypeError: sequence
item 0: expected str instance, int found


## 2.17在文本中处理HTML和XML实体
我们想要将&entiy和&#code这样的HTML或者XML实体替换为它们相应的文本，或者，我们需要生成文本，但是对特定的字符（比如<>&）做转义处理

In [92]:
s = 'Elenments are written as "<targ>text</targ>".'

In [93]:
import html
print(s)

Elenments are written as "<targ>text</targ>".


In [94]:
print(html.escape(s))

Elenments are written as &quot;&lt;targ&gt;text&lt;/targ&gt;&quot;.


In [95]:
print(html.escape(s, quote=False))

Elenments are written as "&lt;targ&gt;text&lt;/targ&gt;".


## 2.18文本分词
我们有一个字符串，想要从左往右解析为标记流（stream of tokens）

In [96]:
text = "foo = 23 +42 * 13"

要对字符串做分词处理，需要做的不仅仅是匹配模式。**我们还需要某种方法来识别出模式的类型。例如：**

In [97]:
token = [ ("NAME", "foo"),("EQ", "="),("NUM","23"),("PLUS","+"),("NUM","42"),("TIMES","*"),("NUM","13") ]

要完成这样的分词工作，第一部定义出所有可能的标记：可以使用正则表达式中的命名捕获组来实现

In [98]:
import re

NAME = r"(?P<NAME>[a-zA-Z_][a-zA-Z_0-9]*)"
NUM = r"(?P<NUM>\d+)"
PLUS = r"(?P<PLUS>\+)"
TIMES = r"(?P<TIMES>\*)"
EQ = r"(?P<EQ>=)"
WS = r"(?P<WS>\s+)"

master_pat = re.compile("|".join([NAME, NUM, PLUS, TIMES, EQ, WS]))

调用模式对象msater_pat的scanner（）方法完成分词。

In [99]:
scanner = master_pat.scanner("foo =42")

In [100]:
scanner.match()

<_sre.SRE_Match object; span=(0, 3), match='foo'>

In [101]:
print(_.lastgroup, _.group())
print(_)

NAME foo
<_sre.SRE_Match object; span=(0, 3), match='foo'>


In [102]:
scanner.match()

<_sre.SRE_Match object; span=(3, 4), match=' '>

In [103]:
print(_.lastgroup, _.group())
print(_)

WS  
<_sre.SRE_Match object; span=(3, 4), match=' '>


In [104]:
scanner.match()

<_sre.SRE_Match object; span=(4, 5), match='='>

In [105]:
_.lastgroup, _.group()

('EQ', '=')

In [106]:
scanner.match()

<_sre.SRE_Match object; span=(5, 7), match='42'>

In [107]:
_.lastgroup, _.group()
'''
while 1:
    try:
        a = scanner.match()
        print(a.lastgroup, a.group())
    except:
        break

'''

'\nwhile 1:\n    try:\n        a = scanner.match()\n        print(a.lastgroup, a.group())\n    except:\n        break\n\n'

**要使得上面的代码可批量化操作，我们可以做些清理工作然后轻松地将其包含在一个生成器函数中：**

In [108]:
from collections import namedtuple
import re

text = "foo = 23 +42 * 13"
NAME = r"(?P<NAME>[a-zA-Z_][a-zA-Z_0-9]*)"
NUM = r"(?P<NUM>\d+)"
PLUS = r"(?P<PLUS>\+)"
TIMES = r"(?P<TIMES>\*)"
EQ = r"(?P<EQ>=)"
WS = r"(?P<WS>\s+)" # 命名，可以使用调用group(NAME)来调用，替代group(1)....

master_pat = re.compile("|".join([NAME, NUM, PLUS, TIMES, EQ, WS]))
Token = namedtuple('Token', ['type', 'value'])

def generate_tokens(pat, text):
    scanner = pat.scanner(text)
    for m in iter(scanner.match, None): # 这里scanner是可迭代对象，且None不可省略
        yield Token(m.lastgroup, m.group())
        yield {"type":m.lastgroup, "value":m.group()} #类似的效果

In [109]:
# example
for tok in generate_tokens(master_pat, "foo = 42"):
    print(tok)

Token(type='NAME', value='foo')
{'type': 'NAME', 'value': 'foo'}
Token(type='WS', value=' ')
{'type': 'WS', 'value': ' '}
Token(type='EQ', value='=')
{'type': 'EQ', 'value': '='}
Token(type='WS', value=' ')
{'type': 'WS', 'value': ' '}
Token(type='NUM', value='42')
{'type': 'NUM', 'value': '42'}


附注：

In [110]:
# Python namedtuple(命名元组)使用实例

import collections
 
MyTupleClass = collections.namedtuple('MyTupleClass',['name', 'age', 'job'])
obj = MyTupleClass("Tomsom",12,'Cooker')
print(obj.name)
print(obj.age)
print(obj.job)

Tomsom
12
Cooker


## 2.19编写一个递归下降解析器

有了前面的基础可以构建一个***递归下降的表达式计算器***

In [111]:
import re
import collections

#Token specification
NUM = r"(?P<NUM>\d+)"
PLUS = r"(?P<PLUS>\+)"
MINUS = r"(?P<MINUS>\-)"
TIMES = r"(?P<TIMES>\*)"
DIVIDE = r"(?P<DIVIDE>/)"
LPAREN = r"(?P<LPAREN>\()"
RPAREN = r"(?P<RPAREN>\))"
WS = r"(?P<WS>\s+)"


def generate_tokens(text):
    master_pat = re.compile("|".join([NUM,PLUS,MINUS,TIMES,DIVIDE,LPAREN,RPAREN]))
    Token = collections.namedtuple('Toke', ["type","value"])
    scanner = master_pat.scanner(text)
    for m in iter(scanner.match, None):
        tok = Token(m.lastgroup, m.group())
        if tok.type != "WS":
            yield tok

In [112]:
class ExpressionEvaluator():
    def parse(self, text):
        self.token = generate_tokens(text)
        self.tok = None
        self.nexttok = None
        self._advance()
        return self.expr()
    
    def _advance(self):
        "前推一个token"
        self.tok, self.nexttok = self.nexttok, next(self.token, None)
        
    def _accept(self, toktype):
        "text and consume the next token if it matches toktype判断下一个token的type是否符合toktype"
        if self.nexttok and self.nexttok.type==toktype:
            self._advance()
            return True
        else:
            return False
        
    def _expect(self,toktype):
        if not self._accept(toktype):
            raise SyntaxError("Excepted" + toktype)
    
    def expr(self):
        exprval = self.term()
        while self._accept('PLUS') or self._accept('MINUS'):
            op = self.tok.type
            right = self.term()
            if op == "PLUS":
                exprval += right
            elif op == "MINUS":
                exprvall -= right
        return exprval
    
    def term(self):
        termval = self.factor()
        while self._accept('TIMES') or self._accept('DIVIDE'):
            op = self.tok.type
            right = self.factor()
            if op == "TIMES":
                termval *= right
            elif op == "DIVIDE":
                termval /= right
        return termval
    
    def factor(self):
        "根据右侧数字和括号选择"
        if self._accept("NUM"):
            return int(self.tok.value)
        elif self._accept("LPAREN"):
            exprval = self.expr()
            self._expect("RPAREN")
            return exprval
        else:
            raise SyntaxError('Expected ( or )')   

In [113]:
a = ExpressionEvaluator()
a.parse("(1+1)*3")

6

## 1.20在字节串上执行文本操作

字节串的操作和文本字符的內建操作基本一致

In [114]:
date = b"hello world"

In [115]:
date[0:3]

b'hel'

In [116]:
date = bytearray(b"hello world") # 字节组
date[0:5]

bytearray(b'hello')

In [117]:
data = b"foo:bar,span"
import re
re.findall(r"[:|,]",data)

TypeError: cannot use a string pattern on a bytes-like object

In [118]:
import re
re.findall(b"[:|,]",data)

[b':', b',']

In [119]:
# 区别
data = b"foo:bar,span"

In [120]:
data[0]

102

In [121]:
data[1]

111

In [122]:
print(date)

bytearray(b'hello world')


In [123]:
print(data)

b'foo:bar,span'


In [124]:
print(data.decode("utf-8"))

foo:bar,span


In [125]:
data = b"李嬷纆"
print(data)

SyntaxError: bytes can only contain ASCII literal characters. (<ipython-input-125-c085bd6538f5>, line 1)