# 2.1.Spliting Strings on Any of Multiple Delimiters


In [2]:
line = 'asdf fjdk; afed, fjek,asdf, foo'
import re 
fields = re.split(r'[;,\s]\s*', line) # specify multiple patterns for the separator 


# 2.2.Matching Text at the Start or End of a String

In [12]:
# Use the str.startswith() str.endswith()
filename = 'spam.txt'
filename.endswith('.txt')
filename.startswith('file:')
url = 'http://www.python.org'
url.startswith('http:')

import os
filenames = os.listdir('.')
[name for name in filenames if name.endswith(('.ipynb'))]
any(name.endswith('.ipynb') for name in filenames)

from urllib.request import urlopen

def read_data(name):
    if name.startswith(('http:', 'https:', 'ftp:')):
        return urlopen(name).read()
    else:
        with open(name) as f:
            return f.read()

choices = ['http:', 'ftp:']
url = 'http://www.python.org'
url.startswith(tuple(choices))

# prefix and suffix checking 


True

# 2.3.Matching Strings Using Shell Wildcard Patterns


In [17]:
from fnmatch import fnmatch, fnmatchcase 
fnmatch('foo.txt','*.txt') # using the same case-sensitivity rule as the system's underlying filesystem

fnmatch('foo.txt', '?oo.txt')
names = ['Dat1.csv', 'Dat2.csv', 'config.ini', 'foo.py']
[name for name in names if fnmatch(name, 'Dat*.csv')]

addresses = [
'5412 N CLARK ST',
'1060 W ADDISON ST',
'1039 W GRANVILLE AVE',
'2122 N CLARK ST',
'4802 N BROADWAY',
]
from fnmatch import fnmatchcase 
[addr for addr in addresses if fnmatchcase(addr, '* ST')]


['5412 N CLARK ST', '1060 W ADDISON ST', '2122 N CLARK ST']

# 2.4.Matching and Searching for Text Patterns


In [31]:
text = 'yeah, but no, but yeah, but no, but yeah'

# Exact match 
text == 'yeah'

# Search for the location of the first occurrence 
text.find('no')

text1 = '11/27/2012'
text2 = 'Nov 27, 2012'

import re 
# Simple matching: \d+ means match one or more digits
if re.match(r'\d+/\d+/\d+', text1):
    print('yes')
else:
    print('no')
    
datepat = re.compile(r'\d+/\d+/\d+')
if datepat.match(text1):
    print('yes')
else:
    print('no')

text = 'Today is 11/27/2012. PyCon starts 3/13/2013.'
datepat.findall(text)

datepat = re.compile(r'(\d+)/(\d+)/(\d+)')
m = datepat.match('11/27/2012')
m.group(0)
m.groups()
# findall method searches the text and finds all matches 
for month, day, year in datepat.findall(text):
    print('{}-{}-{}'.format(year, month, day))


yes
yes
2012-11-27
2013-3-13


# 2.5. Searching and Replacing Text

In [37]:
text = 'yeah, but no, but yeah, but no, but yeah'
text.replace('yeah', 'yep')
text = 'Today is 11/27/2012. PyCon starts 3/13/2013.'
import re
# sub - substitutions 替代
re.sub(r'(\d+)/(\d+)/(\d+)', r'\3-\1-\2', text) # Backslashed digits such as \3 refer to capture group numbers

# If you're going to perform repeated
from calendar import month_abbr # an array that represents the abbreviated months of the year in the current locate.
# month_abbr[0] is the empty string

def change_date(m):
    mon_name = month_abbr[int(m.group(1))]
    return '{} {} {}'.format(m.group(2), mon_name, m.group(3))

datepat.sub(change_date, text)

# how many substitutions were made
newtext, n = datepat.subn(r'\3-\1-\2', text)
print(newtext, n)

Today is 2012-11-27. PyCon starts 2013-3-13. 2


# 2.6. Searching and Replacing Case-Insensitive Text

In [41]:
text = 'UPPER PYTHON, lower python, Mixed Python'
# to perform case-insensitive text operations, need use the re module 
re.findall('python', text, flags=re.IGNORECASE)
re.sub('python', 'snake', text, flags=re.IGNORECASE)

def matchcase(word):
    def replace(m):
        text = m.group()
        if text.isupper():
            return word.upper()
        elif text.islower():
            return word.lower()
        elif text[0].isupper():
            return word.capitalize()
        else:
            return word 
    return replace
re.sub('python', matchcase('snake'), text, flags=re.IGNORECASE)


'UPPER SNAKE, lower snake, Mixed Snake'

# 2.7. Specifying a Regular Expression for the Shortest

In [45]:
# it is identifying the longest possible matches of a pattern of a pattern. Instead, you would like to change it to find the shortest possible match
str_pat = re.compile(r'\"(.*)\"')
text1 = 'Computer says "no."'
str_pat.findall(text1)
text2 = 'Computer says "no." Phone says "yes."'
str_pat.findall(text2)
# * operator in a regular expression is greedy; ?modifier after the * operator in the pattern 
str_pat = re.compile(r'\"(.*?)\"')
str_pat.findall(text2)

# In a pattern, the dot matches any character except a newline.
# ? forces the matching algorithm to look for the shortest possible match instead 

['no.', 'yes.']

# 2.8. Writing a Regular Expression for Multiline Patterns

In [53]:
# trying to match C-style comments 
comment = re.compile(r'/\*(.*?)\*/')
text1 = '/* this is a comment */'
text2 = '''/* this is a 
mutiline comment */
'''
comment.findall(text1)
comment.findall(text2)

# fix the problem, you can add support for newlines 
comment = re.compile(r'/\*((?:.|\n)*?)\*/')
comment.findall(text2)

# (?:.|\n) specifies a noncapture group 

comment = re.compile(r'/\*(.*?)\*/', re.DOTALL)
comment.findall(text2)

[' this is a \nmutiline comment ']

# 2.9. Normalizing Unicode Text to a Standard

In [67]:
# In Unicode, certain characters can be represented by more than one valid sequence of code points
s1 = 'Spicy Jalape\u00f1o'
s2 = 'Spicy Jalapen\u0303o'
s1 == s2
len(s1) == len(s2)
# Having multiple representations is a problem for programs that compare strings 
import unicodedata
t1 = unicodedata.normalize('NFC',s1) # NFC full composed 
t2 = unicodedata.normalize('NFC', s2)
t1 == t2
print(ascii(t1))

t1 = unicodedata.normalize('NFD',s1) #  NFD fully decomposed 
t2 = unicodedata.normalize('NFD', s2)
t1 == t2
print(ascii(t1))

s = '\ufb01' # A single character
unicodedata.normalize('NFD',s)

# Notice how the combined letters are broken apart here
unicodedata.normalize('NFKD',s)

t1 = unicodedata.normalize('NFD', s1)
''.join(c for c in t1 if not unicodedata.combining(c)) # combining() function test a character to see if it is a combining character

'Spicy Jalape\xf1o'
'Spicy Jalapen\u0303o'


'Spicy Jalapeno'

# 2.10. Working with Unicode Characters in Regular Expressions

In [71]:
# \d match any unicode digit character 
import re 
num = re.compile('\d+')
# ASCII digits
num.match('123')

# Arabic digits  阿拉伯数字
num.match('\u0661\u0662\u0663')

pat = re.compile('stra\u00dfe', re.IGNORECASE)
s = 'straβe'
pat.match(s)

# 2.11. Stripping Unwanted Characters from Strings

In [79]:
# strip() method can be used to strip characters from the beginning or end of a string 
# lstrip() and rstrip() perform stripping from the left or right side 
# Whitespace stripping 
s = '    hello world \n'
s.strip()
s.lstrip()
s.rstrip()

# Character stripping
t = '-----hello====='
t.lstrip('-')
t.strip('-=')

# replace() method or a regular expression substitution
s.replace(' ','')
import re
re.sub('\s+',' ',s)


' hello world '

# 2.12. Sanitizing and Cleaning Up Text

In [84]:
s = 'pýtĥöñ\fis\tawesome\r\n'
remap = {
    ord('\t') : ' ',
    ord('\f') : ' ',
    ord('\r') : None 
}
a = s.translate(remap)

import unicodedata
import sys 
cmb_chrs = dict.fromkeys(c for c in range(sys.maxunicode) if unicodedata.combining(chr(c)))
b = unicodedata.normalize('NFD', a)
b.translate(cmb_chrs)

# translation table that maps all Unicode decimal digit characters to their equivalent in ASCII
digitmap = { c: ord('0') + unicodedata.digit(chr(c)) for c in range(sys.maxunicode) if unicodedata.category(chr(c)) == 'Nd'}
digitmap

# Ar

{48: 48,
 49: 49,
 50: 50,
 51: 51,
 52: 52,
 53: 53,
 54: 54,
 55: 55,
 56: 56,
 57: 57,
 1632: 48,
 1633: 49,
 1634: 50,
 1635: 51,
 1636: 52,
 1637: 53,
 1638: 54,
 1639: 55,
 1640: 56,
 1641: 57,
 1776: 48,
 1777: 49,
 1778: 50,
 1779: 51,
 1780: 52,
 1781: 53,
 1782: 54,
 1783: 55,
 1784: 56,
 1785: 57,
 1984: 48,
 1985: 49,
 1986: 50,
 1987: 51,
 1988: 52,
 1989: 53,
 1990: 54,
 1991: 55,
 1992: 56,
 1993: 57,
 2406: 48,
 2407: 49,
 2408: 50,
 2409: 51,
 2410: 52,
 2411: 53,
 2412: 54,
 2413: 55,
 2414: 56,
 2415: 57,
 2534: 48,
 2535: 49,
 2536: 50,
 2537: 51,
 2538: 52,
 2539: 53,
 2540: 54,
 2541: 55,
 2542: 56,
 2543: 57,
 2662: 48,
 2663: 49,
 2664: 50,
 2665: 51,
 2666: 52,
 2667: 53,
 2668: 54,
 2669: 55,
 2670: 56,
 2671: 57,
 2790: 48,
 2791: 49,
 2792: 50,
 2793: 51,
 2794: 52,
 2795: 53,
 2796: 54,
 2797: 55,
 2798: 56,
 2799: 57,
 2918: 48,
 2919: 49,
 2920: 50,
 2921: 51,
 2922: 52,
 2923: 53,
 2924: 54,
 2925: 55,
 2926: 56,
 2927: 57,
 3046: 48,
 3047: 49,
 3048: 5

# 2.13. Aligning Text Strings


In [100]:
# ljust(), rjust(), and center() methods of strings can be used
text = 'Hello World'
text.ljust(20)
text.rjust(20)
text.center(20)
# optional &&65.180&&fill 
text.rjust(20,'=')
text.center(20,'*')
# format <,>,^
format(text, '>20')
format(text, '<20')
format(text,'^20')

# 
format(text, '=>20s')
format(text, '*^20s')

'{:>10s} {:>10s}'.format('Hello','World')
x=1.23456
format(x, '^10.2f')

'   1.23   '

# 2.14. Combining and Concatenatiing Strings

In [110]:
# the fastest way to combine is use the join() method
parts = ['Is', 'Chicago', 'Not', 'Chicago?']
' '.join(parts)
','.join(parts)
''.join(parts)
a = 'Is Chicago'
b = 'Not Chicago?'
a + ' ' + b
# += operation creates a new string object 
data = ['ACME',50,91.1]
','.join(str(d) for d in data)
def sample():
    yield 'Is'
    yield 'Chicago'
    yield 'Not'
    yield 'Chicago?'
''.join(sample())



'IsChicagoNotChicago?'

# 2.15. Interpolating Variables in Strings

In [114]:
s = '{name} has {n} messages.'
s.format(name='Guido', n=37)
name = 'Guido'
n = 37 
s.format_map(vars())
class Info:
    def __init__(self, name, n):
        self.name = name
        self.n = n
a = Info('Guido',37)
s.format_map(vars(a))

'Guido has 37 messages.'

# 2.16. Reformatting Text to a Fixed Number of Columns

In [119]:
s = "Look into my eyes, look into my eyes, the eyes, the eyes, \
the eyes, not around the eyes, don't look around the eyes, \
look into my eyes, you're under."
import textwrap # textwrap module to reformat text for output
print(textwrap.fill(s,70))
print(textwrap.fill(s, 40, initial_indent='    '))
print(textwrap.fill(s, 40, subsequent_indent='    '))

import os 
os.get_terminal_size().columns # terminal size 

Look into my eyes, look into my eyes, the eyes, the eyes, the eyes,
not around the eyes, don't look around the eyes, look into my eyes,
you're under.
    Look into my eyes, look into my
eyes, the eyes, the eyes, the eyes, not
around the eyes, don't look around the
eyes, look into my eyes, you're under.
Look into my eyes, look into my eyes,
    the eyes, the eyes, the eyes, not
    around the eyes, don't look around
    the eyes, look into my eyes, you're
    under.


132

# 2.17. Handling HTML and XML Entities in Text

In [126]:
s = 'Elements are written as "<tag>text</tag>".'
import html
print(s)
print(html.escape(s))
# Disable escaping of quotes
print(html.escape(s, quote=False))

# emit text as ASCII
s = 'Spicy Jalapeño'
s.encode('ascii', errors='xmlcharrefreplace')
s = 'Spicy &quot;Jalape&#241;o&quot.'
html.unescape(s)


Elements are written as "<tag>text</tag>".
Elements are written as &quot;&lt;tag&gt;text&lt;/tag&gt;&quot;.
Elements are written as "&lt;tag&gt;text&lt;/tag&gt;".


'Spicy "Jalapeño".'

# 2.18. Tokenizing Text