Splitting String on Any of Multiple Delimiters

In [4]:
#Problem: YOu need to split a string into fields, but the delimiters arent consistent throughout the string



line = 'asdf fjdk; afed, fjek,asdf,foo'

In [5]:
import re

In [6]:
re.split(r'[;,\s]\s*',line)

['asdf', 'fjdk', 'afed', 'fjek', 'asdf', 'foo']

In [7]:
fields = re.split(r'(;|,|\s)\s*', line)
fields

['asdf', ' ', 'fjdk', ';', 'afed', ',', 'fjek', ',', 'asdf', ',', 'foo']

In [9]:
values = fields[::2]
delimiters = fields[1::2] + ['']

In [10]:
delimiters

[' ', ';', ',', ',', ',', '']

In [11]:
#Reform the line using the same delimiters

''.join(v+d for v,d in zip(values, delimiters))

'asdf fjdk;afed,fjek,asdf,foo'

In [13]:
#If you dont want the separator characters in the result but still need to use parentheses to group parts of the regular expression pattern, make sure you use a noncapture group, specified as (?:...)



re.split(r'(?:,|;|\s)\s*', line)

['asdf', 'fjdk', 'afed', 'fjek', 'asdf', 'foo']

Matching Text at the Start or End of a String

In [15]:
#Problem you need to check the start or end of a strin gfo r secific text patterns such as fileneame extensions, URL schems, and so on.


filename = 'spam.txt'
filename.endswith('.txt')


url = 'http://www.python.org'
url.startswith('http')

True

In [16]:
import os
filenames = os.listdir('.')

In [18]:
filenames

['.git', 'README.md', 'chap1.ipynb', '.vscode', '.gitignore', 'chap2.ipynb']

In [19]:
[name for name in filenames if name.endswith('.ipynb')]

['chap1.ipynb', 'chap2.ipynb']

In [20]:
any(name.endswith('.py') for name in filenames)

False

In [22]:
from urllib.request import urlopen


def read_data(name):
    if name.startswith(('http:', 'https:','ftp:')):
        return urlopen(name).read()
    else:
        with open(name) as f:
            return f.read()

In [25]:
choices = ['http', 'ftp']
url = 'http://www.python.org'
url.startswith(tuple(choices))

True

In [26]:
#One might be inclined to use regular expressions as an alternative. FOr examle

import re
url = 'http://www.python.org'
re.match('https:|http:|frp:', url)

<re.Match object; span=(0, 5), match='http:'>

In [55]:
#from https://docs.python.org/3/howto/regex.html
import re
p = re.compile(r'\bclass\b')
print(p.search("Fine class"))  #\b is a word limiter and matches only if there is the word
print(p.search("subclass"))

#Also if r was not used then \b would mean backspace character

#Groups

p = re.compile('(ab)*') #matches if only ab pattern is there

p.match('abababab').span()  #8 patterns found

p = re.compile('(a)b')
m = p.match('ab')
m.group()

m.group(0)


#subgroups are numberred form left to right from 1 upward. Groups can be nested to determine the number just count he opening parenthesis characters going from left to right


p = re.compile('(a(b)c)d') #groups are assigned based on the brackets

m = p.match('abcd')
m.group(0)
m.group(1)
m.group(2)   #three gruops will be there


#incase we want to know the groups

m.group(0,1,2)
#also
m.groups()

#For example the following Re detects dobled words in a stirng

p = re.compile(r'\b(\w+)\s+\1\b')
p.search('Paris in the the spring').group()


m = re.match("([abc])+", "abc")
m.groups()

m = re.match("(?:[abc])+", "abc")
m.groups()

#Search for a specific word


p = re.compile(r'(?P<word>\b\w+\b)')
m = p.search('((( Lots of punctuation)))')
m.group('word')




#Additonally, you can retrieve named groups as a dictionary with groupdict()


m.groupdict()   #shows the group dictionary


p = re.compile(r'\b(?P<word>\w+)\s+(?P=word)\b')
p.search('Paris in the the spring').group()



p = re.compile(r'\W+')

p.split('This is a test, short and sweet, of split().')



#if we specify maxsplit

p.split('This is a test, short and sweet, of split().', 3)



#incase you also need to separate the delimiters

p = re.compile(r'\W+')
p2 = re.compile(r'(\W+)') #group by words

p.split("This ... is a test")   #ignores symbols

p2.split("This ... is a tes")  #also takes on white spaces




re.split(r'[\W+]', 'Words, WOrds, WOrds')  #takes on whitespaces only

re.split(r'([\W]+)', 'Words,words,words')


#Search and replace

p = re.compile('(blue|white|red)')
p.sub('colour', 'blue socks and red shoes')
p.sub('color', 'blue socks and red shoes', count=1)  #only one occurence replaced



#subn() works the same but returns a 2 tuple containing the new string value and the number of replacements that were performed

p =  re.compile('(blue|white|red)')
p.subn('color', 'blue socks and red shoes')



#Empty matches are replaced only when thery are not adjacent to a previous empty match

p  = re.compile('x*')
p.sub('-','abxd')




#The following example the replacement functiontranslates decimals into hexadecimal

def hexrepl(match):
    "return the hex string for a decimal number"
    value = int(match.group())
    return hex(value)

p = re.compile(r'\d+')
p.sub(hexrepl, 'Call 6478 for printing, 987 for user code')



#common problems

#match() function only checks if the re matches at the beginning of the stirn gwhile search() will scan forward through the string for a match which will start at 0;


print(re.match('super', 'superstition').span())  #returns

print(re.match('super', 'insuperable'))


#on the other hand search() will sacn forward through the stirn greporting the first match if finds

print(re.search('super','superstition').span())
print(re.search('super', 'insuperable').span())



#Greedy versus NonGreedy


s = '<html><head><title>Title</title>'
len(s)
print(re.match('<.*>', s).span())
print(re.match('<.*>',s).group())


print(re.match('<.*>',s).group())


#using re.VERBOSE



<re.Match object; span=(5, 10), match='class'>
None
(0, 5)
None
(0, 5)
(2, 7)
(0, 32)
<html><head><title>Title</title>
<html><head><title>Title</title>


In [58]:
#Continuing from the cookbook

import re
url = 'http://www.python.org'
re.match(r'http:|https:|ftp:',url).group()

'http:'

Matching Strings Using Shell Wildcard patterns




In [61]:
from fnmatch import fnmatch, fnmatchcase

fnmatch('foo.txt','*.txt')
fnmatch('foo.txt','?oo.txt')
fnmatch('Dat45.csv','Dat[0-9]*')

True

In [63]:
names = ['Dat1.csv', 'Dat2.csv', 'config.ini', 'foo.py']

In [66]:
[name for name in names if fnmatch(name, 'Dat*.csv')]


#Also the case sensitivity is base on the system's underlying filesystem

fnmatch('foo.txt','*.TXT')




#if distinction matters then use fnmatchcase

fnmatchcase('foo.txt','*.TXT')

False

In [67]:
addresses = [
'5412 N CLARK ST',
'1060 W ADDISON ST',
'1039 W GRANVILLE AVE',
'2122 N CLARK ST',
'4802 N BROADWAY',
]

In [69]:
from fnmatch import fnmatchcase

[addr for addr in addresses if fnmatchcase(addr, '* ST')]

[addr for addr in addresses if fnmatchcase(addr, '10[0-9][0-9] W*')]


['1060 W ADDISON ST', '1039 W GRANVILLE AVE']

Matching and searching for text patterns

In [70]:
text = 'yeah, but no, but yeah, but no, but yeah'

In [71]:
text == 'yeah'

False

In [72]:
text.startswith('yeah')

True

In [73]:
text.find('no')

10

In [74]:
#for more complilcated matchinguser regular expressions and the re module. To illstrate the basic mechanics of usin regular expressions


text1 = '11/27/2022'
text2 = 'Nov 27, 2012'



import re
if re.match(r'\d+/\d+/\d+', text1):
    print('yes')
else:
    print('no')



    

yes


In [76]:
#if you are going to perform a lot of matches using the same pattern it ussually pays to precompile the regular expression pattern into a pattern object firest. 


datapat = re.compile(r'\d+/\d+/\d+')
if datapat.match(text1):
    print('yes')
else:
    print('nein')

yes


In [19]:
text = 'Today is 11/27/2012. PyCon starts 3/13/2013.'
#datapat.findall(text)

In [5]:
#We can also group it 
import re

datepat=  re.compile(r'(\d+)/(\d+)/(\d+)')

In [6]:
m = datepat.match('11/27/2022')

In [10]:
m.group(1)

'11'

In [15]:
for month,day,year in datepat.findall(text):
    print(f'{month}/{day}/{year}')

11/27/2012
3/13/2013


In [16]:
#if you want o find matches iteratively use the finditer() method instead.


for m in datepat.finditer(text):
    print(m.groups())

('11', '27', '2012')
('3', '13', '2013')


Searching and Replacing Text

In [17]:
#You want to search for and replace a text pattern in a string

text1 = 'yeah, but no, but yeah, but no, but yeah'
text1.replace('yeah', 'yep')

'yep, but no, but yep, but no, but yep'

In [20]:
re.sub(r'(\d+)/(\d+)/(\d+)', r'\3-\1-\2', text)

#format the date

'Today is 2012-11-27. PyCon starts 2013-3-13.'

In [28]:
from calendar import month_abbr
def change_date(m):
    mon_name = month_abbr[int(m.group(1))]
    return '{} {} {}'.format(m.group(2), mon_name, m.group(3))

In [29]:
datepat.sub(change_date,text)

'Today is 27 Nov 2012. PyCon starts 13 Mar 2013.'

In [24]:
m.group(3)

'2013'

In [25]:
m

<re.Match object; span=(34, 43), match='3/13/2013'>

Searching and Replacing Case Insensitive Text

In [70]:
text2 = 'UPPER PYTHON, lower python, Mixed Python'
re.findall('python', text2, flags=re.IGNORECASE)

['PYTHON', 'python', 'Python']

In [36]:
re.sub('python','snake',text2,flags=re.IGNORECASE)

#also the case doesnt match that of the replaced text

'UPPER snake, lower snake, Mixed snake'

In [84]:
#so we formulate a function

def matchcase(word):
    def replace(m):
        text = m.group()
        if text.isupper():
            return word.upper()

        elif text.islower():
            return word.lower()

        elif text[0].isupper():
            return word.capitalize()
        else:
            return word
    return replace


In [85]:
re.sub('python', matchcase('snake'), text2, flags=re.IGNORECASE)

'Computer says "no." Phone says "yes."'

In [54]:
m.group()

'3/13/2013'

Specifying a Regular Expression for the shortest match

In [74]:
str_pat = re.compile(r'\"(.*)\"')
text1  =  'Computer says "no."'
str_pat.findall(text1)

text2 = 'Computer says "no." Phone says "yes."'
str_pat.findall(text2)

['no." Phone says "yes.']

In [77]:
str_pat=  re.compile(r'\"(.*?)\"')  #The question mark makes it nongreedy

str_pat.findall(text2)

['no.', 'yes.']

2.8. Writing a Regular Expression for Multiline Patterns

In [2]:
import re
comment = re.compile(r'/\*(.*?)\*/')
text1 = '/* this is a comment */'
text2 = """/* this is a 
multiline comment */"""

In [5]:
comment.findall(text2)

[]

In [6]:
#to fix the problem

comment = re.compile(r'/\*((?:.|\n)*?)\*/')

In [7]:
comment.findall(text2)

[' this is a \nmultiline comment ']

In [8]:
#alson using dotall

comment = re.compile(r'/\*(.*?)\*/', re.DOTALL)
comment.findall(text2)

[' this is a \nmultiline comment ']

2.9. Normalizing Unicode Text to a Standard
Representation

In [2]:
s1 = 'Spicy Jalape\u00f1o'
s2 = 'Spicy Jalapen\u0303o'

s1, s2

s1==s2
len(s1),len(s2)

(14, 15)

In [3]:
import unicodedata
t1 = unicodedata.normalize('NFC',s1)
t2 = unicodedata.normalize('NFC',s2)

In [6]:
print(ascii(t1))

'Spicy Jalape\xf1o'


In [7]:
t3 = unicodedata.normalize('NFD', s1)
t4 = unicodedata.normalize('NFD', s2)

In [8]:
t3

'Spicy Jalapeño'

In [9]:
#NFC means the characters should be fully composed (ie use a single code point if possible)

#NFD means the characters should be fully decomposed with the use of combining characters

In [10]:
s = '\ufb01'  #A single character
s

'ﬁ'

In [13]:
unicodedata.normalize('NFKD',s)   #The combined letters are broken apart here

'fi'

In [15]:
t1 = unicodedata.normalize('NFD',s1)
''.join(c for c in t1 if not unicodedata.combining(c))

'Spicy Jalapeno'

2.10. Working with Unicode Characters in Regular
Expressions

In [18]:
import re
num = re.compile('\d+')

#ASCII digits

num.match('123')



#Arabic digits

num.match('\u0661\u0662\u0663')

<re.Match object; span=(0, 3), match='١٢٣'>

In [21]:
arabic= re.compile('[\u0600-\u06ff\u0750-\u077f\u08a0-\u08ff]+')

In [22]:
arabic

re.compile(r'[\u0600-ۿݐ-ݿࢠ-ࣿ]+', re.UNICODE)

In [23]:
pat = re.compile('stra\u00dfe', re.IGNORECASE)

In [24]:
s = 'straße'

In [25]:
pat.match(s)

<re.Match object; span=(0, 6), match='straße'>

In [26]:
pat.match(s.upper())

In [28]:
s.upper()

'STRASSE'

2.11. Stripping Unwanted Characters from Strings

In [29]:
s = '   Hello World    \n'
s.strip() 

'Hello World'

In [30]:
s.lstrip()

'Hello World    \n'

In [31]:
s.rstrip()

'   Hello World'

In [33]:
#Character stripping

t = '---------hello======'
t.lstrip('-')
t.rstrip('=')

'---------hello'

In [36]:
t.strip('-=') #strips both

'hello'

In [37]:

#Be aware that stripping does not apply to any text in the middle of a string

In [39]:
s = '  hello            world   \n'
s = s.strip()
s

'hello            world'

In [44]:
s.replace(' ',' ')

'hello            world'

In [47]:
#using re

import re
re.sub(r'\s+',' ',s)

'hello world'

2.12. Sanitizing and Cleaning Up Text

In [1]:
#Problem : Someone has entered “pýtĥöñ” into a form on your we page and youd like to clean it up somehow

In [7]:
s = 'pýtĥöñ\fis\tawesome\r\n'

In [3]:
s

'pýtĥöñ\x0cis\tawesome\r\n'

In [4]:
remap = {ord('\t') : ' ', ord('\f') : ' ', ord('\r') : None}

In [7]:
a = s.translate(remap)

In [8]:
a

'pýtĥöñ is awesome\n'

In [11]:
import unicodedata
import sys
cmb_chrs = dict.fromkeys(c for c in range(sys.maxunicode) if unicodedata.combining(chr(c)))

b = unicodedata.normalize('NFD', a)
b

'pýtĥöñ is awesome\n'

In [12]:
b.translate(cmb_chrs)

'python is awesome\n'

In [13]:
#As another example here is a translation table that maps all Unicode deciamal digit characters to their equivalent in ASCII

In [14]:
digitmap = {c : ord('0') + unicodedata.digit(chr(c)) for c in range(sys.maxunicode) if unicodedata.category(chr(c)) == 'Nd'}

In [15]:
len(digitmap)

660

In [16]:
#Arabic digits

x = '\u0661\u0662\u0663'

In [19]:
x.translate(digitmap)

'123'

In [20]:
a

'pýtĥöñ is awesome\n'

In [21]:
b = unicodedata.normalize('NFD',a)
b.encode('ascii', 'ignore').decode('ascii')

'python is awesome\n'

In [3]:
s = "hi iz right"
ch = {ord('i'):'e', ord('z'):'s'}
a = s.translate(ch)

In [9]:
def clean_spaces(s):
    s = s.replace('\r', '')
    s = s.replace('\t', ' ')
    s = s.replace('\f', ' ')
    return s

In [10]:
s

'pýtĥöñ\x0cis\tawesome\r\n'

In [12]:
sc = clean_spaces(s)

Aligning Text Strings

In [16]:
#You need to format text with some sort of alignnment applied

In [20]:
text = 'Hello World'
text.ljust(20),text.rjust(20)
text.center(29,'_')

'_________Hello World_________'

In [23]:
format(text,'>20')
format(text, '<20')
format(text, '*<20')

'Hello World*********'

In [25]:
'{:>10s} {:>10s}'.format('Hello','World')

'     Hello      World'

In [26]:
#The format method also works with integers and not only strings

In [29]:
#Older code

'%-20s' % ' Hello'

' Hello              '

Combining and Concatenating Stirngs

In [31]:
s = "This is a test script"
sl = s.split(' ') 

In [35]:
' '.join(sl)

'This is a test script'

In [36]:
''.join(sl)

'Thisisatestscript'

In [41]:
a = 'Hello ' 'World'

In [42]:
a

'Hello World'

In [None]:
#To lookout for

#Better method to concatenate multiple strings with a common separator

print( a, b, c, sep = ':')

In [45]:
#joining multiple strings without extracting them directly to the memory first

def sample():
    yield "New York"
    yield "Boston"
    yield "Chicago"
    yield "Milwaukee"

In [55]:
' '.join(sample())

'New York Boston Chicago Milwaukee'

In [56]:
#Also another way to do this is

for i in sample():
    '.'.join(i)

In [68]:
with open('textfile.txt','w+') as f:
    for i in sample():
        f.write(i+'\n')
        

In [88]:
import re
with open('textfile.txt','w+') as f:
    for i in enumerate(sample(), start=1):
        i = str(i)
        i = re.sub(r"[()]","",i)
        f.write(str(i)+'\n')

In [101]:
def combine(source, maxsize):
    parts = []
    size = 0
    for i in source:
        parts.append(i)
        size+= len(i)
        if size > maxsize:
            yield ' '.join(parts)
            parts = []
            size = 0
    yield ' '.join(parts)

In [111]:
f = open('textfile.txt', 'w+')


In [112]:
for part in combine(sample(), 11):
    f.write(part)

In [113]:
f.close()

Interpolating Variables in Strings

In [4]:
s = '{name} has {n} messages.'
s.format(name = 'Guidp',n = '37')

'Guidp has 37 messages.'

In [69]:
name = 'Guido'
n = 37

s.format_map(vars())

AttributeError: 'Template' object has no attribute 'format_map'

In [57]:
class Info:
    def __init__(self, name, n):
        self.name = name
        self.n = n

    def __repr__(self) -> str:
        return '{}{}'.format(self.__class__.__name__,(self.name, self.n))

In [58]:
a = Info('Guido',37)
a

Info('Guido', 37)

In [11]:
s.format_map(vars(a))

'Guido has 37 messages.'

In [60]:
#One downside fo format and format_map is that they do not deal gracefully with missing values. 

#s.format(name='Guido') #Fails

class Safesub(dict):
    def __missing__(self, key):
        return '{' + key + '}'

In [61]:
del n

In [62]:
s.format_map(Safesub(vars()))

'Guido has {n} messages.'

In [63]:
#If you find yourself frequently performing thest steps in your code you could hide the variable substitution process behind a small utility function that employs a so called "frame hack". For example

import sys

def sub(text):
    return text.format_map(Safesub(sys._getframe(1).f_locals))

In [67]:
sys._getframe(1).f_locals

{'self': <ipykernel.zmqshell.ZMQInteractiveShell at 0x7fd74d03a850>,
 'code_obj': <code object <module> at 0x7fd72aa59b30, file "/tmp/ipykernel_6600/1286078319.py", line 1>,
 'result': <ExecutionResult object at 7fd72a988310, execution_count=67 error_before_exec=None error_in_exec=None info=<ExecutionInfo object at 7fd72a993950, raw_cell="sys._getframe(1).f_locals" store_history=True silent=False shell_futures=True cell_id=vscode-notebook-cell:/home/susearc/Documents/github/Python_CookBook_Self/chap2.ipynb#Y265sZmlsZQ%3D%3D> result=None>,
 'async_': False,
 '__tracebackhide__': '__ipython_bottom__',
 'old_excepthook': <bound method IPKernelApp.excepthook of <ipykernel.kernelapp.IPKernelApp object at 0x7fd74fed7610>>,
 'outflag': True}

In [1]:
#Alternative string method is also inplace

import string
name = 'Guido'
n = 37
s = string.Template('$name has $n messages')
s.substitute(vars())

'Guido has 37 messages'

Reformatting Text to a Fixed Number of Columns

In [2]:
s = "Look into my eyes, look into my eyes, the eyes, the eyes, \
the eyes, not around the eyes, don't look around the eyes, \
look into my eyes, you're under."

In [7]:
import textwrap
print(textwrap.fill(s,20,initial_indent='----'))

----Look into my
eyes, look into my
eyes, the eyes, the
eyes, the eyes, not
around the eyes,
don't look around
the eyes, look into
my eyes, you're
under.


Handling HTML and XML Entities in Text

In [8]:
#You want to replace HTML or XML entities such as &entity or &#code; with their corresponding text. Alternatively you need to produce text but escape certain characters

In [9]:
s = 'Elements are writtern as "<tag>text</tag>".'

In [12]:
import html
print(html.escape(s, quote=False))

Elements are writtern as "&lt;tag&gt;text&lt;/tag&gt;".


In [14]:
#If you are trying to emit text as ASCII and want to embed character cod eentitites for non Ascii charactersw you can use the errors = 'xmlcharrefreplace' argument to various I/O related functions to do it

In [16]:
s = 'Spicy Jalapeño'
s.encode('ascii', errors='xmlcharrefreplace')

b'Spicy Jalape&#241;o'

In [1]:
s = 'Spicy &quot;Jalape&#241;o&quot.'
from html.parser import HTMLParser



Tokenizing Text

In [4]:
text = 'foo = 23 + 42 * 10'
#The tokenized text will be like this
tokens = [('NAME', 'foo'), ('EQ','='), ('NUM', '23'),('PLUS','+'),('NUM', '42'), ('TIMES', '*'),('NUM', '10')]




In [5]:
#To do this we can do the following

import re
NAME = r'(?P<NAME>[a-zA-Z_][a-zA-z_0-9]*)'
EQ = r'(?P<EQ>=)'
NUM = r'(?P<NUM>\d+)'
PLUS = r'(?P<PLUS>\+)'
TIMES = r'(?P<TIMES>\*)'
WS = r'(?P<WS>\s+)'

In [6]:
master_pat = re.compile('|'.join([NAME, EQ, NUM, PLUS, TIMES, WS]))

In [7]:
master_pat

re.compile(r'(?P<NAME>[a-zA-Z_][a-zA-z_0-9]*)|(?P<EQ>=)|(?P<NUM>\d+)|(?P<PLUS>\+)|(?P<TIMES>\*)|(?P<WS>\s+)',
           re.UNICODE)

In [8]:
scanner = master_pat.scanner('foo = 42')
scanner.match()

<re.Match object; span=(0, 3), match='foo'>

In [9]:
_.lastgroup

'NAME'

In [10]:
_.group()

AttributeError: 'str' object has no attribute 'group'

In [11]:
from collections import namedtuple

Token = namedtuple('Token',['type','value'])

In [12]:
def generate_tokens(pat, text):
    scanner = pat.scanner(text)
    for m in iter(scanner.match, None):
        yield Token(m.lastgroup, m.group())

In [13]:
for tok in generate_tokens(master_pat, 'foo=42'):
    print(tok)

Token(type='NAME', value='foo')
Token(type='EQ', value='=')
Token(type='NUM', value='42')


In [14]:
#The order of teh tokes in the master regular expression also matters. WHen matching re tries to match patterns in the order specified. Thus if a pattern happens to be a substring of a longer pattern. you need to make sure the longer pattern goes first

In [17]:
LT = r'(?P<LT><)'
LE = r'(?P<LE><=)'
EQ = r'(?P<EQ>=)'

In [19]:
master_pat = re.compile('|'.join([LT, LE, EQ]))

In [21]:
#Last but not least you need to wathc out for patterns that form substrings. For example suppose you have two patterns like this:

PRINT = r'(P<PRINT>print)'
NAME = r'(P<NAME>[a-zA-Z_][a-zA-Z_0-9]*)'

In [22]:
master_pat  =re.compile('|'.join([PRINT, NAME]))

In [23]:
for tok in generate_tokens(master_pat,'printer'):
    print(tok)


In [24]:
#You need to parse text according to a set of grammar rules and perform actions or build an abstract syntax tree representing the input. The grammar is small so youd prefer to just write the parser yourself as opposed to using some kind of framework.