Splitting String on Any of Multiple Delimiters

In [4]:
#Problem: YOu need to split a string into fields, but the delimiters arent consistent throughout the string



line = 'asdf fjdk; afed, fjek,asdf,foo'

In [5]:
import re

In [6]:
re.split(r'[;,\s]\s*',line)

['asdf', 'fjdk', 'afed', 'fjek', 'asdf', 'foo']

In [7]:
fields = re.split(r'(;|,|\s)\s*', line)
fields

['asdf', ' ', 'fjdk', ';', 'afed', ',', 'fjek', ',', 'asdf', ',', 'foo']

In [9]:
values = fields[::2]
delimiters = fields[1::2] + ['']

In [10]:
delimiters

[' ', ';', ',', ',', ',', '']

In [11]:
#Reform the line using the same delimiters

''.join(v+d for v,d in zip(values, delimiters))

'asdf fjdk;afed,fjek,asdf,foo'

In [13]:
#If you dont want the separator characters in the result but still need to use parentheses to group parts of the regular expression pattern, make sure you use a noncapture group, specified as (?:...)



re.split(r'(?:,|;|\s)\s*', line)

['asdf', 'fjdk', 'afed', 'fjek', 'asdf', 'foo']

Matching Text at the Start or End of a String

In [15]:
#Problem you need to check the start or end of a strin gfo r secific text patterns such as fileneame extensions, URL schems, and so on.


filename = 'spam.txt'
filename.endswith('.txt')


url = 'http://www.python.org'
url.startswith('http')

True

In [16]:
import os
filenames = os.listdir('.')

In [18]:
filenames

['.git', 'README.md', 'chap1.ipynb', '.vscode', '.gitignore', 'chap2.ipynb']

In [19]:
[name for name in filenames if name.endswith('.ipynb')]

['chap1.ipynb', 'chap2.ipynb']

In [20]:
any(name.endswith('.py') for name in filenames)

False

In [22]:
from urllib.request import urlopen


def read_data(name):
    if name.startswith(('http:', 'https:','ftp:')):
        return urlopen(name).read()
    else:
        with open(name) as f:
            return f.read()

In [25]:
choices = ['http', 'ftp']
url = 'http://www.python.org'
url.startswith(tuple(choices))

True

In [26]:
#One might be inclined to use regular expressions as an alternative. FOr examle

import re
url = 'http://www.python.org'
re.match('https:|http:|frp:', url)

<re.Match object; span=(0, 5), match='http:'>

In [55]:
#from https://docs.python.org/3/howto/regex.html
import re
p = re.compile(r'\bclass\b')
print(p.search("Fine class"))  #\b is a word limiter and matches only if there is the word
print(p.search("subclass"))

#Also if r was not used then \b would mean backspace character

#Groups

p = re.compile('(ab)*') #matches if only ab pattern is there

p.match('abababab').span()  #8 patterns found

p = re.compile('(a)b')
m = p.match('ab')
m.group()

m.group(0)


#subgroups are numberred form left to right from 1 upward. Groups can be nested to determine the number just count he opening parenthesis characters going from left to right


p = re.compile('(a(b)c)d') #groups are assigned based on the brackets

m = p.match('abcd')
m.group(0)
m.group(1)
m.group(2)   #three gruops will be there


#incase we want to know the groups

m.group(0,1,2)
#also
m.groups()

#For example the following Re detects dobled words in a stirng

p = re.compile(r'\b(\w+)\s+\1\b')
p.search('Paris in the the spring').group()


m = re.match("([abc])+", "abc")
m.groups()

m = re.match("(?:[abc])+", "abc")
m.groups()

#Search for a specific word


p = re.compile(r'(?P<word>\b\w+\b)')
m = p.search('((( Lots of punctuation)))')
m.group('word')




#Additonally, you can retrieve named groups as a dictionary with groupdict()


m.groupdict()   #shows the group dictionary


p = re.compile(r'\b(?P<word>\w+)\s+(?P=word)\b')
p.search('Paris in the the spring').group()



p = re.compile(r'\W+')

p.split('This is a test, short and sweet, of split().')



#if we specify maxsplit

p.split('This is a test, short and sweet, of split().', 3)



#incase you also need to separate the delimiters

p = re.compile(r'\W+')
p2 = re.compile(r'(\W+)') #group by words

p.split("This ... is a test")   #ignores symbols

p2.split("This ... is a tes")  #also takes on white spaces




re.split(r'[\W+]', 'Words, WOrds, WOrds')  #takes on whitespaces only

re.split(r'([\W]+)', 'Words,words,words')


#Search and replace

p = re.compile('(blue|white|red)')
p.sub('colour', 'blue socks and red shoes')
p.sub('color', 'blue socks and red shoes', count=1)  #only one occurence replaced



#subn() works the same but returns a 2 tuple containing the new string value and the number of replacements that were performed

p =  re.compile('(blue|white|red)')
p.subn('color', 'blue socks and red shoes')



#Empty matches are replaced only when thery are not adjacent to a previous empty match

p  = re.compile('x*')
p.sub('-','abxd')




#The following example the replacement functiontranslates decimals into hexadecimal

def hexrepl(match):
    "return the hex string for a decimal number"
    value = int(match.group())
    return hex(value)

p = re.compile(r'\d+')
p.sub(hexrepl, 'Call 6478 for printing, 987 for user code')



#common problems

#match() function only checks if the re matches at the beginning of the stirn gwhile search() will scan forward through the string for a match which will start at 0;


print(re.match('super', 'superstition').span())  #returns

print(re.match('super', 'insuperable'))


#on the other hand search() will sacn forward through the stirn greporting the first match if finds

print(re.search('super','superstition').span())
print(re.search('super', 'insuperable').span())



#Greedy versus NonGreedy


s = '<html><head><title>Title</title>'
len(s)
print(re.match('<.*>', s).span())
print(re.match('<.*>',s).group())


print(re.match('<.*>',s).group())


#using re.VERBOSE



<re.Match object; span=(5, 10), match='class'>
None
(0, 5)
None
(0, 5)
(2, 7)
(0, 32)
<html><head><title>Title</title>
<html><head><title>Title</title>


In [58]:
#Continuing from the cookbook

import re
url = 'http://www.python.org'
re.match(r'http:|https:|ftp:',url).group()

'http:'

Matching Strings Using Shell Wildcard patterns




In [61]:
from fnmatch import fnmatch, fnmatchcase

fnmatch('foo.txt','*.txt')
fnmatch('foo.txt','?oo.txt')
fnmatch('Dat45.csv','Dat[0-9]*')

True

In [63]:
names = ['Dat1.csv', 'Dat2.csv', 'config.ini', 'foo.py']

In [66]:
[name for name in names if fnmatch(name, 'Dat*.csv')]


#Also the case sensitivity is base on the system's underlying filesystem

fnmatch('foo.txt','*.TXT')




#if distinction matters then use fnmatchcase

fnmatchcase('foo.txt','*.TXT')

False

In [67]:
addresses = [
'5412 N CLARK ST',
'1060 W ADDISON ST',
'1039 W GRANVILLE AVE',
'2122 N CLARK ST',
'4802 N BROADWAY',
]

In [69]:
from fnmatch import fnmatchcase

[addr for addr in addresses if fnmatchcase(addr, '* ST')]

[addr for addr in addresses if fnmatchcase(addr, '10[0-9][0-9] W*')]


['1060 W ADDISON ST', '1039 W GRANVILLE AVE']

Matching and searching for text patterns

In [70]:
text = 'yeah, but no, but yeah, but no, but yeah'

In [71]:
text == 'yeah'

False

In [72]:
text.startswith('yeah')

True

In [73]:
text.find('no')

10

In [74]:
#for more complilcated matchinguser regular expressions and the re module. To illstrate the basic mechanics of usin regular expressions


text1 = '11/27/2022'
text2 = 'Nov 27, 2012'



import re
if re.match(r'\d+/\d+/\d+', text1):
    print('yes')
else:
    print('no')



    

yes


In [76]:
#if you are going to perform a lot of matches using the same pattern it ussually pays to precompile the regular expression pattern into a pattern object firest. 


datapat = re.compile(r'\d+/\d+/\d+')
if datapat.match(text1):
    print('yes')
else:
    print('nein')

yes


In [77]:
text = 'Today is 11/27/2012. PyCon starts 3/13/2013.'
datapat.findall(text)

['11/27/2012', '3/13/2013']