# Simple string searching
- methods on string class
- useful but limited functionality

In [1]:
s = "foozapbar"
s.index('zap')

3

In [2]:
# substring 

'zap' in s

True

In [3]:
s.startswith('foo')

True

In [4]:
s.endswith('bar')

True

# Regular Expressions
- very powerful, widely used
- syntax a tad cryptic at first glance
- Python has a fairly standard implementation, similar to what other languages provide
- module is 're'
- [doc](https://docs.python.org/3.5/library/re.html)
- [more readable doc](https://docs.python.org/3.5/howto/regex.html#regex-howto)

In [8]:
import re

s = 'zxcvx97848438455ysdfx234yzX333Ycv234'

In [9]:
match = re.search("5+", s) # getting one or more of 5's (what + means here)
match.group(0)

'55'

In [10]:
# this pattern will find substrings 
# that start with 'x', end with 'y'
# and has one or more digits in the middle
# '[0-9]' is a character set - stands for 
# any digit char
# '*' means match zero or more of the previous regex
# '+' means match one or more of the previous regex
# 'x' and 'y' stand for themselves

pat = 'x[0-9]+y'


In [11]:
# find all substrings that match the pattern
# note match is case sensistive


re.findall(pat, s)

['x97848438455y', 'x234y']

In [12]:
# case insensitive search

re.findall(pat, s, re.IGNORECASE)

['x97848438455y', 'x234y', 'X333Y']

In [13]:
# find substrings built out of the chars 3,4,8

re.findall('[348]+',s)

['8484384', '34', '333', '34']

In [14]:
s

'zxcvx97848438455ysdfx234yzX333Ycv234'

In [15]:
# replace the pattern with a string

re.sub(pat, 'FOOBAR', s)

'zxcvFOOBARsdfFOOBARzX333Ycv234'

In [16]:
# only replace the first occurance of the pattern

re.sub(pat, 'FOOBAR', s, count=1)

'zxcvFOOBARsdfx234yzX333Ycv234'

In [17]:
# split on the pattern

re.split(pat, s)

['zxcv', 'sdf', 'zX333Ycv234']

In [18]:
s

'zxcvx97848438455ysdfx234yzX333Ycv234'

In [19]:
# some re operations return a 'match object'
# if there is no match, None is returns,
# so can test with 'if'

if re.search('aaaaa', s):
    print('match!')
else:
    print('no match!')

no match!


In [20]:
# can 'group' matches with '()'
# '.' matches any char, '.*' matches zero or more chars

m = re.search('(y[sz]).*(y[sz])',s)
m

<_sre.SRE_Match object; span=(16, 26), match='ysdfx234yz'>

In [21]:
# group 0 shows everything that matched
# the other groups show what matched inside '()'

[m.group(0), m.group(1), m.group(2), m.groups()]

['ysdfx234yz', 'ys', 'yz', ('ys', 'yz')]

In [22]:
# could put the middle part of the match in a group as well

m = re.search('(y[sz])(.*)(y[sz])',s)
m

<_sre.SRE_Match object; span=(16, 26), match='ysdfx234yz'>

In [23]:
[m.group(0), m.group(1), m.group(2),m.group(3), m.groups()]

['ysdfx234yz', 'ys', 'dfx234', 'yz', ('ys', 'dfx234', 'yz')]

# decrypt with RE

In [24]:
e = '{SVIu6Python-)dKct@\\JK)2is:y:=;;~6reallyMZ-&Bk`*6great!NB!|Krj##'

In [25]:
# '[^0-9]' means any char EXCEPT the digits, the leading '^' inverts 
# the meaning of the charset

words = re.findall('[0-9][^0-9]+', e)
words

['6Python-)dKct@\\JK)', '2is:y:=;;~', '6reallyMZ-&Bk`*', '6great!NB!|Krj##']

In [26]:
for word in words:
    ln = int(word[0])
    decode = word[1:ln+1]
    print(decode)

Python
is
really
great!


# RE groups
- groups are enclosed by ()
- great for fishing out what matched

In [27]:
s='''
<img src="/icons/unknown.gif" alt="[   ]"> <a href="Problems_chap2.nb">Problems_chap2.nb</a>       2009-04-22 15:16  171K  
<img src="/icons/layout.gif" alt="[   ]"> <a href="Problems_chap2.pdf">Problems_chap2.pdf</a>      2009-10-12 13:15  252K  
<img src="/icons/unknown.gif" alt="[   ]"> <a href="Style07.nb">Style07.nb</a>              2009-04-22 15:16   12K  
'''
urls = re.split('\\n', s)[1:-1]
urls

['<img src="/icons/unknown.gif" alt="[   ]"> <a href="Problems_chap2.nb">Problems_chap2.nb</a>       2009-04-22 15:16  171K  ',
 '<img src="/icons/layout.gif" alt="[   ]"> <a href="Problems_chap2.pdf">Problems_chap2.pdf</a>      2009-10-12 13:15  252K  ',
 '<img src="/icons/unknown.gif" alt="[   ]"> <a href="Style07.nb">Style07.nb</a>              2009-04-22 15:16   12K  ']

In [28]:
# [BKMG] - file length will have a bytes/kilo/mega/giga suffix

for u in urls:
    m = re.match('.+src="(.+)" .+href="(.+)".+ ([0-9]+[BKMG])', u)
    print(m.groups())

('/icons/unknown.gif', 'Problems_chap2.nb', '171K')
('/icons/layout.gif', 'Problems_chap2.pdf', '252K')
('/icons/unknown.gif', 'Style07.nb', '12K')
