In [None]:
'''
A regular expression is a text matching pattern that is described in a 
specialized syntax. The pattern has instructions, which are executed with a 
string as input to produce a matching subset. The Python module to perform 
regular expression is re. Typically, re is used to match or find strings.
'''

In [None]:
# Visit the following sites to learn more
# http://pymotw.com/2/re/
# http://www.thegeekstuff.com/2014/07/python-regex-examples/
# http://www.diveintopython.net/regular_expressions/
# https://docs.python.org/2/library/re.html

In [3]:
import re

In [4]:
patterns = 'and'
text = 'Python is a dynamically typed language and also has a simple syntax'
print re.search(patterns, text)
if re.search(patterns, text):
    print 'There is a match'
else:
    print 'Found no match'

<_sre.SRE_Match object at 0x0325B8A8>
There is a match


In [5]:
patterns = 'or'
text = 'Python is a dynamically typed language and also has a simple syntax'
print re.search(patterns, text)
if re.search(patterns, text):
    print 'There is a match'
else:
    print 'Found no match'

None
Found no match


In [6]:
patterns = ['and', 'or']
text = 'Python is a dynamically typed language and also has a simple syntax'

for pattern in patterns:
    print 'Trying to find a match for "%s" in "%s" - ' %(pattern,text)
    
    if re.search(pattern, text):
        print 'There is a match'
    else:
        print 'Found no match'

Trying to find a match for "and" in "Python is a dynamically typed language and also has a simple syntax" - 
There is a match
Trying to find a match for "or" in "Python is a dynamically typed language and also has a simple syntax" - 
Found no match


In [7]:
pattern = 'and'
text = 'Python is a dynamically typed language and also has a simple syntax'

compare = re.search(pattern,text)

s = compare.start() # start() returns the starting position of the match
e = compare.end() # end() returns the ending position of the match

print 'Found "%s" in "%s"  from %d to %d ' %(pattern,text,s,e)

Found "and" in "Python is a dynamically typed language and also has a simple syntax"  from 39 to 42 


In [8]:
mynumber = 1034567810378103
pattern = 10
mynumber_str = str(mynumber)
pattern_str = str(pattern)
# findall() function finds all the substrings of the input that match the 
# pattern without overlapping syntax re.findall(pattern, string)
print re.findall(pattern_str,mynumber_str)
count = len(re.findall(pattern_str,mynumber_str))
print 'In the given text, %d occured  %d times' %(pattern, count)

['10', '10', '10']
In the given text, 10 occured  3 times


In [11]:
# finditer() returns an iterator that produces match instances instead of the 
# strings returned by findall
# syntax re.finditer(pattern, string)

text = '1034567810378103'
pattern = '78'
count = 0
print re.finditer(pattern,text)
for match in re.finditer(pattern,text):
    s = match.start() 
    e = match.end() 
    count = count + 1
    print 'The pattern "%s" starts at %d and ends at %d ' %(pattern, s, e)
print 'In the given text, "%s" occured  %d times' %(pattern, count)

<callable-iterator object at 0x032DE2F0>
The pattern "78" starts at 6 and ends at 8 
The pattern "78" starts at 11 and ends at 13 
In the given text, "78" occured  2 times


In [None]:
'''
group() returns the substring that was matched by the re. Adding groups to a 
pattern lets you isolate parts of the matching text, expanding those
capabilites to create a parser. 
'''

In [3]:
strval1 = 'Barack Obama, Michelle Obama, Joe Biden, Jill Biden'
list1 = strval1.split(',')
print list1

for items in list1:
    firstname = re.match(r'(.*)Obama',items)
    if firstname:
        print firstname.group(0) 
        # returns every element in the list that has Obama in it
        print firstname.group(1) 
        # returns first name of the element in the list that has Obama in it

['Barack Obama', ' Michelle Obama', ' Joe Biden', ' Jill Biden']
Barack Obama
Barack 
 Michelle Obama
 Michelle 


In [18]:
strval = 'San Francisco, San Jose, San Carlos, Sunnyvale, Cupertino'
strval_list = strval.strip().split(',') # converting strval into a list

b = []
for items in strval_list:
    allnames = re.match(r'San(.*)', items.strip()) 
    # returns a subset of the list which starts with San 
    if allnames:
        b.append(allnames.group(1))
print b

[' Francisco', ' Jose', ' Carlos']


In [13]:
# re.compile() function is used to compile pattern into pattern objects, 
# which have methods for various operations such as searching for pattern 
# matches or performing string substitutions. 
# syntax re.compile(pattern)
strval = 'San Francisco, San Jose, San Carlos, Sunnyvale, Cupertino'
rec = re.compile('San')
re.findall(rec,strval)

['San', 'San', 'San']

In [14]:
# Returns an iterator
for items in re.finditer(rec,strval):
    print items

<_sre.SRE_Match object at 0x0325BF00>
<_sre.SRE_Match object at 0x0325BFA8>
<_sre.SRE_Match object at 0x0325BF00>


In [15]:
# Returns an iterator
for items in re.finditer(rec,strval):
    print items.start(),items.end()    

0 3
15 18
25 28


In [16]:
# First method to find and replace 
# The replace() function will replace substrings
# syntax input_text.replace('pattern', 'replacement') 
a = strval.replace('San','S.')
print strval
print a

San Francisco, San Jose, San Carlos, Sunnyvale, Cupertino
S. Francisco, S. Jose, S. Carlos, Sunnyvale, Cupertino


In [None]:
# Second method to find and replace
# The re.sub() function can be used to replace substrings
# syntax re.sub(pattern,replacement,string) 
strval1 = re.sub('San','S.',strval)
print strval1

In [20]:
t = "It's a dog\n'"
print t

t = r'It\'s a dog\n'
print t

It's a dog
'
It\'s a dog\n


In [None]:
'''
\w - Matches characters from A-Z, a-z, 0-9 or _ also writen as A-Za-z0-9_
\W - Matches nonword characters.
\s - Matches whitespace. Equivalent to [ \t\n\r\f].
\S - Matches nonwhitespace.
\d - Matches digits. Equivalent to [0-9].
\D - Matches nondigits.
^ start of string, or line
'''

In [22]:
# in this example we want to make sure that the user enters valid email address
import re

#phone_check = re.compile(r"\d")
ymail_check = re.compile(r'(\w+@\w+\.(com|net|org|edu))')
while True:
    ymail = raw_input ("Please, enter your email: ")
    if ymail_check.search(ymail):
        print 'you entered a valid email'
        break
    else:
        print "Please enter your email correctly!"

Please, enter your email: d@gmail.co
Please enter your email correctly!
Please, enter your email: d@.com
Please enter your email correctly!
Please, enter your email: ds74809@gmail.com
you entered a valid email


In [None]:
# The re.search() method takes a regular expression pattern and a string and
# searches for that pattern within the string. 
# The syntax is re.search(pattern, string)
import re
name = 'Roosovelt, Eleanor'
a = re.search('(\w+), (\w+)',name)
# (\w+) matches multiple occurrances of A-Za-z0-9_
print a.group(0)
print a.group(1)
print a.group(2)

In [None]:
name = 'Roosovelt, Eleanor'
a = re.search('(?P<lastname>\w+), (?P<firstname>\w+)',name)
# ?P<lastname>\w+ finds pattern that has characters A-Za-z0-9_ and assigns it to 
# lastname
print a.group(0)
print a.group('lastname')
print a.group('firstname')

In [None]:
# There is only one space after , What happens if there is more?
strval = 'Elizabeth Warren, 65'
a = re.search('(?P<firstname>\w+) (?P<lastname>\w+, (?P<age>\d+))',strval)
print a.group(1)
#print a.group('age')

In [27]:
strval = 'Elizabeth Warren,           65'
a = re.search('(?P<firstname>\w+) (?P<lastname>\w+, \s+(?P<age>\d+))',strval)
print a#.group(0)
print a.group('age')

<_sre.SRE_Match object at 0x0325C570>
65


In [None]:
'''
In-class activity: In the below paragraph, find the number of occurances of words 
- of, the and food.

Coral reefs are some of the most biologically rich and economically valuable 
ecosystems on Earth. They provide food, jobs, income, and protection to billions 
of people worldwide. However, coral reefs and the magnificent creatures that call 
them home are in danger of disappearing if actions are not taken to protect them. 
They are threatened by an increasing range of impacts including pollution, 
invasive species, diseases, bleaching, and global climate change. The rapid 
decline and loss of these valuable, ancient, and complex ecosystems have 
significant social, economic, and environmental consequences in the 
United States and around the world.
'''

In [34]:
str1 = ''''Coral reefs are some of the most biologically rich and economically valuable 
ecosystems on Earth. They provide food, jobs, income, and protection to billions 
of people worldwide. However, coral reefs and the magnificent creatures that call 
them home are in danger of disappearing if actions are not taken to protect them. 
They are threatened by an increasing range of impacts including pollution, 
invasive species, diseases, bleaching, and global climate change. The rapid 
decline and loss of these valuable, ancient, and complex ecosystems have 
significant social, economic, and environmental consequences in the 
United States and around the world'''
patterns =['of','the','food']
for p in patterns:
    b = re.findall(p,str1)
    print b,p,len


['of', 'of', 'of', 'of', 'of'] of <built-in function len>
['the', 'the', 'the', 'the', 'the', 'the', 'the'] the <built-in function len>
['food'] food <built-in function len>


In [None]:
'''
In-class activity: 
In the below paragraph, there are typos. The spelling mistakes are in the 
words: tping, componts, programy and binare. Create a dictionary with the key 
being the incorrect word and the value is the correct word. Then replace the 
incorrect word with the correct word and print the corrected text.

'''

myText = ''' Python is an interpreted, object-oriented, high-level programming 
language with dynamic semantics. Its high-level built in data structures, 
combined with dynamic tping and dynamic binding, make it very attractive for 
Rapid Application Development, as well as for use as a scripting or glue 
language to connect existing componts together. Python's simple, easy to learn 
syntax emphasizes readability and therefore reduces the cost of program 
maintenance. Python supports modules and packages, which encourages programy 
modularity and code reuse. The Python interpreter and the extensive standard 
library are available in source or binare form without charge for all major 
platforms, and can be freely distributed.
'''