In [4]:
# we'll be working with raw strings , so let's see the basic difference 
print("\tTab") 
print(r"\tTab")

	Tab
\tTab


In [5]:
import re

In [20]:
text_to_search = '''
abcdefghijklmnopqurtuvwxyz
ABCDEFGHIJKLMNOPQRSTUVWXYZ
1234567890
Ha HaHa
MetaCharacters (Need to be escaped):
. ^ $ * + ? { } [ ] \ | ( )
coreyms.com
321-555-4321
123.555.1234
123*555*1234
700-555-1234
800-555-1234
900-555-1234
Mr. Schafer
Mr Smith
Ms Davis
Mrs. Robinson
Mr. T
'''

In [7]:
sentence = 'Start a sentence and then bring it to an end'

In [11]:
# finditer returns objects containing all matches of pattern in the searched text
pattern=re.compile(r'abc')
matches=pattern.finditer(text_to_search)
for match in matches:
    print(match)
# just found one match , also we get indices which will help us in string slicing 
# this is case sensitive (did not match to ABC)

<re.Match object; span=(1, 4), match='abc'>


In [10]:
print(text_to_search[1:4])

abc


In [None]:
# notice this gives weird results , if we really want to find periods ,we need to escape it with backslash , similar for MetaCharacters (Need to be escaped):. ^ $ * + ? { } [ ] \ | ( )

pattern=re.compile(r'.')
matches=pattern.finditer(text_to_search)
for match in matches:
    print(match)

In [13]:
pattern=re.compile(r'\.')
matches=pattern.finditer(text_to_search)
for match in matches:
    print(match)

<re.Match object; span=(111, 112), match='.'>
<re.Match object; span=(146, 147), match='.'>
<re.Match object; span=(167, 168), match='.'>
<re.Match object; span=(171, 172), match='.'>
<re.Match object; span=(218, 219), match='.'>
<re.Match object; span=(249, 250), match='.'>
<re.Match object; span=(262, 263), match='.'>


In [None]:
## Pattern searching 

.       - Any Character Except New Line
\d      - Digit (0-9)
\D      - Not a Digit (0-9)
\w      - Word Character (a-z, A-Z, 0-9, _)
\W      - Not a Word Character
\s      - Whitespace (space, tab, newline)
\S      - Not Whitespace (space, tab, newline)

# these are known as anchors
\b      - Word Boundary
\B      - Not a Word Boundary
^       - Beginning of a String
$       - End of a String

[]      - Matches Characters in brackets
[^ ]    - Matches Characters NOT in brackets
|       - Either Or
( )     - Group

Quantifiers:
*       - 0 or More
+       - 1 or More
?       - 0 or One
{3}     - Exact Number
{3,4}   - Range of Numbers (Minimum, Maximum)


#### Sample Regexs ####

[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+

In [None]:
#notice the capital letters bascially negate whatever we get from small letters

In [14]:
# matches Ha literal strring with space infront
pattern=re.compile(r'\bHa')
matches=pattern.finditer(text_to_search)
for match in matches:
    print(match)

<re.Match object; span=(66, 68), match='Ha'>
<re.Match object; span=(69, 71), match='Ha'>


In [16]:
#let's parse out phone numbers from the text data
# note : we are using period (.) here which will match any character '-' or '.' but what if we want to distinguish
pattern=re.compile(r'\d\d\d.\d\d\d.\d\d\d\d')
matches=pattern.finditer(text_to_search)
for match in matches:
    print(match)

<re.Match object; span=(151, 163), match='321-555-4321'>
<re.Match object; span=(164, 176), match='123.555.1234'>
<re.Match object; span=(177, 189), match='123*555*1234'>
<re.Match object; span=(190, 202), match='800-555-1234'>
<re.Match object; span=(203, 215), match='900-555-1234'>


In [17]:
#we need to specify the exact patterns to match in the character set
pattern=re.compile(r'\d\d\d[-.]\d\d\d[-.]\d\d\d\d')
matches=pattern.finditer(text_to_search)
for match in matches:
    print(match)

<re.Match object; span=(151, 163), match='321-555-4321'>
<re.Match object; span=(164, 176), match='123.555.1234'>
<re.Match object; span=(190, 202), match='800-555-1234'>
<re.Match object; span=(203, 215), match='900-555-1234'>


In [18]:
pattern=re.compile(r'[89]00[-.]\d\d\d[-.]\d\d\d\d')
matches=pattern.finditer(text_to_search)
for match in matches:
    print(match)

<re.Match object; span=(190, 202), match='800-555-1234'>
<re.Match object; span=(203, 215), match='900-555-1234'>


In [21]:
# hyphen - within character sets can be used for specifying a pattern
pattern=re.compile(r'[7-9]00[-.]\d\d\d[-.]\d\d\d\d')
matches=pattern.finditer(text_to_search)
for match in matches:
    print(match)

<re.Match object; span=(190, 202), match='700-555-1234'>
<re.Match object; span=(203, 215), match='800-555-1234'>
<re.Match object; span=(216, 228), match='900-555-1234'>


In [22]:
pattern=re.compile(r'\d{3}.\d{3}.\d{4}')
matches=pattern.finditer(text_to_search)
for match in matches:
    print(match)

<re.Match object; span=(151, 163), match='321-555-4321'>
<re.Match object; span=(164, 176), match='123.555.1234'>
<re.Match object; span=(177, 189), match='123*555*1234'>
<re.Match object; span=(190, 202), match='700-555-1234'>
<re.Match object; span=(203, 215), match='800-555-1234'>
<re.Match object; span=(216, 228), match='900-555-1234'>


In [23]:
s='''Mr. Schafer
Mr Smith
Ms Davis
Mrs. Robinson
Mr. T
'''

In [24]:
pattern=re.compile(r'Mr\.')
matches=pattern.finditer(s)
for match in matches:
    print(match)

<re.Match object; span=(0, 3), match='Mr.'>
<re.Match object; span=(44, 47), match='Mr.'>


In [25]:
pattern=re.compile(r'Mr\.?')
matches=pattern.finditer(s)
for match in matches:
    print(match)

<re.Match object; span=(0, 3), match='Mr.'>
<re.Match object; span=(12, 14), match='Mr'>
<re.Match object; span=(30, 32), match='Mr'>
<re.Match object; span=(44, 47), match='Mr.'>


In [32]:
pattern=re.compile(r'Mr\.\s[A-Z]\w+')
matches=pattern.finditer(s)
for match in matches:
    print(match)

<re.Match object; span=(0, 11), match='Mr. Schafer'>


In [30]:
pattern=re.compile(r'Mr\.?\s[A-Z]\w+')
matches=pattern.finditer(s)
for match in matches:
    print(match)

<re.Match object; span=(0, 11), match='Mr. Schafer'>
<re.Match object; span=(12, 20), match='Mr Smith'>


In [31]:
pattern=re.compile(r'Mr\.?\s[A-Z]\w*')
matches=pattern.finditer(s)
for match in matches:
    print(match)

<re.Match object; span=(0, 11), match='Mr. Schafer'>
<re.Match object; span=(12, 20), match='Mr Smith'>
<re.Match object; span=(44, 49), match='Mr. T'>


In [33]:
pattern=re.compile(r'M(r|s|rs)\.?\s[A-Z]\w*')
matches=pattern.finditer(s)
for match in matches:
    print(match)

<re.Match object; span=(0, 11), match='Mr. Schafer'>
<re.Match object; span=(12, 20), match='Mr Smith'>
<re.Match object; span=(21, 29), match='Ms Davis'>
<re.Match object; span=(30, 43), match='Mrs. Robinson'>
<re.Match object; span=(44, 49), match='Mr. T'>
