In [None]:
# Regular Expression

'''
.       - Any Character Except New Line
\d      - Digit (0-9)
\D      - Not a Digit (0-9)
\w      - Word Character (a-z, A-Z, 0-9, _)
\W      - Not a Word Character
\s      - Whitespace (space, tab, newline)
\S      - Not Whitespace (space, tab, newline)

\b      - Word Boundary
\B      - Not a Word Boundary
^       - Beginning of a String
$       - End of a String

[]      - Matches Characters in brackets
[^ ]    - Matches Characters NOT in brackets
|       - Either Or
( )     - Group

Quantifiers:
*       - 0 or More
+       - 1 or More
?       - 0 or One
{3}     - Exact Number
{3,4}   - Range of Numbers (Minimum, Maximum)


#### Sample Regexs ####

[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+

'''


# \b- Word Boundary

In [3]:
import re

In [5]:
data = 'cat catherine catholic wildcat copycat uncatchable'

pattern = re.compile('cat')

matches = pattern.finditer(data)

for match in matches:
    print(match)

<re.Match object; span=(0, 3), match='cat'>
<re.Match object; span=(4, 7), match='cat'>
<re.Match object; span=(14, 17), match='cat'>
<re.Match object; span=(27, 30), match='cat'>
<re.Match object; span=(35, 38), match='cat'>
<re.Match object; span=(41, 44), match='cat'>


In [6]:
data = 'cat catherine catholic wildcat copycat uncatchable'

pattern = re.compile('cat ')

matches = pattern.finditer(data)

for match in matches:
    print(match)

<re.Match object; span=(0, 4), match='cat '>
<re.Match object; span=(27, 31), match='cat '>
<re.Match object; span=(35, 39), match='cat '>


In [8]:
"""
word\b = left side of the the \b should be a word char and right hand side should not be a word char
\bword = left side of the \b should be not a word char and right hand side should be a char
"""


data = 'cat catherine catholic wildcat copycat uncatchable'

pattern = re.compile(r'cat\b')

matches = pattern.finditer(data)

for match in matches:
    print(match)

<re.Match object; span=(0, 3), match='cat'>
<re.Match object; span=(27, 30), match='cat'>
<re.Match object; span=(35, 38), match='cat'>


In [9]:
data = 'cat catherine catholic wildcat copycat uncatchable'

pattern = re.compile(r'\bcat')

matches = pattern.finditer(data)

for match in matches:
    print(match)

<re.Match object; span=(0, 3), match='cat'>
<re.Match object; span=(4, 7), match='cat'>
<re.Match object; span=(14, 17), match='cat'>


In [11]:
data = 'cat catherine catholic wildcat copycat uncatchable'

pattern = re.compile(r'\bcat\b')

matches = pattern.finditer(data)

for match in matches:
    print(match)

<re.Match object; span=(0, 3), match='cat'>


In [14]:
data = 'she sells seashells at sea shore'

pattern = re.compile(r'\bs')

matches = pattern.finditer(data)

for match in matches:
    print(match)

<re.Match object; span=(0, 1), match='s'>
<re.Match object; span=(4, 5), match='s'>
<re.Match object; span=(10, 11), match='s'>
<re.Match object; span=(23, 24), match='s'>
<re.Match object; span=(27, 28), match='s'>


In [15]:
# \B - not a wrod boundry

data = 'she sells seashells at sea shore'

pattern = re.compile(r's\B')

matches = pattern.finditer(data)

for match in matches:
    print(match)

<re.Match object; span=(0, 1), match='s'>
<re.Match object; span=(4, 5), match='s'>
<re.Match object; span=(10, 11), match='s'>
<re.Match object; span=(13, 14), match='s'>
<re.Match object; span=(23, 24), match='s'>
<re.Match object; span=(27, 28), match='s'>


In [16]:
data = 'she sells seashells at sea shore'

pattern = re.compile(r'\Bs\B')

matches = pattern.finditer(data)

for match in matches:
    print(match)

<re.Match object; span=(13, 14), match='s'>


In [17]:
# ^       - Beginning of a String

data = 'she sells seashells at sea shore'

pattern = re.compile(r'^sell')

matches = pattern.finditer(data)

for match in matches:
    print(match)

In [18]:
data = 'she sells seashells at sea shore'

pattern = re.compile(r'^she ')

matches = pattern.finditer(data)

for match in matches:
    print(match)

<re.Match object; span=(0, 4), match='she '>


In [20]:
#$      - end of a String

data = 'she sells seashells at sea shore'

pattern = re.compile(r' ore$')

matches = pattern.finditer(data)

for match in matches:
    print(match)

In [21]:
text_to_search = '''
abcdefghijklmnopqurtuvwxyz
ABCDEFGHIJKLMNOPQRSTUVWXYZ\s
321-555-4321
1234567890
Ha HaHa
MetaCharacters (Need to be escaped):
. ^ $ * + ? { } [ ] \ | ( )
khanafsaan11.com
321-555-4321
123.555.1234
123*555*-1234
123.555.1234
800-555-1234
900-555-1234
Mr. Schafer
Mr Smith
Ms Davis
Mrs. Robinson
Mr. T
Mr_hello
'''

In [25]:
# Q1 - write a regex to search all the 3 digit numbers

pattern = re.compile(r'\b\d\d\d\b')

matches = pattern.finditer(text_to_search)

for match in matches:
    print(match)

<re.Match object; span=(57, 60), match='321'>
<re.Match object; span=(61, 64), match='555'>
<re.Match object; span=(171, 174), match='321'>
<re.Match object; span=(175, 178), match='555'>
<re.Match object; span=(184, 187), match='123'>
<re.Match object; span=(188, 191), match='555'>
<re.Match object; span=(197, 200), match='123'>
<re.Match object; span=(201, 204), match='555'>
<re.Match object; span=(211, 214), match='123'>
<re.Match object; span=(215, 218), match='555'>
<re.Match object; span=(224, 227), match='800'>
<re.Match object; span=(228, 231), match='555'>
<re.Match object; span=(237, 240), match='900'>
<re.Match object; span=(241, 244), match='555'>


In [27]:
#Q2 extract a valid phone number = nnn.nnn.nnnn

pattern = re.compile(r'\d\d\d\.\d\d\d\.\d\d\d\d')

matches = pattern.finditer(text_to_search)

for match in matches:
    print(match)

<re.Match object; span=(184, 196), match='123.555.1234'>
<re.Match object; span=(211, 223), match='123.555.1234'>


In [28]:
# [] - Matches the char in brac

pattern = re.compile(r'[048]')             
matches = pattern.finditer(text_to_search)

for match in matches:
    print(match)

<re.Match object; span=(65, 66), match='4'>
<re.Match object; span=(73, 74), match='4'>
<re.Match object; span=(77, 78), match='8'>
<re.Match object; span=(79, 80), match='0'>
<re.Match object; span=(179, 180), match='4'>
<re.Match object; span=(195, 196), match='4'>
<re.Match object; span=(209, 210), match='4'>
<re.Match object; span=(222, 223), match='4'>
<re.Match object; span=(224, 225), match='8'>
<re.Match object; span=(225, 226), match='0'>
<re.Match object; span=(226, 227), match='0'>
<re.Match object; span=(235, 236), match='4'>
<re.Match object; span=(238, 239), match='0'>
<re.Match object; span=(239, 240), match='0'>
<re.Match object; span=(248, 249), match='4'>


In [29]:
pattern = re.compile(r'[0123abcdef]')             
matches = pattern.finditer(text_to_search)

for match in matches:
    print(match)

<re.Match object; span=(1, 2), match='a'>
<re.Match object; span=(2, 3), match='b'>
<re.Match object; span=(3, 4), match='c'>
<re.Match object; span=(4, 5), match='d'>
<re.Match object; span=(5, 6), match='e'>
<re.Match object; span=(6, 7), match='f'>
<re.Match object; span=(57, 58), match='3'>
<re.Match object; span=(58, 59), match='2'>
<re.Match object; span=(59, 60), match='1'>
<re.Match object; span=(66, 67), match='3'>
<re.Match object; span=(67, 68), match='2'>
<re.Match object; span=(68, 69), match='1'>
<re.Match object; span=(70, 71), match='1'>
<re.Match object; span=(71, 72), match='2'>
<re.Match object; span=(72, 73), match='3'>
<re.Match object; span=(79, 80), match='0'>
<re.Match object; span=(82, 83), match='a'>
<re.Match object; span=(85, 86), match='a'>
<re.Match object; span=(87, 88), match='a'>
<re.Match object; span=(90, 91), match='e'>
<re.Match object; span=(92, 93), match='a'>
<re.Match object; span=(95, 96), match='a'>
<re.Match object; span=(97, 98), match='a'>


In [32]:
# Significance of - between number of alphabets

pattern = re.compile(r'[0-7a-f]')             
matches = pattern.finditer(text_to_search)

for match in matches:
    print(match)

<re.Match object; span=(1, 2), match='a'>
<re.Match object; span=(2, 3), match='b'>
<re.Match object; span=(3, 4), match='c'>
<re.Match object; span=(4, 5), match='d'>
<re.Match object; span=(5, 6), match='e'>
<re.Match object; span=(6, 7), match='f'>
<re.Match object; span=(57, 58), match='3'>
<re.Match object; span=(58, 59), match='2'>
<re.Match object; span=(59, 60), match='1'>
<re.Match object; span=(61, 62), match='5'>
<re.Match object; span=(62, 63), match='5'>
<re.Match object; span=(63, 64), match='5'>
<re.Match object; span=(65, 66), match='4'>
<re.Match object; span=(66, 67), match='3'>
<re.Match object; span=(67, 68), match='2'>
<re.Match object; span=(68, 69), match='1'>
<re.Match object; span=(70, 71), match='1'>
<re.Match object; span=(71, 72), match='2'>
<re.Match object; span=(72, 73), match='3'>
<re.Match object; span=(73, 74), match='4'>
<re.Match object; span=(74, 75), match='5'>
<re.Match object; span=(75, 76), match='6'>
<re.Match object; span=(76, 77), match='7'>


In [36]:
# [^] - matches char not in the bracket

data = 'cat mat bat sat dat lat'

pattern = re.compile(r'[cmsdl]at')             
matches = pattern.finditer(data)

for match in matches:
    print(match)

<re.Match object; span=(0, 3), match='cat'>
<re.Match object; span=(4, 7), match='mat'>
<re.Match object; span=(12, 15), match='sat'>
<re.Match object; span=(16, 19), match='dat'>
<re.Match object; span=(20, 23), match='lat'>


In [37]:
pattern = re.compile(r'[^b]at')             
matches = pattern.finditer(data)

for match in matches:
    print(match)

<re.Match object; span=(0, 3), match='cat'>
<re.Match object; span=(4, 7), match='mat'>
<re.Match object; span=(12, 15), match='sat'>
<re.Match object; span=(16, 19), match='dat'>
<re.Match object; span=(20, 23), match='lat'>


In [38]:
data = 'cat mat bat sat dat lat'

pattern = re.compile(r'[^abc]')             
matches = pattern.finditer(data)

for match in matches:
    print(match)

<re.Match object; span=(2, 3), match='t'>
<re.Match object; span=(3, 4), match=' '>
<re.Match object; span=(4, 5), match='m'>
<re.Match object; span=(6, 7), match='t'>
<re.Match object; span=(7, 8), match=' '>
<re.Match object; span=(10, 11), match='t'>
<re.Match object; span=(11, 12), match=' '>
<re.Match object; span=(12, 13), match='s'>
<re.Match object; span=(14, 15), match='t'>
<re.Match object; span=(15, 16), match=' '>
<re.Match object; span=(16, 17), match='d'>
<re.Match object; span=(18, 19), match='t'>
<re.Match object; span=(19, 20), match=' '>
<re.Match object; span=(20, 21), match='l'>
<re.Match object; span=(22, 23), match='t'>


In [40]:
data = 'cat mat bat sat dat lat'

pattern = re.compile(r'[a][b][c]')             
matches = pattern.finditer(data)

for match in matches:
    print(match)

<re.Match object; span=(2, 3), match='t'>
<re.Match object; span=(3, 4), match=' '>
<re.Match object; span=(4, 5), match='m'>
<re.Match object; span=(6, 7), match='t'>
<re.Match object; span=(7, 8), match=' '>
<re.Match object; span=(10, 11), match='t'>
<re.Match object; span=(11, 12), match=' '>
<re.Match object; span=(12, 13), match='s'>
<re.Match object; span=(14, 15), match='t'>
<re.Match object; span=(15, 16), match=' '>
<re.Match object; span=(16, 17), match='d'>
<re.Match object; span=(18, 19), match='t'>
<re.Match object; span=(19, 20), match=' '>
<re.Match object; span=(20, 21), match='l'>
<re.Match object; span=(22, 23), match='t'>


In [41]:
#Q2 extract a valid phone number = nnn.nnn.nnnn / nnn-nnn-nnnn

pattern = re.compile(r'\d\d\d[.-]\d\d\d[.-]\d\d\d\d')             
matches = pattern.finditer(text_to_search)

for match in matches:
    print(match)

<re.Match object; span=(57, 69), match='321-555-4321'>
<re.Match object; span=(171, 183), match='321-555-4321'>
<re.Match object; span=(184, 196), match='123.555.1234'>
<re.Match object; span=(211, 223), match='123.555.1234'>
<re.Match object; span=(224, 236), match='800-555-1234'>
<re.Match object; span=(237, 249), match='900-555-1234'>


In [43]:
# {} - exact number

pattern = re.compile(r'\d{3}[.-]\d{3}[.-]\d{4}')             
matches = pattern.finditer(text_to_search)

for match in matches:
    print(match)

<re.Match object; span=(57, 69), match='321-555-4321'>
<re.Match object; span=(171, 183), match='321-555-4321'>
<re.Match object; span=(184, 196), match='123.555.1234'>
<re.Match object; span=(211, 223), match='123.555.1234'>
<re.Match object; span=(224, 236), match='800-555-1234'>
<re.Match object; span=(237, 249), match='900-555-1234'>


In [46]:
# {min,max}

text_to_search = '''
abcdefghijklmnopqurtuvwxyz
ABCDEFGHIJKLMNOPQRSTUVWXYZ\s
321-555-4321
1234567890
Ha HaHa
MetaCharacters (Need to be escaped):
. ^ $ * + ? { } [ ] \ | ( )
khanafsaan11.com
321-555-4321
321-555-432165
321-555-43
123.555.1234
123*555*-1234
123.555.1234
800-555-1234
900-555-1234
Mr. Schafer
Mr Smith
Ms Davis
Mrs. Robinson
Mr. T
Mr_hello
'''

In [48]:
pattern = re.compile(r'\d{3}[.-]\d{3}[.-]\d{2,6}')             
matches = pattern.finditer(text_to_search)

for match in matches:
    print(match)

<re.Match object; span=(57, 69), match='321-555-4321'>
<re.Match object; span=(171, 183), match='321-555-4321'>
<re.Match object; span=(184, 198), match='321-555-432165'>
<re.Match object; span=(199, 209), match='321-555-43'>
<re.Match object; span=(210, 222), match='123.555.1234'>
<re.Match object; span=(237, 249), match='123.555.1234'>
<re.Match object; span=(250, 262), match='800-555-1234'>
<re.Match object; span=(263, 275), match='900-555-1234'>


In [None]:
# {min,max}

text_to_search = '''
abcdefghijklmnopqurtuvwxyz
ABCDEFGHIJKLMNOPQRSTUVWXYZ\s
321-555-4321
1234567890
Ha HaHa
MetaCharacters (Need to be escaped):
. ^ $ * + ? { } [ ] \ | ( )
khanafsaan11.com
321-555-4321
321-555-432165
321-555-43
123.555.1234
123*555*-1234
123.555.1234
800-555-1234
900-555-1234
Mr. Schafer
Mr Smith
Ms Davis
Mrs. Robinson
Mr. T
Mr_hello
'''

In [49]:
pattern = re.compile(r'M')             
matches = pattern.finditer(text_to_search)

for match in matches:
    print(match)

<re.Match object; span=(40, 41), match='M'>
<re.Match object; span=(89, 90), match='M'>
<re.Match object; span=(276, 277), match='M'>
<re.Match object; span=(288, 289), match='M'>
<re.Match object; span=(297, 298), match='M'>
<re.Match object; span=(306, 307), match='M'>
<re.Match object; span=(320, 321), match='M'>
<re.Match object; span=(326, 327), match='M'>


In [50]:
pattern = re.compile(r'Mr')             
matches = pattern.finditer(text_to_search)

for match in matches:
    print(match)

<re.Match object; span=(276, 278), match='Mr'>
<re.Match object; span=(288, 290), match='Mr'>
<re.Match object; span=(306, 308), match='Mr'>
<re.Match object; span=(320, 322), match='Mr'>
<re.Match object; span=(326, 328), match='Mr'>


In [51]:
pattern = re.compile(r'Mr\.')             
matches = pattern.finditer(text_to_search)

for match in matches:
    print(match)

<re.Match object; span=(276, 279), match='Mr.'>
<re.Match object; span=(320, 323), match='Mr.'>


In [52]:
pattern = re.compile(r'Mr\. ')             
matches = pattern.finditer(text_to_search)

for match in matches:
    print(match)

<re.Match object; span=(276, 280), match='Mr. '>
<re.Match object; span=(320, 324), match='Mr. '>


In [54]:
pattern = re.compile(r'Mr\. [A-Z]')             
matches = pattern.finditer(text_to_search)

for match in matches:
    print(match)

<re.Match object; span=(276, 287), match='Mr. Schafer'>
<re.Match object; span=(320, 325), match='Mr. T'>


In [None]:
# significance of using *

pattern = re.compile(r'Mr\. [A-Z][a-z]*')             
matches = pattern.finditer(text_to_search)

for match in matches:
    print(match)

In [55]:
"""
Mr. Schafer
Mr Smith
Ms Davis
Mrs. Robinson
Mr. T
Mr_hello

"""

pattern = re.compile(r'Mr\.? [A-Z][a-z]*')             
matches = pattern.finditer(text_to_search)

for match in matches:
    print(match)

<re.Match object; span=(276, 287), match='Mr. Schafer'>
<re.Match object; span=(288, 296), match='Mr Smith'>
<re.Match object; span=(320, 325), match='Mr. T'>


In [56]:
pattern = re.compile(r'M[rs]\.? [A-Z][a-z]*')             
matches = pattern.finditer(text_to_search)

for match in matches:
    print(match)

<re.Match object; span=(276, 287), match='Mr. Schafer'>
<re.Match object; span=(288, 296), match='Mr Smith'>
<re.Match object; span=(297, 305), match='Ms Davis'>
<re.Match object; span=(320, 325), match='Mr. T'>


In [57]:
pattern = re.compile(r'M(r|s|rs)\.? [A-Z][a-z]*')             
matches = pattern.finditer(text_to_search)

for match in matches:
    print(match)

<re.Match object; span=(276, 287), match='Mr. Schafer'>
<re.Match object; span=(288, 296), match='Mr Smith'>
<re.Match object; span=(297, 305), match='Ms Davis'>
<re.Match object; span=(306, 319), match='Mrs. Robinson'>
<re.Match object; span=(320, 325), match='Mr. T'>


In [None]:
pattern = re.compile(r'M(r|s|rs)\.? [A-Z][a-z]*')             
matches = pattern.finditer(text_to_search)

for match in matches:
    print(match)

In [61]:
a = 'afsan\\tkhan'
print(a)

afsan\tkhan


In [64]:
data = 'cat dog mat sat dat catholic'

a = re.findall('cat' , data)
print(a)

['cat', 'cat']


In [None]:
Numpy
Pandas
matplotlib
seaborn