# Regular Expression

In [1]:
# Regular Expression - is a pattern which is used to extract the information from a given string or data

In [2]:
'''
.       - Any Character Except New Line
\d      - Digit (0-9)
\D      - Not a Digit (0-9)
\w      - Alphanumeric Character (a-z, A-Z, 0-9, _)
\W      - Non Alphanumeric Character
\s      - Whitespace (space, tab, newline)
\S      - Not Whitespace (space, tab, newline)

\b      - Word Boundary
\B      - Not a Word Boundary
^       - Beginning of a String
$       - End of a String

[A-Z]   - Matches one Characters in brackets
[^A-Z]  - Matches Characters NOT in brackets
|       - Either Or
(com|net)     - Group

Quantifiers:
*       - 0 or More
+       - 1 or More
?       - 0 or One
{3}     - Exact Number
{3,4}   - Range of Numbers (Minimum, Maximum)
'''

'\n.       - Any Character Except New Line\n\\d      - Digit (0-9)\n\\D      - Not a Digit (0-9)\n\\w      - Alphanumeric Character (a-z, A-Z, 0-9, _)\n\\W      - Non Alphanumeric Character\n\\s      - Whitespace (space, tab, newline)\n\\S      - Not Whitespace (space, tab, newline)\n\n\x08      - Word Boundary\n\\B      - Not a Word Boundary\n^       - Beginning of a String\n$       - End of a String\n\n[A-Z]   - Matches one Characters in brackets\n[^A-Z]  - Matches Characters NOT in brackets\n|       - Either Or\n(com|net)     - Group\n\nQuantifiers:\n*       - 0 or More\n+       - 1 or More\n?       - 0 or One\n{3}     - Exact Number\n{3,4}   - Range of Numbers (Minimum, Maximum)\n'

In [3]:
# match - always finds the pattern in the begining of the string
#      If the pattern matches then returns match object else returns None
# search - always finds the first occurremce of the pattern
#      If the pattern matches then returns match object else returns None
# finditer - finds all the occurences of the pattern and returns an iterator object
#      If the pattern matches then returns iterator object
# findall - finds all the occurrences of the pattern and returns a list
#      If the pattern matches then returns list object with all matching values else returns []
# sub - substitute an old pattern with new pattern

In [4]:
# Sample Use Case: Extract E-Mail ID's from a text file:
import re
f = open('Regex/sample.txt', 'r')
content = f.read()

# Extract all email id's from a string
pattern = r'[a-z, A-Z]+\.?\w+.?\w+?@\w+\.com'  
# [a-z]: Match any character from a to z or A to Z
# +: Match for additional characters
# \.: Escape sequence for Special Characters ('.' in this case)
# ?: '.' may or may not appear. ? checks for 0 or 1 occurance
# \w+: Match any special character or alphanumeric character
# .: Match for a second '.' (if any)
# \w+: Match for alphanumeric characters after 2nd '.' (if any)
# ?: '.' may or may not appear. ? checks for 0 or 1 occurance
# \w+: Match for alphanumeric characters after 2nd '.' (if any)
# @: Checks for '@' in Email ID
# \w+: Match any alphanumeric character
# \.com: Matches for .com


print(re.findall(pattern, content))
f.close()

['arijit.chowdhury@volvo.com', 'steven@gmail.com', 'a.b.c@yahoo.com', 'xyz@volvo.com']


In [5]:
# match - always finds the pattern in the begining of the string
#         If the pattern matches then returns match object else returns None

import re
f = open('Regex/sample.txt', 'r')
content = f.read()
pattern = r'[a-z]at'
m_obj = re.match(pattern, content)
print(m_obj)
print(m_obj.group())
print(m_obj.span())           # span() returns both start and end indexes as a single tuple
print(m_obj.start())          # start() returns the start index of the matched object
print(m_obj.end())            # end() returns the end index of the matched object
f.close()

<re.Match object; span=(0, 3), match='rat'>
rat
(0, 3)
0
3


In [6]:
# search - always finds the first occurrance of the pattern
#          If the pattern matches then returns match object else returns None

import re
f = open('Regex/sample.txt', 'r')
content = f.read()
pattern = r'[0-9]at'
m_obj = re.search(pattern, content)
print(m_obj)
print(m_obj.group())
print(m_obj.span()[0])        # .span() can be used along with index [0] or [1] to fetch the start and stop index
print(m_obj.span()[1])
print(m_obj.start())
print(m_obj.end())
f.close()

<re.Match object; span=(32, 35), match='7at'>
7at
32
35
32
35


In [7]:
import re
f = open('Regex/sample.txt', 'r')
content = f.read()
pattern = r'\wat'
# \w : Any alphanumeric character
# 'at': Matching characters (exact match)
m_obj = re.findall(pattern, content)
print(m_obj)
f.close()

['rat', 'cat', 'mat', 'bat', 'sat', 'wat', 'mat', '7at', 'hat', 'hat', 'pat', '1at', 'dat', 'tat', 'lat', 'lat', 'rat']


In [8]:
# finditer - finds all the occurences of the matching pattern and returns an iterator object

import re
f = open('Regex/sample.txt', 'r')
content = f.read()
pattern = r'\wat'
m_obj = re.finditer(pattern, content)
for i in m_obj:
    print(i.group(), i.span(), end = '\n\n')
f.close()

rat (0, 3)

cat (5, 8)

mat (10, 13)

bat (15, 18)

sat (19, 22)

wat (23, 26)

mat (28, 31)

7at (32, 35)

hat (37, 40)

hat (41, 44)

pat (45, 48)

1at (50, 53)

dat (54, 57)

tat (59, 62)

lat (65, 68)

lat (70, 73)

rat (74, 77)



In [9]:
# sub - substitute an old pattern with new pattern

import re
f = open('Regex/sample.txt', 'r+')
content = f.read()
pattern = r'\wat'
m_obj = re.sub(pattern, '999', content)
print(m_obj)
f.close()

999, 999, 999, 999
999
999, 999
999, 999
999
999, 999
999
s999, c999
f999
999
arijit.chowdhury@volvo.com
steven@gmail.com
a.b.c@yahoo.com
xyz@volvo.com


In [10]:
# Matches every character except new line

import re
f = open('Regex/sample.txt', 'r')
content = f.read()
pattern = r'.'
res = re.finditer(pattern, content)
for i in res:
    print(i)
f.close()

<re.Match object; span=(0, 1), match='r'>
<re.Match object; span=(1, 2), match='a'>
<re.Match object; span=(2, 3), match='t'>
<re.Match object; span=(3, 4), match=','>
<re.Match object; span=(4, 5), match=' '>
<re.Match object; span=(5, 6), match='c'>
<re.Match object; span=(6, 7), match='a'>
<re.Match object; span=(7, 8), match='t'>
<re.Match object; span=(8, 9), match=','>
<re.Match object; span=(9, 10), match=' '>
<re.Match object; span=(10, 11), match='m'>
<re.Match object; span=(11, 12), match='a'>
<re.Match object; span=(12, 13), match='t'>
<re.Match object; span=(13, 14), match=','>
<re.Match object; span=(14, 15), match=' '>
<re.Match object; span=(15, 16), match='b'>
<re.Match object; span=(16, 17), match='a'>
<re.Match object; span=(17, 18), match='t'>
<re.Match object; span=(19, 20), match='s'>
<re.Match object; span=(20, 21), match='a'>
<re.Match object; span=(21, 22), match='t'>
<re.Match object; span=(23, 24), match='w'>
<re.Match object; span=(24, 25), match='a'>
<re.Mat

In [11]:
# Match all whitespace characters: [\t\n\r\f\v]

import re
f = open('Regex/sample.txt', 'r')
content = f.read()
pattern = r'\s'
res = re.finditer(pattern, content)
for i in res:
    print(i)
f.close()

<re.Match object; span=(4, 5), match=' '>
<re.Match object; span=(9, 10), match=' '>
<re.Match object; span=(14, 15), match=' '>
<re.Match object; span=(18, 19), match='\n'>
<re.Match object; span=(22, 23), match='\n'>
<re.Match object; span=(27, 28), match=' '>
<re.Match object; span=(31, 32), match='\n'>
<re.Match object; span=(36, 37), match=' '>
<re.Match object; span=(40, 41), match='\n'>
<re.Match object; span=(44, 45), match='\n'>
<re.Match object; span=(49, 50), match=' '>
<re.Match object; span=(53, 54), match='\n'>
<re.Match object; span=(57, 58), match='\n'>
<re.Match object; span=(63, 64), match=' '>
<re.Match object; span=(68, 69), match='\n'>
<re.Match object; span=(73, 74), match='\n'>
<re.Match object; span=(77, 78), match='\n'>
<re.Match object; span=(104, 105), match='\n'>
<re.Match object; span=(121, 122), match='\n'>
<re.Match object; span=(137, 138), match='\n'>


In [12]:
# Match all non-whitespace characters: [\t\n\r\f\v]

import re
f = open('Regex/sample.txt', 'r')
content = f.read()
pattern = r'\S'
res = re.finditer(pattern, content)
for i in res:
    print(i)
f.close()

<re.Match object; span=(0, 1), match='r'>
<re.Match object; span=(1, 2), match='a'>
<re.Match object; span=(2, 3), match='t'>
<re.Match object; span=(3, 4), match=','>
<re.Match object; span=(5, 6), match='c'>
<re.Match object; span=(6, 7), match='a'>
<re.Match object; span=(7, 8), match='t'>
<re.Match object; span=(8, 9), match=','>
<re.Match object; span=(10, 11), match='m'>
<re.Match object; span=(11, 12), match='a'>
<re.Match object; span=(12, 13), match='t'>
<re.Match object; span=(13, 14), match=','>
<re.Match object; span=(15, 16), match='b'>
<re.Match object; span=(16, 17), match='a'>
<re.Match object; span=(17, 18), match='t'>
<re.Match object; span=(19, 20), match='s'>
<re.Match object; span=(20, 21), match='a'>
<re.Match object; span=(21, 22), match='t'>
<re.Match object; span=(23, 24), match='w'>
<re.Match object; span=(24, 25), match='a'>
<re.Match object; span=(25, 26), match='t'>
<re.Match object; span=(26, 27), match=','>
<re.Match object; span=(28, 29), match='m'>
<re.

In [13]:
# Match any alphanumeric character: [0-9a-zA-Z_]

import re
f = open('Regex/sample.txt', 'r')
content = f.read()
pattern = r'\w'
res = re.finditer(pattern, content)
for i in res:
    print(i)
f.close()

<re.Match object; span=(0, 1), match='r'>
<re.Match object; span=(1, 2), match='a'>
<re.Match object; span=(2, 3), match='t'>
<re.Match object; span=(5, 6), match='c'>
<re.Match object; span=(6, 7), match='a'>
<re.Match object; span=(7, 8), match='t'>
<re.Match object; span=(10, 11), match='m'>
<re.Match object; span=(11, 12), match='a'>
<re.Match object; span=(12, 13), match='t'>
<re.Match object; span=(15, 16), match='b'>
<re.Match object; span=(16, 17), match='a'>
<re.Match object; span=(17, 18), match='t'>
<re.Match object; span=(19, 20), match='s'>
<re.Match object; span=(20, 21), match='a'>
<re.Match object; span=(21, 22), match='t'>
<re.Match object; span=(23, 24), match='w'>
<re.Match object; span=(24, 25), match='a'>
<re.Match object; span=(25, 26), match='t'>
<re.Match object; span=(28, 29), match='m'>
<re.Match object; span=(29, 30), match='a'>
<re.Match object; span=(30, 31), match='t'>
<re.Match object; span=(32, 33), match='7'>
<re.Match object; span=(33, 34), match='a'>


In [14]:
# Match any non-alphanumeric character: [0-9a-zA-Z_]

import re
f = open('Regex/sample.txt', 'r')
content = f.read()
pattern = r'\W'
res = re.finditer(pattern, content)
for i in res:
    print(i)
f.close()

<re.Match object; span=(3, 4), match=','>
<re.Match object; span=(4, 5), match=' '>
<re.Match object; span=(8, 9), match=','>
<re.Match object; span=(9, 10), match=' '>
<re.Match object; span=(13, 14), match=','>
<re.Match object; span=(14, 15), match=' '>
<re.Match object; span=(18, 19), match='\n'>
<re.Match object; span=(22, 23), match='\n'>
<re.Match object; span=(26, 27), match=','>
<re.Match object; span=(27, 28), match=' '>
<re.Match object; span=(31, 32), match='\n'>
<re.Match object; span=(35, 36), match=','>
<re.Match object; span=(36, 37), match=' '>
<re.Match object; span=(40, 41), match='\n'>
<re.Match object; span=(44, 45), match='\n'>
<re.Match object; span=(48, 49), match=','>
<re.Match object; span=(49, 50), match=' '>
<re.Match object; span=(53, 54), match='\n'>
<re.Match object; span=(57, 58), match='\n'>
<re.Match object; span=(62, 63), match=','>
<re.Match object; span=(63, 64), match=' '>
<re.Match object; span=(68, 69), match='\n'>
<re.Match object; span=(73, 74)

In [15]:
# Match a digit

import re
f = open('Regex/sample.txt', 'r')
content = f.read()
pattern = r'\d'
res = re.finditer(pattern, content)
for i in res:
    print(i)
f.close()

<re.Match object; span=(32, 33), match='7'>
<re.Match object; span=(50, 51), match='1'>


In [16]:
# Match a non-digit

import re
f = open('Regex/sample.txt', 'r')
content = f.read()
pattern = r'\D'
res = re.finditer(pattern, content)
for i in res:
    print(i)
f.close()

<re.Match object; span=(0, 1), match='r'>
<re.Match object; span=(1, 2), match='a'>
<re.Match object; span=(2, 3), match='t'>
<re.Match object; span=(3, 4), match=','>
<re.Match object; span=(4, 5), match=' '>
<re.Match object; span=(5, 6), match='c'>
<re.Match object; span=(6, 7), match='a'>
<re.Match object; span=(7, 8), match='t'>
<re.Match object; span=(8, 9), match=','>
<re.Match object; span=(9, 10), match=' '>
<re.Match object; span=(10, 11), match='m'>
<re.Match object; span=(11, 12), match='a'>
<re.Match object; span=(12, 13), match='t'>
<re.Match object; span=(13, 14), match=','>
<re.Match object; span=(14, 15), match=' '>
<re.Match object; span=(15, 16), match='b'>
<re.Match object; span=(16, 17), match='a'>
<re.Match object; span=(17, 18), match='t'>
<re.Match object; span=(18, 19), match='\n'>
<re.Match object; span=(19, 20), match='s'>
<re.Match object; span=(20, 21), match='a'>
<re.Match object; span=(21, 22), match='t'>
<re.Match object; span=(22, 23), match='\n'>
<re.M

In [17]:
# Match a Dot '.'

import re
f = open('Regex/sample.txt', 'r')
content = f.read()
pattern = r'\.'
res = re.finditer(pattern, content)
for i in res:
    print(i)
f.close()

<re.Match object; span=(84, 85), match='.'>
<re.Match object; span=(100, 101), match='.'>
<re.Match object; span=(117, 118), match='.'>
<re.Match object; span=(123, 124), match='.'>
<re.Match object; span=(125, 126), match='.'>
<re.Match object; span=(133, 134), match='.'>
<re.Match object; span=(147, 148), match='.'>


In [18]:
# Find the span for the exact word 'cat' without any prefix or suffix in the below string:
# 'cat catherine catholic wildcat cat copycat uncatachable'

import re
string = 'cat catherine catholic wildcat cat copycat uncatachable'
pattern = r'\bcat\b'
res = re.finditer(pattern, string)
for i in res:
    print(i)

<re.Match object; span=(0, 3), match='cat'>
<re.Match object; span=(31, 34), match='cat'>


In [19]:
# Find a match at the beginning of the string

import re
string = 'cat catherine catholic wildcat cat copycat uncatachable'
pattern = r'^cat'
res = re.finditer(pattern, string)
for i in res:
    print(i)

<re.Match object; span=(0, 3), match='cat'>


In [20]:
# Find a match at the end of the string

import re
string = 'cat catherine catholic wildcat cat copycat uncatachable cat'
pattern = r'cat$'
res = re.finditer(pattern, string)
for i in res:
    print(i)

<re.Match object; span=(56, 59), match='cat'>


In [21]:
# Here is a sample text that will used for demonstration in the next few examples:

text_to_search = '''
abcdefghijklmnopqurtuvwxyz
ABCDEFGHIJKLMNOPQRSTUVWXYZ\s
321-555-4321
1234567890
Ha HaHa
MetaCharacters (Need to be escaped):
. ^ $ * + ? { } [ ] \ | ( )
guptaabhishek.com
abhishek-8765-gupta@yahoo.com
abhi_gyupta@gamil.abc
321-555-4321
123.555.1234
123*555*1234
123.555.1234
800-555-1234
900-555-1234
Mr. Schafer
Mr Smith
Ms Davis
Mrs. Robinson
Mr. T
Mr_hello
abhsihek.gupta@gmail.com
abhishek-1234gupta@gmail.org
abhishek@hotmail.net
'''

In [22]:
# Find all alphabets in upper case

pattern = r'[A-Z]'
res = re.finditer(pattern, text_to_search)
# print(res)
for i in res:
    print(i)

<re.Match object; span=(28, 29), match='A'>
<re.Match object; span=(29, 30), match='B'>
<re.Match object; span=(30, 31), match='C'>
<re.Match object; span=(31, 32), match='D'>
<re.Match object; span=(32, 33), match='E'>
<re.Match object; span=(33, 34), match='F'>
<re.Match object; span=(34, 35), match='G'>
<re.Match object; span=(35, 36), match='H'>
<re.Match object; span=(36, 37), match='I'>
<re.Match object; span=(37, 38), match='J'>
<re.Match object; span=(38, 39), match='K'>
<re.Match object; span=(39, 40), match='L'>
<re.Match object; span=(40, 41), match='M'>
<re.Match object; span=(41, 42), match='N'>
<re.Match object; span=(42, 43), match='O'>
<re.Match object; span=(43, 44), match='P'>
<re.Match object; span=(44, 45), match='Q'>
<re.Match object; span=(45, 46), match='R'>
<re.Match object; span=(46, 47), match='S'>
<re.Match object; span=(47, 48), match='T'>
<re.Match object; span=(48, 49), match='U'>
<re.Match object; span=(49, 50), match='V'>
<re.Match object; span=(50, 51),

In [23]:
# Find all alphabets in lower case

pattern = r'[a-z]'
res = re.finditer(pattern, text_to_search)
# print(res)
for i in res:
    print(i)

<re.Match object; span=(1, 2), match='a'>
<re.Match object; span=(2, 3), match='b'>
<re.Match object; span=(3, 4), match='c'>
<re.Match object; span=(4, 5), match='d'>
<re.Match object; span=(5, 6), match='e'>
<re.Match object; span=(6, 7), match='f'>
<re.Match object; span=(7, 8), match='g'>
<re.Match object; span=(8, 9), match='h'>
<re.Match object; span=(9, 10), match='i'>
<re.Match object; span=(10, 11), match='j'>
<re.Match object; span=(11, 12), match='k'>
<re.Match object; span=(12, 13), match='l'>
<re.Match object; span=(13, 14), match='m'>
<re.Match object; span=(14, 15), match='n'>
<re.Match object; span=(15, 16), match='o'>
<re.Match object; span=(16, 17), match='p'>
<re.Match object; span=(17, 18), match='q'>
<re.Match object; span=(18, 19), match='u'>
<re.Match object; span=(19, 20), match='r'>
<re.Match object; span=(20, 21), match='t'>
<re.Match object; span=(21, 22), match='u'>
<re.Match object; span=(22, 23), match='v'>
<re.Match object; span=(23, 24), match='w'>
<re.M

In [24]:
# Find all alphabets and digits:

pattern = r'[A-Za-z0-9]'
res = re.finditer(pattern, text_to_search)
# print(res)
for i in res:
    print(i)

<re.Match object; span=(1, 2), match='a'>
<re.Match object; span=(2, 3), match='b'>
<re.Match object; span=(3, 4), match='c'>
<re.Match object; span=(4, 5), match='d'>
<re.Match object; span=(5, 6), match='e'>
<re.Match object; span=(6, 7), match='f'>
<re.Match object; span=(7, 8), match='g'>
<re.Match object; span=(8, 9), match='h'>
<re.Match object; span=(9, 10), match='i'>
<re.Match object; span=(10, 11), match='j'>
<re.Match object; span=(11, 12), match='k'>
<re.Match object; span=(12, 13), match='l'>
<re.Match object; span=(13, 14), match='m'>
<re.Match object; span=(14, 15), match='n'>
<re.Match object; span=(15, 16), match='o'>
<re.Match object; span=(16, 17), match='p'>
<re.Match object; span=(17, 18), match='q'>
<re.Match object; span=(18, 19), match='u'>
<re.Match object; span=(19, 20), match='r'>
<re.Match object; span=(20, 21), match='t'>
<re.Match object; span=(21, 22), match='u'>
<re.Match object; span=(22, 23), match='v'>
<re.Match object; span=(23, 24), match='w'>
<re.M

In [25]:
# Find everything except alphabets and digits:

pattern = r'[^A-Za-z0-9]'
res = re.finditer(pattern, text_to_search)
# print(res)
for i in res:
    print(i)

<re.Match object; span=(0, 1), match='\n'>
<re.Match object; span=(27, 28), match='\n'>
<re.Match object; span=(54, 55), match='\\'>
<re.Match object; span=(56, 57), match='\n'>
<re.Match object; span=(60, 61), match='-'>
<re.Match object; span=(64, 65), match='-'>
<re.Match object; span=(69, 70), match='\n'>
<re.Match object; span=(80, 81), match='\n'>
<re.Match object; span=(83, 84), match=' '>
<re.Match object; span=(88, 89), match='\n'>
<re.Match object; span=(103, 104), match=' '>
<re.Match object; span=(104, 105), match='('>
<re.Match object; span=(109, 110), match=' '>
<re.Match object; span=(112, 113), match=' '>
<re.Match object; span=(115, 116), match=' '>
<re.Match object; span=(123, 124), match=')'>
<re.Match object; span=(124, 125), match=':'>
<re.Match object; span=(125, 126), match='\n'>
<re.Match object; span=(126, 127), match='.'>
<re.Match object; span=(127, 128), match=' '>
<re.Match object; span=(128, 129), match='^'>
<re.Match object; span=(129, 130), match=' '>
<r

In [26]:
# Match for one or more occurances of words starting with alphabets

pattern = r'[A-Za-z]+'
res = re.finditer(pattern, text_to_search)
# print(res)
for i in res:
    print(i)

<re.Match object; span=(1, 27), match='abcdefghijklmnopqurtuvwxyz'>
<re.Match object; span=(28, 54), match='ABCDEFGHIJKLMNOPQRSTUVWXYZ'>
<re.Match object; span=(55, 56), match='s'>
<re.Match object; span=(81, 83), match='Ha'>
<re.Match object; span=(84, 88), match='HaHa'>
<re.Match object; span=(89, 103), match='MetaCharacters'>
<re.Match object; span=(105, 109), match='Need'>
<re.Match object; span=(110, 112), match='to'>
<re.Match object; span=(113, 115), match='be'>
<re.Match object; span=(116, 123), match='escaped'>
<re.Match object; span=(154, 167), match='guptaabhishek'>
<re.Match object; span=(168, 171), match='com'>
<re.Match object; span=(172, 180), match='abhishek'>
<re.Match object; span=(186, 191), match='gupta'>
<re.Match object; span=(192, 197), match='yahoo'>
<re.Match object; span=(198, 201), match='com'>
<re.Match object; span=(202, 206), match='abhi'>
<re.Match object; span=(207, 213), match='gyupta'>
<re.Match object; span=(214, 219), match='gamil'>
<re.Match object;

In [27]:
# Match for zero or more occurances of words starting with alphabets

pattern = r'[A-Za-z]*'
res = re.finditer(pattern, text_to_search)
# print(res)
for i in res:
    print(i)

<re.Match object; span=(0, 0), match=''>
<re.Match object; span=(1, 27), match='abcdefghijklmnopqurtuvwxyz'>
<re.Match object; span=(27, 27), match=''>
<re.Match object; span=(28, 54), match='ABCDEFGHIJKLMNOPQRSTUVWXYZ'>
<re.Match object; span=(54, 54), match=''>
<re.Match object; span=(55, 56), match='s'>
<re.Match object; span=(56, 56), match=''>
<re.Match object; span=(57, 57), match=''>
<re.Match object; span=(58, 58), match=''>
<re.Match object; span=(59, 59), match=''>
<re.Match object; span=(60, 60), match=''>
<re.Match object; span=(61, 61), match=''>
<re.Match object; span=(62, 62), match=''>
<re.Match object; span=(63, 63), match=''>
<re.Match object; span=(64, 64), match=''>
<re.Match object; span=(65, 65), match=''>
<re.Match object; span=(66, 66), match=''>
<re.Match object; span=(67, 67), match=''>
<re.Match object; span=(68, 68), match=''>
<re.Match object; span=(69, 69), match=''>
<re.Match object; span=(70, 70), match=''>
<re.Match object; span=(71, 71), match=''>
<re.

In [28]:
# Match for zero or one occurances of words starting with alphabets

pattern = r'[A-Za-z]?'
res = re.finditer(pattern, text_to_search)
# print(res)
for i in res:
    print(i)

<re.Match object; span=(0, 0), match=''>
<re.Match object; span=(1, 2), match='a'>
<re.Match object; span=(2, 3), match='b'>
<re.Match object; span=(3, 4), match='c'>
<re.Match object; span=(4, 5), match='d'>
<re.Match object; span=(5, 6), match='e'>
<re.Match object; span=(6, 7), match='f'>
<re.Match object; span=(7, 8), match='g'>
<re.Match object; span=(8, 9), match='h'>
<re.Match object; span=(9, 10), match='i'>
<re.Match object; span=(10, 11), match='j'>
<re.Match object; span=(11, 12), match='k'>
<re.Match object; span=(12, 13), match='l'>
<re.Match object; span=(13, 14), match='m'>
<re.Match object; span=(14, 15), match='n'>
<re.Match object; span=(15, 16), match='o'>
<re.Match object; span=(16, 17), match='p'>
<re.Match object; span=(17, 18), match='q'>
<re.Match object; span=(18, 19), match='u'>
<re.Match object; span=(19, 20), match='r'>
<re.Match object; span=(20, 21), match='t'>
<re.Match object; span=(21, 22), match='u'>
<re.Match object; span=(22, 23), match='v'>
<re.Matc

In [29]:
# To match lower case alphabets

pattern = r'\b[a-z]{7}\b'
for i in re.finditer(pattern, text_to_search):
    print(i)

<re.Match object; span=(116, 123), match='escaped'>
<re.Match object; span=(424, 431), match='hotmail'>


In [30]:
# Here is a sample text that will used for demonstration in the next few examples:

text_to_search = '''
abcdefghijklmnopqurtuvwxyz
ABCDEFGHIJKLMNOPQRSTUVWXYZ\s
321-555-4321
1234567890
Ha HaHa
MetaCharacters (Need to be escaped):
. ^ $ * + ? { } [ ] \ | ( )
guptaabhishek.com
abhishek-8765-gupta@yahoo.com
abhi_gyupta@gamil.abc
321-555-4321
123.555.1234
123*555*1234
123.555.1234
800-555-1234
900-555-1234
Mr. Schafer
Mr Smith
Ms Davis
Mrs. Robinson
Mr. T
Mr_hello
abhsihek.gupta@gmail.com
abhishek-1234gupta@gmail.org
afsan@hotmail.net
'''

In [31]:
# To match small case alphabets
pattern = r'[a-z]{3,6}'
for i in re.finditer(pattern, text_to_search):
    print(i)

<re.Match object; span=(1, 7), match='abcdef'>
<re.Match object; span=(7, 13), match='ghijkl'>
<re.Match object; span=(13, 19), match='mnopqu'>
<re.Match object; span=(19, 25), match='rtuvwx'>
<re.Match object; span=(90, 93), match='eta'>
<re.Match object; span=(94, 100), match='haract'>
<re.Match object; span=(100, 103), match='ers'>
<re.Match object; span=(106, 109), match='eed'>
<re.Match object; span=(116, 122), match='escape'>
<re.Match object; span=(154, 160), match='guptaa'>
<re.Match object; span=(160, 166), match='bhishe'>
<re.Match object; span=(168, 171), match='com'>
<re.Match object; span=(172, 178), match='abhish'>
<re.Match object; span=(186, 191), match='gupta'>
<re.Match object; span=(192, 197), match='yahoo'>
<re.Match object; span=(198, 201), match='com'>
<re.Match object; span=(202, 206), match='abhi'>
<re.Match object; span=(207, 213), match='gyupta'>
<re.Match object; span=(214, 219), match='gamil'>
<re.Match object; span=(220, 223), match='abc'>
<re.Match object;

In [32]:
# To match exact group of characters
pattern = r'(com|net|org)'
for i in re.finditer(pattern, text_to_search):
    print(i)

<re.Match object; span=(168, 171), match='com'>
<re.Match object; span=(198, 201), match='com'>
<re.Match object; span=(382, 385), match='com'>
<re.Match object; span=(411, 414), match='org'>
<re.Match object; span=(429, 432), match='net'>


In [33]:
string = 'cat bat rat mat hat'
pattern = r'[^r]at'
res = re.finditer(pattern, string)
for i in res:
    print(i)

<re.Match object; span=(0, 3), match='cat'>
<re.Match object; span=(4, 7), match='bat'>
<re.Match object; span=(12, 15), match='mat'>
<re.Match object; span=(16, 19), match='hat'>


In [34]:
string = 'cat bat rat mat hat'
pattern = r'[cbmh]at'
res = re.finditer(pattern, string)
for i in res:
    print(i)

<re.Match object; span=(0, 3), match='cat'>
<re.Match object; span=(4, 7), match='bat'>
<re.Match object; span=(12, 15), match='mat'>
<re.Match object; span=(16, 19), match='hat'>


In [35]:
# Extract all names from the above string

pattern = r'M(rs|r|s|r_)\.?\s?\w+'
for i in re.finditer(pattern, text_to_search):
    print(i)

<re.Match object; span=(302, 313), match='Mr. Schafer'>
<re.Match object; span=(314, 322), match='Mr Smith'>
<re.Match object; span=(323, 331), match='Ms Davis'>
<re.Match object; span=(332, 345), match='Mrs. Robinson'>
<re.Match object; span=(346, 351), match='Mr. T'>
<re.Match object; span=(352, 360), match='Mr_hello'>


In [36]:
pattern = r'Mr\.?\s\w+'
res = re.finditer(pattern, text_to_search)
for i in res:
    print(i)

<re.Match object; span=(302, 313), match='Mr. Schafer'>
<re.Match object; span=(314, 322), match='Mr Smith'>
<re.Match object; span=(346, 351), match='Mr. T'>


In [37]:
pattern = r'[0-9]{3}[-.*][0-9]{3}[-.*][0-9]{4}'
res = re.finditer(pattern, text_to_search)
for i in res:
    print(i)

<re.Match object; span=(57, 69), match='321-555-4321'>
<re.Match object; span=(224, 236), match='321-555-4321'>
<re.Match object; span=(237, 249), match='123.555.1234'>
<re.Match object; span=(250, 262), match='123*555*1234'>
<re.Match object; span=(263, 275), match='123.555.1234'>
<re.Match object; span=(276, 288), match='800-555-1234'>
<re.Match object; span=(289, 301), match='900-555-1234'>


In [38]:
pattern = r'\d{3}[-.*]\d{3}[-.*]\d{4}'
res = re.finditer(pattern, text_to_search)
for i in res:
    print(i)

<re.Match object; span=(57, 69), match='321-555-4321'>
<re.Match object; span=(224, 236), match='321-555-4321'>
<re.Match object; span=(237, 249), match='123.555.1234'>
<re.Match object; span=(250, 262), match='123*555*1234'>
<re.Match object; span=(263, 275), match='123.555.1234'>
<re.Match object; span=(276, 288), match='800-555-1234'>
<re.Match object; span=(289, 301), match='900-555-1234'>


In [39]:
# Extract all email-id's from the above string

pattern = r'[a-zA-Z]+.*@[a-zA-Z]+\.(com|net|org)'
res = re.finditer(pattern, text_to_search)
for i in res:
    print(i)

<re.Match object; span=(172, 201), match='abhishek-8765-gupta@yahoo.com'>
<re.Match object; span=(361, 385), match='abhsihek.gupta@gmail.com'>
<re.Match object; span=(386, 414), match='abhishek-1234gupta@gmail.org'>
<re.Match object; span=(415, 432), match='afsan@hotmail.net'>
