In [1]:
# Reference taken from: https://pymotw.com/2/re/

import re

In [2]:
base_text = "This is base text to look for everything"
to_search_text = "look"

print(to_search_text in base_text)
print(re.search(to_search_text, base_text))
# Says the pattern is found and is available from 21 to 24 index

True
<_sre.SRE_Match object; span=(21, 25), match='look'>


In [3]:
base_text = "This is base text to looking for everything"
to_search_text = "look"

print(to_search_text in base_text)
print(re.search(to_search_text, base_text))
# Says the pattern is found and is available from 21 to 24 index

True
<_sre.SRE_Match object; span=(21, 25), match='look'>


In [4]:
base_text = "This is base text to look for everything"
to_search_text = "loot"

print(to_search_text in base_text)
print(re.search(to_search_text, base_text))
# If the pattern is not found, search() returns None.

False
None


In [5]:
base_text = "This is base text to alook for everything"
to_search_text = "look"

print(to_search_text in base_text)
print(re.search(to_search_text, base_text))
# Says the pattern is found and is available from 22 to 25 index

True
<_sre.SRE_Match object; span=(22, 26), match='look'>


In [6]:
start_index = re.search(to_search_text, base_text).start()
end_index = re.search(to_search_text, base_text).end()

start_index, end_index

(22, 26)

In [7]:
base_text[start_index: end_index]

'look'

In [8]:
base_text = "This is base text to look for everything to study"
to_search_text = r"to"

print(re.findall(to_search_text, base_text))

['to', 'to']


In [9]:
base_text = "This is base text to look for everything to study"
to_search_text = r"to"

for num in re.finditer(to_search_text, base_text):
    print(num.start(), num.end())

18 20
41 43


In [10]:
base_text = "abbaaabbbbaaaaa"

print(re.findall(r'ab*', base_text))     # * means 0 or more
print(re.findall(r'ab+', base_text))     # + means 1 or more
print(re.findall(r'ab?', base_text))     # ? means 0 or 1
print(re.findall(r'ab{2}', base_text))   # exactly 2 b after a
print(re.findall(r'ab{2,3}', base_text)) # 2 or 3 b after a

['abb', 'a', 'a', 'abbbb', 'a', 'a', 'a', 'a', 'a']
['abb', 'abbbb']
['ab', 'a', 'a', 'ab', 'a', 'a', 'a', 'a', 'a']
['abb', 'abb']
['abb', 'abbb']


In [11]:
# The normal processing for a repetition instruction is to consume as much of the input as possible 
# while matching the pattern. This so-called greedy behavior may result in fewer individual matches, 
# or the matches may include more of the input text than intended. Greediness can be turned off by 
# following the repetition instruction with ?.

base_text = "abbaaabbbbaaaaa"

print(re.findall(r'ab*?', base_text))     # * means 0 or more
print(re.findall(r'ab+?', base_text))     # + means 1 or more
print(re.findall(r'ab??', base_text))     # ? means 0 or 1
print(re.findall(r'ab{2}?', base_text))   # exactly 2 b after a
print(re.findall(r'ab{2,3}?', base_text)) # 2 or 3 b after a

['a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a']
['ab', 'ab']
['a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a']
['abb', 'abb']
['abb', 'abb']


In [12]:
base_text = "abbaaabbbbaaaaa"

print(re.findall(r'[ab]', base_text))
print(re.findall(r'a[ab]+', base_text))
print(re.findall(r'a[ab]+?', base_text))

['a', 'b', 'b', 'a', 'a', 'a', 'b', 'b', 'b', 'b', 'a', 'a', 'a', 'a', 'a']
['abbaaabbbbaaaaa']
['ab', 'aa', 'ab', 'aa', 'aa']


In [13]:
# A character set can also be used to exclude specific characters. 
# The special marker ^ means to look for characters not in the set following.

base_text = 'This is some text -- with punctuation.'
print(re.findall(r'[^-.]+', base_text))   # sequences without -, ., 
print(re.findall(r'[^-. ]+', base_text))   # sequences without -, ., and space

['This is some text ', ' with punctuation']
['This', 'is', 'some', 'text', 'with', 'punctuation']


In [14]:
base_text = 'This is some text -- with punctuation.'
print(re.findall(r'[a-z]+', base_text))      # sequences of lower case letters
print(re.findall(r'[A-Z]+', base_text))      # sequences of upper case letters
print(re.findall(r'[a-zA-Z]+', base_text))   # sequences of lower or upper case letters
print(re.findall(r'[A-Z][a-z]+', base_text)) # one upper case letter followed by lower case letters

['his', 'is', 'some', 'text', 'with', 'punctuation']
['T']
['This', 'is', 'some', 'text', 'with', 'punctuation']
['This']


In [15]:
# As a special case of a character set the metacharacter dot, or period (.), indicates that the 
# pattern should match any single character in that position.

base_text = 'abbaaabbbbaaaaa'


print(re.findall(r'a.', base_text))   # a followed by any one character
print(re.findall(r'b.', base_text))   # b followed by any one character
print(re.findall(r'a.*b',base_text))  # a followed by anything, ending in b


['ab', 'aa', 'ab', 'aa', 'aa']
['bb', 'bb', 'bb']
['abbaaabbbb']


In [16]:
# Note: if one character is matched as part of regex, then this character will not be included for next regex search
# from next character onwards, the search will start


![image.png](attachment:image.png)

In [17]:
base_text = 'This is a prime #1 example!'

print(re.findall(r'\d+', base_text)) # sequence of digits
print(re.findall(r'\D+', base_text)) # sequence of non-digits
print(re.findall(r'\s+', base_text)) # sequence of whitespace
print(re.findall(r'\S+', base_text)) # sequence of non-whitespace
print(re.findall(r'\w+', base_text)) # alphanumeric characters
print(re.findall(r'\W+', base_text)) # non-alphanumeric

['1']
['This is a prime #', ' example!']
[' ', ' ', ' ', ' ', ' ']
['This', 'is', 'a', 'prime', '#1', 'example!']
['This', 'is', 'a', 'prime', '1', 'example']
[' ', ' ', ' ', ' #', ' ', '!']


![image.png](attachment:image.png)

In [18]:
base_text = 'This is some text -- with punctuation.'


print(re.findall(r'^\w+', base_text))     # word at start of string
print(re.findall(r'\A\w+', base_text))    # word at start of string
print("\n")

print(re.findall(r'\w+\S*$', base_text))  # word at end of string, with optional punctuation
print(re.findall(r'\w+$', base_text))     # word at end of string
print(re.findall(r'\w+\S*\Z', base_text)) # word at end of string, with optional punctuation
print(re.findall(r'\w+\Z', base_text))    # word at end of string
print("\n")

print(re.findall(r'\bt\w+', base_text))   
print(re.findall(r'\Bt\w+', base_text))
print(re.findall(r'\w+h\b', base_text))

['This']
['This']


['punctuation.']
[]
['punctuation.']
[]


['text']
['th', 'tuation']
['with']


![image.png](attachment:image.png)

In [19]:
text = 'This is some text -- with punctuation.'
re.findall(r'(\bt\w+)\W+(\w+)', text)

[('text', 'with')]

In [20]:
# These are defined as groups, groups which are matching the above criteria will be the output of regex. 
# and in between those two groups there should be some Non-whitespace characters

In [21]:
text1 = "deepak123455@gmail.com"
text2 = "deepak123455@gmailcom"
exp = r'(\w+)\@(\w+)\.(\w+)'

print(re.match(exp, text1))
print(re.match(exp, text2))

<_sre.SRE_Match object; span=(0, 22), match='deepak123455@gmail.com'>
None


In [22]:
text1 = "deepak123455@gmail.com"
text2 = "deepak123455@gmail.hotmail"
text3 = "deepak123455@gmail.edu"
exp = r'(\w+)\@(\w+)\.(com|hotmail)'

print(re.match(exp, text1))
print(re.match(exp, text2))
print(re.match(exp, text3))

<_sre.SRE_Match object; span=(0, 22), match='deepak123455@gmail.com'>
<_sre.SRE_Match object; span=(0, 26), match='deepak123455@gmail.hotmail'>
None


In [23]:
text = "deepak.dodeja123455@gmail.edu"
exp = r'(\w+)\@(\w+)\.(com|hotmail)'

print(re.match(exp, text))

None


In [24]:
text1 = "deepak.dodeja123455@gmail.edu"
text2 = "dee1pak123.dodeja123455@gmail.edu"
text3 = ".dodeja123455@gmail.edu"
text4 = "dee1pak123dodeja123455@gmail.edu"
exp = r'(\w*\.\w*)\@(\w+)\.(com|edu)'

print(re.match(exp, text1))
print(re.match(exp, text2))
print(re.match(exp, text3))
print(re.match(exp, text4))

<_sre.SRE_Match object; span=(0, 29), match='deepak.dodeja123455@gmail.edu'>
<_sre.SRE_Match object; span=(0, 33), match='dee1pak123.dodeja123455@gmail.edu'>
<_sre.SRE_Match object; span=(0, 23), match='.dodeja123455@gmail.edu'>
None


In [25]:
text1 = ".dodeja123455@gmail.edu"
text2 = "dee1pak123dodeja123455@gmail.edu"
text3 = "dee1pak123.dodeja123455@gmail.edu"
exp = r'([\w]+[\.]*[\w]+)\@(\w+)\.(com|edu)'

print(re.match(exp, text1))
print(re.match(exp, text2))
print(re.match(exp, text3))

None
<_sre.SRE_Match object; span=(0, 32), match='dee1pak123dodeja123455@gmail.edu'>
<_sre.SRE_Match object; span=(0, 33), match='dee1pak123.dodeja123455@gmail.edu'>


![image.png](attachment:image.png)

![image.png](attachment:image.png)