# re module
#### The re module in Python provides functions for working with regular expressions. Regular expressions are text patterns that allow you to search, match, extract, and manipulate strings based on specific rules.


In [2]:
import re

# TODO i want to search for abc
test_string = '123abc456789abc123ABC'

# now we made to use re.compile to search for abc
pattern = re.compile(r"abc")
# apply the pattern to the test_string
matches = pattern.finditer(test_string) 
for match in matches:
    print(match)


<re.Match object; span=(3, 6), match='abc'>
<re.Match object; span=(12, 15), match='abc'>


In [3]:
# or we can use it directly in finditer
matches = re.finditer(r"abc",test_string)

for match in matches:
    print(match)

<re.Match object; span=(3, 6), match='abc'>
<re.Match object; span=(12, 15), match='abc'>


In [4]:
# why we use r before the string here it will print and compile the \t to a tab
a = "\t Hello"
print(a)

	 Hello


In [5]:
# Here it will be understood as a raw text
a = r"\t Hello"
print(a)

\t Hello


### findall()
to find all the strings you are looking for

In [6]:
test_string = '123abc456789abc123ABC'

# now we made to use re.compile to search for abc
pattern = re.compile(r"abc")
# apply the pattern to the test_string
matches = pattern.findall(test_string) 
for match in matches:
    print(match)

abc
abc


### match()
looking for the pattern in the begining of the string

In [9]:
test_string = '123abc456789abc123ABC'

# now we made to use re.compile to search for abc
pattern = re.compile(r"abc")
# apply the pattern to the test_string
matches = pattern.match(test_string) 
print(matches)

None


In [10]:
test_string = '123abc456789abc123ABC'

# now we made to use re.compile to search for abc
pattern = re.compile(r"123")
# apply the pattern to the test_string
matches = pattern.match(test_string) 
print(matches)
# because it's in the begining

<re.Match object; span=(0, 3), match='123'>


### search()
searches for the first match

In [12]:
test_string = '123abc456789abc123ABC'

# now we made to use re.compile to search for abc
pattern = re.compile(r"abc")
# apply the pattern to the test_string
matches = pattern.search(test_string) 
print(matches)

<re.Match object; span=(3, 6), match='abc'>


### span()
get the first index and last index appear

In [14]:
test_string = '123abc456789abc123ABC'

# now we made to use re.compile to search for abc
pattern = re.compile(r"abc")
# apply the pattern to the test_string
matches = pattern.finditer(test_string) 
for match in matches:
    # prints the first element match and the second one
    print(match.span(),match.start(),match.end())


(3, 6) 3 6
(12, 15) 12 15


### group()

In [15]:
test_string = '123abc456789abc123ABC'

# now we made to use re.compile to search for abc
pattern = re.compile(r"abc")
# apply the pattern to the test_string
matches = pattern.finditer(test_string) 
for match in matches:
    # prints the first element match and the second one
    print(match.group())

abc
abc


# Meta character

. Any character (except new line character)

^ Starts with "^hello"

\$ Ends with "world$"

\* Zero or more occurrences "aix*"

\+  One or more occurrences "aix+"

{ } Exactly the specified number of occurrences "al{2}"

[ ] A set of characters "[a-m]"

\ Special sequence (or escape special characters) "\d"

| Either or "falls|stays"

( ) Capture and group

In [16]:
# . Any character (except new line character)
test_string = '123abc456789abc123ABC'

# now we made to use re.compile to search for abc
pattern = re.compile(r".")
# apply the pattern to the test_string
matches = pattern.finditer(test_string) 
for match in matches:
    # prints the first element match and the second one
    print(match.group())

1
2
3
a
b
c
4
5
6
7
8
9
a
b
c
1
2
3
A
B
C


In [22]:

test_string = '123abc456789abc123ABC.'

# if we want to search for the . it self we put \ befor it
pattern = re.compile(r"\.")
# apply the pattern to the test_string
matches = pattern.finditer(test_string) 
for match in matches:
    print(match)
  

<re.Match object; span=(21, 22), match='.'>


In [23]:
# ^ Starts with any pattern I choose
test_string = '123abc456789abc123ABC'

# now we made to use re.compile to search for abc
pattern = re.compile(r"^123")
# apply the pattern to the test_string
matches = pattern.finditer(test_string) 
for match in matches:
    # prints the first element match and the second one
    print(match)

<re.Match object; span=(0, 3), match='123'>


In [24]:
# $ Ends with any pattern I choose
test_string = '123abc456789abc123ABC'

# now we made to use re.compile to search for abc 
# ! case sensetive abc != ABC
pattern = re.compile(r"abc$")
# apply the pattern to the test_string
matches = pattern.finditer(test_string) 
for match in matches:
    # prints the first element match and the second one
    print(match)

In [25]:
# $ Ends with any pattern I choose
test_string = '123abc456789abc123ABC'

# now we made to use re.compile to search for abc
pattern = re.compile(r"ABC$")
# apply the pattern to the test_string
matches = pattern.finditer(test_string) 
for match in matches:
    # prints the first element match and the second one
    print(match)

<re.Match object; span=(18, 21), match='ABC'>


# More special characters:
##### \d : Matches any decimal digit; [0-9]

##### \D : Matches any non-digit character;

##### \s : Matches any whitespace character; (space " ", tab "\t", newline "\n")

##### \S : Matches any non-whitespace character;

##### \w : Matches any alphanumeric  (word) character [a-zA-Z0-9_];

##### \W : Matches any non-alphanumeric

##### \b : Matches where the specified characters are at the beginning or at the end of a word r"\bain" , r"ain\b"

##### \B : Matches where the specified characters are present, but NOT at the beginning of a word (or at the end) of a word r"\Bain" , r"ain\B"

In [26]:
# \d : Matches any decimal digit; [0-9]
test_string = 'hello 123_ heyho hohey'
pattern = re.compile(r"\d")
matches = pattern.finditer(test_string)

for match in matches:
    print(match)

<re.Match object; span=(6, 7), match='1'>
<re.Match object; span=(7, 8), match='2'>
<re.Match object; span=(8, 9), match='3'>


In [27]:
# \D : Matches any decimal digit; [0-9]
test_string = 'hello 123_ heyho hohey'
pattern = re.compile(r"\D")
matches = pattern.finditer(test_string)

for match in matches:
    print(match)

<re.Match object; span=(0, 1), match='h'>
<re.Match object; span=(1, 2), match='e'>
<re.Match object; span=(2, 3), match='l'>
<re.Match object; span=(3, 4), match='l'>
<re.Match object; span=(4, 5), match='o'>
<re.Match object; span=(5, 6), match=' '>
<re.Match object; span=(9, 10), match='_'>
<re.Match object; span=(10, 11), match=' '>
<re.Match object; span=(11, 12), match='h'>
<re.Match object; span=(12, 13), match='e'>
<re.Match object; span=(13, 14), match='y'>
<re.Match object; span=(14, 15), match='h'>
<re.Match object; span=(15, 16), match='o'>
<re.Match object; span=(16, 17), match=' '>
<re.Match object; span=(17, 18), match='h'>
<re.Match object; span=(18, 19), match='o'>
<re.Match object; span=(19, 20), match='h'>
<re.Match object; span=(20, 21), match='e'>
<re.Match object; span=(21, 22), match='y'>


In [18]:
# \s : Matches any whitespace character; (space " ", tab "\t", newline "\n")
test_string = 'hello 123_ heyho hohey'

pattern = re.compile(r"\s")
matches = pattern.finditer(test_string)

for match in matches:
    print(match)

<re.Match object; span=(5, 6), match=' '>
<re.Match object; span=(10, 11), match=' '>
<re.Match object; span=(16, 17), match=' '>


In [29]:
# \S : Matches any non-whitespace character;
test_string = 'hello 123_ heyho hohey'

pattern = re.compile(r"\S")
matches = pattern.finditer(test_string)

for match in matches:
    print(match)

<re.Match object; span=(0, 1), match='h'>
<re.Match object; span=(1, 2), match='e'>
<re.Match object; span=(2, 3), match='l'>
<re.Match object; span=(3, 4), match='l'>
<re.Match object; span=(4, 5), match='o'>
<re.Match object; span=(6, 7), match='1'>
<re.Match object; span=(7, 8), match='2'>
<re.Match object; span=(8, 9), match='3'>
<re.Match object; span=(9, 10), match='_'>
<re.Match object; span=(11, 12), match='h'>
<re.Match object; span=(12, 13), match='e'>
<re.Match object; span=(13, 14), match='y'>
<re.Match object; span=(14, 15), match='h'>
<re.Match object; span=(15, 16), match='o'>
<re.Match object; span=(17, 18), match='h'>
<re.Match object; span=(18, 19), match='o'>
<re.Match object; span=(19, 20), match='h'>
<re.Match object; span=(20, 21), match='e'>
<re.Match object; span=(21, 22), match='y'>


In [30]:
# \w : Matches any alphanumeric  (word) character [a-zA-Z0-9_];
test_string = 'hello 123_ heyho hohey'

pattern = re.compile(r"\w")
matches = pattern.finditer(test_string)

for match in matches:
    print(match)

<re.Match object; span=(0, 1), match='h'>
<re.Match object; span=(1, 2), match='e'>
<re.Match object; span=(2, 3), match='l'>
<re.Match object; span=(3, 4), match='l'>
<re.Match object; span=(4, 5), match='o'>
<re.Match object; span=(6, 7), match='1'>
<re.Match object; span=(7, 8), match='2'>
<re.Match object; span=(8, 9), match='3'>
<re.Match object; span=(9, 10), match='_'>
<re.Match object; span=(11, 12), match='h'>
<re.Match object; span=(12, 13), match='e'>
<re.Match object; span=(13, 14), match='y'>
<re.Match object; span=(14, 15), match='h'>
<re.Match object; span=(15, 16), match='o'>
<re.Match object; span=(17, 18), match='h'>
<re.Match object; span=(18, 19), match='o'>
<re.Match object; span=(19, 20), match='h'>
<re.Match object; span=(20, 21), match='e'>
<re.Match object; span=(21, 22), match='y'>


In [32]:
##### \W : Matches any non-alphanumeric
test_string = 'hello 123_ heyho hohey'

pattern = re.compile(r"\W")
matches = pattern.finditer(test_string)

for match in matches:
    print(match)

<re.Match object; span=(5, 6), match=' '>
<re.Match object; span=(10, 11), match=' '>
<re.Match object; span=(16, 17), match=' '>


In [33]:
# \b : Matches where the specified characters are at the beginning or at the end of a word block including white spaces  r"\bain" , r"ain\b"
# any beginning of a word after white space
test_string = 'hello 123_ heyho hohey'

pattern = re.compile(r"\bhello")
matches = pattern.finditer(test_string)

for match in matches:
    print(match)

<re.Match object; span=(0, 5), match='hello'>


In [34]:
test_string = 'hello 123_ heyho hohey'

pattern = re.compile(r"\bhey")
matches = pattern.finditer(test_string)

for match in matches:
    print(match)

<re.Match object; span=(11, 14), match='hey'>


In [36]:
# \B : Matches where the specified characters are present, but NOT at the beginning of a word (or at the end) of a word r"\Bain" , r"ain\B"
test_string = 'hello 123_ heyho hoheys'

pattern = re.compile(r"\Bhey")
matches = pattern.finditer(test_string)

for match in matches:
    print(match)

<re.Match object; span=(19, 22), match='hey'>


In [38]:
# A set of characters "[a-m]"
# search for a single characters in a set
test_string = 'hello 123_'

pattern = re.compile(r'[lo]')
matches = pattern.finditer(test_string)
for match in matches:
    print(match)

<re.Match object; span=(2, 3), match='l'>
<re.Match object; span=(3, 4), match='l'>
<re.Match object; span=(4, 5), match='o'>


In [39]:
test_string = 'hello 123_'

pattern = re.compile(r'[helo]')
matches = pattern.finditer(test_string)
for match in matches:
    print(match)

<re.Match object; span=(0, 1), match='h'>
<re.Match object; span=(1, 2), match='e'>
<re.Match object; span=(2, 3), match='l'>
<re.Match object; span=(3, 4), match='l'>
<re.Match object; span=(4, 5), match='o'>


In [40]:
test_string = 'hello 123_'
# search for a range of character in a string
pattern = re.compile(r'[a-z]')
matches = pattern.finditer(test_string)
for match in matches:
    print(match)

<re.Match object; span=(0, 1), match='h'>
<re.Match object; span=(1, 2), match='e'>
<re.Match object; span=(2, 3), match='l'>
<re.Match object; span=(3, 4), match='l'>
<re.Match object; span=(4, 5), match='o'>


In [41]:
test_string = 'hello 123_'
# search for a range of numbers in a string
pattern = re.compile(r'[0-9]')
matches = pattern.finditer(test_string)
for match in matches:
    print(match)

<re.Match object; span=(6, 7), match='1'>
<re.Match object; span=(7, 8), match='2'>
<re.Match object; span=(8, 9), match='3'>


In [43]:
test_string = 'hello 123-_'
# search for an actual - in a range
pattern = re.compile(r'[0-9-]')
matches = pattern.finditer(test_string)
for match in matches:
    print(match)

<re.Match object; span=(6, 7), match='1'>
<re.Match object; span=(7, 8), match='2'>
<re.Match object; span=(8, 9), match='3'>
<re.Match object; span=(9, 10), match='-'>


In [44]:
test_string = 'helloHELLO 123_'
# search for a range of character in a string (lower case)
pattern = re.compile(r'[a-z]')
matches = pattern.finditer(test_string)
for match in matches:
    print(match)

<re.Match object; span=(0, 1), match='h'>
<re.Match object; span=(1, 2), match='e'>
<re.Match object; span=(2, 3), match='l'>
<re.Match object; span=(3, 4), match='l'>
<re.Match object; span=(4, 5), match='o'>


In [45]:
test_string = 'helloHELLO 123_'
# search for a range of character in a string (lower case & upper case)
pattern = re.compile(r'[a-zA-Z]')
matches = pattern.finditer(test_string)
for match in matches:
    print(match)

<re.Match object; span=(0, 1), match='h'>
<re.Match object; span=(1, 2), match='e'>
<re.Match object; span=(2, 3), match='l'>
<re.Match object; span=(3, 4), match='l'>
<re.Match object; span=(4, 5), match='o'>
<re.Match object; span=(5, 6), match='H'>
<re.Match object; span=(6, 7), match='E'>
<re.Match object; span=(7, 8), match='L'>
<re.Match object; span=(8, 9), match='L'>
<re.Match object; span=(9, 10), match='O'>


In [46]:
test_string = 'helloHELLO 123_'
# search for a range of character in a string (lower case & upper case & numbers) 
pattern = re.compile(r'[a-zA-Z0-9]')
matches = pattern.finditer(test_string)
for match in matches:
    print(match)

<re.Match object; span=(0, 1), match='h'>
<re.Match object; span=(1, 2), match='e'>
<re.Match object; span=(2, 3), match='l'>
<re.Match object; span=(3, 4), match='l'>
<re.Match object; span=(4, 5), match='o'>
<re.Match object; span=(5, 6), match='H'>
<re.Match object; span=(6, 7), match='E'>
<re.Match object; span=(7, 8), match='L'>
<re.Match object; span=(8, 9), match='L'>
<re.Match object; span=(9, 10), match='O'>
<re.Match object; span=(11, 12), match='1'>
<re.Match object; span=(12, 13), match='2'>
<re.Match object; span=(13, 14), match='3'>


# Quantifier:

##### \* : 0 or more

##### \+ : 1 or more

##### ? : 0 or 1 -> optional character

##### {4} : exact number

##### {4,6} : range of numbers (min, max)


In [48]:
# \d* : Matches any decimal digit; [0-9] and return them together not individual
test_string = 'hello 123_ heyho hohey'
pattern = re.compile(r"\d*")
matches = pattern.finditer(test_string)

for match in matches:
    print(match)

<re.Match object; span=(0, 0), match=''>
<re.Match object; span=(1, 1), match=''>
<re.Match object; span=(2, 2), match=''>
<re.Match object; span=(3, 3), match=''>
<re.Match object; span=(4, 4), match=''>
<re.Match object; span=(5, 5), match=''>
<re.Match object; span=(6, 9), match='123'>
<re.Match object; span=(9, 9), match=''>
<re.Match object; span=(10, 10), match=''>
<re.Match object; span=(11, 11), match=''>
<re.Match object; span=(12, 12), match=''>
<re.Match object; span=(13, 13), match=''>
<re.Match object; span=(14, 14), match=''>
<re.Match object; span=(15, 15), match=''>
<re.Match object; span=(16, 16), match=''>
<re.Match object; span=(17, 17), match=''>
<re.Match object; span=(18, 18), match=''>
<re.Match object; span=(19, 19), match=''>
<re.Match object; span=(20, 20), match=''>
<re.Match object; span=(21, 21), match=''>
<re.Match object; span=(22, 22), match=''>


In [49]:
test_string = 'hello 123_ heyho hohey'
pattern = re.compile(r"\d")
matches = pattern.finditer(test_string)

for match in matches:
    print(match)

<re.Match object; span=(6, 7), match='1'>
<re.Match object; span=(7, 8), match='2'>
<re.Match object; span=(8, 9), match='3'>


In [52]:
test_string = 'hello _123_ heyho hohey'
pattern = re.compile(r"_\d")
matches = pattern.finditer(test_string)

for match in matches:
    print(match)

<re.Match object; span=(6, 8), match='_1'>


In [53]:
# ? : 0 or 1 -> optional character
test_string = 'hello _123_ heyho hohey'
pattern = re.compile(r"_?\d")
matches = pattern.finditer(test_string)

for match in matches:
    print(match)

<re.Match object; span=(6, 8), match='_1'>
<re.Match object; span=(8, 9), match='2'>
<re.Match object; span=(9, 10), match='3'>


In [55]:
# {4} : exact number
test_string = 'hello _123_ heyho hohey'
pattern = re.compile(r"\d{3}")
matches = pattern.finditer(test_string)

for match in matches:
    print(match)

<re.Match object; span=(7, 10), match='123'>


In [57]:
# {4,6} : range of numbers (min, max)
test_string = 'hello _123_ heyho hohey'
pattern = re.compile(r"\d{1,3}")
matches = pattern.finditer(test_string)

for match in matches:
    print(match)

<re.Match object; span=(7, 10), match='123'>


In [5]:
# we want to extract dates with - formate
dates = """
01.04.2020

2020.04.01

2020-04-01

2020-05-23

2020-06-11

2020-07-11

2020-08-11

"""

pattern = re.compile("\d\d\d\d.\d\d.\d\d")
matches = pattern.finditer(dates)

for match in matches:
    print(match)

<re.Match object; span=(13, 23), match='2020.04.01'>
<re.Match object; span=(25, 35), match='2020-04-01'>
<re.Match object; span=(37, 47), match='2020-05-23'>
<re.Match object; span=(49, 59), match='2020-06-11'>
<re.Match object; span=(61, 71), match='2020-07-11'>
<re.Match object; span=(73, 83), match='2020-08-11'>


  pattern = re.compile("\d\d\d\d.\d\d.\d\d")


In [6]:
# we want to extract dates with - formate
dates = """
01.04.2020

2020.04.01

2020-04-01

2020-05-23

2020-06-11

2020-07-11

2020-08-11

"""

pattern = re.compile("\d\d\d\d-\d\d-\d\d")
matches = pattern.finditer(dates)

for match in matches:
    print(match)

<re.Match object; span=(25, 35), match='2020-04-01'>
<re.Match object; span=(37, 47), match='2020-05-23'>
<re.Match object; span=(49, 59), match='2020-06-11'>
<re.Match object; span=(61, 71), match='2020-07-11'>
<re.Match object; span=(73, 83), match='2020-08-11'>


  pattern = re.compile("\d\d\d\d-\d\d-\d\d")


In [10]:
# we want to extract dates with -/ formate
dates = """
01.04.2020

2020.04.01

2020-04-01

2020-05-23

2020-06-11

2020-07-11

2020-08-11

2020/04/04

2020_04_04

2020_04_04

"""

pattern = re.compile("\d\d\d\d[-/]\d\d[-/]\d\d")
matches = pattern.finditer(dates)

for match in matches:
    print(match)

<re.Match object; span=(25, 35), match='2020-04-01'>
<re.Match object; span=(37, 47), match='2020-05-23'>
<re.Match object; span=(49, 59), match='2020-06-11'>
<re.Match object; span=(61, 71), match='2020-07-11'>
<re.Match object; span=(73, 83), match='2020-08-11'>
<re.Match object; span=(85, 95), match='2020/04/04'>


  pattern = re.compile("\d\d\d\d[-/]\d\d[-/]\d\d")


In [15]:
# we want to extract dates 05 and 06
dates = """
01.04.2020

2020.04.01

2020-04-01

2020-05-23

2020-06-11

2020-07-11

2020-08-11

2020/04/04

2020_04_04

2020_04_04

"""

pattern = re.compile("\d\d\d\d[-/]0[56][-/]\d\d")
matches = pattern.finditer(dates)

for match in matches:
    print(match)

<re.Match object; span=(37, 47), match='2020-05-23'>
<re.Match object; span=(49, 59), match='2020-06-11'>


  pattern = re.compile("\d\d\d\d[-/]0[56][-/]\d\d")


In [16]:
# we want to extract dates 05 and 06 and 07
dates = """
01.04.2020

2020.04.01

2020-04-01

2020-05-23

2020-06-11

2020-07-11

2020-08-11

2020/04/04

2020_04_04

2020_04_04

"""

pattern = re.compile("\d\d\d\d[-/]0[5-7][-/]\d\d")
matches = pattern.finditer(dates)

for match in matches:
    print(match)

<re.Match object; span=(37, 47), match='2020-05-23'>
<re.Match object; span=(49, 59), match='2020-06-11'>
<re.Match object; span=(61, 71), match='2020-07-11'>


  pattern = re.compile("\d\d\d\d[-/]0[5-7][-/]\d\d")


In [17]:
# another way to write the code for that
dates = """
01.04.2020

2020.04.01

2020-04-01

2020-05-23

2020-06-11

2020-07-11

2020-08-11

2020/04/04

2020_04_04

2020_04_04

"""

pattern = re.compile("\d{4}[-/]0[5-7][-/]\d{2}")
matches = pattern.finditer(dates)

for match in matches:
    print(match)

<re.Match object; span=(37, 47), match='2020-05-23'>
<re.Match object; span=(49, 59), match='2020-06-11'>
<re.Match object; span=(61, 71), match='2020-07-11'>


  pattern = re.compile("\d{4}[-/]0[5-7][-/]\d{2}")


# Conditions

In [20]:
# matching a specific pattern
my_string = """
hello world
123
2020-05-20
Mr Simpson
Mrs Simpson
Mr. Brown
Ms Smith
Mr.T
"""
# matching a specific pattern look here
# \s : Matches any whitespace character; (space " ", tab "\t", newline "\n")
# \w : Matches any alphanumeric  (word) character [a-zA-Z0-9_];
pattern = re.compile(r"Mr\s\w+")
matches = pattern.finditer(my_string)
for match in matches:
    print(match)

<re.Match object; span=(28, 38), match='Mr Simpson'>


In [23]:
# matching a specific pattern
my_string = """
hello world
123
2020-05-20
Mr Simpson
Mrs Simpson
Mr. Brown
Ms Smith
Mr. T
"""
# matching a specific pattern look here
# \s : Matches any whitespace character; (space " ", tab "\t", newline "\n")
# \w : Matches any alphanumeric  (word) character [a-zA-Z0-9_];
pattern = re.compile(r"Mr\.\s\w+")
matches = pattern.finditer(my_string)
for match in matches:
    print(match)

<re.Match object; span=(51, 60), match='Mr. Brown'>
<re.Match object; span=(70, 75), match='Mr. T'>


In [25]:
# matching a specific pattern
my_string = """
hello world
123
2020-05-20
Mr Simpson
Mrs Simpson
Mr. Brown
Ms Smith
Mr. T
"""
# matching a specific pattern look here
# \s : Matches any whitespace character; (space " ", tab "\t", newline "\n")
# \w : Matches any alphanumeric  (word) character [a-zA-Z0-9_];
# ? : opthional
pattern = re.compile(r"Mr\.?\s\w+")
matches = pattern.finditer(my_string)
for match in matches:
    print(match)

<re.Match object; span=(28, 38), match='Mr Simpson'>
<re.Match object; span=(51, 60), match='Mr. Brown'>
<re.Match object; span=(70, 75), match='Mr. T'>


In [27]:
# matching a specific pattern
my_string = """
hello world
123
2020-05-20
Mr Simpson
Mrs Simpson
Mr. Brown
Ms Smith
Mr. T
"""
# matching a specific pattern look here
# \s : Matches any whitespace character; (space " ", tab "\t", newline "\n")
# \w : Matches any alphanumeric  (word) character [a-zA-Z0-9_];
# ? : opthional
# () : conditioned
pattern = re.compile(r"(Mr|Ms|Mrs)\.?\s\w+")
matches = pattern.finditer(my_string)
for match in matches:
    print(match)

<re.Match object; span=(28, 38), match='Mr Simpson'>
<re.Match object; span=(39, 50), match='Mrs Simpson'>
<re.Match object; span=(51, 60), match='Mr. Brown'>
<re.Match object; span=(61, 69), match='Ms Smith'>
<re.Match object; span=(70, 75), match='Mr. T'>


In [28]:
# matching a specific pattern
my_string = """
hello world
123
2020-05-20
Mr Simpson
Mrs Simpson
Mr. Brown
Ms Smith
Mr. T
pythonengineer@gmail.com
Python-engineer@gmx.de
python-engineer123@my-domain.org
"""

pattern = re.compile(r"[a-zA-z0-9-]+@")
matches = pattern.finditer(my_string)
for match in matches:
    print(match)

<re.Match object; span=(76, 91), match='pythonengineer@'>
<re.Match object; span=(101, 117), match='Python-engineer@'>
<re.Match object; span=(124, 143), match='python-engineer123@'>


In [29]:
# matching a specific pattern (domain name and dot .)
my_string = """
hello world
123
2020-05-20
Mr Simpson
Mrs Simpson
Mr. Brown
Ms Smith
Mr. T
pythonengineer@gmail.com
Python-engineer@gmx.de
python-engineer123@my-domain.org
"""
# (domain name and dot .)
pattern = re.compile(r"[a-zA-z0-9-]+@[a-zA-Z-]+.")
matches = pattern.finditer(my_string)
for match in matches:
    print(match)

<re.Match object; span=(76, 97), match='pythonengineer@gmail.'>
<re.Match object; span=(101, 121), match='Python-engineer@gmx.'>
<re.Match object; span=(124, 153), match='python-engineer123@my-domain.'>


In [30]:
# matching a specific pattern (endings of emails and urls)
my_string = """
hello world
123
2020-05-20
Mr Simpson
Mrs Simpson
Mr. Brown
Ms Smith
Mr. T
pythonengineer@gmail.com
Python-engineer@gmx.de
python-engineer123@my-domain.org
"""
# (domain name and dot .)
pattern = re.compile(r"[a-zA-z0-9-]+@[a-zA-Z-]+\.(com|de|org)")
matches = pattern.finditer(my_string)
for match in matches:
    print(match)

<re.Match object; span=(76, 100), match='pythonengineer@gmail.com'>
<re.Match object; span=(101, 123), match='Python-engineer@gmx.de'>
<re.Match object; span=(124, 156), match='python-engineer123@my-domain.org'>


In [31]:
# matching a specific pattern (endings of emails and urls) another method
my_string = """
hello world
123
2020-05-20
Mr Simpson
Mrs Simpson
Mr. Brown
Ms Smith
Mr. T
pythonengineer@gmail.com
Python-engineer@gmx.de
python-engineer123@my-domain.org
"""
pattern = re.compile(r"[a-zA-z0-9-]+@[a-zA-Z-]+\.[a-zA-Z]+")
matches = pattern.finditer(my_string)
for match in matches:
    print(match)

<re.Match object; span=(76, 100), match='pythonengineer@gmail.com'>
<re.Match object; span=(101, 123), match='Python-engineer@gmx.de'>
<re.Match object; span=(124, 156), match='python-engineer123@my-domain.org'>


In [32]:
# matching a specific pattern (endings of emails and urls) another method
my_string = """
hello world
123
2020-05-20
Mr Simpson
Mrs Simpson
Mr. Brown
Ms Smith
Mr. T
pythonengineer@gmail.com
Python-engineer@gmx.de
python-engineer123@my-domain.org
"""
# using group to return the actual string
pattern = re.compile(r"([a-zA-z0-9-]+)@([a-zA-Z-]+)\.([a-zA-Z]+)")
matches = pattern.finditer(my_string)
for match in matches:
    print(match.group())

pythonengineer@gmail.com
Python-engineer@gmx.de
python-engineer123@my-domain.org


In [37]:
# matching a specific pattern (endings of emails and urls) another method
my_string = """
hello world
123
2020-05-20
Mr Simpson
Mrs Simpson
Mr. Brown
Ms Smith
Mr. T
pythonengineer@gmail.com
Python-engineer@gmx.de
python-engineer123@my-domain.org
"""
# using group to return the actual string
pattern = re.compile(r"([a-zA-z0-9-]+)@([a-zA-Z-]+)\.([a-zA-Z]+)")
matches = pattern.finditer(my_string)
# choose a group to print
for match in matches:
    #print(match.group(0))
    print(match.group(1))
    #print(match.group(2))
    #print(match.group(3))

pythonengineer
Python-engineer
python-engineer123


# Modification

In [39]:
test_string = '123abc456789abc123ABC'

# split, sub
# using this pattern to be a splitter
pattern = re.compile(r"123")
splitted = pattern.split(test_string)
print(splitted)


['', 'abc456789abc', 'ABC']


In [40]:
test_string = '123abc456789abc123ABC'

# split, sub
# using this pattern to be a splitter
pattern = re.compile(r"abc")
splitted = pattern.split(test_string)
print(splitted)


['123', '456789', '123ABC']


In [41]:
# sub

test_string = 'hello world, you are the best world'

pattern = re.compile(r"world")
# here replace the pattern with the first argument 
subbed_String = pattern.sub("planet",test_string)
print(subbed_String)

hello planet, you are the best planet


# Practice

In [44]:
urls = """
hello
2020-05-20
http://python-engineer.com
https://www.python-engineer.com
http://www.pyeng.net
"""
pattern = re.compile(r"http://www\.([a-zA-Z-]+)\.[a-zA-Z]+")
matches = pattern.finditer(urls)
for match in matches:
    print(match)

<re.Match object; span=(77, 97), match='http://www.pyeng.net'>


In [45]:
urls = """
hello
2020-05-20
http://python-engineer.com
https://www.python-engineer.com
http://www.pyeng.net
"""
pattern = re.compile(r"https?://(www\.)?([a-zA-Z-]+)\.[a-zA-Z]+")
matches = pattern.finditer(urls)
for match in matches:
    print(match)

<re.Match object; span=(18, 44), match='http://python-engineer.com'>
<re.Match object; span=(45, 76), match='https://www.python-engineer.com'>
<re.Match object; span=(77, 97), match='http://www.pyeng.net'>


In [50]:
urls = """
http://python-engineer.com
https://www.python-engineer.com
http://www.pyeng.net
"""
pattern = re.compile(r"https?://(www\.)?([a-zA-Z-]+)(\.[a-zA-Z]+)")
matches = pattern.finditer(urls)
for match in matches:
    print(match)
subbed_urls = pattern.sub("hello",urls)
print(subbed_urls)

<re.Match object; span=(1, 27), match='http://python-engineer.com'>
<re.Match object; span=(28, 59), match='https://www.python-engineer.com'>
<re.Match object; span=(60, 80), match='http://www.pyeng.net'>

hello
hello
hello



In [53]:
urls = """
http://python-engineer.com
https://www.python-engineer.com
http://www.pyeng.net
"""
pattern = re.compile(r"https?://(www\.)?([a-zA-Z-]+)(\.[a-zA-Z]+)")
matches = pattern.finditer(urls)
for match in matches:
    print(match)
# here we slice the groups 
# ([a-zA-Z-]+) \2 group 2
# (\.[a-zA-Z]+) \3 group 3
subbed_urls = pattern.sub(r"\2\3",urls)
print("subbed_urls",subbed_urls)

<re.Match object; span=(1, 27), match='http://python-engineer.com'>
<re.Match object; span=(28, 59), match='https://www.python-engineer.com'>
<re.Match object; span=(60, 80), match='http://www.pyeng.net'>
subbed_urls 
python-engineer.com
python-engineer.com
pyeng.net

