In [1]:
import re

A RegEx, or Regular Expression, is a sequence of characters that forms a search pattern.

RegEx can be used to check if a string contains the specified search pattern.

# RegEx Functions
The re module offers a set of functions that allows us to search a string for a match:

Function	Description

1. findall: Returns a list containing all matches

2. search:	Returns a Match object if there is a match anywhere in the string

3. split: Returns a list where the string has been split at each match

4. sub: Replaces one or many matches with a string

In [2]:
#findall: Returns a list containing all matches
txt = "The rain in Spain"
x=re.findall('ai',txt)
print(x)

['ai', 'ai']


# Metacharacters
Metacharacters are characters with a special meaning:

In [3]:
#1. []	A set of characters
x = re.findall("[a-m]", txt)# EMLEMENTS BETWEEN a and m
print(x)
txt = "The rain in Spain 10673451"
x = re.findall("[1-5]", txt)# EMLEMENTS BETWEEN 1 and 5
print(x)


['h', 'e', 'a', 'i', 'i', 'a', 'i']
['1', '3', '4', '5', '1']


In [9]:
# \	Signals a special sequence (can also be used to escape special characters) eg.	"\d"
txt = "The rain in Spain 10673451"
x = re.findall("\d", txt) #find all digit characters
print(x)
txt = "The rain in Spain 1067345@!#1"
x = re.findall("\s", txt) #find all spaces
print(x)
x = re.findall("\S", txt) #find all elements other tthan space
print(x)
x = re.findall("\@", txt) #find all @
print(x)

['1', '0', '6', '7', '3', '4', '5', '1']
[' ', ' ', ' ', ' ']
['T', 'h', 'e', 'r', 'a', 'i', 'n', 'i', 'n', 'S', 'p', 'a', 'i', 'n', '1', '0', '6', '7', '3', '4', '5', '@', '!', '#', '1']
['@']


In [13]:
# .	Any character (except newline character)
txt = "hello planet"
#Search for a sequence that starts with "he", followed by two (any) characters, and an "o":
x = re.findall("he..o", txt)
print(x)
x = re.findall("he.", txt)
print(x)
x = re.findall("he. ", txt)
print(x)
x = re.findall("he..", txt)
print(x)
x = re.findall("he....", txt)
print(x)

['hello']
['hel']
[]
['hell']
['hello ']


In [25]:
# ^	Starts with
txt = "hello planet olleh"
#Check if the string starts with 'hello':
x = re.findall("^hello", txt)
print(x)
if x:
  print("Yes, the string starts with 'hello'")
else:
  print("No match")
x = re.findall("^hello", txt)
print(x)


['hello']
Yes, the string starts with 'hello'
['hello']


In [26]:
# $	Ends with
txt = "hello planet"
#Check if the string ends with 'planet':
x = re.findall("planet$", txt)
if x:
  print("Yes, the string ends with 'planet'")
else:
  print("No match")


Yes, the string ends with 'planet'


In [32]:
# * Zero or more occurrences
txt = "hello planet"
#Search for a sequence that starts with "he", followed by 0 or more  (any) characters, and an "o":
x = re.findall("he.*o", txt)
print(x)
x = re.findall("he*", txt)
print(x)
x = re.findall("he.*", txt)
print(x)
x = re.findall("he. *", txt)
print(x)
x = re.findall("he *", txt)
print(x)

['hello']
['he']
['hello planet']
['hel']
['he']


In [37]:
# +	One or more occurrences
txt = "hello planet"
#Search for a sequence that starts with "he", followed by 1 or more  (any) characters, and an "o":
x = re.findall("he.+o", txt)
print(x)

x = re.findall("he. +o", txt)
print(x)
x = re.findall("he.l+o", txt)
print(x)
x = re.findall("he.+", txt)
print(x)
x = re.findall("he+", txt)
print(x)

['hello']
[]
['hello']
['hello planet']
['he']


In [41]:
# ?	Zero or one occurrences
txt = "hello planet"
#Search for a sequence that starts with "he", followed by 0 or 1  (any) character, and an "o":
x = re.findall("he.?o", txt)
print(x)
#This time we got no match, because there were not zero, not one, but two characters between "he" and the "o"
x = re.findall("he.l?o", txt)
print(x)
x = re.findall("he.ll?", txt)
print(x)
x = re.findall("he.ll?o", txt)
print(x)

[]
['hello']
['hell']
['hello']


In [42]:
# {}	Exactly the specified number of occurrences
txt = "hello planet"
#Search for a sequence that starts with "he", followed excactly 2 (any) characters, and an "o":
x = re.findall("he.{2}o", txt)
print(x)
x = re.findall("he.{2}", txt)
print(x)
x = re.findall("he.{5}", txt)
print(x)

['hello']
['hell']
['hello p']


In [43]:
x = re.findall("he.{5}o", txt)
print(x)

[]


In [44]:
# |	Either or
txt = "The rain in Spain falls mainly in the plain!"

#Check if the string contains either "falls" or "stays":
x = re.findall("falls|stays", txt)
print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")


['falls']
Yes, there is at least one match!


In [47]:
# |	Either or
txt = "The rain in Spain falls mainly in the plain!"

#Check if the string contains either "falls" or "stays":
x = re.findall("falls|stays", txt)
print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")


['falls']
Yes, there is at least one match!


# Special Sequences
A special sequence is a \ followed by one of the characters in the list below, and has a special meaning:

1. \A:	Returns a match if the specified characters are at the beginning of the string	"\AThe"	

2. \b:	Returns a match where the specified characters are at the beginning or at the end of a word
(the "r" in the beginning is making sure that the string is being treated as a "raw string")	r"\bain"

r"ain\b"	

3. \B	Returns a match where the specified characters are present, but NOT at the beginning r"\Bain"
(or at the end) of a word (the "r" in the beginning is making sure that the string is being treated as a "raw string") r"ain\B"	

3. \d	Returns a match where the string contains digits (numbers from 0-9)	"\d"	
4. \D	Returns a match where the string DOES NOT contain digits	"\D"	
5. \s	Returns a match where the string contains a white space character	"\s"	
6. \S	Returns a match where the string DOES NOT contain a white space character	"\S"	
7. \w	Returns a match where the string contains any word characters (characters from a to Z, digits from 0-9, and the underscore _ character)	"\w"	
8. \W	Returns a match where the string DOES NOT contain any word characters	"\W"	
9. \Z	Returns a match if the specified characters are at the end of the string	"Spain\Z"

In [48]:
txt = "The rain in Spain"

#Check if the string ends with "Spain":

x = re.findall("Spain\Z", txt)

print(x)

if x:
  print("Yes, there is a match!")
else:
  print("No match")


['Spain']
Yes, there is a match!


# sets
A set is a set of characters inside a pair of square brackets [] with a special meaning:

1. [arn]	Returns a match where one of the specified characters (a, r, or n) is present	
2. [a-n]	Returns a match for any lower case character, alphabetically between a and n	
3. [^arn]	Returns a match for any character EXCEPT a, r, and n	
4. [0123]	Returns a match where any of the specified digits (0, 1, 2, or 3) are present	
5. [0-9]	Returns a match for any digit between 0 and 9	
6. [0-5][0-9]	Returns a match for any two-digit numbers from 00 and 59	
7. [a-zA-Z]	Returns a match for any character alphabetically between a and z, lower case OR upper case	
8. [+]	In sets, +, *, ., |, (), $,{} has no special meaning, so [+] means: return a match for any + character in the string	


# The findall() Function
The findall() function returns a list containing all matches.

In [49]:
txt = "The rain in Spain"
x = re.findall("ai", txt)
print(x)

['ai', 'ai']


In [50]:
txt = "The rain in Spain"
x = re.findall("Portugal", txt)
print(x)

[]


# search() Function
The search() function searches the string for a match, and returns a Match object if there is a match.

If there is more than one match, only the first occurrence of the match will be returned:

In [60]:
txt = "The rain in Spain"
x = re.search("\s", txt)

print("The first white-space character is located in position:", x.start())
print(x)
print(x.group)

print(x.end())
print(x.groups())
print(x.pos)

The first white-space character is located in position: 3
<re.Match object; span=(3, 4), match=' '>
<built-in method group of re.Match object at 0x000001D3CEAB8A40>
4
()
0


## The split() Function
The split() function returns a list where the string has been split at each match:

In [61]:
txt = "The rain in Spain"
x = re.split("\s", txt)
print(x)

['The', 'rain', 'in', 'Spain']


In [62]:
txt = "The rain in Spain"
x = re.split("\S", txt)
print(x)

['', '', '', ' ', '', '', '', ' ', '', ' ', '', '', '', '', '']


In [65]:
txt = "The rain in Spain"
x = re.split("[in]", txt)
print(x)

['The ra', '', ' ', '', ' Spa', '', '']


In [66]:
txt = "The rain in Spain"
x = re.split("[n]", txt)
print(x)

['The rai', ' i', ' Spai', '']


You can control the number of occurrences by specifying the maxsplit parameter:

In [67]:
txt = "The rain in Spain"
x = re.split("\s", txt, 1)
print(x)

['The', 'rain in Spain']


In [68]:
txt = "The rain in Spain"
x = re.split("\s", txt, 2)
print(x)

['The', 'rain', 'in Spain']


In [69]:
txt = "The rain in Spain"
x = re.split("\s", txt, 3)
print(x)

['The', 'rain', 'in', 'Spain']


# The sub() Function
The sub() function replaces the matches with the text of your choice:

In [70]:
txt = "The rain in Spain"
x = re.sub("\s", "9", txt)#Replace every white-space character with the number 9:
print(x)

The9rain9in9Spain


You can control the number of replacements by specifying the count parameter:

In [71]:
txt = "The rain in Spain"
x = re.sub("\s", "9", txt, 2)#Replace the first 2 occurrences
print(x)

The9rain9in Spain


# Match Object
A Match Object is an object containing information about the search and the result.

Note: If there is no match, the value None will be returned, instead of the Match Object.

In [None]:
txt = "The rain in Spain"#Do a search that will return a Match Object
x = re.search("ai", txt)
print(x) #this will print an object

The Match object has properties and methods used to retrieve information about the search, and the result:

.span() returns a tuple containing the start-, and end positions of the match.

.string returns the string passed into the function

.group() returns the part of the string where there was a match

In [72]:
txt = "The rain in Spain"
x = re.search(r"\bS\w+", txt)#Print the position (start- and end-position) of the first match occurrence.The regular expression looks for any words that starts with an upper case "S":
print(x.span())

(12, 17)


In [73]:
txt = "The rain in Spain"
x = re.search(r"\bS\w+", txt)#Print the string passed into the function
print(x.string)

The rain in Spain


In [74]:
txt = "The rain in Spain"
x = re.search(r"\bS\w+", txt)# Print the part of the string where there was a match.The regular expression looks for any words that starts with an upper case "S"
print(x.group())

Spain


Note: If there is no match, the value None will be returned, instead of the Match Object.

In [76]:
sentence = "Python is awesome"

a = re.match(r"[Pp]ython", sentence)# identify the pattern at starting position

if a:
    print(f'Pattern Found and the pattern is {a.group()}')
else:
    print("Pattern not Found")

Pattern Found and the pattern is Python
