In [1]:
""" Regular Expressions in Python

Regular Expression is a sequence of characters used mainly to find and replace patterns in the strings.

Here are the common regular expressions:

(1) Search a string (search and match) --->  re.match() & re.search() 

(2) Finding a string (findall) ----> re.findall()

(3) Break string into sub strings (split) -----> re.split()

(4) Replace part of a string (sub) -------> re.sub() & re.compile """

#pakage "re" will be used for regular expression operations

import re


In [4]:
""" re.match ---> This method finds match if it occurs at the start of the string. 

The r in below code snippet means that the string is to be treated as a raw string, which means all escape codes will be ignored.

For example, '\n' will be treated as a newline character, while r'\n' will be treated as the characters \ followed by n. """

result = re.match(r'John', 'John is playing in the ground.')


In [5]:
""" Print the result to check if it matches or now """

print(result)

<_sre.SRE_Match object; span=(0, 4), match='John'>


In [6]:
""" start() ---> This method returns the starting position of the matching pattern in the string """

print(result.start())

0


In [7]:
""" end() ---> This method returns the ending position of the matching pattern in the string """

print(result.end())

4


In [9]:
""" group(0) ---> This method returns the entire match  """

print(result.group(0))

John


In [10]:
""" If the matching pattern is not in the starting of the string, it will return no result """

result = re.match(r'is', 'John is playing in the ground.')

In [11]:
""" Print the result """

print(result)

None


In [24]:
""" re.search ---> This method is similar to match() but it does not restricts to find matches at the beginning of the string only. """  

result = re.search(r'ground', 'John is playing in the ground.')


In [25]:
""" Print the result """

print(result)

<_sre.SRE_Match object; span=(23, 29), match='ground'>


In [26]:
""" Print the matching pattern """

print(result.group(0))

ground


In [27]:
""" Print the starting position of the matching pattern in the string """

print(result.start())

23


In [28]:
""" Print the ending position of the matching pattern in the string """

print(result.end())

29


In [29]:
 """ re.findall --> This method helps to get a list of all matching patterns. 
     
It has not constraints of searching from start or end. 

Also, the main difference of this method with search() is that it will return not only the first but all the occurences of the matching pattern in the string. """ 

result = re.findall(r'John', 'John is playing in the ground. John loves playing baseball.')

In [31]:
""" Print the result """

print(result)

['John', 'John']


In [34]:
""" re.split ---> This method helps to split string by the occurences of a given pattern. 
This has done all the splits possible by pattern 'n' in the below code snippet """

result = re.split(r'n', 'John is playing in the ground.')

In [35]:
""" Print the result """

print(result)

['Joh', ' is playi', 'g i', ' the grou', 'd.']


In [36]:
""" maxsplit is an argument which can be used to restrict this split to once or twice or as many number of times required.
The default value of this argument is zero(0) and hence, the above mentioned example splits the strings for the maximum number of times possible. """


result = re.split(r'n', 'John is playing in the ground.', maxsplit = 1)

In [38]:
""" Print the result """

print(result) 

['Joh', ' is playing in the ground.']


In [39]:
""" re.sub --> This method helps to search a pattern and replace with a new sub string. 

If the pattern is not found, string is returned unchanged. """

result = re.sub(r'ground', 'field', 'John is playing in the ground.')

In [41]:
""" Print the result """

print(result)

John is playing in the field.


In [43]:
""" re.compile --> This method helps to combine a regular expression pattern into pattern objects, which can be used for pattern matching.

It also helps to search a pattern again without rewriting it. """

""" Creating a pattern object by calling compile with the pattern """

pattern = re.compile('playing')

In [44]:
""" Searching for the pattern using findall """

result = pattern.findall('John is playing in the ground.')

In [46]:
""" Print the result """

print(result)

['playing']


In [47]:
""" Searching for the same pattern using findall but in different strings """

result = pattern.findall('Sam is playing baseball.')

In [48]:
""" Print the result """

print(result)

['playing']


In [50]:
""" Here are the most commonly used operators that helps to generate an expression 

(1) . ---> Matches with any single character except newline '\n'

(2) ? ---> Match 0 or 1 occurence of the pattern to its left

(3) + ---> 1 or more occurences of the pattern to its left

(4) * ---> 0 or more occurences of the pattern to its left

(5) \w ---> Matches with an alphanumeric character and underscore. This is equivalent to the set [a-zA-Z0-9_].

(6) \W ---> Matches with a non-alphanumeric character. This is equivalent to the set [^a-zA-Z0-9_].

(7) \d ---> Matches with digits [0-9]. This is equivalent to the set [0-9].

(8) \D ---> Matches with any non-digit characters. This is equivalent to the set [^0-9].

(9) \s ---> Matches with a single white space character. This is equivalent to the set [\t\n\r\f\v]

(10) \S ---> Matches with any non-white space character. This is equivalent to the set [^\t\n\r\f\v]

(11) \b ---> Matches the boundary between word and non-word ie matches the empty string but only at the beginning or end of the word.

(12) \B ---> It is opposite to \b. Matches the empty string but only when it is not at the beginning or end of a word.

(13) [..] ---> Matches any single character in square bracket 

(14) [^..] ---> Matches any single character not in square bracket

(15) \ ---> It is used to match special meaning characters. For example: \. to match a period or \+ to match a plus sign.

(16) ^ and $ ---> ^ match the start of the string and $ match the end of the string.

(17) {n,m} & {m} ---> Matches at least n and at most m occurences of preceding expression. Omitting m specifies a lower bound of zero and omitting n specifies an infinite upper bound.
                     {m} -> Specifies exactly m copies of preceding expression should be matched. 

(18) a|b ---> Matches either a or b

(19) () ---> Groups regular expressions and returns matched text

(20) \t,\n,\r ---> Matches tab, newline, return


Some examples using the above mentioned operators are given below. """

""" Extract each character using (1) . (dot) including space"""

result = re.findall(r'.','John is playing in the ground.')



In [52]:
""" Print the result. The result includes all characters including spaces ' ' """

print(result)

['J', 'o', 'h', 'n', ' ', 'i', 's', ' ', 'p', 'l', 'a', 'y', 'i', 'n', 'g', ' ', 'i', 'n', ' ', 't', 'h', 'e', ' ', 'g', 'r', 'o', 'u', 'n', 'd', '.']


In [53]:
""" Extract each character using (5) \w excluding space """

result = re.findall(r'\w', 'John is playing in the ground.')

In [55]:
""" Print the result. The result includes all characters excluding spaces ' ' and .(period) """

print(result)

['J', 'o', 'h', 'n', 'i', 's', 'p', 'l', 'a', 'y', 'i', 'n', 'g', 'i', 'n', 't', 'h', 'e', 'g', 'r', 'o', 'u', 'n', 'd']


In [57]:
""" Extract each word using (4) * """

result = re.findall(r'\w*', 'John is playing in the ground.')

In [59]:
""" Print the result. The result includes all words including spaces ' ' """

print(result)

['John', '', 'is', '', 'playing', '', 'in', '', 'the', '', 'ground', '', '']


In [62]:
""" Extract each word using (3) + """

result = re.findall(r'\w+', 'John is playing in the ground.')


In [68]:
""" Print the result. The result includes all words excluding spaces ' ' """

print(result)

['John']


In [69]:
""" Extract each word using (16) ^ """

result = re.findall(r'^\w+', 'John is playing in the ground.')

In [70]:
""" Print the result """

print(result)

['John']


In [75]:
""" Extract each word using (16) $ """

result = re.findall(r'\w+$', 'John is playing in the ground.')

In [77]:
""" Print the result """

print(result)

[]


In [80]:
""" Extract consecutive two characters of each word, excluding spaces using (5) \w """

result = re.findall(r'\w\w', 'John is playing in the ground.')

In [81]:
""" Print the result """

print(result)

['Jo', 'hn', 'is', 'pl', 'ay', 'in', 'in', 'th', 'gr', 'ou', 'nd']


In [86]:
""" Extract consecutive two characters only at the start of word boundary using (11) \b """

result = re.findall(r'\b\w\w', 'John is playing in the ground.')

In [88]:
""" Print the result """

print(result)

['Jo', 'is', 'pl', 'in', 'th', 'gr']


In [93]:
""" Extract email domains - all characters after '@' """

result = re.findall(r'@\w+', 'john.paul@gmail.com, sam_223@yahoo.com, robin134@hotmail.com')

In [95]:
""" Print the result, note - .com has been excluded """

print(result)

['@gmail', '@yahoo', '@hotmail']


In [96]:
""" Extract email domains - all characters after '@' including .com """

result = re.findall(r'@\w+.\w+', 'john.paul@gmail.com, sam_223@yahoo.com, robin134@hotmail.com')

In [98]:
""" Print the result """

print(result)

['@gmail.com', '@yahoo.com', '@hotmail.com']


In [102]:
""" Extract only email domain names using (19) () """

result = re.findall(r'@\w+.(\w+)', 'john.paul@gmail.com, sam_223@yahoo.com, robin134@hotmail.com')

In [105]:
""" Print the result """

print(result)

['com', 'com', 'com']


In [106]:
""" Extract dates from the strings using (7) \d & (17) {m} """


result = re.findall(r'\d{2}-\d{2}-\d{4}', 'John 10001 12-12-1982, Sam 10002 08-08-2000, Robin 10003 10-10-2010')

In [107]:
""" Print the result """

print(result)

['12-12-1982', '08-08-2000', '10-10-2010']


In [108]:
""" Extract only year from date using (19) () """

result = re.findall(r'\d{2}-\d{2}-(\d{4})', 'John 10001 12-12-1982, Sam 10002 08-08-2000, Robin 10003 10-10-2010')

In [109]:
""" Print the result """

print(result)

['1982', '2000', '2010']


In [110]:
""" Extract all words of a string those starts with vowel """


result = re.findall(r'\b[aeiouAEIOU]\w+', 'John is playing in the ground.')

In [111]:
""" Print the result """

print(result)

['is', 'in']


In [113]:
""" Validate 10 digits phone number which starts with either 8 or 9"""

list = ['9999999999', '999999-999', '99999x9999']

for val in list:
    
    if re.match(r'[8-9]{1}[0-9]{9}', val) and len(val) == 10:
        
        print('yes')
        
    else:
        
        print('no')
        
        
        

yes
no
no


In [117]:
""" Split a string with multiple delimiters """

line = 'abcdef ghij,klmno;pqr st'

result = re.split(r'[\s,;]', line)

print(result)

['abcdef', 'ghij', 'klmno', 'pqr', 'st']


In [119]:
""" Replace delimiters with space ' ' """

line = 'abcdef ghij,klmno;pqr st'

result = re.sub(r'[\s,;]',' ', line)

print(result)

abcdef ghij klmno pqr st
