In [2]:
import regex as re

# character class , backslash, alteration, quantifiers, greedy and non greedy, boundary macthers, splitting, substitution

## match, search, findall, finditer

In [4]:
x = re.compile('apple')

In [5]:
x.match('apple is mine')

<regex.Match object; span=(0, 5), match='apple'>

In [8]:
x.search('apple is mine apple',pos=5)

<regex.Match object; span=(14, 19), match='apple'>

In [7]:
x.findall('apple is mine apple')

['apple', 'apple']

In [9]:
x.finditer('apple is mine')

<_regex.Scanner at 0x1936e898af0>

# Predefined Character Classes
There exist some predefined character classes which can be used as a shortcut for some frequently used classes.

Element	Description.    
.	This element matches any character except newline      
\d	This matches any decimal digit; this is equivalent to the class [0-9]           
\D	This matches any non-digit character; this is equivalent to the class [^0-9]            
\s	This matches any whitespace character; this is equivalent to the class [ \t\n\r\f\v]          
\S	This matches any non-whitespace character; this is equivalent to the class [^ \t\n\r\f\v]          
\w	This matches any alphanumeric character; this is equivalent to the class [a-zA-Z0-9_]            
\W	This matches any non-alphanumeric character; this is equivalent to the class [^a-zA-Z0-9_]

In [16]:
txt = """
Thw first season of Indian Premiere League (IPL) was played in 2008. 
The second season was played in 2009 in South Africa. 
Last season was played in 2018 and won by Chennai Super Kings (CSK).
CSK won the title in 2010 and 2011 as well.
Mumbai Indians (MI) has also won the title 3 times in 2013, 2015 and 2017.
"""

In [17]:
re.findall('\d\d\d\d',txt)

['2008', '2009', '2018', '2010', '2011', '2013', '2015', '2017']

In [18]:
re.findall('Th[e|w]',txt)

['Thw', 'The']

# backslash and alteration


In [19]:
txt1 = """
C:\Windows
C:\Python
C:\Windows\System32
"""

In [21]:
re.escape('C:\Windows\System32')

'C:\\\\Windows\\\\System32'

In [23]:
re.findall('C:\\\\Windows\\\\System32',txt1)

['C:\\Windows\\System32']

In [25]:
re.findall('and|or|the',txt)

['and', 'the', 'and', 'the', 'and']

# Quantifiers

In [104]:
'''Quantifiers are the mechanisms to define how a character, metacharacter, or character set can be repeated.       

Here is the list of 4 basic quantifers:          
 
Symbol   	 Name	         Quantification of previous character           
?	       Question Mark	  Optional (0 or 1 repetitions)         
*	      Asterisk      	  Zero or more times          
+	      Plus Sign	           One or more times           
{n,m}     Curly Braces	       Between n and m times  '''           

'Quantifiers are the mechanisms to define how a character, metacharacter, or character set can be repeated.       \n\nHere is the list of 4 basic quantifers:          \n \nSymbol   \t Name\t         Quantification of previous character           \n?\t       Question Mark\t  Optional (0 or 1 repetitions)         \n*\t      Asterisk      \t  Zero or more times          \n+\t      Plus Sign\t           One or more times           \n{n,m}     Curly Braces\t       Between n and m times  '

In [None]:
'''We can use the curly brackets syntax here with these modifications:

Syntax	Description
{n}	The previous character is repeated exactly n times.
{n,}	The previous character is repeated at least n times.
{,n}	The previous character is repeated at most n times.
{n,m}	The previous character is repeated between n and m times (both inclusive).'''

In [26]:
txt2 = """
I have 2 dogs. One dog is 1 year old and other one is 2 years old. Both dogs are very cute! 
"""

In [27]:
re.findall('dogs?',txt2)

['dogs', 'dog', 'dogs']

Example 2
Find all filenames starting with file and ending with .txt in the given text.

In [28]:
txt = """
file1.txt
file_one.txt
file.txt
fil.txt
file.xml
file-1.txt
"""

In [33]:
re.findall('file\d?\.\w*',txt)

['file1.txt', 'file.txt', 'file.xml']

Example 4
Find years in the given text.

In [35]:
txt = """
The first season of Indian Premiere League (IPL) was played in 2008. 
The second season was played in 2009 in South Africa. 
Last season was played in 2018 and won by Chennai Super Kings (CSK).
CSK won the title in 2010 and 2011 as well.
Mumbai Indians (MI) has also won the title 3 times in 2013, 2015 and 2017.
"""

In [38]:
re.findall('\d{4}',txt)

['2008', '2009', '2018', '2010', '2011', '2013', '2015', '2017']

Example 6
Write a pattern to validate telephone numbers.

Telephone numbers can be of the form: 555-555-5555, 555 555 5555, 5555555555

In [39]:
txt = '555-555-5555, 555 555 5555, 5555555555'

In [46]:
re.findall('\d{3}[-?|\s?|\d]?\d{3}[-?|\s?|\d]?\d{3}',txt)

['555-555-555', '555 555 555', '5555555555']

# Non-Greedy behaviour

In [None]:
'''
The non-greedy (or reluctant) behaviour can be requested by adding an extra question mark to the quantifier.

For example, ??, *? or +?.'''

In [49]:
re.findall('\d.*?\d',txt)

['55',
 '5-5',
 '55',
 '55',
 '55',
 '55',
 '5 5',
 '55',
 '55',
 '55',
 '55',
 '55',
 '55',
 '55',
 '55']

# Greedy behaviour

In [50]:
re.findall('\d.*\d',txt)

['555-555-5555, 555 555 5555, 5555555555']

# boundary matchers

In [None]:
'''Here is a table which shows the list of all boundary matchers available in Python:

Matcher	Description
^	Matches at the beginning of a line
$	Matches at the end of a line
\b	Matches a word boundary
\B	Matches the opposite of \b. Anything that is not a word boundary
\A	Matches the beginning of the input
\Z	Matches the end of the input'''

In [58]:
txt = """
Name:
Age: 0
Roll No.: 15
Grade: S

Name: Ravi
Age: -1
Roll No.: 123 Name: ABC
Grade: K

Name: Ram
Age: N/A
Roll No.: 1
Grade: G
"""

Example 1
Consider a scenario where we want to find all the lines in the given text which start with the pattern Name:.

In [61]:
re.findall('^N.*$',txt,flags=re.M)

['Name:', 'Name: Ravi', 'Name: Ram']

Example 2
Find all the sentences which do not end with a full stop (.) in the given text.

In [63]:
txt = """
Lorem Ipsum is simply dummy text of the printing and typesetting industry.
Lorem Ipsum has been the industry's standard dummy text ever since the 1500s!
It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged.
It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages
More recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum."""

In [76]:
re.findall('^.*[^\.]$',txt,flags=re.M)

["Lorem Ipsum has been the industry's standard dummy text ever since the 1500s!",
 'It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages']

# Split using RegEx

In [77]:
txt

"\nLorem Ipsum is simply dummy text of the printing and typesetting industry.\nLorem Ipsum has been the industry's standard dummy text ever since the 1500s!\nIt has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged.\nIt was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages\nMore recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum."

In [79]:
x = re.compile('is')

In [80]:
x.split(txt)

['\nLorem Ipsum ',
 " simply dummy text of the printing and typesetting industry.\nLorem Ipsum has been the industry's standard dummy text ever since the 1500s!\nIt has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged.\nIt was popular",
 'ed in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages\nMore recently with desktop publ',
 'hing software like Aldus PageMaker including versions of Lorem Ipsum.']

In [83]:
re.split('\.',txt)

['\nLorem Ipsum is simply dummy text of the printing and typesetting industry',
 "\nLorem Ipsum has been the industry's standard dummy text ever since the 1500s!\nIt has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged",
 '\nIt was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages\nMore recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum',
 '']

In [86]:
re.split(' ',txt)

['\nLorem',
 'Ipsum',
 'is',
 'simply',
 'dummy',
 'text',
 'of',
 'the',
 'printing',
 'and',
 'typesetting',
 'industry.\nLorem',
 'Ipsum',
 'has',
 'been',
 'the',
 "industry's",
 'standard',
 'dummy',
 'text',
 'ever',
 'since',
 'the',
 '1500s!\nIt',
 'has',
 'survived',
 'not',
 'only',
 'five',
 'centuries,',
 'but',
 'also',
 'the',
 'leap',
 'into',
 'electronic',
 'typesetting,',
 'remaining',
 'essentially',
 'unchanged.\nIt',
 'was',
 'popularised',
 'in',
 'the',
 '1960s',
 'with',
 'the',
 'release',
 'of',
 'Letraset',
 'sheets',
 'containing',
 'Lorem',
 'Ipsum',
 'passages\nMore',
 'recently',
 'with',
 'desktop',
 'publishing',
 'software',
 'like',
 'Aldus',
 'PageMaker',
 'including',
 'versions',
 'of',
 'Lorem',
 'Ipsum.']

# Substitution

In [96]:
x = re.compile('\d{2}')
x

regex.Regex('\\d{2}', flags=regex.V0)

In [102]:
x.subn('\%',txt)

("\nLorem Ipsum is simply dummy text of the printing and typesetting industry.\nLorem Ipsum has been the industry's standard dummy text ever since the \\%\\%s!\nIt has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged.\nIt was popularised in the \\%\\%s with the release of Letraset sheets containing Lorem Ipsum passages\nMore recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum.",
 4)