In [2]:
import re

# Specify pattern using regex

To specify regular expressions, metacharacters are used. In the above example, **^** and **$** are metacharacters.

## Meta-characters

Metacharacters are characters that are interpreted in a special way by a RegEx engine. Here's a list of metacharacters:

**`[] . ^ $ * + ? {} () \ |`**

### [] - Square brackets

Square brackets specify a set of characters you wish to match

**Note: Inside square brackets, special sequences and meta characters lose their properties like +, $, * etc**

In [11]:
txt = "a"
pattern = "[a]"
re.match(pattern, txt)

<re.Match object; span=(0, 1), match='a'>

In [13]:
txt = "a"
pattern = "[abc]"
re.match(pattern, txt)

<re.Match object; span=(0, 1), match='a'>

In [16]:
txt = "ac"
pattern = "[abc]"
re.findall(pattern, txt)

['a', 'c']

In [18]:
txt = "hey anav"
pattern = "[abc]"
re.findall(pattern, txt)

['a', 'a']

In [21]:
txt = "hey ANAV"
pattern = "[abc]"
re.findall(pattern, txt)

[]

In [31]:
txt = "hey qwertyuiopasdfghjklzxcvbnm767543543657687:,><>/>.><6"
pattern = "[a-z]"
re.findall(pattern, txt)

['h',
 'e',
 'y',
 'q',
 'w',
 'e',
 'r',
 't',
 'y',
 'u',
 'i',
 'o',
 'p',
 'a',
 's',
 'd',
 'f',
 'g',
 'h',
 'j',
 'k',
 'l',
 'z',
 'x',
 'c',
 'v',
 'b',
 'n',
 'm']

In [36]:
txt = "hey qweryuklzxcv3657687:,>.><6"
pattern = "[0-9][0-9]"
re.findall(pattern, txt)

['36', '57', '68']

In [42]:
txt = "anav@gmail.com"
pattern = "[a-z][a-z][a-z][a-z]@[a-z][a-z][a-z][a-z][a-z].[a-z][a-z][a-z]"
re.findall(pattern, txt)

['anav@gmail.com']

In [43]:
txt = "hey qweryuklzxcv3657687:,>.><6"
pattern = "[^0-9]" # carrat is used to complement the chracter set
re.findall(pattern, txt)

['h',
 'e',
 'y',
 ' ',
 'q',
 'w',
 'e',
 'r',
 'y',
 'u',
 'k',
 'l',
 'z',
 'x',
 'c',
 'v',
 ':',
 ',',
 '>',
 '.',
 '>',
 '<']

In [100]:
txt = "123a+aaw+qe+dqw+"
pattern = "[.*^{}\/+]"
re.findall(pattern, txt)

['+', '+', '+', '+']

### . - Period

A period matches any single character (except newline "\n")

In [48]:
txt = "hello anav"
pattern = "hello....."
re.findall(pattern, txt)

['hello anav']

### ^ - Caret

The caret symbol is used to check if a string starts with a certain character

In [51]:
txt = "Roll no: 21"
pattern = "^Ro"
re.findall(pattern, txt)

['Ro']

In [54]:
txt = "Roll no: 21"
pattern = "^Roll no"
if re.findall(pattern, txt):
    print("Pattern found")
else:
    print("Pattern not found")

Pattern found


### $ - Dollar

The dollar symbol is used to check if a string ends with a certain character

In [56]:
txt = "Roll no: 21"
pattern = "21$"
re.findall(pattern, txt)

['21']

In [60]:
txt = "Roll no: 78"
pattern = "[0-9][0-9]$"
re.findall(pattern, txt)

['78']

In [62]:
txt = "Roll no: 788"
pattern = "[0-9][0-9]$"
re.findall(pattern, txt)

['88']

In [64]:
txt = "Roll no: 8"
pattern = "[0-9][0-9]$"
re.findall(pattern, txt)

[]

In [19]:
txt = "Roll no: 854654"
pattern = "[0-9]+$"
re.findall(pattern, txt)

['854654']

### * - Asterisk

The * symbol matches 0 or more occurences of the pattern left of it.

In [11]:
txt = "man"
pattern = "ma*n"
re.findall(pattern, txt)

['man']

In [12]:
txt = "maaaaaaan" # more than one occurence
pattern = "ma*n"
re.findall(pattern, txt)

['maaaaaaan']

In [14]:
txt = "mn" # 0 occurence
pattern = "ma*n"
re.findall(pattern, txt)

['man']

In [17]:
txt = "ma"
pattern = "ma*n"
re.findall(pattern, txt)

[]

### + - Plus

The + symbol matches one or more occurences of the pattern left of it.

In [20]:
txt = "man"
pattern = "ma+n"
re.findall(pattern, txt)

['man']

In [23]:
txt = "maaaaan"
pattern = "ma+n"
re.findall(pattern, txt)

['maaaaan']

In [24]:
txt = "mn"
pattern = "ma+n"
re.findall(pattern, txt)

[]

### ? - Question mark

The ? symbol matches 0 or 1 occurences of the pattern left of it.

In [25]:
txt = "man"
pattern = "ma?n"
re.findall(pattern, txt)

['man']

In [28]:
txt = "maan"
pattern = "ma?n"
re.findall(pattern, txt)

[]

In [29]:
txt = "mn"
pattern = "ma?n"
re.findall(pattern, txt)

['mn']

### {} - Braces

Consider this code: {n,m}. This means atleast n, and atmost m repitions of the pattern left of it.

In [30]:
# Select first 5 numbers
txt = "324435"
pattern = "[0-9][0-9][0-9][0-9][0-9]"
re.findall(pattern, txt)

['32443']

In [31]:
# Select first 5 numbers
txt = "324435"
pattern = "[0-9]{5}"
re.findall(pattern, txt)

['32443']

In [34]:
# Select first 5 numbers
txt = "324435"
pattern = "[0-9]{0,5}"
re.findall(pattern, txt)

['32443', '5', '']

In [35]:
# Select first 5 numbers
txt = "324435"
pattern = "[0-9]{1,5}"
re.findall(pattern, txt)

['32443', '5']

In [39]:
# Select first 5 numbers
txt = "32443544366"
pattern = "[0-9]{1,5}"
re.findall(pattern, txt)

['32443', '54436', '6']

In [50]:
# Select phone number
txt = "rgegergthrsgth +91-9911515109 rehaehres"
pattern = "\+[0-9]{2}-?[0-9]{10}"
re.findall(pattern, txt)

['+91-9911515109']

In [52]:
# Select phone number
txt = "rgegergthrsgth +91-99115 15109 rehaehres"
pattern = "\+[0-9]{2}-?[0-9]{5} ?[0-9]{5}"
re.findall(pattern, txt)

['+91-99115 15109']

In [55]:
# Select phone number
txt = "rgegergthrsgth +91-9911515109 rehaehres"
pattern = "\+[0-9]{2}-?[0-9]{5} ?[0-9]{5}"
re.findall(pattern, txt)

['+91-9911515109']

In [7]:
txt = "324435 12321432 32 4 234"
pattern = "[0-9 ]{3,}"
re.findall(pattern, txt)

['324435 12321432 32 4 234']

In [19]:
txt = "123aaa"
pattern = "[0-9]a{3,}"
re.findall(pattern, txt)

['3aaa']

In [38]:
txt = "abc23132432dfasf"
pattern = "[0-9 a-z]{3,}"
re.findall(pattern, txt)

['abc23132432dfasf']

In [44]:
txt = "123aa.awqedqw"
pattern = ".{10}"
re.findall(pattern, txt)

['123aa.awqe']

In [65]:
txt = "123aaaw}qe{{dq}w"
pattern = "[.*^{}\/+]"
re.findall(pattern, txt)

['}', '{', '{', '}']

In [97]:
txt = """123aaw\qed\qw
"""
pattern = r"[.*\\^{}\n]"
re.findall(pattern, txt)

['\\', '\\', '\n']

### | - Alternation

In [7]:
txt = "a"
pattern = "a|c"
re.findall(pattern, txt)

['a']

In [9]:
txt = "d"
pattern = "a|c"
re.findall(pattern, txt)

[]

In [17]:
# match either 5 digits or 5 characters of alphabet
txt = "qwdwqfsdf"
pattern = "[1-9]{5}|[a-z]{5}"
re.findall(pattern, txt)

['qwdwq']

In [21]:
# match either 5 digits or 5 characters of alphabet
txt = "132552132"
pattern = "[1-9]{5}|[a-z]{5}"
re.findall(pattern, txt)

['13255']

In [27]:
txt = "anav@gmail.com"
pattern = "[a-z_0-9]+@[a-z]+[.][a-z]+"
re.findall(pattern, txt)

['anav@gmail.com']

In [37]:
txt = "anav@gmail.net"
pattern = "[a-z_0-9]+@[a-z]+[.]in|[a-z_0-9]+@[a-z]+[.]com|[a-z_0-9]+@[a-z]+[.]net" # With domain names
re.findall(pattern, txt)

['anav@gmail.net']

### () - Group

Parethesis () is used to group sub-patterns.

In [61]:
txt = "ab xz"
pattern = "(a|b|c)xz"
re.search(pattern, txt)

In [62]:
txt = "abxz"
pattern = "(a|b|c)xz"
re.search(pattern, txt)

<re.Match object; span=(1, 4), match='bxz'>

In [63]:
txt = "axz cabxz"
pattern = "(a|b|c)xz"
re.search(pattern, txt)

<re.Match object; span=(0, 3), match='axz'>

**(a|b|c)xz** matches any string that matches either **a** or **b** or **c** followed by **xz**

In [68]:
txt = "anav@gmail.in"
pattern = "[a-z_0-9]+@[a-z]+[.](net|com|in)"
re.search(pattern, txt)

<re.Match object; span=(0, 13), match='anav@gmail.in'>

In [78]:
txt = "axz cabxz"
pattern = "(a|b|c)xz"
re.search(pattern, txt)

<re.Match object; span=(0, 3), match='axz'>

## Special sequences

Special sequences make commonly used patterns easier to write. Here's a list of special sequences:

**`\A \b \B \d \D \s \S \w \W \Z`**

### \A

Matches if specified characters are at the start of the string, similiar to ^

In [7]:
txt = "the sun"
pattern = "\Athe"
re.findall(pattern, txt)

['the']

### \b

Matches if the specified characters are at the beginning or at the end of the **word**

In [28]:
txt = "football"
pattern = r"\bfoo"
re.findall(pattern, txt)

['foo']

In [29]:
txt = "a football"
pattern = r"\bfoo"
re.findall(pattern, txt)

['foo']

In [32]:
txt = "this is a football 23"
pattern = r"\bfoo"
re.findall(pattern, txt)

['foo']

In [36]:
txt = "this is afootball 23"
pattern = r"\bfoo"
re.findall(pattern, txt)

[]

In [44]:
txt = "this is a football 23"
pattern = r"ball\b"
re.findall(pattern, txt)

['ball']

### /B

Oppostie of \b. matches if the specified characters are not at the beginning or end of the word

In [54]:
txt = "football"
pattern = r"\Bfoo"
re.findall(pattern, txt)

[]

In [58]:
txt = "afootball"
pattern = r"\Bfoo"
re.search(pattern, txt)

<re.Match object; span=(1, 4), match='foo'>

### \d

Equivalent of [0-9], matches any digit

In [64]:
txt = "012321fwd432ab<>\}:"
pattern = r"[0-9]"
re.findall(pattern, txt)

['0', '1', '2', '3', '2', '1', '4', '3', '2']

In [62]:
txt = "012321fwd432ab<>\}:"
pattern = r"\d"
re.findall(pattern, txt)

['0', '1', '2', '3', '2', '1', '4', '3', '2']

### \D

Equivalent of [^0-9], matches any non decimal digit

In [65]:
txt = "012321fwd432ab<>\}:"
pattern = r"[^0-9]"
re.findall(pattern, txt)

['f', 'w', 'd', 'a', 'b', '<', '>', '\\', '}', ':']

In [66]:
txt = "012321fwd432ab<>\}:"
pattern = r"\D"
re.findall(pattern, txt)

['f', 'w', 'd', 'a', 'b', '<', '>', '\\', '}', ':']

### \s - small s

Matches where a string contains any whitespace character, equivalent to [ \n\r\v\f\t]

list

In [68]:
txt = "012321fwd432ab<>\}:"
pattern = r"\s"
re.findall(pattern, txt)

[]

In [72]:
txt = """Hello My name 
 is anav\t,   """
pattern = r"\s"
re.findall(pattern, txt)

[' ', ' ', ' ', '\n', ' ', ' ', '\t', ' ', ' ', ' ']

### \S

Matches where a string does not contain any whitespace character, equivalent to [^ \n\r\v\f\t]

In [75]:
txt = "12 31w \rd \v42b\n <>\ \t}"
pattern = r"\S"
re.findall(pattern, txt)

['1', '2', '3', '1', 'w', 'd', '4', '2', 'b', '<', '>', '\\', '}']

### \w

Matches any alphanumeric character, equivalent to [a-zA-Z0-9_]

*Note: Underscore is also considered as an alphanumeric character*

In [79]:
txt = "012321 f>wd4_32ab <>\}:"
pattern = r"[a-zA-Z0-9_]"
re.findall(pattern, txt)

['0', '1', '2', '3', '2', '1', 'f', 'w', 'd', '4', '_', '3', '2', 'a', 'b']

In [77]:
txt = "012321 f>wd4_32ab <>\}:"
pattern = r"\w"
re.findall(pattern, txt)

['0', '1', '2', '3', '2', '1', 'f', 'w', 'd', '4', '_', '3', '2', 'a', 'b']

In [90]:
txt = "anav@gmail.in"
pattern = r"\w+@[a-z]+[.](net|com|in)"
re.search(pattern, txt)

<re.Match object; span=(0, 13), match='anav@gmail.in'>

### \W

Matches any non alphanumeric character, equivalent to [^a-zA-Z0-9_]

In [80]:
txt = "012321 f>wd4_32ab <>\}:"
pattern = r"[^a-zA-Z0-9_]"
re.findall(pattern, txt)

[' ', '>', ' ', '<', '>', '\\', '}', ':']

In [78]:
txt = "012321 f>wd4_32ab <>\}:"
pattern = r"\W"
re.findall(pattern, txt)

[' ', '>', ' ', '<', '>', '\\', '}', ':']

### \Z

Matches if specified characters are at the start of the string, similiar to $

In [83]:
txt = "the sun"
pattern = "sun\Z"
re.findall(pattern, txt)

['sun']

## Python Regex (functions)

### findall

The re.findall method returns a list of strings containing all matches, if a pattern is not found re.findall returns an empty list.

In [92]:
txt = "ab 123 4 567 cd"
pattern = r"\d+"
re.findall(pattern, txt)

['123', '4', '567']

In [93]:
txt = "ab efghijk cd"
pattern = r"\d+"
re.findall(pattern, txt)

[]

### split

The re.split method splits the string where there is a match and returns a list of strings where the splits have occured. If the pattern is not found, re.split returns a list containing the original string.

In [96]:
txt = "abcsplit_herecdesplit_hereefg"
pattern = r"split_here"
re.split(pattern, txt)

['abc', 'cde', 'efg']

In [94]:
txt = "ab efghijk cd"
pattern = r"\d+"
re.split(pattern, txt)

['ab efghijk cd']

You can pass maxsplit argument to re.split method. Its the maximum number of splits that will happen. Default is 0, meaning all possible splits.

In [98]:
txt = "abcsplit_herecdesplit_hereefg"
pattern = r"split_here"
re.split(pattern, txt, maxsplit=2)

['abc', 'cde', 'efg']

In [101]:
txt = "abcsplit_herecdesplit_hereefg"
pattern = r"split_here"
re.split(pattern, txt, maxsplit=1)

['abc', 'cdesplit_hereefg']

### sub

The re.sub method returns a string where matched occurences are replaced with the content of "replace" variable. If the pattern is not found, re.sub returns the original string.

Syntax of re.sub

`re.sub(pattern, replace, string)`

In [108]:
txt = "abcreplace_herecdereplace_hereefg"
pattern = r"replace_here"
re.sub(pattern, "hello", txt)

'abchellocdehelloefg'

You can pass count argument to re.sub method. Its the maximum number of substitutions that will happen. Default is 0, meaning all possible substitutions.

In [109]:
txt = "abcreplace_herecdereplace_hereefg"
pattern = r"replace_here"
re.sub(pattern, "hello", txt, count=1)

'abchellocdereplace_hereefg'

### subn

The `re.subn()` is similar to `re.sub()` except it returns a tuple of 2 items containing the new string and the number of substitutions made

In [4]:
# Program to remove all whitespaces

# multiline string
string = 'abc 12\
de 23 \n f45 6'

# matches all whitespace characters
pattern = '\s+'

# empty string
replace = ''

new_string = re.subn(pattern, replace, string) 
print(new_string)

('abc12de23f456', 4)


In [11]:
# Program to remove all whitespaces

# multiline string
string = """abc 12
de 23 \n f45 6"""

# matches all whitespace characters
pattern = '\s+'

# empty string
replace = ''

new_string = re.subn(pattern, replace, string) 
print(new_string)

('abc12de23f456', 5)


### search

The `re.search()` method takes two arguments: a pattern and a string. The method looks for the first location where the RegEx pattern produces a match with the string.

If the search is successful, `re.search()` returns a match object; if not, it returns None.

`match = re.search(pattern, str)`

In [13]:
txt = "python is a programming language"
pattern = "\Apython"

re.search(pattern, txt)

<re.Match object; span=(0, 6), match='python'>

### match object

You can get methods and attributes of a match object using `dir()` function.

Some of the commonly used methods and attributes of match objects are:

#### match.group

The `group()` method returns the part of the string where there is a match.

In [27]:
string = '39801 356, 2102 1111'

# Three digit number followed by space followed by two digit number
pattern = '(\d{3}) (\d{2})'

# match variable contains a Match object.
match = re.search(pattern, string)
match.group()

'801 35'

In [34]:
print(match.group(1))

print(match.group(2))

print(match.group(1, 2))

print(match.groups())

801
35
('801', '35')
('801', '35')


#### **match.start()**, **match.end()** and **match.span()**



The **start()** function returns the index of the start of the matched substring. Similarly, **end()** returns the end index of the matched substring.

In [35]:
print(match.start())
print(match.end())

2
8


The **span()** function returns a tuple containing start and end index of the matched part.

In [39]:
match.span()

(2, 8)

#### **match.re** and **match.string**

The **re** attribute of a matched object returns a regular expression object. Similarly, **string** attribute returns the passed string.

In [41]:
print(match.re)
print(match.string)

re.compile('(\\d{3}) (\\d{2})')
39801 356, 2102 1111


In [43]:
dir(match)

['__class__',
 '__class_getitem__',
 '__copy__',
 '__deepcopy__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'end',
 'endpos',
 'expand',
 'group',
 'groupdict',
 'groups',
 'lastgroup',
 'lastindex',
 'pos',
 're',
 'regs',
 'span',
 'start',
 'string']

### options

The re functions take options to modify the behavior of the pattern match. The option flag is added as an extra argument to the `search()` or `findall()` etc., 

e.g. `re.search(pat, str, re.IGNORECASE)`.

- IGNORECASE -- ignore upper/lowercase differences for matching, so 'a' matches both 'a' and 'A'.

- DOTALL -- allow dot (.) to match newline -- normally it matches anything but newline. This can trip you up -- you think `.*` matches everything, but by default it does not go past the end of a line. Note that `\s` (whitespace) includes newlines, so if you want to match a run of whitespace that may include a newline, you can just use `\s*`

- MULTILINE -- Within a string made of many lines, allow `^` and `$` to match the start and end of each line. Normally `^/$` would just match the start and end of the whole string.

## Examples

In [46]:
txt = "piiiiiig"
pattern = "pi+"

re.findall(pattern, txt)

['piiiiii']

In [47]:
txt = "piiiiiig"
pattern = "i+"

re.findall(pattern, txt)

['iiiiii']

In [60]:
txt = "xx1  2   3xx"
txt1 = "xx123xx"
txt2 = "'xx12  3xx"
txt3 = "x1\r2  \n3xx"
pattern = r"\d\s*\d\s*\d"

print(re.findall(pattern, txt))
print(re.findall(pattern, txt1))
print(re.findall(pattern, txt2))
print(re.findall(pattern, txt3))

['1  2   3']
['123']
['12  3']
['1\r2  \n3']


In [73]:
txt = "purple alice-b@google.com monkey dishwasher"
pattern = "[\w-]+@(gmail|google|yahoo).com"

re.search(pattern, txt)

<re.Match object; span=(7, 25), match='alice-b@google.com'>

In [77]:
str = 'purple alice-b@google.com monkey dishwasher'
match = re.search(r'([\w.-]+)@([\w.-]+)', str)
if match:
    print(match.group())   ## 'alice-b@google.com' (the whole match)
    print(match.group(1))  ## 'alice-b' (the username, group 1)
    print(match.group(2))  ## 'google.com' (the host, group 2)

alice-b@google.com
alice-b
google.com


In [79]:
## Suppose we have a text with many email addresses
str = 'purple alice@google.com, blah monkey bob@abc.com blah dishwasher'

## Here re.findall() returns a list of all the found email strings
emails = re.findall(r'[\w\.-]+@[\w\.-]+', str) ## ['alice@google.com', 'bob@abc.com']
for email in emails:
    # do something with each found email string
    print(email)
str = 'purple alice@google.com, blah monkey bob@abc.com blah dishwasher'
tuples = re.findall(r'([\w\.-]+)@([\w\.-]+)', str)
print(tuples)  ## [('alice', 'google.com'), ('bob', 'abc.com')]
for tuple in tuples:
    print(tuple[0])  ## username
    print(tuple[1])  ## host

alice@google.com
bob@abc.com
[('alice', 'google.com'), ('bob', 'abc.com')]
alice
google.com
bob
abc.com


In [82]:
txt = "piiiiiig"
pattern = "pi+"

print(re.findall(pattern, txt))
re.search(pattern, txt)

['piiiiii', 'piiiiiiiiiiiiii']


<re.Match object; span=(0, 7), match='piiiiii'>