## Regexp Python

In [1]:
import re

In [84]:
text = """
Hello this is a quick walk through on regular expression in python.
123.456.7897
123*456*7897
123-456-7897
900-567-8909
800-568-1567
+27 652 305 879
+263 568 189 1899
+263-568-189-1899
+263_568_189_1899
+188 789 089 7816
*188 789 089 7816
https://www.google.com
https://whatsapp.com
http://localhost.edu
https://zero-5.li
sizaro_56@gmail.com
zakatMario@yahoo.com
faraicup@word.org
faraio@green.edu
Mr John
Mr. Petter
Mrs T
Ms Gonorio
Mrs Makosi
"""
sent = "Start working with regular expression you will enjoy these."

> We are going to use the above text to match regexp patterns

In [11]:
pattern = re.compile(r'^start', re.I)
matches = re.finditer(pattern, sent)
for match in matches:
    print(match)

<re.Match object; span=(0, 5), match='Start'>


In [14]:
pattern = re.compile(r'these.$', re.I)
matches = re.finditer(pattern, sent)
for match in matches:
    print(match)

<re.Match object; span=(53, 59), match='these.'>


In [17]:
pattern = re.compile(r'^[start|Start]')
matches = re.finditer(pattern, sent)
for match in matches:
    print(match)

<re.Match object; span=(0, 1), match='S'>


In [24]:
pattern = re.compile(r't{2,3}')
matches = re.finditer(pattern, text)
for match in matches:
    print(match)

<re.Match object; span=(135, 137), match='tt'>
<re.Match object; span=(158, 160), match='tt'>
<re.Match object; span=(179, 181), match='tt'>
<re.Match object; span=(200, 202), match='tt'>
<re.Match object; span=(307, 309), match='tt'>


In [25]:
pattern = re.compile(r't{2,3}?')
matches = re.finditer(pattern, text)
for match in matches:
    print(match)

<re.Match object; span=(135, 137), match='tt'>
<re.Match object; span=(158, 160), match='tt'>
<re.Match object; span=(179, 181), match='tt'>
<re.Match object; span=(200, 202), match='tt'>
<re.Match object; span=(307, 309), match='tt'>


In [27]:
pattern = re.compile(r'\bMr\b')
matches = re.finditer(pattern, text)
for match in matches:
    print(match)

<re.Match object; span=(293, 295), match='Mr'>
<re.Match object; span=(301, 303), match='Mr'>


In [31]:
pattern = re.compile(r'\bMr\B')
matches = re.finditer(pattern, text)
for match in matches:
    print(match)

<re.Match object; span=(312, 314), match='Mr'>
<re.Match object; span=(329, 331), match='Mr'>


In [48]:
pattern = re.compile(r'\d{3}-[1-8]{3}.\d{4}')
matches = re.finditer(pattern, text)
for match in matches:
    print(match) # Matches phone number of form ddd-ddd-dddd

<re.Match object; span=(95, 107), match='123-456-7897'>
<re.Match object; span=(108, 120), match='900-567-8909'>
<re.Match object; span=(121, 133), match='800-568-1567'>


#### Names 
> Let's create a pattert that matches names such as Mr ...

In [110]:
matches = re.finditer(r'^M(r)?s?.?\s\w+', text, re.MULTILINE | re.I)

for match in matches:
    start, stop = match.span()
    print(match, ", ",text[start: stop] )

<re.Match object; span=(399, 406), match='Mr John'> ,  Mr John
<re.Match object; span=(407, 417), match='Mr. Petter'> ,  Mr. Petter
<re.Match object; span=(418, 423), match='Mrs T'> ,  Mrs T
<re.Match object; span=(424, 434), match='Ms Gonorio'> ,  Ms Gonorio
<re.Match object; span=(435, 445), match='Mrs Makosi'> ,  Mrs Makosi


#### Phone numbers 
> Let's create a pattert that matches phone numbers

In [111]:
pattern = re.compile(r'[+*.]?\d{2,3}.[0-9]{3}.[0-9]{3,4}')

matches = re.finditer(pattern, text)

for match in matches:
    start, stop = match.span()
    print(match, ", ",text[start: stop] )


<re.Match object; span=(69, 81), match='123.456.7897'> ,  123.456.7897
<re.Match object; span=(82, 94), match='123*456*7897'> ,  123*456*7897
<re.Match object; span=(95, 107), match='123-456-7897'> ,  123-456-7897
<re.Match object; span=(108, 120), match='900-567-8909'> ,  900-567-8909
<re.Match object; span=(121, 133), match='800-568-1567'> ,  800-568-1567
<re.Match object; span=(134, 145), match='+27 652 305'> ,  +27 652 305
<re.Match object; span=(150, 162), match='+263 568 189'> ,  +263 568 189
<re.Match object; span=(168, 180), match='+263-568-189'> ,  +263-568-189
<re.Match object; span=(186, 198), match='+263_568_189'> ,  +263_568_189
<re.Match object; span=(204, 216), match='+188 789 089'> ,  +188 789 089
<re.Match object; span=(222, 234), match='*188 789 089'> ,  *188 789 089


#### Emails
> Let's create a pattert that matches emails

In [127]:
matches = re.finditer(r'([a-zA-Z0-9_])+@[a-zA-Z]+\.\w+', text)
for match in matches:
    start, stop = match.span()
    print(match, ", ",text[start: stop] )

<re.Match object; span=(323, 342), match='sizaro_56@gmail.com'> ,  sizaro_56@gmail.com
<re.Match object; span=(343, 363), match='zakatMario@yahoo.com'> ,  zakatMario@yahoo.com
<re.Match object; span=(364, 381), match='faraicup@word.org'> ,  faraicup@word.org
<re.Match object; span=(382, 398), match='faraio@green.edu'> ,  faraio@green.edu


'\nsizaro_56@gmail.com\nzakatMario@yahoo.com\nfaraicup@word.org\nfaraio@green.edu\n'

#### Names 
> Let's create a pattert that matches urls

In [141]:
matches = re.finditer(r'https?://([a-zA-Z0-9-]?)+(.[a-zA-Z]+)', text)
for match in matches:
    start, stop = match.span()
    print(match, ", ",text[start: stop] )

<re.Match object; span=(240, 258), match='https://www.google'> ,  https://www.google
<re.Match object; span=(263, 283), match='https://whatsapp.com'> ,  https://whatsapp.com
<re.Match object; span=(284, 304), match='http://localhost.edu'> ,  http://localhost.edu
<re.Match object; span=(305, 322), match='https://zero-5.li'> ,  https://zero-5.li


#### findAll(A, B)
> Matches all instances of an expression A in a string B and returns them in a list

In [151]:
matches = re.findall(r'\w+[is]+', "this is a regex this")

", ".join(matches)

'this, is, this'

#### search(A, B)
> Matches the first instance of an expression A in a string B, and returns it as a re match object.

In [155]:
# search for the first email address in the text sting
matches = re.search(r'([a-zA-Z0-9_])+@[a-zA-Z]+\.\w+', text, re.MULTILINE)
matches

<re.Match object; span=(323, 342), match='sizaro_56@gmail.com'>

#### split(A, B)
> Split a string B into a list using the delimiter A

In [157]:
matches = re.split(r'is?', "This is a work of magic.")
matches

['Th', ' ', ' a work of mag', 'c.']

#### re.sub(A, B, C)
> Replace A with B in the string C.

In [159]:
matches = re.sub(r'is', 'at', "This is a work of magic.")
matches

'That at a work of magic.'

#### re.match(A, B)
>Returns the first occurrence of A in B

In [160]:
matches = re.match(r'is', 'at', "This is a work of magic.")
matches

TypeError: unsupported operand type(s) for &: 'str' and 'int'