# Regular Expressions

In [2]:
import re

In [89]:
text = ("Jones Cardigan, 33, 0049171465734\n"
        "jones-c.online.com, 22.01.1987\n" 
        "a2b3c45abab"
       )

In [17]:
# Find substring
exp=re.compile(r"Jones")
match=exp.search(text)
match

<re.Match object; span=(0, 5), match='Jones'>

In [18]:
match.start()

0

In [19]:
match.end()

5

In [20]:
match.group()

'Jones'

**Metacharacters in Regular Expressions**

In [21]:
# Find single character from char list
exp=re.compile(r"[a-zA-Z0-9]")
match=exp.search(text)
match

<re.Match object; span=(0, 1), match='J'>

In [57]:
# Find character others than listed
exp=re.compile(r"[^a-zA-Z0-9\s]")
match=exp.search(text)
match

<re.Match object; span=(14, 15), match=','>

In [22]:
# Find space
exp=re.compile(r"\s")
match=exp.search(text)
match

<re.Match object; span=(5, 6), match=' '>

In [102]:
# Find everything thats no a space
exp=re.compile(r"\S")
match=exp.search(text)
match

<re.Match object; span=(0, 1), match='J'>

In [23]:
# Find any character
exp=re.compile(r".", re.DOTALL)
match=exp.search(text)
match

<re.Match object; span=(0, 1), match='J'>

In [24]:
# Find 5 digits in sequence
exp=re.compile(r"\d{5}")
match=exp.search(text)
match

<re.Match object; span=(20, 25), match='00491'>

In [25]:
# Find letter, followed by digit (repeated 2 times)
exp=re.compile(r"([a-z]\d){2}")
match=exp.search(text)
match

<re.Match object; span=(62, 66), match='a2b3'>

In [26]:
# Find a, followed by one or zero digits, followed by b
exp=re.compile(r"a\d?b")
match=exp.search(text)
match

<re.Match object; span=(62, 65), match='a2b'>

In [27]:
# Find date in format xx.xx.xxxx
exp=re.compile(r"\d{2}\.\d{2}\.\d{4}")
match=exp.search(text)
match

<re.Match object; span=(50, 60), match='12.01.1987'>

In [28]:
# Ignorecase
exp=re.compile(r"c", re.IGNORECASE)
match=exp.search(text)
match

<re.Match object; span=(6, 7), match='C'>

In [30]:
# Sequence of any characters (except line break)
exp=re.compile(r".*")
match=exp.search(text)
match

<re.Match object; span=(0, 34), match='Jones Cardigan, 33, 0049171465734 '>

In [32]:
# Sequence of any characters 
exp=re.compile(r".*", re.DOTALL)
match=exp.search(text)
match.group()

'Jones Cardigan, 33, 0049171465734 \ninfo@jones.de, 12.01.1987 \na2b3c45'

In [41]:
# Search for expr at beginning of whole text
exp=re.compile(r"^Jo")
match=exp.search(text)
match

<re.Match object; span=(0, 2), match='Jo'>

In [49]:
# Search at beginning of each line
exp=re.compile(r"^jo", re.MULTILINE)
match=exp.search(text)
match

<re.Match object; span=(35, 37), match='jo'>

In [54]:
# End of text/line anchor: $
exp=re.compile(r"87$", re.MULTILINE)
match=exp.search(text)
match

<re.Match object; span=(62, 64), match='87'>

In [85]:
# * and + by default greedy = they yield the longest matching string
exp=re.compile(r"J.*s", re.DOTALL)
match=exp.search(text)
match

<re.Match object; span=(0, 39), match='Jones Cardigan, 33, 0049171465734\njones'>

In [86]:
# Place ? after quantifier to make it non-greedy
exp=re.compile(r"J.*?s", re.DOTALL)
match=exp.search(text)
match

<re.Match object; span=(0, 5), match='Jones'>

**Iterate over search results**

In [88]:
exp=re.compile(r"jones", re.IGNORECASE)
for match in exp.finditer(text):
    print(match)

<re.Match object; span=(0, 5), match='Jones'>
<re.Match object; span=(34, 39), match='jones'>


**Extracting groups**

In [95]:
# Extract day, month and year from a date
exp=re.compile(r"(\d{2})\.(\d{2})\.(\d{4})")
match = exp.search(text)
match.group(), match.group(1), match.group(2), match.group(3)

('22.01.1987', '22', '01', '1987')

**Replace**

In [97]:
# Change format of all dates to yyyy-mm-dd
exp=re.compile(r"(\d{2})\.(\d{2})\.(\d{4})")
text_sub = exp.sub(lambda match: f"{match.group(3)}-{match.group(2)}-{match.group(1)}", text)
text_sub

'Jones Cardigan, 33, 0049171465734\njones-c.online.com, 1987-01-22\na2b3c45abab'

**Look-behind and Look-ahead**

In [98]:
# Look-behind: ?<   negative: ?<!
# Look-ahead: ?=    negative: ?=!

In [103]:
# Check PW: at least one number, one letter, one special char and 6-12 long
exp=re.compile(r"^(?=.*[a-z])(?=.*\d)(?=.*[^a-z\d])\S{6,12}$")

In [106]:
# Invalid PW
match=exp.search("password12")
print(match)

None


In [107]:
# Valid PW
match=exp.search("password$12")
print(match)

<re.Match object; span=(0, 11), match='password$12'>
