In [13]:
import re

# The Basic

* The dot regex . matches an arbitrary character.
* The asterisk regex <pattern>* matches an arbitrary number of the regex
<pattern>. Note that this includes zero matching instances.
* The at-least-one regex <pattern>+ can match an arbitrary number of
<pattern> but must match at least one instance.
* The zero-or-one regex <pattern>? matches either zero or one instances
of <pattern>.
* The nongreedy asterisk regex *? matches as few arbitrary characters as
possible to match the overall regex.
* The regex <pattern>{m} matches exactly m copies of <pattern>.
* The regex <pattern>{m,n} matches between m and n copies of <pattern>.
* The regex <pattern_1>|<pattern_2> matches either <pattern_1>
or <pattern_2>.
* The regex <pattern_1><pattern_2> matches <pattern_1> and
then <pattern_2>.
* The regex (<pattern>) matches <pattern>. The parentheses group regu-
lar expressions so you can control the order of execution (for exam-
ple, (<pattern_1><pattern_2>)|<pattern_3> is different from <pattern_1>
(<pattern_2>|<pattern_3>). The parentheses regex also creates a match-
ing group, as you’ll see later in the section.

## . - is equal everything

In [14]:
text = '''A blockchain, originally block chain,
is a growing list of records, called blocks,
which are linked using cryptography.
'''

re.findall("b...k", text)

['block', 'block', 'block']

## *, asterix - arbitary number of symbols

In [15]:
re.findall("y.*y", text) # arbitary any symbols
# s* - arbitary s symbols 

['yptography']

In [16]:
re.findall("abc*", "abc ab abcc ad")

['abc', 'ab', 'abcc']

## (?) The Zero-or-one Regex 

In [17]:
re.findall("blocks?", text) # s?

['block', 'block', 'blocks']

### ungreedy: *?

In [18]:
txt = '<div>hello world</div>'

In [19]:
re.findall("<.*>", txt)

['<div>hello world</div>']

In [20]:
re.findall("<.*?>", txt)

['<div>', '</div>']

In [21]:
text = 'peter piper picked a peck of pickled peppers'
re.findall("p.*?e.*?r", text)

['peter', 'piper', 'picked a peck of pickled pepper']

## Fixed number of character: {N}

* () matches whatever regex is inside.
* {1,30} matches between 1 and 30 occurrences of the previous regex.
* (.{1,30}) matches between 1 and 30 arbitrary characters.

In [22]:
text_1 = "crypto-bot that is trading Bitcoin and other currencies"
text_2 = "cryptographic encryption methods that can be cracked easily with quantum computers"

In [24]:
pattern = re.compile("crypto(.{1,30})coin")

print(pattern.match(text_1))
print(pattern.match(text_2))

<re.Match object; span=(0, 34), match='crypto-bot that is trading Bitcoin'>
None


## Match, serach, findall

In [27]:
text = '''
"One can never have enough socks", said Dumbledore.
"Another Christmas has come and gone and I didn't
get a single pair. People will insist on giving me books."
Christmas Quote
'''
regex = 'Christ.*'
print(re.match(regex, text))
print(re.search(regex, text))
print(re.findall(regex, text))

None
<re.Match object; span=(62, 102), match="Christmas has come and gone and I didn't">
["Christmas has come and gone and I didn't", 'Christmas Quote']


# Matching groups

In [38]:
page = '''
<!DOCTYPE html>
<html>
<body>
<h1>My Programming Links</h1>
<a href="https://app.finxter.com/">test your Python skills</a>
<a href="https://blog.finxter.com/recursion/">Learn recursion</a>
<a href="https://nostarch.com/">Great books from NoStarchPress</a>
<a href="http://finxter.com/">Solve more Python puzzles</a>
</body>
</html>
'''

In [39]:
pattern = re.compile("<a.*?finxter.*?(test|puzzle).*?>")
re.findall(pattern, page)

['test', 'puzzle']

In [40]:
pattern = re.compile("(<a.*?finxter.*?(test|puzzle).*?>)")
re.findall(pattern, page)

[('<a href="https://app.finxter.com/">test your Python skills</a>', 'test'),
 ('<a href="http://finxter.com/">Solve more Python puzzles</a>', 'puzzle')]

## Nested groups

In [41]:
string = 'helloworld'

regex_1 = 'hello(world)'
regex_2 = '(hello(world))'

res_1 = re.findall(regex_1, string)
res_2 = re.findall(regex_2, string)

print(res_1)
print(res_2)

['world']
[('helloworld', 'world')]


## Digits or letters

* [a-e] -> a, b, c, d, e  
* [0-3] -> 0, 1, 2, 3
* [0-3a-c] -> 0, 1, 2, 3, a, b, c
* [^0-3a-c] -> not a, b, c, 0, 1, 2, 3

In [43]:
report = '''
If you invested $1 in the year 1801, you would have $18087791.41 today.
This is a 7.967% return on investment.
But if you invested only $0.25 in 1801, you would end up with $4521947.8525.
'''
dollars = [x[0] for x in re.findall('(\$[0-9]+(\.[0-9]*)?)', report)]

In [44]:
print(dollars)

['$1', '$18087791.41', '$0.25', '$4521947.8525']


In [58]:
article = '''
The algorithm has important practical applications
http://blog.finxter.com/applications/
in many basic data structures such as sets, trees,
dictionaries, bags, bag trees, bag dictionaries,
hash sets, https://blog.finxter.com/sets-in-python/
hash tables, maps, and arrays. http://blog.finxter.com/
http://not-a-valid-url
http:/bla.ba.com
http://bo.bo.bo.bo.bo.bo/
http://bo.bo.bo.bo.bo.bo/333483--33343-/
'''

In [59]:
stale_links = re.findall('http://[a-z0-9_\-\.]+\.[a-z0-9_\-\/]+', article)
stale_links

['http://blog.finxter.com/applications/',
 'http://blog.finxter.com/',
 'http://bo.bo.bo.bo.bo.bo/',
 'http://bo.bo.bo.bo.bo.bo/333483--33343-/']

In [61]:
print(re.findall('x{3,5}y', 'xy'))
print(re.findall('x{3,5}y', 'xxxy'))
print(re.findall('x{3,5}y', 'xxxxxy'))
print(re.findall('x{3,5}y', 'xxxxxxxxxxxxxxxxxy'))

[]
['xxxy']
['xxxxxy']
['xxxxxy']


## fullmatch  
checks whether the
regex matches the full string as the name suggests.

In [62]:
inputs = ['18:29', '23:55', '123', 'ab:de', '18:299', '99:99']
input_ok = lambda x: re.fullmatch('[0-9]{2}:[0-9]{2}', x) != None
for x in inputs:
    print(input_ok(x))

True
True
False
False
False
True


In [63]:
inputs = ['18:29', '23:55', '123', 'ab:de', '18:299', '99:99']
input_ok = lambda x: re.fullmatch('([01][0-9]|2[0-3]):[0-5][0-9]', x) != None
for x in inputs:
    print(input_ok(x))

True
True
False
False
False
False


## named group (variables)  
`(?P\<name\>...)`

In [66]:
pattern = '(?P<quote>[\'"]).*(?P=quote)'
text = 'She said "hi"'
print(re.search(pattern, text))

<re.Match object; span=(9, 13), match='"hi"'>


* `\s` matches any whitespace character (equivalent to `[\r\n\t\f\v ]`)

In [74]:
text = '''
It was a bright cold day in April, and the clocks were
striking thirteen. Winston Smith, his chin nuzzled into
his breast in an effort to escape the vile wind, slipped
quickly through the glass doors of Victory Mansions,
though not quickly enough to prevent a swirl of gritty
dust from entering along with him.
-- George Orwell, 1984
'''
duplicates = re.findall('([^\s]*(?P<x>[^\s])(?P=x)[^\s]*)', text)
duplicates

[('thirteen.', 'e'),
 ('nuzzled', 'z'),
 ('effort', 'f'),
 ('slipped', 'p'),
 ('glass', 's'),
 ('glass', 's'),
 ('doors', 'o'),
 ('gritty', 't'),
 ('--', '-'),
 ('Orwell,', 'l')]

In [77]:
text = text = 'if you use words too often words become used'
pattern = "\s(?P<repeat>[a-z]+)\s+([a-z+\s+]){0,10}(?P=repeat)\s"
print(re.search(pattern, " " + text + " "))

<re.Match object; span=(11, 34), match=' words too often words '>


In [78]:
re.search("\b(\w{1,10})\b\s+\b\1\b", text)

In [80]:
txt = "hello world hello"
pattern = "\b(\w{1,10})\b\s+\b\1\b"
print(re.search(pattern, " " + txt + " "))

[]


# Modifying text using sub

## Exclusion expression  
Negative Lookahead: `expr(?!cond)`

In [83]:
text = '''
Alice Wonderland married John Doe.
The new name of former 'Alice Wonderland' is Alice Doe.
Alice Wonderland replaces her old name 'Wonderland' with her new name 'Doe'.
Alice's sister Jane Wonderland still keeps her old name.
'''
updated_text = re.sub("Alice Wonderland(?!\')", 'ALICE DOE', text)
print(updated_text)


ALICE DOE married John Doe.
The new name of former 'Alice Wonderland' is Alice Doe.
ALICE DOE replaces her old name 'Wonderland' with her new name 'Doe'.
Alice's sister Jane Wonderland still keeps her old name.

