In [1]:
import re
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Exercise 1

def is_vowel(string):
    """ return if the input is a single vowel character"""
    
    # Starts with a character from this character class
    # Ends with a character from this character class
    regex = r'^[aeiouAEIOU]$'
    return bool(re.search(regex, string))

assert is_vowel("A") == True
assert is_vowel("e") == True
assert is_vowel("b") == False
assert is_vowel("ee") == False
assert is_vowel("aie") == False

In [7]:
# Exercise 1 w/ only Python
def is_vowel2(string):
    string = string.lower()
    return string in ["a", "e", "i", "o", "u"]

assert is_vowel2("A") == True
assert is_vowel2("e") == True
assert is_vowel2("b") == False
assert is_vowel2("ee") == False
assert is_vowel2("aie") == False

True

In [12]:
# Exercise 2
# A valid username starts with a lowercase letter, 
# and only consists of lowercase letters, numbers, or the _ character. 
# It should also be no longer than 32 characters. 
# The function should return True or False

def is_valid_username(string):
    return bool(re.search(r'^[a-z][a-z0-9_]{,31}$', string))

assert is_valid_username("jane_janeway76") == True
assert is_valid_username("bob_bobberson") == True
assert is_valid_username("Robert') DROP TABLE Students;--") == False
assert is_valid_username("quincy__!") == False

In [None]:
# Exercise 2 solved by breaking the logic into little functions and using them together
# My preference is to build tiny functions that do one thing and then string them together
def starts_with_lowercase(string):
    return bool(re.search(r'^[a-z]', string))

def is_only_lowercase_alphanumeric_or_underscore(string):
    # character class holds any a-z, 0-9, underscore
    # match until the last character in this character class
    return bool(re.search(r'[a-z0-9_]$', string))
    
def is_valid_username2(string):
    return starts_with_lowercase(string) and is_only_lowercase_alphanumeric_or_underscore(string) and len(string) <= 32

assert is_valid_username2("jane_janeway76") == True
assert is_valid_username2("bob_bobberson") == True
assert is_valid_username2("Robert') DROP TABLE Students;--") == False
assert is_valid_username2("quincy__!") == False

In [13]:
# Exercise 3

# Write a regular expression to capture phone numbers. 
# It should match all of the following:
# (210) 867 5309
# +1 210.867.5309
# 867-5309
# 210-867-5309
numbers = [
    '(210) 867 5309',
    '+1 210.867.5309',
    '867-5309',
    '210-867-5309',
]

# Parts of a phone number
# Country Code +1, Area Code 210, Exchange Code 226, Line number 3232

phone_number_re = re.compile(r'''
^
(?P<country_code>\+\d+)?
\D*?
(?P<area_code>\d{3})?
\D*?
(?P<exchange_code>\d{3})
\D*?
(?P<line_number>\d{4})
\D*
$
''', re.VERBOSE)

# Iterate through the list of strings, producing a dictionary containing named groups from each string
phone_numbers = [re.search(phone_number_re, number).groupdict() for number in numbers]
phone_numbers

# And what can you do once you have a list of dictionaries? That's right! Make a Dataframe!
df = pd.DataFrame(phone_numbers)
df

Unnamed: 0,country_code,area_code,exchange_code,line_number
0,,210.0,867,5309
1,1.0,210.0,867,5309
2,,,867,5309
3,,210.0,867,5309


In [18]:
# Zero or more of SOMETHING
re.search(r"\w*", "bob")

<re.Match object; span=(0, 3), match='bob'>

In [19]:
# . means anything, Zero or more of anything
re.search(r".*", "bob")

<re.Match object; span=(0, 3), match='bob'>

In [20]:
re.search(r"\w?", "bob")

<re.Match object; span=(0, 1), match='b'>

In [15]:
re.search(phone_number_re, numbers[0]).groupdict()

{'country_code': None,
 'area_code': '210',
 'exchange_code': '867',
 'line_number': '5309'}

In [21]:
# Exercise 4
# Convert the dates below to the standardized year-month-day format.
dates = [
    '02/04/19',
    '02/05/19',
    '02/06/19',
    '02/07/19',
    '02/08/19',
    '02/09/19',
    '02/10/19',
]

dates = [re.sub(r'(\d+)/(\d+)/(\d+)', r'20\3-\1-\2', date) for date in dates]
dates

['2019-02-04',
 '2019-02-05',
 '2019-02-06',
 '2019-02-07',
 '2019-02-08',
 '2019-02-09',
 '2019-02-10']

In [25]:
# Exercise 5

# GET /api/v1/sales?page=86 [16/Apr/2019:193452+0000] HTTP/1.1 {200} 510348 "python-requests/2.21.0" 97.105.19.58
# POST /users_accounts/file-upload [16/Apr/2019:193452+0000] HTTP/1.1 {201} 42 "User-Agent: Mozilla/5.0 (X11; Fedora; Fedora; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36" 97.105.19.58
# GET /api/v1/items?page=3 [16/Apr/2019:193453+0000] HTTP/1.1 {429} 3561 "python-requests/2.21.0" 97.105.19.58

string = """
GET /api/v1/sales?page=86 [16/Apr/2019:193452+0000] HTTP/1.1 {200} 510348 "python-requests/2.21.0" 97.105.19.58
POST /users_accounts/file-upload [16/Apr/2019:193452+0000] HTTP/1.1 {201} 42 "User-Agent: Mozilla/5.0 (X11; Fedora; Fedora; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36" 97.105.19.58
GET /api/v1/items?page=3 [16/Apr/2019:193453+0000] HTTP/1.1 {429} 3561 "python-requests/2.21.0" 97.105.19.58
"""

# method, endpoint, date, protocol, http_status_code, some_number, "user_agent", ip_address

regex = r'''
(?P<method>[A-Z]+)
\s
(?P<path>.*)
\s
\[(?P<timestamp>.*)\]
\s
HTTP/1.1
\s
{(?P<status>\d+)}
\s
(?P<bytes_sent>\d+)
\s
"(?P<user_agent>.*)"
\s+
(?P<ip>\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})
'''

regex = re.compile(regex, re.VERBOSE)

# Lines is a pandas series
lines = pd.Series(string.strip().split('\n'))
lines

0    GET /api/v1/sales?page=86 [16/Apr/2019:193452+...
1    POST /users_accounts/file-upload [16/Apr/2019:...
2    GET /api/v1/items?page=3 [16/Apr/2019:193453+0...
dtype: object

In [27]:
df = lines.str.extract(regex)
df

Unnamed: 0,method,path,timestamp,status,bytes_sent,user_agent,ip
0,GET,/api/v1/sales?page=86,16/Apr/2019:193452+0000,200,510348,python-requests/2.21.0,97.105.19.58
1,POST,/users_accounts/file-upload,16/Apr/2019:193452+0000,201,42,User-Agent: Mozilla/5.0 (X11; Fedora; Fedora; ...,97.105.19.58
2,GET,/api/v1/items?page=3,16/Apr/2019:193453+0000,429,3561,python-requests/2.21.0,97.105.19.58


In [34]:
# Exercise 6

# Find a list of words on your mac at /usr/share/dict/words. 

# Use this file to answer the following questions:
# How many words have at least 3 vowels?
# How many words have at least 3 vowels in a row?
# How many words have at least 4 consonants in a row?
# How many words start and end with the same letter?
# How many words start and end with a vowel?
# How many words contain the same letter 3 times in a row?
# What other interesting patterns in words can you find?

# Wrangle
df = pd.read_csv('/usr/share/dict/words', header=None).dropna()
words = df[0]
words = words.str.lower()
words

0                  a
1                  a
2                 aa
3                aal
4              aalii
             ...    
235881        zythem
235882        zythia
235883        zythum
235884       zyzomys
235885    zyzzogeton
Name: 0, Length: 235884, dtype: object

In [39]:
# How many words have at least 3 vowels?
at_least_3_vowels = (words.str.count(r"[aeiou]")  >= 3)
words[at_least_3_vowels].head()
at_least_3_vowels.sum()

191365

In [40]:
# How many words have at least 3 vowels in a row?
words.str.count(r"[aeiou]{3}").sum()

6251

In [41]:
# How many words have at least 4 consonants in a row?
# ^ inside of a character class []
# Match pattern not a or e o i or o or u
words.str.count(r"[^aeiou]{4}").sum()

19640

In [52]:
# Words with at least 6 consonants in a row
words[words.str.contains(r"[^aeiouy]{6}")]

12492     archchronicler
21118        bergschrund
64716      eschscholtzia
73886     fruchtschiefer
104560       latchstring
105552        lengthsman
122114        nachschlag
151826      postphthisic
227486       veldtschoen
Name: 0, dtype: object

In [53]:
# How many words start and end with the same letter?
# (.) is capture group 1
# ^(.) starts with any character
# .* mean any character repeated 0 or more times
# \1$ means ends with whatever the first capture group matched
words.str.contains(r'^(.).*\1$').sum()

11452

In [54]:
# How many words start and end with a vowel?
words.str.contains(r'^[aeiou].*[aeiou]$').sum()

14657

In [55]:
# How many words contain the same letter 3 times in a row?
words[words.str.contains(r'(.)\1\1')]

24988             bossship
50636      demigoddessship
78498          goddessship
82997     headmistressship
140481       patronessship
230262            wallless
231688           whenceeer
Name: 0, dtype: object

In [56]:
# What other interesting patterns in words can you find?
# Find the words that contain "q" but not "qu"
# q means we're literally looking to match a q
# [^u] means not the "u" character
words[words.str.contains(r"q([^u]|$)")]

97907         iraq
97908        iraqi
97909      iraqian
108449      louiqa
116731       miqra
122607    nastaliq
150881     pontacq
161159           q
161160           q
161161      qasida
161162        qere
161163        qeri
161164      qintar
161165    qoheleth
161166        qoph
173530       saqib
180565        shoq
198373       tareq
235046      zaqqum
Name: 0, dtype: object