In [106]:
import re
import pandas as pd

## Exercise 1

Write a function named is_vowel. It should accept a string as input and use a regular expression to determine if the passed string is a vowel. While not explicity mentioned in the lesson, you can treat the result of re.search as a boolean value that indicates whether or not the regular expression matches the given string.



In [107]:
def is_vowel(string):
    return bool(re.search(r"^[aeiou]$", string, re.IGNORECASE))
    
assert is_vowel("a") == True
assert is_vowel("E") == True
assert is_vowel("aaa") == False
assert is_vowel("aeiou") == False

## Exercise 2

Write a function named is_valid_username that accepts a string as input. A valid username starts with a lowercase letter, and only consists of lowercase letters, numbers, or the _ character. It should also be no longer than 32 characters. The function should return either True or False depending on whether the passed string is a valid username.

In [109]:
# Why do we need the $ in the above pattern?
# because the regex below matches up to the capital letter but we're not saying everything needs to be lowercase
re.search(r"^[a-z][a-z0-9_]{,31}", "aaaCODEUPCODEUPaaaaaaaaaaaaaaaaaaaaaaaaaa")

<re.Match object; span=(0, 3), match='aaa'>

In [108]:
# starts with a lowercase letter
# is only lowercase letters, numbers, or _
# should be no longer than 32 characters
# return a boolean

def is_valid_username(string):
    pattern = r"^[a-z][a-z0-9_]{,31}$"
    return bool(re.search(pattern, string))

assert is_valid_username("codeup") == True
assert is_valid_username("codeup123") == True
assert is_valid_username("123Codeup") == False
assert is_valid_username("CodeupCodeup!") == False

## Exercise 3

Write a regular expression to capture phone numbers. It should match all of the following:

- (210) 867 5309
- +1 210.867.5309
- 867-5309
- 210-867-5309

#### Problem solving process:
- Put the subject strings in order of increasing complexity
- Solve them one at a time and build an iterative solution

In [66]:
re.search(r"\(?\d{3}\)?.?\d{3}.?\d{4}", "210-867-5309")

<re.Match object; span=(0, 12), match='210-867-5309'>

In [72]:
re.search(r"\(?\d{3}\)?.?\d{3}.?\d{4}", "210.867.5309")

<re.Match object; span=(0, 10), match='2108675309'>

In [73]:
re.search(r"(\(?\d{3}\)?)?.?\d{3}.?\d{4}", "(210) 867-5309")

<re.Match object; span=(0, 14), match='(210) 867-5309'>

In [75]:
re.search(r"(\(?\d{3}\)?)?.?\d{3}.?\d{4}", "867-5309")

<re.Match object; span=(0, 8), match='867-5309'>

In [78]:
re.search(r"(\(?\d{3}\)?)?.?\d{3}.?\d{4}", "8675309")

<re.Match object; span=(0, 7), match='8675309'>

In [79]:
re.search(r"(\(?\d{3}\)?)?.?\d{3}.?\d{4}", "210.867.5309")

<re.Match object; span=(0, 12), match='210.867.5309'>

In [80]:
# But what about the international code +1
re.search(r"(\(?\d{3}\)?)?.?\d{3}.?\d{4}", "+1 210.867.5309")

<re.Match object; span=(3, 15), match='210.867.5309'>

In [110]:
re.search(r"(\+\d+)?.?(\(?\d{3}\)?)?.?\d{3}.?\d{4}", "+1 210.867.5309")

<re.Match object; span=(0, 15), match='+1 210.867.5309'>

## Exercise 3 using a DataFrame

In [100]:
# another approach
# The \D*? means zero or more of anything that's not a digit (including parentheses)
phone_regex = re.compile(
"""
^
(?P<country_code>\+\d+)?
\D*?
(?P<area_code>\d{3})?
\D*?
(?P<exchange_code>\d{3})
\D*?
(?P<line_number>\d{4})
""", re.VERBOSE)

In [96]:
df = pd.DataFrame()
df['number'] = [
    '(210) 867 5309',
    '+1 210.867.5309',
    '867-5309',
    '210-867-5309',
    '2108675309',
]

In [113]:
# extract turns named capture groups into dataframe columns
# NaNs for no match
df.number.str.extract(phone_regex)

Unnamed: 0,country_code,area_code,exchange_code,line_number
0,,210.0,867,5309
1,1.0,210.0,867,5309
2,,,867,5309
3,,210.0,867,5309
4,,210.0,867,5309


In [99]:
df = pd.concat([df, df.number.str.extract(phone_regex)], axis=1)
df

Unnamed: 0,number,country_code,area_code,exchange_code,line_number
0,(210) 867 5309,,210.0,867,5309
1,+1 210.867.5309,1.0,210.0,867,5309
2,867-5309,,,867,5309
3,210-867-5309,,210.0,867,5309
4,2108675309,,210.0,867,5309


## Exercise 4

Use regular expressions to convert the dates below to the standardized year-month-day format.

```
02/04/19
02/05/19
02/06/19
02/07/19
02/08/19
02/09/19
02/10/19
```

## Exercise 5

Write a regex to extract the various parts of these logfile lines:


`GET /api/v1/sales?page=86 [16/Apr/2019:193452+0000] HTTP/1.1 {200} 510348 "python-requests/2.21.0" 97.105.19.58`

`POST /users_accounts/file-upload [16/Apr/2019:193452+0000] HTTP/1.1 {201} 42 "User-Agent: Mozilla/5.0 (X11; Fedora; Fedora; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36" 97.105.19.58`

`GET /api/v1/items?page=3 [16/Apr/2019:193453+0000] HTTP/1.1 {429} 3561 "python-requests/2.21.0" 97.105.19.58`

## Bonus Exercise
You can find a list of words on your mac at `/usr/share/dict/words`. Use this file to answer the following questions:

- How many words have at least 3 vowels?
- How many words have at least 3 vowels in a row?
- How many words have at least 4 consonants in a row?
- How many words start and end with the same letter?
- How many words start and end with a vowel?
- How many words contain the same letter 3 times in a row?
- What other interesting patterns in words can you find?

In [117]:
words = pd.read_csv("/usr/share/dict/words")
words.sample(5)

Unnamed: 0,A
11170,aphanesite
217083,ungeometricalness
66783,exotospore
92682,inbuilt
65636,eurobin


In [118]:
re.search(r"😀", "😃😁😀")

<re.Match object; span=(2, 3), match='😀'>