# Regex Exercises

# 1.  Write a function named is_vowel. It should accept a string as input and use a regular expression to determine if the passed string is a vowel. 
 - While not explicity mentioned in the lesson, you can treat the result of re.search as a boolean value that indicates whether or not the regular expression matches the given string.

In [6]:
import pandas as pd
import re

In [7]:
def is_vowel(string):

    """
    returns a boolean value assessing if the passed string is a single vowel

    """
    regex = r'^[aeiou]$'
    return bool(re.search(regex, string.lower()))

In [10]:
is_vowel('a')

True

In [11]:
is_vowel('A')

True

In [12]:
is_vowel('b')

False

# 2. Write a function named is_valid_username that accepts a string as input. 

- A valid username starts with a lowercase letter, and only consists of lowercase letters, numbers, or the _ character. 
- It should also be no longer than 32 characters. 
- The function should return either True or False depending on whether the passed string is a valid username.

In [19]:
def is_valid_username(string):
    return bool(re.search(r'^[a-z][a-z0-9_]{,31}$', string))

In [21]:
is_valid_username('Alexia')

False

In [22]:
is_valid_username('alexia')

True

# 3. Write a regular expression to capture phone numbers. It should match all of the following:

- (210) 867 5309
- +1 210.867.5309
- 867-5309
- 210-867-5309

In [26]:
numbers = ['(210) 867 5309',
'+1 210.867.5309', '867-5309', 
'210-867-5309']

# Parts of a phone number
# Country Code +1, Area Code 210, Exchange Code 226, Line number 3232

phone_re = re.compile(r'''
^
(?P<country_code>\+\d+)?
\D*?
(?P<area_code>\d{3})?
\D*?
(?P<exchange_code>\d{3})
\D*?
(?P<line_number>\d{4})
\D*
$
''', re.VERBOSE)

# loop through numbers
phone_numbers = [re.search(phone_re, number).groupdict() for number in numbers]
phone_numbers

[{'country_code': None,
  'area_code': '210',
  'exchange_code': '867',
  'line_number': '5309'},
 {'country_code': '+1',
  'area_code': '210',
  'exchange_code': '867',
  'line_number': '5309'},
 {'country_code': None,
  'area_code': None,
  'exchange_code': '867',
  'line_number': '5309'},
 {'country_code': None,
  'area_code': '210',
  'exchange_code': '867',
  'line_number': '5309'}]

In [27]:
# Make the dictionary a df
df = pd.DataFrame(phone_numbers)
df

Unnamed: 0,country_code,area_code,exchange_code,line_number
0,,210.0,867,5309
1,1.0,210.0,867,5309
2,,,867,5309
3,,210.0,867,5309


# 4. Use regular expressions to convert the dates below to the standardized year-month-day format.

In [28]:
dates = [
    '02/04/19',
    '02/05/19',
    '02/06/19',
    '02/07/19',
    '02/08/19',
    '02/09/19',
    '02/10/19',
]

dates = [re.sub(r'(\d+)/(\d+)/(\d+)', r'20\3-\1-\2', date) for date in dates]
dates

['2019-02-04',
 '2019-02-05',
 '2019-02-06',
 '2019-02-07',
 '2019-02-08',
 '2019-02-09',
 '2019-02-10']

# 5. Write a regex to extract the various parts of these logfile lines:

GET /api/v1/sales?page=86 [16/Apr/2019:193452+0000] HTTP/1.1 {200} 510348 "python-requests/2.21.0" 97.105.19.58
POST /users_accounts/file-upload [16/Apr/2019:193452+0000] HTTP/1.1 {201} 42 "User-Agent: Mozilla/5.0 (X11; Fedora; Fedora; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36" 97.105.19.58
GET /api/v1/items?page=3 [16/Apr/2019:193453+0000] HTTP/1.1 {429} 3561 "python-requests/2.21.0" 97.105.19.58

In [29]:
string = """
GET /api/v1/sales?page=86 [16/Apr/2019:193452+0000] HTTP/1.1 {200} 510348 "python-requests/2.21.0" 97.105.19.58
POST /users_accounts/file-upload [16/Apr/2019:193452+0000] HTTP/1.1 {201} 42 "User-Agent: Mozilla/5.0 (X11; Fedora; Fedora; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36" 97.105.19.58
GET /api/v1/items?page=3 [16/Apr/2019:193453+0000] HTTP/1.1 {429} 3561 "python-requests/2.21.0" 97.105.19.58
"""

# method, endpoint, date, protocol, status_code, some_number, "user_agent", ip_address

regex = r'''
(?P<method>[A-Z]+)
\s
(?P<path>.*)
\s
\[(?P<timestamp>.*)\]
\s
HTTP/1.1
\s
{(?P<status>\d+)}
\s
(?P<bytes_sent>\d+)
\s
"(?P<user_agent>.*)"
\s+
(?P<ip>\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})
'''

regex = re.compile(regex, re.VERBOSE)

# create series
logs = pd.Series(string.strip().split('\n'))
logs

0    GET /api/v1/sales?page=86 [16/Apr/2019:193452+...
1    POST /users_accounts/file-upload [16/Apr/2019:...
2    GET /api/v1/items?page=3 [16/Apr/2019:193453+0...
dtype: object

In [33]:
df = logs.str.extract(regex)
df

Unnamed: 0,method,path,timestamp,status,bytes_sent,user_agent,ip
0,GET,/api/v1/sales?page=86,16/Apr/2019:193452+0000,200,510348,python-requests/2.21.0,97.105.19.58
1,POST,/users_accounts/file-upload,16/Apr/2019:193452+0000,201,42,User-Agent: Mozilla/5.0 (X11; Fedora; Fedora; ...,97.105.19.58
2,GET,/api/v1/items?page=3,16/Apr/2019:193453+0000,429,3561,python-requests/2.21.0,97.105.19.58
