# **Regular Expressions (Regex) Examples**

## General info
1. Some documentation
    * [re library](https://docs.python.org/3/library/re.html)
    * [Character Classes or Character Sets](https://www.regular-expressions.info/charclass.html)


2. Probe regular expression sentence
    * [regex101](https://regex101.com/)
    

3. Play with regex
    * [regexcrossword](https://regexcrossword.com/)
    
    
4. More info
    * https://docs.python.org/3/howto/regex.html
    * https://docs.python.org/3/library/re.html
    * https://docs.python.org/3/howto/regex.html#greedy-versus-non-greedy
    
## Methods used

* [re.search()](#Search())
* [re.findall()](#Findall())

In [14]:
import re 

## **Search()**

In [15]:
'''
Checks if the text passed includes the letter "a" 
(lowercase or uppercase) at least twice.
'''
def repeating_letter_a(text):
    result = re.search(r"[aA].*[aA]+", text)
    return result != None

# TEST <repeating_letter_a()> 
test_list = ["banana", "pineapple", "pineapple", "Animal Kingdom", "A is for apple"]

for text in test_list:
    print("-> Input :", text)
    print("-> Output:", repeating_letter_a(text))
    print("-"*40)

-> Input : banana
-> Output: True
----------------------------------------
-> Input : pineapple
-> Output: False
----------------------------------------
-> Input : pineapple
-> Output: False
----------------------------------------
-> Input : Animal Kingdom
-> Output: True
----------------------------------------
-> Input : A is for apple
-> Output: True
----------------------------------------


In [16]:
'''
Check if the text passed has at least 2 groups of alphanumeric 
characters (including letters, numbers, and underscores) separated 
by one or more whitespace characters.
'''
def check_character_groups(text):
    result = re.search(r"\w+\s+\w+", text)
    return result != None

# TEST <check_character_groups()>
test_list = ["One", "123  Ready Set GO", "username user_01", "shopping_list: milk, bread, eggs."]

for text in test_list:
    print("-> Input :", text)
    print("-> Output:", check_character_groups(text))
    print("-"*40)

-> Input : One
-> Output: False
----------------------------------------
-> Input : 123  Ready Set GO
-> Output: True
----------------------------------------
-> Input : username user_01
-> Output: True
----------------------------------------
-> Input : shopping_list: milk, bread, eggs.
-> Output: False
----------------------------------------


In [17]:
'''
Check if the text passed looks like a standard sentence, meaning 
that it starts with an uppercase letter, followed by at least some 
lowercase letters or a space, and ends with a period, question mark, 
or exclamation point. 
'''
def check_sentence(text):
    result = re.search(r"^[A-Z][a-z ]+[.?!]$", text)
    return result != None

# TEST <check_character_groups()>
test_list = ["Is this is a sentence?", "is this is a sentence?", "Hello", "1-2-3-GO!", "A star is born."]

for text in test_list:
    print("-> Input :", text)
    print("-> Output:", check_sentence(text))
    print("-"*40)

-> Input : Is this is a sentence?
-> Output: True
----------------------------------------
-> Input : is this is a sentence?
-> Output: False
----------------------------------------
-> Input : Hello
-> Output: False
----------------------------------------
-> Input : 1-2-3-GO!
-> Output: False
----------------------------------------
-> Input : A star is born.
-> Output: True
----------------------------------------


In [19]:
'''Return the uppercase message in parenthesis, after the process id.'''

def extract_pid(log_line):
    regex = r"\[(\d+)\]: ([A-Z]+)"
    result = re.search(regex, log_line)
    if result is None:
        return None
    return "{} ({})".format(result[1], result[2])

# TEST <extract_pid()>
test_list = ["July 31 07:51:48 mycomputer bad_process[12345]: ERROR Performing package upgrade", 
             "99 elephants in a [cage]", 
             "A string that also has numbers [34567] but no uppercase message", 
             "July 31 08:08:08 mycomputer new_process[67890]: RUNNING Performing backup"]

for text in test_list:
    print("-> Input :", text)
    print("-> Output:", extract_pid(text))
    print("-"*40)

-> Input : July 31 07:51:48 mycomputer bad_process[12345]: ERROR Performing package upgrade
-> Output: 12345 (ERROR)
----------------------------------------
-> Input : 99 elephants in a [cage]
-> Output: None
----------------------------------------
-> Input : A string that also has numbers [34567] but no uppercase message
-> Output: None
----------------------------------------
-> Input : July 31 08:08:08 mycomputer new_process[67890]: RUNNING Performing backup
-> Output: 67890 (RUNNING)
----------------------------------------


In [21]:
'''
Checks if the text passed qualifies as a top-level web address, meaning that it contains 
alphanumeric characters (which includes letters, numbers, and underscores), as well as 
periods, dashes, and a plus sign, followed by a period and a character-only top-level 
domain such as ".com", ".info", ".edu", etc.
'''

def check_web_address(text):
    pattern = r"^[\w.\-+]*\.[a-zA-Z]*$"
    result = re.search(pattern, text)
    return result != None

# TEST <check_web_address()>
test_list = ["gmail.com", "www@google", "www.learning.org", "web-address.com/homepage", "My_Favorite-Blog.US"]

for text in test_list:
    print("-> Input :", text)
    print("-> Output:", check_web_address(text))
    print("-"*40)

-> Input : gmail.com
-> Output: True
----------------------------------------
-> Input : www@google
-> Output: False
----------------------------------------
-> Input : www.learning.org
-> Output: True
----------------------------------------
-> Input : web-address.com/homepage
-> Output: False
----------------------------------------
-> Input : My_Favorite-Blog.US
-> Output: True
----------------------------------------


In [22]:
'''
Checks for the time format of a 12-hour clock, as follows: the hour is between 1 and 12, 
with no leading zero, followed by a colon, then minutes between 00 and 59, then an optional 
space, and then AM or PM, in upper or lower case.
'''
def check_time(text):
    pattern = r"^1*[0-9]:[0-5][0-9][ ]?[aApP][mM]"
    result = re.search(pattern, text)
    return result != None

# TEST <check_time()>
test_list = ["12:45pm", "9:59 AM", "6:60am", "five o'clock"]

for text in test_list:
    print("-> Input :", text)
    print("-> Output:", check_time(text))
    print("-"*40)

-> Input : 12:45pm
-> Output: True
----------------------------------------
-> Input : 9:59 AM
-> Output: True
----------------------------------------
-> Input : 6:60am
-> Output: False
----------------------------------------
-> Input : five o'clock
-> Output: False
----------------------------------------


In [26]:
'''
Checks the text for the presence of 2 or more characters or digits surrounded by parentheses, 
with at least the first character in uppercase (if it's a letter), returning True if the 
condition is met, or False otherwise. For example: 
    
    "Instant messaging (IM) is a set of communication technologies used for text-based communication" 

It should return True since (IM) satisfies the match conditions."
'''
def contains_acronym(text):
    pattern = r"\([A-Z0-9][a-zA-Z0-9]+\)"
    result = re.search(pattern, text)
    return result != None

# TEST <contains_acronym()>
test_list = ["Instant messaging (IM) is a set of communication technologies used for text-based communication", 
             "American Standard Code for Information Interchange (ASCII) is a character encoding standard for electronic communication", 
             "Please do NOT enter without permission!", 
             "PostScript is a fourth-generation programming language (4GL)", 
             "Have fun using a self-contained underwater breathing apparatus (Scuba)!"]

for text in test_list:
    print("-> Input :", text)
    print("-> Output:", contains_acronym(text))
    print("-"*40)

-> Input : Instant messaging (IM) is a set of communication technologies used for text-based communication
-> Output: True
----------------------------------------
-> Input : American Standard Code for Information Interchange (ASCII) is a character encoding standard for electronic communication
-> Output: True
----------------------------------------
-> Input : Please do NOT enter without permission!
-> Output: False
----------------------------------------
-> Input : PostScript is a fourth-generation programming language (4GL)
-> Output: True
----------------------------------------
-> Input : Have fun using a self-contained underwater breathing apparatus (Scuba)!
-> Output: True
----------------------------------------


In [27]:
'''
Check if the text passed includes a possible U.S. zip code, formatted as follows: exactly 5 digits, 
and sometimes, but not always, followed by a dash with 4 more digits. The zip code needs to be 
preceded by at least one space, and cannot be at the start of the text.
'''

def check_zip_code (text):
    result = re.search(r"[\w\s]+[0-9]{5}[-]?([0-9]{4})?", text)
    return result != None

# TEST <check_zip_code()>
test_list = ["The zip codes for New York are 10001 thru 11104.", 
             "90210 is a TV show", 
             "Their address is: 123 Main Street, Anytown, AZ 85258-0001.", 
             "The Parliament of Canada is at 111 Wellington St, Ottawa, ON K1A0A9."]

for text in test_list:
    print("-> Input :", text)
    print("-> Output:", check_zip_code(text))
    print("-"*40)

-> Input : The zip codes for New York are 10001 thru 11104.
-> Output: True
----------------------------------------
-> Input : 90210 is a TV show
-> Output: False
----------------------------------------
-> Input : Their address is: 123 Main Street, Anytown, AZ 85258-0001.
-> Output: True
----------------------------------------
-> Input : The Parliament of Canada is at 111 Wellington St, Ottawa, ON K1A0A9.
-> Output: False
----------------------------------------


In [29]:
'''returns all words with 3 or more consecutive vowels (a, e, i, o, u).'''

def multi_vowel_words(text):
    pattern = "(\w*[aeiou]{3}\w*)"
    result = re.findall(pattern, text)
    return result

# TEST <multi_vowel_words()>
test_list = ["Life is beautiful", 
             "Obviously, the queen is courageous and gracious.", 
             "The rambunctious children had to sit quietly and await their delicious dinner.", 
             "The order of a data queue is First In First Out (FIFO)", 
             "Hello world!"]

for text in test_list:
    print("-> Input :", text)
    print("-> Output:", multi_vowel_words(text))
    print("-"*40)

-> Input : Life is beautiful
-> Output: ['beautiful']
----------------------------------------
-> Input : Obviously, the queen is courageous and gracious.
-> Output: ['Obviously', 'queen', 'courageous', 'gracious']
----------------------------------------
-> Input : The rambunctious children had to sit quietly and await their delicious dinner.
-> Output: ['rambunctious', 'quietly', 'delicious']
----------------------------------------
-> Input : The order of a data queue is First In First Out (FIFO)
-> Output: ['queue']
----------------------------------------
-> Input : Hello world!
-> Output: []
----------------------------------------


## **Findall()**

In [20]:
'''Returns all words that are at least 7 characters.'''

def long_words(text):
    pattern = r"\w{7,}"
    result = re.findall(pattern, text)
    return result

# TEST <long_words()>
test_list = ["I like to drink coffee in the morning.", 
             "I also have a taste for hot chocolate in the afternoon.", 
             "I never drink tea late at night."]

for text in test_list:
    print("-> Input :", text)
    print("-> Output:", long_words(text))
    print("-"*40)

-> Input : I like to drink coffee in the morning.
-> Output: ['morning']
----------------------------------------
-> Input : I also have a taste for hot chocolate in the afternoon.
-> Output: ['chocolate', 'afternoon']
----------------------------------------
-> Input : I never drink tea late at night.
-> Output: []
----------------------------------------


## **Sub()**

In [28]:
'''
A CSV file contains employee information. Each record has a name field, followed by a phone number 
field, and a role field. The phone number field contains U.S. phone numbers, and needs to be 
modified to the international format, with "+1-" in front of the phone number.
'''

def transform_record(record):
    new_record = re.sub(r"([0-9-]+)", r"+1-\1",record)
    return new_record

# TEST <transform_record()>
test_list = ["Sabrina Green,802-867-5309,System Administrator", 
             "Eli Jones,684-3481127,IT specialist", 
             "Melody Daniels,846-687-7436,Programmer", 
             "Charlie Rivera,698-746-3357,Web Developer"]

for text in test_list:
    print("-> Input :", text)
    print("-> Output:", transform_record(text))
    print("-"*40)

-> Input : Sabrina Green,802-867-5309,System Administrator
-> Output: Sabrina Green,+1-802-867-5309,System Administrator
----------------------------------------
-> Input : Eli Jones,684-3481127,IT specialist
-> Output: Eli Jones,+1-684-3481127,IT specialist
----------------------------------------
-> Input : Melody Daniels,846-687-7436,Programmer
-> Output: Melody Daniels,+1-846-687-7436,Programmer
----------------------------------------
-> Input : Charlie Rivera,698-746-3357,Web Developer
-> Output: Charlie Rivera,+1-698-746-3357,Web Developer
----------------------------------------


In [30]:
'''
Converts comments in a Python script into those usable by a C compiler. This means looking for 
text that begins with a hash mark (#) and replacing it with double slashes (//), which is the 
C single-line comment indicator. For this exercise, let's ignore the possibility of a hash mark 
embedded inside of a Python command, and assume that it's only used to indicate a comment. Besides, 
treat repetitive hash marks (##), (###), etc., as a single comment indicator, to be replaced with 
just (//) and not (#//) or (//#).
'''

def transform_comments(line_of_code):
    result = re.sub(r"([#]+)", r"//", line_of_code)
    return result

# TEST <transform_comments()>
test_list = ["### Start of program", 
             "  number = 0   ## Initialize the variable", 
             "  number += 1   # Increment the variable", 
             "  return(number)"]

for text in test_list:
    print("-> Input :", text)
    print("-> Output:", transform_comments(text))
    print("-"*40)

-> Input : ### Start of program
-> Output: // Start of program
----------------------------------------
-> Input :   number = 0   ## Initialize the variable
-> Output:   number = 0   // Initialize the variable
----------------------------------------
-> Input :   number += 1   # Increment the variable
-> Output:   number += 1   // Increment the variable
----------------------------------------
-> Input :   return(number)
-> Output:   return(number)
----------------------------------------


In [31]:
'''
Checks for a U.S. phone number format: XXX-XXX-XXXX (3 digits followed by a dash, 3 more digits 
followed by a dash, and 4 digits), and converts it to a more formal format that looks like this: 
(XXX) XXX-XXXX.
'''

def convert_phone_number(phone):
    result = re.sub(r"\b([0-9]{3})-([0-9]{3})-([0-9]{4})\b",r"(\1) \2-\3", phone)
    return result

# TEST <convert_phone_number()>
test_list = ["My number is 212-345-9999.", 
             "Please call 888-555-1234", 
             "123-123-12345", 
             "Phone number of Buckingham Palace is +44 303 123 7300"]

for text in test_list:
    print("-> Input :", text)
    print("-> Output:", convert_phone_number(text))
    print("-"*40)

-> Input : My number is 212-345-9999.
-> Output: My number is (212) 345-9999.
----------------------------------------
-> Input : Please call 888-555-1234
-> Output: Please call (888) 555-1234
----------------------------------------
-> Input : 123-123-12345
-> Output: 123-123-12345
----------------------------------------
-> Input : Phone number of Buckingham Palace is +44 303 123 7300
-> Output: Phone number of Buckingham Palace is +44 303 123 7300
----------------------------------------
