# Regex

In [63]:
import re

text_to_search = '''
abcdefghijklmnopqrstuvwxyz
ABCDEFGHIJKLMNOPQRSTUVWXYZ
12345678

Ha HaHa

MetaCharacters (Need to be escaped):
. ^ $ * + ? { } [ ] \ | ( ) 

coreyms.com

321-555-4321
123.555.1234
123*555*1234
800-555-1234
900.555-1234


Mr. Schafer
Mr. Taylor
Ms Davis
Mrs. Robinson
Mr. T

cat
mat
pat
pat

'''

sentence = 'Start a sentence and then bring it to the end'


# Raw string in python is a string prefixed with a r. That tells Python not to to handle back slashes in any special 
# way. Usually backslashes are used to specify tabs, new lines etc. 

# print(r'\tTab')
# print('\tTab')


# .     - Any character except a new line
# \d    - Digit (0-9)
# \D    - Not a digit (0-9)
# \w    - Word character (a-z, A-Z, 0-9, -)
# \W    - Not a word character 
# \s    - Whitespace (space, tab, newline)
# \S    - Not Whitespace (space, tab, newline)

# \b    - Word boundary 
# \B    - Not a word boundary 
# ^     - Beginning of a string
# $      End of a string

# []    - Matches characters in brackets
# [^ ]  - Matches characters NOT in 


In [29]:
pattern = re.compile(r'abc')   #this method will not match on capital ABCs. 

matches = pattern.finditer(text_to_search)

for match in matches:
    print(match)
    
print(text_to_search[1:4])
    
# The span is the beginnig and end index of the match. 

<re.Match object; span=(1, 4), match='abc'>
abc


In [30]:
# pattern = re.compile(r'\.')             # A . with no backslash will match all characters except new lines
pattern = re.compile(r'coreyms\.com')     #For meta characters they must be escaped before attemting to compile.

matches = pattern.finditer(text_to_search)

for match in matches:
    print(match)
    

<re.Match object; span=(141, 152), match='coreyms.com'>


In [31]:
pattern = re.compile(r'\bHa')     #Use a word boundary to match on Ha. The first and second Ha's are matched
                                   # When we use an uppercase B we match on the last ha because it does not have a word
                                    # boundary before it.

matches = pattern.finditer(text_to_search)

for match in matches:
    print(match)


<re.Match object; span=(65, 67), match='Ha'>
<re.Match object; span=(68, 70), match='Ha'>


In [36]:
pattern = re.compile(r'a$')     

matches = pattern.finditer(text_to_search)

for match in matches:
    print(match)

# Matching Phone Number with Wildcard Seperators in Txt File

In [50]:
pattern = re.compile(r'\d\d\d.\d\d\d.\d\d\d\d')        # A . with no backslash will match all characters except new lines 

matches = pattern.finditer(text_to_search)

for match in matches:
    print(match)

# with open('/Users/sammarcustaylor/Desktop/data.txt', 'r') as f:
#     contents = f.read()
    
#     matches = pattern.finditer(contents)

# for match in matches:
#     print(match)

<re.Match object; span=(154, 166), match='321-555-4321'>
<re.Match object; span=(167, 179), match='123.555.1234'>


In [52]:
# If there was an instance where we wanted to match a phone number with a specific seperator we would use
# a character set

# the character set [-.] says that we only want to match phone numbers with a dash or a dot. This would exclude the 
# third number.

pattern = re.compile(r'\d\d\d[-.]\d\d\d[-.]\d\d\d\d')        

matches = pattern.finditer(text_to_search)

for match in matches:
    print(match)



<re.Match object; span=(154, 166), match='321-555-4321'>
<re.Match object; span=(167, 179), match='123.555.1234'>


In [59]:
pattern = re.compile(r'[89]00[-.]\d\d\d[-.]\d\d\d\d')        

matches = pattern.finditer(text_to_search)

for match in matches:
    print(match)

<re.Match object; span=(193, 205), match='800-555-1234'>
<re.Match object; span=(206, 218), match='900.555-1234'>


# Using Dashes -

In [62]:
pattern = re.compile(r'[^a-zA-Z]')        #carrot within the set negates the set and does the opposite.
                                          # for example this query would return everything that is not a letter.   

matches = pattern.finditer(text_to_search)

for match in matches:
    print(match)

<re.Match object; span=(0, 1), match='\n'>
<re.Match object; span=(27, 28), match='\n'>
<re.Match object; span=(54, 55), match='\n'>
<re.Match object; span=(55, 56), match='1'>
<re.Match object; span=(56, 57), match='2'>
<re.Match object; span=(57, 58), match='3'>
<re.Match object; span=(58, 59), match='4'>
<re.Match object; span=(59, 60), match='5'>
<re.Match object; span=(60, 61), match='6'>
<re.Match object; span=(61, 62), match='7'>
<re.Match object; span=(62, 63), match='8'>
<re.Match object; span=(63, 64), match='\n'>
<re.Match object; span=(64, 65), match='\n'>
<re.Match object; span=(67, 68), match=' '>
<re.Match object; span=(72, 73), match='\n'>
<re.Match object; span=(73, 74), match='\n'>
<re.Match object; span=(88, 89), match=' '>
<re.Match object; span=(89, 90), match='('>
<re.Match object; span=(94, 95), match=' '>
<re.Match object; span=(97, 98), match=' '>
<re.Match object; span=(100, 101), match=' '>
<re.Match object; span=(108, 109), match=')'>
<re.Match object; span=

In [67]:
pattern = re.compile(r'[^b]at')        #carrot within the set negates the set and does the opposite.
                                          # for example this query would return everything that is not a letter b.   

matches = pattern.finditer(text_to_search)

for match in matches:
    print(match)

<re.Match object; span=(274, 277), match='cat'>
<re.Match object; span=(278, 281), match='mat'>
<re.Match object; span=(282, 285), match='pat'>
<re.Match object; span=(286, 289), match='pat'>


# Quantifiers

# *         
- 0 or more
# +       
- 1 or more
# ?       
- 0 or one
# {3}     
- Exact number
# {3,4}  
- Range of numbers (minimum, maximum)

In [69]:
pattern = re.compile(r'\d{3}.\d{3}.\d{4}')        #we use a quantifier to match multiple characters  
                                                #specify the amount of digits that we are looking for without having t
                                                #to type them all out. 
                                                #equivalent to (r'\d\d\d.\d\d\d.\d\d\d\d)
                                                        
matches = pattern.finditer(text_to_search)

for match in matches:
    print(match)

<re.Match object; span=(154, 166), match='321-555-4321'>
<re.Match object; span=(167, 179), match='123.555.1234'>
<re.Match object; span=(180, 192), match='123*555*1234'>
<re.Match object; span=(193, 205), match='800-555-1234'>
<re.Match object; span=(206, 218), match='900.555-1234'>


In [80]:
# Write a pattern that matches prefixes and entire names that come afterward. 
# Mr. Schafer
# Mr. Taylor
# Ms Davis
# Mrs. Robinson
# Mr. T

# question mark quantifier says that we want to match zero or one of those characters. 
# Therefore Mr. or Mr

# pattern = re.compile(r'Mr\.?\s[A-Z]\w+')     # The problem with this one is it does not match on Mr. T
# pattern = re.compile(r'Mr\.?\s[A-Z]\w*')  # Pass in asterisk to get Mr. T - 0 or more.  

pattern = re.compile(r'(Mr|Ms|Mrs)\.?\s[A-Z]\w*') 
                                                        
matches = pattern.finditer(text_to_search)

for match in matches:
    print(match)

<re.Match object; span=(221, 232), match='Mr. Schafer'>
<re.Match object; span=(233, 243), match='Mr. Taylor'>
<re.Match object; span=(244, 252), match='Ms Davis'>
<re.Match object; span=(253, 266), match='Mrs. Robinson'>
<re.Match object; span=(267, 272), match='Mr. T'>


# Regex to Match Emails Phrases

In [102]:
emails = '''
CoreyMSchafer@gmail.com
corey.shafer@university.edu
corey-321-schafer@my-work.net
'''

# Using large characters sets
# match all lower & upper case letters, all digits. Match _ . + - followed up by a plus sign that tells us we will 
# match for any one or more of those in the character sets. 
#Once the A sign is reached we have another character set that matches upper, lower, numbers, & hyphens.
#After the second character set the following plus sign looks to match on one or more of those characters in the set.
# This will match up to the last period that is escaped with a backslash. 
# After the dot the final character set matches on lower, upper, digits, hyphens, or another dot. Plus sign matches 
# one or more of any of those characters. 

pattern = re.compile(r'[a-zA-Z0-9-.+]+@[a-zA-z0-9-]+\.[a-zA-z0-9-.]+')

matches = pattern.finditer(emails)

for match in matches:
    print(match)


<re.Match object; span=(1, 24), match='CoreyMSchafer@gmail.com'>
<re.Match object; span=(25, 52), match='corey.shafer@university.edu'>
<re.Match object; span=(53, 82), match='corey-321-schafer@my-work.net'>


# Capture info from Groups

In [130]:
# grab only the domain name

# https? we place a question mark because the s is optional 0 or 1.
# (www\.)? we place a group with a question mark because it is optional. 
# \w+\.  backslash w to match a word chracter, a plus to match one or more. All the way up to the dot. 
# We do the same with the last portion and break up the pieces into groups.

# group method on the match object that allow you to select which group we want to select.

# Regex has a sub method that allows us to perform a substitution. 
# subbed_urls uses the pattern above to sub out group 2, group 3 for all matches in urls. 


urls = '''
https://www.google.com
http://coreyms.com
https://youtube.com
https://nasa.gov
'''

pattern = re.compile(r'https?://(www\.)?(\w+)(\.\w+)')

subbed_urls = pattern.sub(r'\2\3', urls)

print(subbed_urls)

# matches = pattern.finditer(urls)

# for match in matches:
#     print(match.group(2))


google.com
coreyms.com
youtube.com
nasa.gov



# Other methods of regex

In [132]:
#matches = pattern.findall(urls)      This would work just as iter, but iter comes with added functionality, such
# as selecting groups.

#matches = pattern.match(urls)       Would only look at the beginning of lines

#matches = pattern.search(urls)      Prints out first match it finds. 

sentence = 'Start a sentence and then bring it to an end'

pattern = re.compile(r'start', re.IGNORECASE)  # Flags allow us to ignore case. 

matches = pattern.search(sentence)
print(matches)

<re.Match object; span=(0, 5), match='Start'>
