Site to play with regex: https://regex101.com/

## 1- Importing Regex Module

In [1]:
# Imporintg Regex library
import re

### 1.1 Creating a custom string

In [85]:
text_to_search = '''
abcdefghijklmnopqurtuvwxyz
Yo Yo Yo
ABCDEFGHIJKLMNOPQRSTUVWXYZ
1234567890
Ha HaHa
Carrot MyCarrot
MetaCharacters (Need to be escaped):
. ^ $ * + ? { } [ ] \ | ( )
coreyms.com
321-555-4321
123.555.1234
123*555*1234
800-555-1234
900-555-1234
cat
mat
rat
bat
Mr. Schafer
Mr Smith
Ms Davis
Mrs. Robinson
Mr. T
'''

### 1.2 Applying regex to find desired pattern

In [3]:
# "re" stands for regex. "r" before XYZ makes it Raw-String
search = re.compile(r"XYZ")

# Finditer tries to search for the Indices of the pattern we want to find
matches = search.finditer(text_to_search)

# Looping through the matches to get the information of exact indices from the string of our pattern
for match in matches:
    print(match)

<re.Match object; span=(51, 54), match='XYZ'>


## 2- Meta-characters: 
**. ^ $ * + ? { } [ ] \ | ( )**
<br>They need to be escaped otherwise we get weird results

In [4]:
# In order to escape meta characters we use backslash \
search = re.compile(r"\.")
matches = search.finditer(text_to_search)
for match in matches:
    print(match)

<re.Match object; span=(111, 112), match='.'>
<re.Match object; span=(146, 147), match='.'>
<re.Match object; span=(167, 168), match='.'>
<re.Match object; span=(171, 172), match='.'>
<re.Match object; span=(218, 219), match='.'>
<re.Match object; span=(249, 250), match='.'>
<re.Match object; span=(262, 263), match='.'>


### 2.1 For finding a URL

In [5]:
# In order to escape meta characters we use backslash \
search = re.compile(r"coreyms\.com")
matches = search.finditer(text_to_search)
for match in matches:
    print(match)

<re.Match object; span=(139, 150), match='coreyms.com'>


## 3- Special Characters
* .       - Period Means Any Character Except New Line
* \d      - Digit (0-9)
* \D      - Not a Digit (0-9)
* \w      - Word Character (a-z, A-Z, 0-9, _)
* \W      - Not a Word Character
* \s      - Whitespace (space, tab, newline)
* \S      - Not Whitespace (space, tab, newline)
* 
* \b      - Word Boundary (No characters before it)
* \B      - Not a Word Boundary (Has characters before it)
* ^       - Begining of a string
* $       - End of a string
* 
* [ ]     - Matches characters inside the bracket
* [^ ]    - Not Matches characters inside the bracket
* |       - Either OR
* ( )     - Group
### **Quantifiers**
* \*      - 0 ore more
* \+      - 1 or more
* ?       - 0 or 1
* {3}     - Exact number of times
* {3,4}   - Range of numbers


### 3.1 Word Boundary \b is used to get characters that've no characters before them.

In [41]:
# NOTE: \B does the opposite of \b i.e., "\BHa"
search = re.compile(r"\bHa")
matches = search.finditer(text_to_search)
for match in matches:
    print(match)

<re.Match object; span=(66, 68), match='Ha'>
<re.Match object; span=(69, 71), match='Ha'>


### 3.2 The carrot ^ can be used to match the beginning of a string

In [42]:
# NOTE: $ does the opposite of ^ i.e., "end$"
sentence = "Start of everything eventually comes to an end"
search = re.compile(r"^Start")
matches = search.finditer(sentence)
for match in matches:
    print(match)

<re.Match object; span=(0, 5), match='Start'>


### 3.3 Using character set [ ] while finding phone numbers

In [6]:
# In order to escape meta characters we use backslash \
# Below the [] is the character set: Where we can use custom characters to be found
search = re.compile(r"\d\d\d[-.]\d\d\d[.-]\d\d\d") 
matches = search.finditer(text_to_search)
for match in matches:
    print(match)

<re.Match object; span=(151, 162), match='321-555-432'>
<re.Match object; span=(164, 175), match='123.555.123'>
<re.Match object; span=(190, 201), match='800-555-123'>
<re.Match object; span=(203, 214), match='900-555-123'>


### 3.4 Grabbing all phone numbers from a Text file

In [73]:
with open("data.txt", "r") as f:
    text = f.read()
    search = re.compile(r"\d\d\d.\d\d\d.\d\d\d")
    matches = search.finditer(text)
    for match in matches:
     print(match)
    f.close()

<re.Match object; span=(12, 23), match='615-555-716'>
<re.Match object; span=(102, 113), match='800-555-566'>
<re.Match object; span=(191, 202), match='560-555-515'>
<re.Match object; span=(281, 292), match='900-555-934'>
<re.Match object; span=(378, 389), match='714-555-740'>
<re.Match object; span=(467, 478), match='800-555-677'>
<re.Match object; span=(557, 568), match='783-555-479'>
<re.Match object; span=(647, 658), match='516-555-461'>
<re.Match object; span=(740, 751), match='127-555-186'>
<re.Match object; span=(831, 842), match='608-555-493'>
<re.Match object; span=(917, 928), match='568-555-605'>
<re.Match object; span=(1005, 1016), match='292-555-187'>
<re.Match object; span=(1093, 1104), match='900-555-320'>
<re.Match object; span=(1182, 1193), match='614-555-116'>
<re.Match object; span=(1273, 1284), match='530-555-267'>
<re.Match object; span=(1359, 1370), match='470-555-275'>
<re.Match object; span=(1443, 1454), match='800-555-608'>
<re.Match object; span=(1530, 1541), m

### 3.5 Grabbing all phone numbers that have a common prefix

In [17]:
# Below the [] is the character set: Where we can use custom characters to be found
search = re.compile(r"[89]00[-.]\d\d\d[.-]\d\d\d") 
matches = search.finditer(text_to_search)
for match in matches:
    print(match)

<re.Match object; span=(190, 201), match='800-555-123'>
<re.Match object; span=(203, 214), match='900-555-123'>


### 3.6 Grabbing all phone numbers that have a common prefix from a file

In [72]:
with open("data.txt", "r") as f:
    text = f.read()
    search = re.compile(r"[89]00[-.]\d\d\d[-.]\d\d\d")
    matches = search.finditer(text)
    for match in matches:
     print(match)
    f.close()

<re.Match object; span=(102, 113), match='800-555-566'>
<re.Match object; span=(281, 292), match='900-555-934'>
<re.Match object; span=(467, 478), match='800-555-677'>
<re.Match object; span=(1093, 1104), match='900-555-320'>
<re.Match object; span=(1443, 1454), match='800-555-608'>
<re.Match object; span=(1794, 1805), match='800-555-710'>
<re.Match object; span=(2055, 2066), match='900-555-511'>
<re.Match object; span=(2830, 2841), match='900-555-542'>
<re.Match object; span=(3290, 3301), match='800-555-881'>
<re.Match object; span=(3977, 3988), match='900-555-959'>
<re.Match object; span=(4951, 4962), match='800-555-242'>
<re.Match object; span=(5572, 5583), match='900-555-356'>
<re.Match object; span=(6195, 6206), match='800-555-321'>
<re.Match object; span=(6897, 6908), match='900-555-775'>
<re.Match object; span=(7872, 7883), match='800-555-137'>
<re.Match object; span=(8751, 8762), match='900-555-642'>


### 3.7 Using a range for digits and characters

In [24]:
# Below the - acts as a range
search = re.compile(r"[1-5]") 
# If you want a range for characters simply write [a-z] or [a-zA-Z] for all upper and lowers
matches = search.finditer(text_to_search)
for match in matches:
    print(match)

<re.Match object; span=(55, 56), match='1'>
<re.Match object; span=(56, 57), match='2'>
<re.Match object; span=(57, 58), match='3'>
<re.Match object; span=(58, 59), match='4'>
<re.Match object; span=(59, 60), match='5'>
<re.Match object; span=(151, 152), match='3'>
<re.Match object; span=(152, 153), match='2'>
<re.Match object; span=(153, 154), match='1'>
<re.Match object; span=(155, 156), match='5'>
<re.Match object; span=(156, 157), match='5'>
<re.Match object; span=(157, 158), match='5'>
<re.Match object; span=(159, 160), match='4'>
<re.Match object; span=(160, 161), match='3'>
<re.Match object; span=(161, 162), match='2'>
<re.Match object; span=(162, 163), match='1'>
<re.Match object; span=(164, 165), match='1'>
<re.Match object; span=(165, 166), match='2'>
<re.Match object; span=(166, 167), match='3'>
<re.Match object; span=(168, 169), match='5'>
<re.Match object; span=(169, 170), match='5'>
<re.Match object; span=(170, 171), match='5'>
<re.Match object; span=(172, 173), match='1'

### 3.8 Carrot ^ as a NOT when used in [ ]

In [43]:
# Below the - acts as a range
search = re.compile(r"[^a-z]") 
# If you want a range for characters simply write [a-z] or [a-zA-Z] for all upper and lowers
matches = search.finditer(text_to_search)
for match in matches:
    print(match)

<re.Match object; span=(0, 1), match='\n'>
<re.Match object; span=(27, 28), match='\n'>
<re.Match object; span=(28, 29), match='A'>
<re.Match object; span=(29, 30), match='B'>
<re.Match object; span=(30, 31), match='C'>
<re.Match object; span=(31, 32), match='D'>
<re.Match object; span=(32, 33), match='E'>
<re.Match object; span=(33, 34), match='F'>
<re.Match object; span=(34, 35), match='G'>
<re.Match object; span=(35, 36), match='H'>
<re.Match object; span=(36, 37), match='I'>
<re.Match object; span=(37, 38), match='J'>
<re.Match object; span=(38, 39), match='K'>
<re.Match object; span=(39, 40), match='L'>
<re.Match object; span=(40, 41), match='M'>
<re.Match object; span=(41, 42), match='N'>
<re.Match object; span=(42, 43), match='O'>
<re.Match object; span=(43, 44), match='P'>
<re.Match object; span=(44, 45), match='Q'>
<re.Match object; span=(45, 46), match='R'>
<re.Match object; span=(46, 47), match='S'>
<re.Match object; span=(47, 48), match='T'>
<re.Match object; span=(48, 49),

### 3.9 Skipping "bat" and only fetching "cat, mat, rat"

In [45]:
search = re.compile(r"[^b]at") 
matches = search.finditer(text_to_search)
for match in matches:
    print(match)

<re.Match object; span=(232, 235), match='cat'>
<re.Match object; span=(236, 239), match='mat'>
<re.Match object; span=(240, 243), match='rat'>


### 3.10 Using exact number of times { } to simplify our code

In [70]:
with open("data.txt", "r") as f:
    read = f.read()
    # Instead of multiple \d\d\d we used \d{3}
    search = re.compile(r"\d{3}.\d{3}.\d{4}") 
    matches = search.finditer(read)
    for match in matches:
        print(match)
    f.close()

<re.Match object; span=(12, 24), match='615-555-7164'>
<re.Match object; span=(102, 114), match='800-555-5669'>
<re.Match object; span=(191, 203), match='560-555-5153'>
<re.Match object; span=(281, 293), match='900-555-9340'>
<re.Match object; span=(378, 390), match='714-555-7405'>
<re.Match object; span=(467, 479), match='800-555-6771'>
<re.Match object; span=(557, 569), match='783-555-4799'>
<re.Match object; span=(647, 659), match='516-555-4615'>
<re.Match object; span=(740, 752), match='127-555-1867'>
<re.Match object; span=(831, 843), match='608-555-4938'>
<re.Match object; span=(917, 929), match='568-555-6051'>
<re.Match object; span=(1005, 1017), match='292-555-1875'>
<re.Match object; span=(1093, 1105), match='900-555-3205'>
<re.Match object; span=(1182, 1194), match='614-555-1166'>
<re.Match object; span=(1273, 1285), match='530-555-2676'>
<re.Match object; span=(1359, 1371), match='470-555-2750'>
<re.Match object; span=(1443, 1455), match='800-555-6089'>
<re.Match object; spa

### 3.11 Fetching Mr Name from the text

In [58]:
# Using ? to get 0 or 1 quantity of dot "."
# \s is whitespace
# \w is all a-z, A-Z, _
# Asterick * means 0 or more quantity afterwards
search = re.compile(r"Mr\.?\s\w*") 
matches = search.finditer(text_to_search)
for match in matches:
    print(match)

<re.Match object; span=(248, 259), match='Mr. Schafer'>
<re.Match object; span=(260, 268), match='Mr Smith'>
<re.Match object; span=(292, 297), match='Mr. T'>


### 3.12 Fetching Mr and Mrs Name from the text

In [60]:
# | is OR
# ( ) is a group
search = re.compile(r"M(r|s|rs)\.?\s\w*") # Another way (Mr|Ms|Mrs)
matches = search.finditer(text_to_search)
for match in matches:
    print(match)

<re.Match object; span=(248, 259), match='Mr. Schafer'>
<re.Match object; span=(260, 268), match='Mr Smith'>
<re.Match object; span=(269, 277), match='Ms Davis'>
<re.Match object; span=(278, 291), match='Mrs. Robinson'>
<re.Match object; span=(292, 297), match='Mr. T'>


## 4- Practical Implementation

### 4.1 Fetch all the email addresses

In [105]:
emails = '''
CoreyMSchafer@gmail.com
corey.schafer@university.edu
corey-321-schafer@my-work.net
'''

search = re.compile(r".*@.*(\w|\.)*")
matches = search.finditer(emails)
for match in matches:
    print(match)

<re.Match object; span=(1, 24), match='CoreyMSchafer@gmail.com'>
<re.Match object; span=(25, 53), match='corey.schafer@university.edu'>
<re.Match object; span=(54, 83), match='corey-321-schafer@my-work.net'>


### 4.2 Replace URLS with domain name and top domain level only

In [57]:
urls = '''
https://www.google.com
http://coreyms.com
https://youtube.com
https://www.nasa.gov
'''

pattern = re.compile(r"https?://(www\.)?(\w+\.\w+)")

matches = pattern.finditer(urls)

for match in matches:
    # group 2 is the group we created in the compile function at the last (\w+\.\w+)
    print(match.group(2))

google.com
coreyms.com
youtube.com
nasa.gov


In [62]:
pattern = re.compile(r"https?://(www\.)?(\w+\.\w+)")
# substitute URLS with group 2
subbed_urls = pattern.sub(r"\2", urls)

print(subbed_urls)


google.com
coreyms.com
youtube.com
nasa.gov



### 4.3 Use Different functions of Regex

#### Findall function

In [70]:
# Findall.....it only fetches the pattern and not the indices along with it
search = re.compile(r"\d{3}.\d{3}.\d{4}")
matches = search.findall(text_to_search)
for match in matches:
    print(match)

321-555-4321
123.555.1234
123*555*1234
800-555-1234
900-555-1234


#### Finditer function

In [75]:
# Finditer.....its fetches the indices of the pattern
search = re.compile(r"\d{3}.\d{3}.\d{4}")
matches = search.finditer(text_to_search)
for match in matches:
    print(match)

<re.Match object; span=(167, 179), match='321-555-4321'>
<re.Match object; span=(180, 192), match='123.555.1234'>
<re.Match object; span=(193, 205), match='123*555*1234'>
<re.Match object; span=(206, 218), match='800-555-1234'>
<re.Match object; span=(219, 231), match='900-555-1234'>


#### Match function

In [89]:
# Match.....Fetches only one pattern from beginning of a string. Not iterable
sentence = "My name is Arsalan Ali"
search = re.compile(r"My")
matches = search.match(sentence)
print(matches)

<re.Match object; span=(0, 2), match='My'>


#### Search function

In [90]:
# Search.....Fetches only one pattern from anywhere in the string. Not iterable
sentence = "My name is Arsalan Ali"
search = re.compile(r"Arsalan")
matches = search.search(sentence)
print(matches)

<re.Match object; span=(11, 18), match='Arsalan'>


#### Flags in Regular Expressions
* **IGNORECASE:** Ignores the case sensitivity of a string
* **I:** Its the same as IGNORECASE

In [91]:
sentence = "My name is Arsalan Ali"
search = re.compile(r"arsalan", re.I)
matches = search.search(sentence)
print(matches)

<re.Match object; span=(11, 18), match='Arsalan'>


In [93]:
search = re.compile(r"\d\d\d[-.*]\d\d\d[-.*]\d\d\d") 
matches = search.findall(text_to_search)
for match in matches:
    print(match)

321-555-432
123.555.123
123*555*123
800-555-123
900-555-123
