# Mounting Google Drive

In [1]:
from google.colab import drive
drive.mount("/content/Drive")

Mounted at /content/Drive


# Regular Expressions

In [2]:
import re

## Searching for a Specific String

In [3]:
# Text to search in
text = "The phone number of the agent is 408-555-1234. Call soon!"

# Word to find
pattern = "phone"

# Using Regex to find the pattern
my_match = re.search(pattern, text)

# Getting the index where the pattern was found
print(f"""
span  - {my_match.span()}
start - {my_match.start()}
end   - {my_match.end()}
""")


span  - (4, 9)
start - 4
end   - 9



In [4]:
# Text to search in
text = "my phone is a new phone"

# Word to find
pattern = "phone"

# Using Regex to find the pattern
my_match = re.search(pattern, text)

# If a pattern is repeated more than once then only the first instance will be returned
print(f"""
span  - {my_match.span()}
start - {my_match.start()}
end   - {my_match.end()}
""")


span  - (3, 8)
start - 3
end   - 8



In [5]:
# Finding how many times the pattern is repeated
print("Pattern repeated :", len(re.findall(pattern, text)))

Pattern repeated : 2


In [6]:
# Find the span / location of the pattern matches in the text
for match in re.finditer(pattern, text):
  print(match.span())

(3, 8)
(18, 23)


## Searching for a General Pattern e.g. Phone Numbers

### Identifiers for Characters in Patterns

In [7]:
# We will use this text to find a phone number
text = "My telephone number is 777-555-1234"

In [8]:
# Regex Pattern
pattern = r"\d\d\d-\d\d\d-\d\d\d\d"

# Searching for the pattern
phone_number = re.search(pattern, text)

# Printing the returned data
print(f"Match     :", phone_number.group())
print(f"Location  :", phone_number.span())

Match     : 777-555-1234
Location  : (23, 35)


In [9]:
# Finding multiple phone numbers
text = "My telephone number is 777-555-1234 and the office telephone number is 444-111-4321"

for phone_number in re.finditer(pattern, text):
    print(f"Match     : {phone_number.group()}")
    print(f"Location  : {phone_number.span()}", "\n")

Match     : 777-555-1234
Location  : (23, 35) 

Match     : 444-111-4321
Location  : (71, 83) 



### Quantifiers for Pattern Recognition

In [10]:
# Regex Pattern
pattern = r"\d{3}-\d{3}-\d{4}"

# Searching for the pattern
phone_number = re.search(pattern, text)

# Printing the returned data
print(f"Match     :", phone_number.group())
print(f"Location  :", phone_number.span())

Match     : 777-555-1234
Location  : (23, 35)


## Groups

In [11]:
# Regex Pattern
pattern = r"(\d{3})-(\d{3})-(\d{4})"

# Searching for the pattern
phone_number = re.search(pattern, text)

In [12]:
print(phone_number.group())

777-555-1234


In [13]:
# Finding the individual group in the pattern
print(phone_number.group(1))

777


In [14]:
print(phone_number.group(2))

555


In [15]:
print(phone_number.group(3))

1234


### Using the pipe operator

In [16]:
# The pipe operator "|" is used as "or" operator
# Here it allows us to search for "man" or "woman" in the provided text
pattern = r"man|woman"

text = "This man was here"

# Searching for the pattern
mymatch = re.search(pattern, text)

# Printing the returned data
print(mymatch)

<re.Match object; span=(5, 8), match='man'>


In [17]:
text = "This woman was here"

# Searching for the pattern
mymatch = re.search(pattern, text)

# Printing the returned data
print(mymatch)

<re.Match object; span=(5, 10), match='woman'>


### Using the Wildcard Character

In [18]:
pattern = r".at" # Patte for words ending with "at"

text = "The cat sat in the hat"

# Searching for the pattern
mymatch = re.findall(pattern, text)

# Printing the returned data
print(mymatch, "\n")

# # Finding all the matches and their locations
# for match in re.finditer(pattern, text):
#     print(f"Match     : {match.group()}")
#     print(f"Location  : {match.span()}", "\n")

['cat', 'sat', 'hat'] 



In [19]:
pattern = r"..at" # gives at with 2 previous letters
text = "copycat wildcat habitat preheat"

mymatch = re.findall(pattern, text)

print(mymatch, "\n")

['ycat', 'dcat', 'itat', 'heat'] 



In [20]:
pattern = r"^\d" # Sentence starting with digit
text = "0 is neither prime nor composite"

mymatch = re.findall(pattern, text)

print(mymatch, "\n")

['0'] 



In [21]:
pattern = r"\d$" # Sentence ending with digit
text = "The smallest two digit prime number is 11"

mymatch = re.findall(pattern, text)

print(mymatch, "\n")

['1'] 



In [22]:
pattern = r'[^\d ]+' # Exclude all the Numbers
text = "there are 3 numbers 34 inside 5 this sentence"

words = re.findall(pattern, text)
sentence = " ".join(words)

print(words, "\n")
print(sentence)

['there', 'are', 'numbers', 'inside', 'this', 'sentence'] 

there are numbers inside this sentence


In [23]:
pattern = r'[^!.? ]+' # Removing the puncutation
text = "This is a string! but it has puncutation. How to remove it?"

words = re.findall(pattern, text)
sentence = " ".join(words)

print(words, "\n")
print(sentence)

['This', 'is', 'a', 'string', 'but', 'it', 'has', 'puncutation', 'How', 'to', 'remove', 'it'] 

This is a string but it has puncutation How to remove it


In [24]:
pattern = r'[\w]+-[\w]+' # Finding words with hyphen between them
text = 'Only find the hypen-words in this sentence. But you do not know how long-ish they are'

mymatch = re.findall(pattern, text)

print(mymatch)

['hypen-words', 'long-ish']


## Parentheses for Multiple Options

In [25]:
# Find words that start with cat and end with one of these options: 'fish','nap', or 'claw'
text1 = 'Hello, would you like some catfish?'
text2 = "Hello, would you like to take a catnap?"
text3 = "Hello, have you seen this caterpillar?"

In [28]:
re.search(r'cat(fish|nap|claw)', text1)

<re.Match object; span=(27, 34), match='catfish'>

In [29]:
re.search(r'cat(fish|nap|claw)',text2)

<re.Match object; span=(32, 38), match='catnap'>

In [30]:
re.search(r'cat(fish|nap|claw)',text3)