# Regular Expression in Python

- text search: re.search() follower by object.start()/object.end() to find the start and end index or the search string in the sentence.

In [1]:
import re

## Common Functions

In [2]:
# input
text = "The phone number of the agent is 405-555-1234 call soon !"

In [3]:
pattern = "phone"
my_match = re.search(pattern, text)

In [4]:
# start and end
print("Start and End of Match {}".format(my_match.span()))
print("Start of the Match {}".format(my_match.start()))
print("End of the Match {}".format(my_match.end()))

Start and End of Match (4, 9)
Start of the Match 4
End of the Match 9


In [5]:
# Finding all available matches
text = "A phone is a phone."
pattern = "phone"
all_match = re.findall(pattern, text)

In [6]:
all_match

['phone', 'phone']

In [7]:
# Iterate using iterator object
for match in re.finditer(pattern, text):
    print(match.span())

(2, 7)
(13, 18)


## Indntifiers and Quantifiers for Characters

In [8]:
text = "My telephone number is 777-555-1234"

In [9]:
pattern = r"\d{3}-\d{3}-\d{4}" # \<identifiers>{<quatifiers>}
phone_number = re.search(pattern, text)
phone_number

<_sre.SRE_Match object; span=(23, 35), match='777-555-1234'>

In [10]:
# Returning the object
phone_number.group()

'777-555-1234'

## Gruoups

In [11]:
pattern = r"(\d{3})-(\d{3})-(\d{4})" # (\identi{quanti}) each bracket is a group
my_match = re.search(pattern, text)

In [12]:
print("Group 1 {}".format(my_match.group(1)))
print("Group 2 {}".format(my_match.group(2)))

Group 1 777
Group 2 555


## Pipe Operator and Wildcard

In [13]:
re.search(f"man|women", "A man and lots of women")

<_sre.SRE_Match object; span=(2, 5), match='man'>

In [14]:
re.findall(r".at", "The cat in the hat sat fuckat") # . is a wild card

['cat', 'hat', 'sat', 'kat']

_Caution: . wildcard only return words match the length of wild card as above it cuts the fuckat to kat because of 3 character match_

In [15]:
# Find words start with search: ^
# Find words ends with" $
re.findall(r"\d$", "This ends with a number abc2")

['2']

In [16]:
re.findall(r"\w+\d$", "1x This ends with a number abc2")

['abc2']

In [17]:
re.findall(r"^\d", "1x This ends with a number abc2")

['1']

In [18]:
re.findall(r"^\d\w+", "1x This ends with a number abc2")

['1x']

# Excluding Character with []
- [^ any identifier] will remove the identifier class, here ^ does not indicate that it is at the beginning
- [^ any idengifier]+ will re add up the seperate letter to words

In [19]:
text = "There are 3 nums 34 not 5."

print(re.findall(r"[^\d]", text)) # exclude any degits

['T', 'h', 'e', 'r', 'e', ' ', 'a', 'r', 'e', ' ', ' ', 'n', 'u', 'm', 's', ' ', ' ', 'n', 'o', 't', ' ', '.']


In [20]:
print(re.findall(r"[^\d]+", text)) #concate after removal

['There are ', ' nums ', ' not ', '.']


In [21]:
# removing panctuation
text = "This is a string! but it has punc. how to remove it?"

In [22]:
print(re.findall(r"[^!.? ]+", text))

['This', 'is', 'a', 'string', 'but', 'it', 'has', 'punc', 'how', 'to', 'remove', 'it']


# Grouping after Search []+

In [23]:
# + with [] allow grouping so []+ does not mean only exclution

In [24]:
text = "Only Hype-words. That is a shit long-dash."

In [25]:
re.findall(r"[\w]+", text)

['Only', 'Hype', 'words', 'That', 'is', 'a', 'shit', 'long', 'dash']

In [26]:
re.findall(r"[\w]+-", text)

['Hype-', 'long-']

In [27]:
re.findall(r"[\w]+-[\w]+", text) # [\w] indicates group of alphanumeric and [\w]+ indicates any amount in one group 

['Hype-words', 'long-dash']