# <u>Regular Expressions</u>

In [15]:
text = "The agent's phone number is 408-555-1234. Call soon!"

In [2]:
import re

In [16]:
pattern = 'phone'

In [6]:
re.search(pattern,text)

<re.Match object; span=(12, 17), match='phone'>

In [13]:
pattern = 'NOT IN TEXT'

In [18]:
match = re.search(pattern,text)

In [19]:
match.span()

(12, 17)

In [20]:
match.start()

12

In [21]:
match.end()

17

### Unfortunately it gets us only the first match. Does not by default get all the matches.

In [22]:
text = " phone one phone twice"

In [23]:
pattern = 'phone'

In [24]:
re.search(pattern,text)

<re.Match object; span=(1, 6), match='phone'>

In [25]:
matches = re.findall('phone',text)

In [26]:
matches

['phone', 'phone']

In [27]:
len(matches)

2

In [33]:
for match in re.finditer('phone',text):
    print(match.span())

(1, 6)
(11, 16)


In [34]:
for match in re.finditer('phone',text):
    print(match.group())

phone
phone


# <u>Using the pattern codes</u>

| Character | Description | Example Pattern Code | Example Match |
| --- | --- | --- | --- |
| \d | A digit | file_\d\d | file_25 |
| \w | Alphanumeric | \w-\w\w\w | A-b_1 |
| \s | White space | a\sb\sc | a b c |
| \D | A non digit | \D\D\D | ABC |
| \W | Non-Alphanumeric | \W\W\W\W\W | *-+) |
| \S | Non-Whitespace | \S\S\S\S | Yoyo |

# <u>Quantifiers</u>

| Character | Description | Example Pattern Code | Example Match |
| --- | --- | --- | --- |
| + | Occures one or more times | Version \w-\w+ | Version A-b1_1 |
| {3} | Occurs Exactly 3 Times | \D{3} | abc |
| {2,4} | Occurs 2 to 4 Times | \d{2,4} | 123 |
| 3, | Occurs 3 or more times | \w{3,} | anycharacters |
| * | Occurs 0 or more times | ABC* | AAACC |
| ? | Once or more | Plurals? | Plural |

In [61]:
import re
text = 'My phone number is +91-99162-53336'
#phone = re.search('\W\d\d-\d\d\d\d\d-\d\d\d\d\d',text) # This is also valid
phone = re.search(r'\W\d{2}-\d{5}-\d{5}',text)
print(phone)
print(phone.span())

<re.Match object; span=(19, 34), match='+91-99162-53336'>
(19, 34)


In [50]:
text1 = '10.150.23.45'
output = re.search(r'\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3}',text1)
print(output)

<re.Match object; span=(0, 12), match='10.150.23.45'>


In [78]:
text = 'My phone number is 408-555-7777'
phone_pattern = re.compile(r'(\d{3})-(\d{3})-(\d{4})')
results = re.search(phone_pattern,text)
print(results.group())
print(results.group(1))
print(results.group(2))
print(results.group(3))

408-555-7777
408
555
7777


In [79]:
text = 'My phone number is +91-99162-53336'
phone_pattern = re.compile(r'(\W\d{2})-(\d{5})-(\d{5})')
results = re.search(phone_pattern,text)
print(results.group())
print(results.group(1))
print(results.group(2))
print(results.group(3))

+91-99162-53336
+91
99162
53336


# <u>Additional Regex Syntax</u>

In [87]:
mat = re.search(r'cat','The cat is here')

Let's say we want to match either cat or dog. The we can use the pipe operator that symbolises OR.

## OR Operator >> |

In [89]:
re.search(r'cat|dog','The dog is here')

<re.Match object; span=(4, 7), match='dog'>

## Wild Card Operator >> .

In [92]:
# This example gets all the instances of at
re.findall(r'at','The cat in the hat sat there.')

['at', 'at', 'at']

In [93]:
# This example gets all the word that has at in it.
re.findall(r'.at','The cat in the hat sat there.')

['cat', 'hat', 'sat']

In [95]:
re.findall(r'...at','The cat in the hat splat.')

['e cat', 'e hat', 'splat']

## Starts with using ^

In [99]:
# Find the sentences that starts with a number and returns the number.
re.findall(r'^\d','1 is a number and so is 2 as well')

['1']

## Ends with using $

In [101]:
re.findall(r'\d$','number is 2')

['2']

## Exclusions

In [103]:
phrase = 'there 3 are numbers 34 that 5 cannot be defined'

In [106]:
# Getting back all the elements excluding the numbers
pattern=r'[^\d]+'

In [110]:
re.findall(pattern,phrase)

['there ', ' are numbers ', ' that ', ' cannot be defined']

In [111]:
' '.join(re.findall(pattern,phrase))

'there   are numbers   that   cannot be defined'

In [112]:
test_phrase = 'This is a string! But it has punctuation. How can we remove it?'

In [113]:
clean = re.findall(r'[^!.?]+',test_phrase)

In [114]:
' '.join(clean)

'This is a string  But it has punctuation  How can we remove it'

In [115]:
test = 'Only find the hypen-words in this sentence. But you do not know how long-ish they are'

In [118]:
pattern = r'[\w]+-[\w]+'

In [120]:
re.findall(pattern,test)

['hypen-w', 'long-i']

In [121]:
t = 'Hello, would you like some catfish?'
u = "Hello, would like to take a catnap?"
v = "Hello, have you seen this caterpillar?"

In [125]:
re.search(r'cat(fish|nap|erpillar)',v)

<re.Match object; span=(26, 37), match='caterpillar'>