In [None]:
# Regular Expressions

#re.search
#re.findall
#re.compile

'''Regular Expressions (sometimes called regex for short) allows a user to search for strings 
   using almost any sort of rule they can come up. For example, finding all capital letters in a 
   string, or finding a phone number in a document.

   Regular expressions are notorious for their seemingly strange syntax. This strange syntax is a
   byproduct of their flexibility. Regular expressions have to be able to filter out any string 
   pattern you can imagine, which is why they have a complex string pattern format.'''

In [3]:
# Searching for Basic Patterns
text = "The person's phone number is 408-555-1234. Call soon!"
'phone' in text

True

In [4]:
# format for regular expressions

In [5]:
import re

In [6]:
pattern = 'phone'

In [7]:
re.search(pattern,text)

<re.Match object; span=(13, 18), match='phone'>

In [8]:
pattern = "NOT IN TEXT"

In [9]:
re.search(pattern,text)

In [None]:
'''re.search() will take the pattern, scan the text, and then returns a Match object.
   If no pattern is found, a None is returned (in Jupyter Notebook this just means that 
   nothing is output below the cell).'''

In [11]:
pattern = 'phone'

match = re.search(pattern,text)

In [12]:
match

<re.Match object; span=(13, 18), match='phone'>

In [13]:
'''
Notice the span, there is also a start and end index information.'''

match.span()

(13, 18)

In [14]:
match.start()

13

In [15]:
match.end()

18

In [16]:
# pattern occurs more than once

In [17]:
text = "my phone is a new phone"

In [18]:
match = re.search("phone",text)

In [19]:
match.span()

(3, 8)

In [20]:
matches = re.findall("phone",text)

In [21]:
matches

['phone', 'phone']

In [22]:
len(matches)

2

In [23]:
# To get actual match objects, use the iterator:

for match in re.finditer("phone",text):
    print(match.span())

(3, 8)
(18, 23)


In [24]:
# To find out the actual text that matched, you can use the .group() method.

for match in re.finditer("phone",text):
    print(match.group())

phone
phone


In [None]:
# Identifiers for Characters in Patterns

In [None]:
'''Character Identifiers

Now that we know the special character designations, we can use them along with quantifiers to 
define how many we expect.

Character	Description	Example Pattern Code	Exammple Match
\d	A digit	file_\d\d	file_25
\w	Alphanumeric	\w-\w\w\w	A-b_1
\s	White space	a\sb\sc	a b c
\D	A non digit	\D\D\D	ABC
\W	Non-alphanumeric	\W\W\W\W\W	*-+=)
\S	Non-whitespace	\S\S\S\S	Yoyo'''

In [None]:
'''Quantifiers

Character	Description	Example Pattern Code	Exammple Match
+	Occurs one or more times	Version \w-\w+	Version A-b1_1
{3}	Occurs exactly 3 times	\D{3}	abc
{2,4}	Occurs 2 to 4 times	\d{2,4}	123
{3,}	Occurs 3 or more	\w{3,}	anycharacters
\*	Occurs zero or more times	A\*B\*C*	AAACC
?	Once or none	plurals?	plural'''

In [35]:
text = "My telephone number is 408-555-1234"

phone = re.search(r'\d\d\d-\d\d\d-\d\d\d\d',text)

In [36]:
phone.group()

'408-555-1234'

In [37]:
# or

re.search(r'\d{3}-\d{3}-\d{4}',text)

<re.Match object; span=(23, 35), match='408-555-1234'>

In [None]:
# Groups

In [48]:
phone_pattern = re.compile(r'(\d{3})-(\d{3})-(\d{4})')

results=re.search(phone_pattern,text)

In [50]:
# Can then also call by group position.
# remember groups were separated by parenthesis ()
# Something to note is that group ordering starts at 1. Passing in 0 returns everything

results.group(1)

'408'

In [51]:
results.group(2)

'555'

In [52]:
results.group(3)

'1234'

In [None]:
# Additional Regex Syntax

In [None]:
# Or operator |

# Use the pipe operator to have an or statment.

In [53]:
re.search(r"man|woman","This man was here.")

<re.Match object; span=(5, 8), match='man'>

In [None]:
# The Wildcard Character

# A "wildcard" acts as a placement that will match any character placed there. 

In [54]:
re.findall(r".at","The cat in the hat sat here.")

['cat', 'hat', 'sat']

In [55]:
# want words that end with "at".

# One or more non-whitespace that ends with 'at'
re.findall(r'\S+at',"The bat went splat")

['bat', 'splat']

In [None]:
# Starts with and Ends With

'''
We can use the ^ to signal starts with, and the $ to signal ends with

Note that this is for the entire string, not individual words!

'''

In [56]:
# Ends with a number
re.findall(r'\d$','This ends with a number 2')

['2']

In [58]:
# Starts with a number
re.findall(r'^\d','1 is a whole number.')

['1']

In [None]:
# Exclusion

'''
To exclude characters, we can use the ^ symbol in conjunction with a set of brackets []. 
Anything inside the brackets is excluded.

'''

In [64]:
phrase = "there are 3 numbers 34 inside 5 this sentence."

re.findall(r'[^\d]',phrase)

['t',
 'h',
 'e',
 'r',
 'e',
 ' ',
 'a',
 'r',
 'e',
 ' ',
 ' ',
 'n',
 'u',
 'm',
 'b',
 'e',
 'r',
 's',
 ' ',
 ' ',
 'i',
 'n',
 's',
 'i',
 'd',
 'e',
 ' ',
 ' ',
 't',
 'h',
 'i',
 's',
 ' ',
 's',
 'e',
 'n',
 't',
 'e',
 'n',
 'c',
 'e',
 '.']

In [65]:
re.findall(r'[^\D]',phrase)

['3', '3', '4', '5']

In [66]:
# To get the words back together, use a + sign
# Or we can say to remove all digits, use a + sign

re.findall(r'[^\d]+',phrase)

['there are ', ' numbers ', ' inside ', ' this sentence.']

In [67]:
#  To remove punctuation from a sentence

test_phrase = 'This is a string! But it has punctuation. How can we remove it?'

re.findall('[^!.? ]+',test_phrase)

['This',
 'is',
 'a',
 'string',
 'But',
 'it',
 'has',
 'punctuation',
 'How',
 'can',
 'we',
 'remove',
 'it']

In [68]:
clean = ' '.join(re.findall('[^!.? ]+',test_phrase))

In [69]:
clean

'This is a string But it has punctuation How can we remove it'

In [None]:
# Brackets for Grouping

In [70]:
# To find hyphenated words

text = 'Only find the hypen-words in this sentence. But you do not know how long-ish they are'

re.findall(r'[\w]+-[\w]+',text)

['hypen-words', 'long-ish']

In [None]:
# Parenthesis for Multiple Options

In [74]:
# Find words that start with cat and end with one of these options: 'fish','nap', or 'claw'

text = 'Hello, would you like some catfish?'
texttwo = "Hello, would you like to take a catnap?"
textthree = "Hello, have you seen this caterpillar?"

In [75]:
re.search(r'cat(fish|nap|claw)',text)

<re.Match object; span=(27, 34), match='catfish'>

In [76]:
re.search(r'cat(fish|nap|claw)',texttwo)

<re.Match object; span=(32, 38), match='catnap'>

In [77]:
# None returned
re.search(r'cat(fish|nap|claw)',textthree)