这里说的是一个查找的问题，查找问题分为两类：
1. 知道要查找的内容，比如要找一个电话号码，我们知道要电话号码是1234，就直接用 in  来找
2. 不知道要查找的具体内容，只知道format，这种情况，我们就要用regular expression to search through the document to find the pattern
such as r'\d{3}-\d{3}-\d{4}'
d means digits, and put a backlash in front of d is to tell python that it is different from 'd'

In [1]:
text='The phone number of the agent is 408-555-1234. Call soon!'

In [2]:
'phone' in text

True

In [3]:
import re  #regular expression library in python

In [4]:
pattern='phone'

In [5]:
re.search(pattern,text)

<re.Match object; span=(4, 9), match='phone'>

In [6]:
my_match=re.search(pattern,text)

In [7]:
my_match.span()

(4, 9)

In [8]:
my_match.start()

4

In [9]:
my_match.end()

9

###### the pattern appears twice(more than once) 

In [10]:
text2='My phone is a new phone'

In [11]:
match=re.search(pattern,text2)
match

<re.Match object; span=(3, 8), match='phone'>

In [12]:
match.span()  #the span seems does not work, only return the span of the first search

(3, 8)

In [13]:
all_matches=re.findall('phone',text2)
len(all_matches)

2

In [14]:
for match in re.finditer('phone',text2):
    print(match.span())

(3, 8)
(18, 23)


more to check the python expressions notebook

### python pattern search

In [15]:
text3='My telephone number is 777-555-1234'

In [16]:
pattern=r'\d\d\d-\d\d\d-\d\d\d\d'

In [17]:
phone_number=re.search(pattern,text3)
phone_number

<re.Match object; span=(23, 35), match='777-555-1234'>

In [18]:
phone_number.group()

'777-555-1234'

### quantifiers
define how many times to search

In [19]:
pattern=r'\d{3}-\d{3}-\d{4}'

In [20]:
re.search(pattern,text3)

<re.Match object; span=(23, 35), match='777-555-1234'>

### grab separate groups
compile separate groups using pairs

In [21]:
pattern=r'(\d{3})-(\d{3})-(\d{4})'

In [23]:
mymatch=re.search(pattern,text3)

In [24]:
mymatch.group()

'777-555-1234'

In [25]:
mymatch

<re.Match object; span=(23, 35), match='777-555-1234'>

In [26]:
mymatch.group(1)

'777'

In [27]:
mymatch.group(3)

'1234'

### | operator 

In [28]:
re.search(r'man|woman',"this is man here")  #match by man

<re.Match object; span=(8, 11), match='man'>

In [29]:
re.search(r'man|woman',"this is woman here")  #match by woman

<re.Match object; span=(8, 13), match='woman'>

### a wild card

In [32]:
re.findall(r".at","The cat is in the hat sat plat")

['cat', 'hat', 'sat', 'lat']

In [33]:
re.findall(r"..at","The cat is in the hat sat plat")

[' cat', ' hat', ' sat', 'plat']

### $ ends with

In [35]:
re.findall(r"\d$",'This ends with a number 2')

['2']

###  ^ starts with

In [36]:
re.findall(r"^\d","1 is the lonest number")

['1']

### exclude [ ]

In [37]:
phrase="There are 3 numbers 34 inside 5 this sentence"

In [39]:
re.findall(r"[^\d]",phrase)

['T',
 'h',
 'e',
 'r',
 'e',
 ' ',
 'a',
 'r',
 'e',
 ' ',
 ' ',
 'n',
 'u',
 'm',
 'b',
 'e',
 'r',
 's',
 ' ',
 ' ',
 'i',
 'n',
 's',
 'i',
 'd',
 'e',
 ' ',
 ' ',
 't',
 'h',
 'i',
 's',
 ' ',
 's',
 'e',
 'n',
 't',
 'e',
 'n',
 'c',
 'e']

In [40]:
re.findall(r"[^\d]+",phrase)

['There are ', ' numbers ', ' inside ', ' this sentence']

In [50]:
test_phrase="This is a string! but it has punctuation. how to remove it ? "

In [51]:
re.findall(r"[^!.? ]+",test_phrase)

['This',
 'is',
 'a',
 'string',
 'but',
 'it',
 'has',
 'punctuation',
 'how',
 'to',
 'remove',
 'it']

In [52]:
re.findall(r"[^!.?]+",test_phrase) #if there is no space behind ?, then it is like the following 

['This is a string', ' but it has punctuation', ' how to remove it ', ' ']

In [53]:
mylist=re.findall(r"[^!.? ]+",test_phrase)

In [54]:
' '.join(mylist)

'This is a string but it has punctuation how to remove it'

### \w alphanumeric

In [57]:
text="Only find the hyphen-words. were are the long-ish dash words?"

In [58]:
re.findall(r'[\w]+-[\w]+',text)

['hyphen-words', 'long-ish']