# Regular Expressions

## Procurar padrões basicos

In [1]:
text = "The agent's phone number is 408-555-1234. Call soon!"

In [2]:
# procurar o padrão phone em text
'phone' in text

True

In [3]:
# importar as regular expressions
import re

In [4]:
pattern = 'phone'

In [5]:
# procurar o padrão phone em text utilizando as importações do python
re.search(pattern,text)

<re.Match object; span=(12, 17), match='phone'>

In [6]:
# procurar o padrão NOT IN TEXT em text utilizando as importações do python
pattern = "NOT IN TEXT"

In [7]:
# Como o padrão não existe não é nada retornado
re.search(pattern,text)

In [8]:
pattern = 'phone'

In [9]:
match = re.search(pattern,text)

In [10]:
match

<re.Match object; span=(12, 17), match='phone'>

In [11]:
# O padrão está nos digitos 12 ao 17
match.span()

(12, 17)

In [12]:
# O padrão começa no digito 12
match.start()

12

In [13]:
# O padrão acaba no digito 17
match.end()

17

In [14]:
text = "my phone is a new phone"

In [15]:
match = re.search("phone",text)

In [16]:
# Só retorna o primeiro padrão
match.span()

(3, 8)

In [17]:
# Para procurar todos os padrões
matches = re.findall("phone",text)
matches

['phone', 'phone']

In [18]:
len(matches)

2

In [19]:
# A possição de todos os padrões encontrados
for match in re.finditer("phone",text):
    print(match.span())

(3, 8)
(18, 23)


In [20]:
match.group()

'phone'

## Padrões

### Identificadores para caracteres em padrões

<table ><tr><th>Character</th><th>Description</th><th>Example Pattern Code</th><th >Exammple Match</th></tr>

<tr ><td><span >\d</span></td><td>A digit</td><td>file_\d\d</td><td>file_25</td></tr>

<tr ><td><span >\w</span></td><td>Alphanumeric</td><td>\w-\w\w\w</td><td>A-b_1</td></tr>



<tr ><td><span >\s</span></td><td>White space</td><td>a\sb\sc</td><td>a b c</td></tr>



<tr ><td><span >\D</span></td><td>A non digit</td><td>\D\D\D</td><td>ABC</td></tr>

<tr ><td><span >\W</span></td><td>Non-alphanumeric</td><td>\W\W\W\W\W</td><td>*-+=)</td></tr>

<tr ><td><span >\S</span></td><td>Non-whitespace</td><td>\S\S\S\S</td><td>Yoyo</td></tr></table>

In [21]:
text = "My telephone number is 408-555-1234"

In [22]:
texts = "My telephone number is 408-555-1234 and 408-555-1234"

In [23]:
# Procurar os digitos do telefone
phone = re.search(r'\d\d\d-\d\d\d-\d\d\d\d',text)

In [24]:
phones = re.findall(r'\d\d\d-\d\d\d-\d\d\d\d',texts)

In [25]:
phone.group()

'408-555-1234'

In [27]:
phones

['408-555-1234', '408-555-1234']

### Quantificadores

<table ><tr><th>Character</th><th>Description</th><th>Example Pattern Code</th><th >Exammple Match</th></tr>

<tr ><td><span >+</span></td><td>Occurs one or more times</td><td>	Version \w-\w+</td><td>Version A-b1_1</td></tr>

<tr ><td><span >{3}</span></td><td>Occurs exactly 3 times</td><td>\D{3}</td><td>abc</td></tr>



<tr ><td><span >{2,4}</span></td><td>Occurs 2 to 4 times</td><td>\d{2,4}</td><td>123</td></tr>



<tr ><td><span >{3,}</span></td><td>Occurs 3 or more</td><td>\w{3,}</td><td>anycharacters</td></tr>

<tr ><td><span >\*</span></td><td>Occurs zero or more times</td><td>A\*B\*C*</td><td>AAACC</td></tr>

<tr ><td><span >?</span></td><td>Once or none</td><td>plurals?</td><td>plural</td></tr></table>

In [29]:
# Em vez de utilizar a repetição de \d pode-se utilizar os quantificadores para indicar quantas vezes se repete o \d.
re.search(r'\d{3}-\d{3}-\d{4}',text)

<re.Match object; span=(23, 35), match='408-555-1234'>

### Grupos

In [30]:
# Quando se quer fazer duas tarefas, por exemplo encontrar numeros de telefones e extrair o codigo de area(3 primeiros digitos).
# Pode-se usar os grupos para qualquer tarefa que envolva agrupar expressões regulares e mais tarde fazer a separação.

phone_pattern = re.compile(r'(\d{3})-(\d{3})-(\d{4})')

In [31]:
results = re.search(phone_pattern,text)

In [32]:
# O resultado inteiro do grupo
results.group()

'408-555-1234'

In [33]:
# retornar o resultado por possição no grupo
# Os grupos são separados por parênteses: (\d{3})-(\d{3})-(\d{4}) São 3 grupos
# A ordenação do grupo comeca em 1. Passar 0 retorna o grupo inteiro.
results.group(1)

'408'

In [34]:
results.group(2)

'555'

In [35]:
results.group(3)

'1234'

## Regex Syntax adicional

### Operador Or |

In [36]:
# Procurar man ou woman na frase
re.search(r"man|woman","This man was here.")

<re.Match object; span=(5, 8), match='man'>

In [37]:
re.search(r"man|woman","This woman was here.")

<re.Match object; span=(5, 10), match='woman'>

### The Wildcard Character

In [39]:
# Procurar frases que contenham certas letras
re.findall(r".at","The cat in the hat sat here.")

['cat', 'hat', 'sat']

In [40]:
# Só retorna 3 letras
re.findall(r".at","The bat went splat")

['bat', 'lat']

In [41]:
# Para retornar mais letras antes de at
# Pode apanhar palavras que não deviam
re.findall(r"...at","The bat went splat")

['e bat', 'splat']

In [44]:
# Para retornar apenas as letras das palavras que têm em at
re.findall(r'\S+at',"The bat went splat satp")

['bat', 'splat', 'sat']

### Começa com e termina com

In [29]:
# Termina com 2
re.findall(r'\d$','This ends with a number 2')

['2']

In [48]:
# Começa com 1
re.findall(r'^\d','1a is the loneliest number.')

['1']

### Excluir

In [49]:
phrase = "there are 3 numbers 34 inside 5 this sentence."

In [50]:
# Para excluir os numeros
re.findall(r'[^\d]',phrase)

['t',
 'h',
 'e',
 'r',
 'e',
 ' ',
 'a',
 'r',
 'e',
 ' ',
 ' ',
 'n',
 'u',
 'm',
 'b',
 'e',
 'r',
 's',
 ' ',
 ' ',
 'i',
 'n',
 's',
 'i',
 'd',
 'e',
 ' ',
 ' ',
 't',
 'h',
 'i',
 's',
 ' ',
 's',
 'e',
 'n',
 't',
 'e',
 'n',
 'c',
 'e',
 '.']

In [51]:
# Voltar a juntar as letras
re.findall(r'[^\d]+',phrase)

['there are ', ' numbers ', ' inside ', ' this sentence.']

In [52]:
test_phrase = 'This is a string! But it has punctuation. How can we remove it?'

In [53]:
# Remover a pontuação
re.findall('[^!.? ]+',test_phrase)

['This',
 'is',
 'a',
 'string',
 'But',
 'it',
 'has',
 'punctuation',
 'How',
 'can',
 'we',
 'remove',
 'it']

In [54]:
# Voltar a juntar
clean = ' '.join(re.findall('[^!.? ]+',test_phrase))
clean

'This is a string But it has punctuation How can we remove it'

### Brackets para agrupamento

In [55]:
text = 'Only find the hypen-words in this sentence. But you do not know how long-ish they are'

In [56]:
# Procurar palavras separadas por hifen
# [\w]+ palavra alfanumérica de qualquer tamanho
# - hifen
# [\w]+ palavra alfanumérica de qualquer tamanho
re.findall(r'[\w]+-[\w]+',text)

['hypen-words', 'long-ish']

### Parênteses para multiplas hipoteses

In [57]:
text = 'Hello, would you like some catfish?'
texttwo = "Hello, would you like to take a catnap?"
textthree = "Hello, have you seen this caterpillar?"

In [61]:
# Procurar palavras que começam com cat e terminam com as segintes hipoteses: 'fish','nap', or 'claw'
re.search(r'cat(fish|nap|claw)',text)

<re.Match object; span=(27, 34), match='catfish'>

In [59]:
re.search(r'cat(fish|nap|claw)',texttwo)

<re.Match object; span=(32, 38), match='catnap'>

In [60]:
# Sem nenhum retorno
re.search(r'cat(fish|nap|claw)',textthree)