In [2]:
import re

In [3]:
text_to_search = '''
abcdefghijklmnopqurtuvwxyz
ABCDEFGHIJKLMNOPQRSTUVWXYZ
1234567890

Ha HaHa

MetaCharacters (Need to be escaped):
. ^ $ * + ? { } [ ] \ | ( )

coreyms.com

321-555-4321
123.555.1234
123*555*123
800-555-1234
900-555-1234

Mr. Schafer
Mr Smith
Ms Davis
Mrs. Robinson
Mr. T
abc
'''

In [4]:
sentence = 'Start a sentence and then bring it to an end'

**Raw Strings**

Strings prefixed with an r<br>
Not to handle backslashes in any way.

In [5]:
# i.e.
#print('\tTab')
# 	Tab
print(r'\tTab')

\tTab


### Compiling

Allows us to seperate patterns into a variable and reuse variables to perform multiple searches.

In [6]:
pattern = re.compile(r'abc')

In [7]:
matches = pattern.finditer(text_to_search)

In [8]:
for match in matches:
    print(match)

<re.Match object; span=(1, 4), match='abc'>
<re.Match object; span=(270, 273), match='abc'>


^ This tells us that it only found the the pattern of string 'abc'<br>
And also gives us the index of each location i.e [1] -  [4] and [271]  -  [274]

In [9]:
print(text_to_search[271:274])

bc



**More on Compiling**

In [10]:
import re

# Target String one
str1 = "Emma's luck numbers are 251 761 231 451"

# pattern to find three consecutive digits
string_pattern = r"\d{3}"
# compile string pattern to re.Pattern object
regex_pattern = re.compile(string_pattern)

# print the type of compiled pattern
print(type(regex_pattern))

# find all the matches in string one
result = regex_pattern.findall(str1)
print(result)

<class 're.Pattern'>
['251', '761', '231', '451']


**Special Characters**

<re.Match object; span=(142, 153), match='coreyms.com'>


## Character Matching

In [12]:
# .       - Any Character Except New Line
# d       - Digit (0-9)
# \D      - Not a Digit (0-9)
# \w      - Word Character (a-z, A-Z, 0-9, _)
# \W      - Not a Word Character
# \s      - Whitespace (space, tab, newline)
# \S      - Not Whitespace (space, tab, newline)

# \b      - Word Boundary
# \B      - Not a Word Boundary
# ^       - Beginning of a String
# $       - End of a String

# []      - Matches Characters in brackets
# [^ ]    - Matches Characters NOT in brackets
# |       - Either Or
# ( )     - Group

#Quantifiers:
# *       - 0 or More
# +       - 1 or More
# ?       - 0 or One
# {3}     - Exact Number
# {3,4}   - Range of Numbers (Minimum, Maximum)

**Word Boundary**

Indicated by whitespaces and non alphanumerical character.

In [13]:
pattern = re.compile(r'\bHa')

matches = pattern.finditer(text_to_search)
for match in matches:
    print(match)

<re.Match object; span=(67, 69), match='Ha'>
<re.Match object; span=(70, 72), match='Ha'>


Starts by finding a '/b' then a 'Ha'<br>Matches "Ha HaHa" matches the 1st and 2nd Ha, but not the third in HaHa

**Beginning  / End of String**

In [1]:
sentence = 'Start a sentence and then bring it to an end'

In [14]:
pattern = re.compile(r'^Star') # The beginning of sentence is Start and so it finds Star at [0] - [4]

matches = pattern.finditer(sentence)
for match in matches:
    print(match)

<re.Match object; span=(0, 4), match='Star'>


In [15]:
pattern = re.compile(r'nd$') # The beginning of sentence is Start and so it finds Star at [0] - [4]

matches = pattern.finditer(sentence)
for match in matches:
    print(match)

<re.Match object; span=(42, 44), match='nd'>


**Matching Digits**

In [16]:
pattern = re.compile(r'\d{3}\W\d{3}\W\d{3,4}')

matches = pattern.finditer(text_to_search)
for match in matches:
    print(match)

<re.Match object; span=(155, 167), match='321-555-4321'>
<re.Match object; span=(168, 180), match='123.555.1234'>
<re.Match object; span=(181, 192), match='123*555*123'>
<re.Match object; span=(193, 205), match='800-555-1234'>
<re.Match object; span=(206, 218), match='900-555-1234'>


In nicer form

In [17]:
list_org = re.findall(r'\d{3}\W\d{3}\W\d{3,4}', text_to_search)
list_org

['321-555-4321', '123.555.1234', '123*555*123', '800-555-1234', '900-555-1234']

In [18]:
list_clean = re.findall(r'\d{3}\W\d{3}\W\d{3,4}', text_to_search)
list_clean

['321-555-4321', '123.555.1234', '123*555*123', '800-555-1234', '900-555-1234']

### Quick Example of Cleaning Data

In [19]:
for count, x in enumerate(list_clean):
    list_clean[count] = x.replace('-', '').replace('.', '').replace('*','')

In [24]:
import pandas as pd
d = {'Original': list_org, 'Cleaned': list_clean}
df = pd.DataFrame(d)
df

Unnamed: 0,Original,Cleaned
0,321-555-4321,3215554321
1,123.555.1234,1235551234
2,123*555*123,123555123
3,800-555-1234,8005551234
4,900-555-1234,9005551234


#### Further Testing

In [25]:
test = re.compile(r'test')

In [26]:
test.findall('test test test')

['test', 'test', 'test']

### More examples

In [27]:
Numbers = '''
321--555-4321
123.555.1234
123*555*123
800-555---1234
900-..555-1234'''

In [28]:
re.findall(r'\d{3}[-.]{1,3}\d{3}[-.]{1,3}\d{3,4}', Numbers)

['321--555-4321', '123.555.1234', '800-555---1234', '900-..555-1234']

**Searching through a range**

In [29]:
number = '12345'
re.findall(r'[1-2][3-5]', number)

['23']

Alphabet Example

In [30]:
Sentence = 'This is a test'
re.findall(r'[a-zA-Z]', Sentence)

['T', 'h', 'i', 's', 'i', 's', 'a', 't', 'e', 's', 't']

### Groups

In [35]:
pattern = re.compile(r'M[rs]s?\.?\s[a-zA-Z]*')

matches = pattern.finditer(text_to_search)
for match in matches:
    print(match)

<re.Match object; span=(220, 231), match='Mr. Schafer'>
<re.Match object; span=(232, 240), match='Mr Smith'>
<re.Match object; span=(241, 249), match='Ms Davis'>
<re.Match object; span=(250, 263), match='Mrs. Robinson'>
<re.Match object; span=(264, 269), match='Mr. T'>


In [36]:
pattern = re.compile(r'M(r|s|rs)\.?\s[a-zA-Z]*')

matches = pattern.finditer(text_to_search)
for match in matches:
    print(match)

<re.Match object; span=(220, 231), match='Mr. Schafer'>
<re.Match object; span=(232, 240), match='Mr Smith'>
<re.Match object; span=(241, 249), match='Ms Davis'>
<re.Match object; span=(250, 263), match='Mrs. Robinson'>
<re.Match object; span=(264, 269), match='Mr. T'>


In [4]:
import re
pattern = re.compile(r'[:;][-~]?[D/)]')
text = ':)'
matches = pattern.finditer(text)
for match in matches:
    print(match)

<re.Match object; span=(0, 2), match=':)'>


In [None]:
Sentence = 'This is a test'
re.findall(r'[a-zA-Z]', Sentence)

In [8]:
pattern = re.compile(r'[:;][-~]?[D/)]')
text = ':('
if pattern.match(text):
    print('test')