# Meta-characters: Character matches

. : wildcard, matches a single character

^ : start of a string

$ : end of a 

[] : matches one of the set of characters within []

[a-z] : matches one of the range of characters a, b, …, z

[^abc] : matches a character that is not a, b, or, c

a|b : matches either a or b, where a and b are strings

() : Scoping for operators

\ : Escape character for special characters (\t, \n, \b)


# Meta-characters: Character symbols

\b : Matches word boundary

\d : Any digit, equivalent to [0-9]

\D : Any non-digit, equivalent to [^0-9]

\s : Any whitespace, equivalent to [ \t\n\r\f\v]

\S : Any non-whitespace, equivalent to [^ \t\n\r\f\v]

\w : Alphanumeric character, equivalent to [a-zA-Z0-9_]

\W : Non-alphanumeric, equivalent to [^a-zA-Z0-9_]

# Meta-characters: Repetitions

"*" : matches zero or more occurrences

"+" : matches one or more occurrences

? : matches zero or one occurrences

{n} : exactly n repetitions, n≥ 0

{n,} : at least n repetitions

{,n} : at most n repetitions

{m,n} : at least m and at most n repetitions

# Examples :

In [2]:

# Recall the callout regular expression

import re 
text10 = "”Ethics: are built right into the ideals and objectives of the United Nations” #UNSG @ NY Society for Ethical Culture bit.ly/2guVelr @UN @UN_Women"
text11 = text10.split(' ')
t=[w for w in text11 if re.search('@[A-Za-z0-9_]+', w)]
t1=[w for w in text11 if re.search('@\w+',w)]
print (t,t1)


['@UN', '@UN_Women'] ['@UN', '@UN_Women']


In [40]:
# Write a regular expression to search for anything in square brackets: pattern1
pattern1 = r"\[.*\]"

# Use re.search to find the first text in square brackets
print(re.search(pattern1,text10))

None


In [51]:
# Find the script notation at the beginning of the fourth sentence and print it
# Import necessary modules
import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
pattern2 = r"[\w]+:"
print(re.match(pattern2, text10))

None


# Finding specific characters

In [11]:
text12 = "ouagadougou"
t=re.findall(r'[aeiou]', text12)
t1=re.findall(r'[^aeiou]',text12)
t,t1

(['o', 'u', 'a', 'a', 'o', 'u', 'o', 'u'], ['g', 'd', 'g'])

# Regular expression for Dates

In [15]:
dateStr = '23-10-2002\n23/10/2002\n23/10/02\n 10/23/2002\n 23 Oct 2002\n 23 October 2002\n Oct 23, 2002\nOctober 23, 2002\n'
t=re.findall(r'\d{2}[/-]\d{2}[/-]\d{4}',dateStr)
t2 =re.findall(r'(?:\d{1,2} )?(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* (?:\d{1,2}, )?\d{4}', dateStr)
t,t2

(['23-10-2002', '23/10/2002', '10/23/2002'],
 ['23 Oct 2002', '23 October 2002', 'Oct 23, 2002', 'October 23, 2002'])

In [3]:
my_string="Let's write RegEx!  Won't that be fun?  I sure think so.  Can you find 4 sentences?  Or perhaps, all 19 words?"
# Write a pattern to match sentence endings: sentence_endings

sentence_endings = r"[.?!]"

# Split my_string on sentence endings and print the result
print(re.split(sentence_endings, my_string))

# Find all capitalized words in my_string and print the result
capitalized_words = r"[A-Z]\w+"
print(re.findall(capitalized_words, my_string))

# Split my_string on spaces and print the result
spaces = r"\s+"
print(re.split(spaces, my_string))

# Find all digits in my_string and print the result
digits = r"\d+"
print(re.findall(digits, my_string))


["Let's write RegEx", "  Won't that be fun", '  I sure think so', '  Can you find 4 sentences', '  Or perhaps, all 19 words', '']
['Let', 'RegEx', 'Won', 'Can', 'Or']
["Let's", 'write', 'RegEx!', "Won't", 'that', 'be', 'fun?', 'I', 'sure', 'think', 'so.', 'Can', 'you', 'find', '4', 'sentences?', 'Or', 'perhaps,', 'all', '19', 'words?']
['4', '19']


# Working with Text Data in pandas

In [21]:
import pandas as pd

time_sentences = ["Monday: The doctor's appointment is at 2:45pm.", 
                  "Tuesday: The dentist's appointment is at 11:30 am.",
                  "Wednesday: At 7:00pm, there is a basketball game!",
                  "Thursday: Be back home by 11:15 pm at the latest.",
                  "Friday: Take the train at 08:10 am, arrive at 09:00am."]

df = pd.DataFrame(time_sentences, columns=['text'])
df

Unnamed: 0,text
0,Monday: The doctor's appointment is at 2:45pm.
1,Tuesday: The dentist's appointment is at 11:30...
2,"Wednesday: At 7:00pm, there is a basketball game!"
3,Thursday: Be back home by 11:15 pm at the latest.
4,"Friday: Take the train at 08:10 am, arrive at ..."


In [22]:
# find the number of characters for each string in df['text']
df['text'].str.len()

0    46
1    50
2    49
3    49
4    54
Name: text, dtype: int64

In [23]:
# find the number of tokens for each string in df['text']
df['text'].str.split().str.len()

0     7
1     8
2     8
3    10
4    10
Name: text, dtype: int64

In [24]:
# find which entries contain the word 'appointment'
df['text'].str.contains('appointment')

0     True
1     True
2    False
3    False
4    False
Name: text, dtype: bool

In [25]:
# find how many times a digit occurs in each string
df['text'].str.count(r'\d')

0    3
1    4
2    3
3    4
4    8
Name: text, dtype: int64

In [26]:
# find all occurances of the digits
df['text'].str.findall(r'\d')

0                   [2, 4, 5]
1                [1, 1, 3, 0]
2                   [7, 0, 0]
3                [1, 1, 1, 5]
4    [0, 8, 1, 0, 0, 9, 0, 0]
Name: text, dtype: object

In [27]:
# group and find the hours and minutes
df['text'].str.findall(r'(\d?\d):(\d\d)')

0               [(2, 45)]
1              [(11, 30)]
2               [(7, 00)]
3              [(11, 15)]
4    [(08, 10), (09, 00)]
Name: text, dtype: object

In [28]:
# replace weekdays with '???'
df['text'].str.replace(r'\w+day\b', '???')

0          ???: The doctor's appointment is at 2:45pm.
1       ???: The dentist's appointment is at 11:30 am.
2          ???: At 7:00pm, there is a basketball game!
3         ???: Be back home by 11:15 pm at the latest.
4    ???: Take the train at 08:10 am, arrive at 09:...
Name: text, dtype: object

In [29]:
# replace weekdays with 3 letter abbrevations
df['text'].str.replace(r'(\w+day\b)', lambda x: x.groups()[0][:3])

0          Mon: The doctor's appointment is at 2:45pm.
1       Tue: The dentist's appointment is at 11:30 am.
2          Wed: At 7:00pm, there is a basketball game!
3         Thu: Be back home by 11:15 pm at the latest.
4    Fri: Take the train at 08:10 am, arrive at 09:...
Name: text, dtype: object

In [30]:
# create new columns from first match of extracted groups
df['text'].str.extract(r'(\d?\d):(\d\d)')

Unnamed: 0,0,1
0,2,45
1,11,30
2,7,0
3,11,15
4,8,10


In [33]:

# extract the entire time, the hours, the minutes, and the period
df['text'].str.extractall(r'((\d?\d):(\d\d) ?([ap]m))')

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3
Unnamed: 0_level_1,match,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0,2:45pm,2,45,pm
1,0,11:30 am,11,30,am
2,0,7:00pm,7,0,pm
3,0,11:15 pm,11,15,pm
4,0,08:10 am,8,10,am
4,1,09:00am,9,0,am


In [34]:
# extract the entire time, the hours, the minutes, and the period with group names
df['text'].str.extractall(r'(?P<time>(?P<hour>\d?\d):(?P<minute>\d\d) ?(?P<period>[ap]m))')

Unnamed: 0_level_0,Unnamed: 1_level_0,time,hour,minute,period
Unnamed: 0_level_1,match,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0,2:45pm,2,45,pm
1,0,11:30 am,11,30,am
2,0,7:00pm,7,0,pm
3,0,11:15 pm,11,15,pm
4,0,08:10 am,8,10,am
4,1,09:00am,9,0,am


# Tweet preprocessing 