# Regular expressions

In [83]:
import re


In [84]:
text = "This is a good day"

if re.search("good", text):
    print("wonderful")
else:
    print("Alas :c")


wonderful


In [85]:
# Tokenizing separate into substrings based on patterns

text = "Amy works dilligently, Amy gets good grades. Our student Amy is succesful"

re.split("Amy", text)

['',
 ' works dilligently, ',
 ' gets good grades. Our student ',
 ' is succesful']

In [86]:
re.findall("Amy", text)

['Amy', 'Amy', 'Amy']

In [87]:
# Anchors: specify start or end of string you are truing to match. ^start $end
text = "Amy works dilligently, Amy gets good grades. Our student Amy is succesful"
re.search("^Amy", text)

<re.Match object; span=(0, 3), match='Amy'>

In [88]:
#re.Match object, has a boolean true if found, and tells you which parent was matched


# Patterns and Character Classes

In [89]:
grades="AABCCBABAAAAAABC"

re.findall("B", grades)


['B', 'B', 'B', 'B']

In [90]:
# Set operator, OR
re.findall("[AB]", grades)

['A', 'A', 'B', 'B', 'A', 'B', 'A', 'A', 'A', 'A', 'A', 'A', 'B']

In [91]:
# All lower case characters [a-z]
# Find A followed by a B OR C
re.findall("[A][B-C]", grades)

['AB', 'AB', 'AB']

In [92]:
#Pipe = OR
re.findall("AB|AC", grades)

['AB', 'AB', 'AB']

In [93]:
# ^ NOT, inside [] ^ looses its start meaning
re.findall("[^A]", grades)

['B', 'C', 'C', 'B', 'B', 'B', 'C']

In [94]:
# Match any value at the beginning of the string that is not A = [] because the string starts with A
re.findall("^[^A]", grades)

[]

# Quantifiers


In [97]:
# Quantifiers, number of times you want a pattern to be matched before it counts as matched
# e(m,n) m = min matches, n = max matches
re.findall("A{2,10}", grades)

['AA', 'AAAAAA']

In [103]:
# This patterns looks for two As followed by another two As
re.findall("A{1,1}A{1,1}", grades)

['AA', 'AA', 'AA', 'AA']

In [104]:
# You cannot add spaces, follow the pattern
re.findall("A{2, 2}", grades)

[]

In [105]:
# if no quantifier is specified then it is {1,1}
re.findall("AA", grades)

['AA', 'AA', 'AA', 'AA']

In [106]:
# If only one number is included then it is the min and the max
re.findall("A{2}", grades)

['AA', 'AA', 'AA', 'AA']

In [107]:
#Decreasings
re.findall("A{1,10}B{1,10}C{1,10}", grades)

['AABCC', 'AAAAAABC']

In [108]:
# * -> match 0 or more times
# ? -> match one or more times
# + -> match one or more times
with open("datasets/ferpa.txt", "r") as file:
    wiki=file.read()


In [109]:
# Headers in ferpa all have the word edit behind them, followed by a new line char
re.findall("[a-zA-Z]{1,100}\[edit\]", wiki)

['Overview[edit]', 'records[edit]', 'records[edit]']

In [110]:
# It got only the last word of the headers - w ->  any word character
re.findall("[\w]{1,100}\[edit\]", wiki)

['Overview[edit]', 'records[edit]', 'records[edit]']

In [119]:
# Metacharacters
# s -> whitespace
# w -> any word, doesnt include spaces
# Get any lenght string followed by edit
re.findall("[\w]*\[edit\]", wiki)

['Overview[edit]', 'records[edit]', 'records[edit]']

In [112]:
# Adds space
# Get any length string containing chars and spaces followed by edit
re.findall("[\w ]*\[edit\]", wiki)

['Overview[edit]',
 'Access to public records[edit]',
 'Student medical records[edit]']

In [120]:
# Remove the edit part [edit] of the title
for title in re.findall("[\w ]*\[edit\]", wiki):
    print(re.split("[\[]", title)[0])

Overview
Access to public records
Student medical records


# Groups

In [121]:
# Match groups of patterns
# () group patterns
re.findall("([\w ]*)(\[edit\])", wiki)

[('Overview', '[edit]'),
 ('Access to public records', '[edit]'),
 ('Student medical records', '[edit]')]

In [122]:
# return a list of match objects
for item in re.finditer("([\w ]*)(\[edit\])", wiki):
    print(item.groups())

('Overview', '[edit]')
('Access to public records', '[edit]')
('Student medical records', '[edit]')


In [125]:
# return a list of match objects
# only first parameter
for item in re.finditer("([\w ]*)(\[edit\])", wiki):
    print(item.group(1))

Overview
Access to public records
Student medical records


In [129]:
# Label groups
# Results as dictionary
# (?P<name>): name -> dictionary key
for item in re.finditer("(?P<title>[\w ]*)(?P<edit_link>\[edit\])", wiki):
    print(item.group('title'))

Overview
Access to public records
Student medical records


In [130]:
print(item.groupdict())

{'title': 'Student medical records', 'edit_link': '[edit]'}


In [131]:
# w -> any word character
# . -> any single char not new line
# d -> any digit
# s -> whitespace

# Look-ahead and Look-behind

In [134]:
# ?= we are looking ahead for edit but not matching it
for item in re.finditer("(?P<title>[\w]+)(?=\[edit\])", wiki):
    print(item)

<re.Match object; span=(0, 8), match='Overview'>
<re.Match object; span=(2732, 2739), match='records'>
<re.Match object; span=(3708, 3715), match='records'>


# Example : Wikipedia Data

In [147]:
# Data of buddist based universities
with open("datasets/buddhist.txt", "r", -1,  "utf-8") as file:
    wiki = file.read()
wiki

'Buddhist universities and colleges in the United States\nFrom Wikipedia, the free encyclopedia\nJump to navigationJump to search\n\nThis article needs additional citations for verification. Please help improve this article by adding citations to reliable sources. Unsourced material may be challenged and removed.\nFind sources: "Buddhist universities and colleges in the United States" – news · newspapers · books · scholar · JSTOR (December 2009) (Learn how and when to remove this template message)\nThere are several Buddhist universities in the United States. Some of these have existed for decades and are accredited. Others are relatively new and are either in the process of being accredited or else have no formal accreditation. The list includes:\n\nDhammakaya Open University – located in Azusa, California, part of the Thai Wat Phra Dhammakaya[1]\nDharmakirti College – located in Tucson, Arizona Now called Awam Tibetan Buddhist Institute (http://awaminstitute.org/)\nDharma Realm Buddh

In [152]:
# Verbose mode of python regex - write multiline regex and encreases readability, indicate white space chars
pattern="""
(?P<title>.*)        #the university title
(–\ located\ in\ )   #an indicator of the location
(?P<city>\w*)        #city the university is in
(,\ )                #separator for the state
(?P<state>\w*)       #the state the city is located in"""

# Now when we call finditer() we just pass the re.VERBOSE flag as the last parameter, this makes it much
# easier to understand large regexes!
for item in re.finditer(pattern,wiki,re.VERBOSE):
    # We can get the dictionary returned for the item with .groupdict()
    print(item.groupdict())

{'title': 'Dhammakaya Open University ', 'city': 'Azusa', 'state': 'California'}
{'title': 'Dharmakirti College ', 'city': 'Tucson', 'state': 'Arizona'}
{'title': 'Dharma Realm Buddhist University ', 'city': 'Ukiah', 'state': 'California'}
{'title': 'Ewam Buddhist Institute ', 'city': 'Arlee', 'state': 'Montana'}
{'title': 'Institute of Buddhist Studies ', 'city': 'Berkeley', 'state': 'California'}
{'title': 'Maitripa College ', 'city': 'Portland', 'state': 'Oregon'}
{'title': 'University of the West ', 'city': 'Rosemead', 'state': 'California'}
{'title': 'Won Institute of Graduate Studies ', 'city': 'Glenside', 'state': 'Pennsylvania'}


# Example: New York Times and Hashtags

In [153]:
with open("datasets/nytimeshealth.txt", "r", -1, "utf-8") as file:
    health = file.read()
health

Customers Are Generally Happy http://nyti.ms/1q0OTGZ\n487095289883656192|Thu Jul 10 04:45:41 +0000 2014|Race Is On to Profit From Rise of Urgent Care http://nyti.ms/1sA0YA4\n487078036408254464|Thu Jul 10 03:37:08 +0000 2014|America’s young people, as a group, are becoming more out of shape with every passing year http://nyti.ms/1mIZ80m\n487041531824459776|Thu Jul 10 01:12:04 +0000 2014|Drug-resistant tuberculosis is on the rise http://nyti.ms/1xU0ZSr via @nytopinion\n487018872982097921|Wed Jul 09 23:42:02 +0000 2014|Lettuce turnip the beet http://nyti.ms/1xTUoYa\n486984329260974080|Wed Jul 09 21:24:46 +0000 2014|Europe Fines Servier in Pay-for-Delay Crackdown http://nyti.ms/VYT09l\n486965947119702016|Wed Jul 09 20:11:43 +0000 2014|Staple closures are faster, but suture closures are safer after a C-section http://nyti.ms/1xTFtx3\n486935846335950849|Wed Jul 09 18:12:07 +0000 2014|No, contraception coverage does not usually pay for itself http://nyti.ms/1mJoeMl\n486920209425760256|Wed Jul

In [164]:
# List all the hashtags

pattern="(?P<hashtag>#[\w\d]*)(?=\s)"

for item in re.finditer(pattern, health):
    print(item.group('hashtag'))

#askwell
#pregnancy
#Colorado
#VegetarianThanksgiving
#FallPrevention
#Ebola
#Ebola
#ebola
#Ebola
#Ebola
#EbolaHysteria
#AskNYT
#Ebola
#Ebola
#Liberia
#Excalibur
#ebola
#Ebola
#dallas
#nobelprize2014
#ebola
#ebola
#monrovia
#ebola
#nobelprize2014
#ebola
#nobelprize2014
#Medicine
#Ebola
#Monrovia
#Ebola
#smell
#Ebola
#Ebola
#Ebola
#Monrovia
#Ebola
#ebola
#monrovia
#liberia
#benzos
#ClimateChange
#Whole
#Wheat
#Focaccia
#Tomatoes
#Olives
#Recipes
#Health
#Ebola
#Monrovia
#Liberia
#Ebola
#Ebola
#Liberia
#Ebola
#blood
#Ebola
#organtrafficking
#EbolaOutbreak
#SierraLeone
#Freetown
#SierraLeone
#ebolaoutbreak
#kenema
#ebola
#Ebola
#ebola
#ebola
#Ebola
#ASMR
#AIDS2014
#AIDS
#MH17
#benzos
