# Regex manipulation

In [1]:
import re

In [2]:
# match() checks for a match at the beginning of the string and returns a bool
# serach() checks for a match anywhere in the string and returns a bool

# Example
text = "This is a good day"

if re.search("good", text):
    print("It is good!")
else:
    print("It may not be good")

It is good!


In [3]:
# findall() and split() parse the string and return chunks.
text = "Amy works digilgently. Amy gets good grades. Our student Amy is succesful."

# Splitting this on all occurences of Amy
re.split("Amy", text)

['',
 ' works digilgently. ',
 ' gets good grades. Our student ',
 ' is succesful.']

In [4]:
# To count all occurences
re.findall("Amy", text)

['Amy', 'Amy', 'Amy']

In [5]:
# For the more complex patterns, we have Anchors, which specify the start or the end of the string you're trying to match. The caret character ^ (Ctrl+Shift+u -> 5e)
# means the start and the dollar sign $ character means the end. 
# ^ means the string must start with the specified string and $ means that it must end with it. ^ goes before the string and $ after the string.

# Example
text = "Amy works diligently. Amy gets good grades. Our student Amy is succesful."

# Let's see if it starts with Amy
re.search("^Amy", text)

#search() returns a re.Match object which always has a value of true. It also tells you what pattern was matched, as well as the location as in the span.

<re.Match object; span=(0, 3), match='Amy'>

In [6]:
#Testing the dollar sign operator
re.search("is succesful.$", text)

<re.Match object; span=(60, 73), match='is succesful.'>

In [7]:
re.search("grades.$", text)

## Patterns and character clases

In [8]:
grades = "ACAAAABCBCBAA"

#Finding all B's
re.findall("B", grades)

['B', 'B', 'B']

In [9]:
# To find all A's and B's we can't do "AB" because this will match all A's followed by B's so we do this:
re.findall("[AB]", grades)

['A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'A', 'A']

In [10]:
# This is the set operator. We can also do ranges with it, for example, all lower case letters are indicated like [a-z].
# For example, to find all A's followed by a B or a C we do:
re.findall("[A][B-C]", grades)

['AC', 'AB']

In [11]:
# There's also the | pipe operator which indicates OR.

re.findall("AB|AC", grades)

['AC', 'AB']

In [12]:
# We can use the ^ caret operator with the set operator [] to negate results.

re.findall("[^A]", grades)

['C', 'B', 'C', 'B', 'C', 'B']

In [13]:
# When we use these kinds of operators inside the set operator, they lose their meaning in a sense. For example the following matches all strings 
# that don't start with an A, which is none of them because the string starts with an A

re.findall("^[^A]", grades) 

# When using the set operator we are doing character based matching. So we are matching individual characters in an OR method.

[]

## Quantifiers

In [14]:
# Quantifiers are the number of times you want a pattern to be matched in order to match.
# The most basic quantifier is expressed as e(m,n), where e is the expression or the character to match, m is the minimum number of times you want it to be matched
# and n is the maximum number of times the item could be matched.

# Example. How many times has this student been on a back to back A's streak?
re.findall("A{2,10}", grades)

['AAAA', 'AA']

In [15]:
# We can try to use single values and repeating the pattern
re.findall("A{1,1}A{1,1}", grades)

['AA', 'AA', 'AA']

In [16]:
re.findall("A{1,1}A{1,1}A{1,1}",grades)

['AAA']

In [17]:
print(grades)

ACAAAABCBCBAA


In [18]:
# If you have an empty space in between the braces you'll get an empty result
re.findall("A{2, 2}", grades)

[]

In [19]:
# If we don't include a quantifier, the default is {1,1}
re.findall("AA", grades)

['AA', 'AA', 'AA']

In [20]:
# With just one number in the braces, the number is both m and n
re.findall("A{2}", grades)

['AA', 'AA', 'AA']

In [21]:
# We could also find a decreasing trend in a student's grade
re.findall("A{1,10}B{1,10}C{1,10}", grades)

['AAAABC']

In [52]:
# There are other special quantifiers such as: * to match 0 or more times, a question mark ? to match one or more times, or a plus sign + to match one or more times

# Example
with open("ferpa.txt", "r") as file:
    wiki = file.read()

wiki

'Overview[edit]\nFERPA gives parents access to their child\'s education records, an opportunity to seek to have the records amended, and some control over the disclosure of information from the records. With several exceptions, schools must have a student\'s consent prior to the disclosure of education records after that student is 18 years old. The law applies only to educational agencies and institutions that receive funds under a program administered by the U.S. Department of Education.\n\nOther regulations under this act, effective starting January 3, 2012, allow for greater disclosures of personal and directory student identifying information and regulate student IDs and e-mail addresses.[2] For example, schools may provide external companies with a student\'s personally identifiable information without the student\'s consent.[2]\n\nExamples of situations affected by FERPA include school employees divulging information to anyone other than the student about the student\'s grades o

In [55]:
# As all the headers have the words [edit] followed by a newline character, to get a list of all the headers, we can do the following:
re.findall("[a-zA-Z]{1,100}\[edit\]", wiki)


['Overview[edit]', 'records[edit]', 'records[edit]']

In [24]:
# To improve on this and get all the headers, we can do something different. Using \w allows any character including digits and numbers.
re.findall("[\w]{1,100}\[edit\]", wiki)

['Overview[edit]', 'records[edit]', 'records[edit]']

In [25]:
# \w is a metacharacter, and indicates a special patern of any letter or digit. There are others, such as \s which matches any whitespace character
# We can shorten the syntax using other quantifiers such as *.
re.findall("[\w]*\[edit\]", wiki)

['Overview[edit]', 'records[edit]', 'records[edit]']

In [26]:
# We can also use spaces to include the space character.
re.findall("[\w ]*\[edit\]", wiki)

['Overview[edit]',
 'Access to public records[edit]',
 'Student medical records[edit]']

In [27]:
# Now it's possible to make a list of titles from the wikipedia page by iterating through this and applying another regex.
for title in re.findall("[\w ]*\[edit\]", wiki):
    # Now we'll take the intermediate result and split on the square bracket [ just taking the first result
    print(re.split("[\[]", title)[0])

Overview
Access to public records
Student medical records


## Groups

In [28]:
# We can also match a group of patterns instead of a single one, which are called groups.
re.findall("([\w ]*)(\[edit\])", wiki)

[('Overview', '[edit]'),
 ('Access to public records', '[edit]'),
 ('Student medical records', '[edit]')]

In [29]:
# If we want a list of match objects we can use finditer()
for item in re.finditer("([\w ]*)(\[edit\])", wiki):
    print(item.groups())

('Overview', '[edit]')
('Access to public records', '[edit]')
('Student medical records', '[edit]')


In [30]:
# As groups() returns a tuple, we can get an individual group using group(number), where group(0) is the whole match, and the rest of the numbers are the portions
# we are interesed in.
for item in re.finditer("([\w ]*)(\[edit\])", wiki):
    print(item.group(1))

Overview
Access to public records
Student medical records


In [31]:
# Another thing that can be done with regex groups are labeling groups. Looking at the as a dictionary. (?P<name>), where the parenthesis starts the group,
# the ?P indicates that this is an extension to basic regexes, and <name> is the dictionary key we want to use.

for item in re.finditer("(?P<title>[\w ]*)(?P<edit_link>\[edit\])", wiki):
    print(item.groupdict()['title'])

Overview
Access to public records
Student medical records


In [32]:
# The edit link is still there. We can print the dictionary entry that includes it as well:
print(item.groupdict())

{'title': 'Student medical records', 'edit_link': '[edit]'}


In [33]:
# There are other kinds of shorthands for characters like \w. Such as a . for any single character which is not a newiine.
# \d for any digit
# \s for any whitespace character, like spaces and tabs.

## Look-ahead and Look-behind

In [34]:
# For this, the pattern we are trying to match is either before or after the text we are trying to isolate

for item in re.finditer("(?P<title>[\w ]+)(?=\[edit\])", wiki):
    # This regex is saying is to match two groups, the first named title, will have any amount of whitespapce or regular word characters, the second will
    # will be the characters [edit] but we don't want this edit in our output match objects
    print(item)

<re.Match object; span=(0, 8), match='Overview'>
<re.Match object; span=(2715, 2739), match='Access to public records'>
<re.Match object; span=(3692, 3715), match='Student medical records'>


## Example: Wikipedia data

In [35]:
with open("buddhist.txt", "r") as file:
    wiki = file.read()

wiki

'Buddhist universities and colleges in the United States\nFrom Wikipedia, the free encyclopedia\nJump to navigationJump to search\n\nThis article needs additional citations for verification. Please help improve this article by adding citations to reliable sources. Unsourced material may be challenged and removed.\nFind sources: "Buddhist universities and colleges in the United States" – news · newspapers · books · scholar · JSTOR (December 2009) (Learn how and when to remove this template message)\nThere are several Buddhist universities in the United States. Some of these have existed for decades and are accredited. Others are relatively new and are either in the process of being accredited or else have no formal accreditation. The list includes:\n\nDhammakaya Open University – located in Azusa, California, part of the Thai Wat Phra Dhammakaya[1]\nDharmakirti College – located in Tucson, Arizona Now called Awam Tibetan Buddhist Institute (http://awaminstitute.org/)\nDharma Realm Buddh

In [36]:
# Regex in verbose mode, which allows for multiline regexes and comments
pattern="""
(?P<title>.*)       #The university title
(–\ located\ in\ )  #an indicator of the location
(?P<city>\w*)       #the city the university is in
(.\ )               #separator for the state
(?P<state>\w*)      #the state the city is located in"""

for item in re.finditer(pattern,wiki,re.VERBOSE): # re.VERBOSE as the last flag to make it read this multiline regex
    print(item.groupdict())

{'title': 'Dhammakaya Open University ', 'city': 'Azusa', 'state': 'California'}
{'title': 'Dharmakirti College ', 'city': 'Tucson', 'state': 'Arizona'}
{'title': 'Dharma Realm Buddhist University ', 'city': 'Ukiah', 'state': 'California'}
{'title': 'Ewam Buddhist Institute ', 'city': 'Arlee', 'state': 'Montana'}
{'title': 'Institute of Buddhist Studies ', 'city': 'Berkeley', 'state': 'California'}
{'title': 'Maitripa College ', 'city': 'Portland', 'state': 'Oregon'}
{'title': 'Soka University of America ', 'city': 'Alis', 'state': 'Viejo'}
{'title': 'University of the West ', 'city': 'Rosemead', 'state': 'California'}
{'title': 'Won Institute of Graduate Studies ', 'city': 'Glenside', 'state': 'Pennsylvania'}


In [None]:
with open("nytimeshealth.txt", "r") as file:
    health=file.read()
health

In [38]:
# In this case, there are tweets with fields separated by pipes |. Let's try and get a list of the hastags.

# The pattern is a # then any alphanumeric character and ends with any whitespace
pattern = '#[\w\d]*(?=\s)'

# The ending is a lookahead, because we are not interesting in matching a whitespace in the return value
re.findall(pattern, health)

['#askwell',
 '#pregnancy',
 '#Colorado',
 '#VegetarianThanksgiving',
 '#FallPrevention',
 '#Ebola',
 '#Ebola',
 '#ebola',
 '#Ebola',
 '#Ebola',
 '#EbolaHysteria',
 '#AskNYT',
 '#Ebola',
 '#Ebola',
 '#Liberia',
 '#Excalibur',
 '#ebola',
 '#Ebola',
 '#dallas',
 '#nobelprize2014',
 '#ebola',
 '#ebola',
 '#monrovia',
 '#ebola',
 '#nobelprize2014',
 '#ebola',
 '#nobelprize2014',
 '#Medicine',
 '#Ebola',
 '#Monrovia',
 '#Ebola',
 '#smell',
 '#Ebola',
 '#Ebola',
 '#Ebola',
 '#Monrovia',
 '#Ebola',
 '#ebola',
 '#monrovia',
 '#liberia',
 '#benzos',
 '#ClimateChange',
 '#Whole',
 '#Wheat',
 '#Focaccia',
 '#Tomatoes',
 '#Olives',
 '#Recipes',
 '#Health',
 '#Ebola',
 '#Monrovia',
 '#Liberia',
 '#Ebola',
 '#Ebola',
 '#Liberia',
 '#Ebola',
 '#blood',
 '#Ebola',
 '#organtrafficking',
 '#EbolaOutbreak',
 '#SierraLeone',
 '#Freetown',
 '#SierraLeone',
 '#ebolaoutbreak',
 '#kenema',
 '#ebola',
 '#Ebola',
 '#ebola',
 '#ebola',
 '#Ebola',
 '#ASMR',
 '#AIDS2014',
 '#AIDS',
 '#MH17',
 '#benzos']

## Regex practice session

In [108]:
# What is the correct regular expression to match a URL with letters, numbers, underscores and dots? A valid URL defined in this problem must meet the following requirements:

# The URL consists of two or more strings made of letters, numbers, and underscores.
# A dot is used in between the strings.
# No two dots are allowed to appear consecutively.


text = "For example, your regex should match URLs like: www.aBC.com , abc.com , ab_c.de8f.com tH1sUrl.com But your regex should not match: abc , abc..com"

re.findall("[www\.]?[\w\.]+com", text)

['www.aBC.com', 'abc.com', 'ab_c.de8f.com', 'tH1sUrl.com', 'abc..com']