# Chapter 6: Sample Notebook

This notebook contains all code from Chapter 6: _Introduction to Regular Expressions_.

In [1]:
import re

## 6.1 Looking for Patterns in Text

In [2]:
# loads Python's regular expressions module
import re 

text = "OI for FY 2019 was 12.4 billion, up more than eight percent from OI in FY 2018."

#returns a Match object of the first match if it exists 
x1 = re.search(r"OI", text) 

#finds all matches of "OI"
x2 = re.findall(r"OI", text) 

#splits text at ","
x3 = re.split(r",", text) 

#replaces "OI" with "Operating Income"
x4 = re.sub(r"OI", "Operating Income", text) 

print(f'Result of re.search:\n{x1}\n')
print(f'Result of re.findall:\n{x2}\n')
print(f'Result of re.split:\n{x3}\n')
print(f'Result of re.sub:\n{x4}')

Result of re.search:
<re.Match object; span=(0, 2), match='OI'>

Result of re.findall:
['OI', 'OI']

Result of re.split:
['OI for FY 2019 was 12.4 billion', ' up more than eight percent from OI in FY 2018.']

Result of re.sub:
Operating Income for FY 2019 was 12.4 billion, up more than eight percent from Operating Income in FY 2018.


In [3]:
x1 = re.findall(r'MD&A', "This year's MD&a Section is located... Please refer to our md&A section on page...", re.IGNORECASE)
x2 = re.findall(r'md&a', "This year's MD&a Section is located... Please refer to our md&A section on page...".lower())
print(x1)
print(x2)

['MD&a', 'md&A']
['md&a', 'md&a']


## 6.2 Characters and Character Sets

### Special Characters

Match the dollar sign in the string by using a backslash in the pattern. This works

In [4]:
print(re.search(r"\$4.99", "Apple's Earnings per Share for the three months ended in December 2019 was $4.99,"))

<re.Match object; span=(75, 80), match='$4.99'>


Omit the backslash in the pattern. This does not work.

In [5]:
print(re.search(r"$4.99", "Apple's Earnings per Share for the three months ended in December 2019 was $4.99,"))

None


Match the backslash in the string by using a backslash in the pattern. This works

In [6]:
print(re.search(r"S-1\\A", "Form S-1\A"))

<re.Match object; span=(5, 10), match='S-1\\A'>


Omit the backslash in the pattern. This does not work.

In [7]:
print(re.search(r"S-1\A", "Form S-1\A"))

None


### Character Sets in Regex

In [8]:
text = "This project has increased our revenues by more than 70% in FY 2019."

# returns all single digit matches
x1 = re.findall(r'[0-9]', text) 

# returns all non-word characters, also excludes spaces, periods, and commas
x2 = re.findall(r'[^a-zA-Z \.,]', text) 

# returns all two-digit numbers followed by "%"
x3 = re.findall(r'\d\d%', text) 

print(x1)
print(x2)
print(x3)

['7', '0', '2', '0', '1', '9']
['7', '0', '%', '2', '0', '1', '9']
['70%']


## 6.3 Anchors and Boundaries in Regex

In [9]:
# Match inf at the beginning of the string
print(re.search(r"^inf", "information retrieval"))

<re.Match object; span=(0, 3), match='inf'>


In [10]:
# No match because the characters inf occur in the middle of the string
print(re.search(r"^inf", "retrieval of information"))

None


In [11]:
# Match high because it is a complete word
print(re.search(r"\bhigh\b", "high income"))

<re.Match object; span=(0, 4), match='high'>


In [12]:
# No match because high is part of a word
print(re.search(r"\bhigh\b", "higher income"))

None


In [13]:
# Match high because it is part of a word
print(re.search(r"\bhigh\B", "higher income"))

<re.Match object; span=(0, 4), match='high'>


In [14]:
# No match because high is a complete word
print(re.search(r"\bhigh\B", "high income"))

None


## 6.4 Quantifiers in Regex

In [15]:
# Match one or more word characters at the beginning of a string,
# followed by a word boundary
print(re.search(r"^\w+\b", "high income"))

<re.Match object; span=(0, 4), match='high'>


In [16]:
# Match any integer number between 1900 and 1999
print(re.search(r"\b19\d{2}\b", "1975"))

<re.Match object; span=(0, 4), match='1975'>


In [17]:
# Match all two and three digit numbers in text
print(re.findall(r"\b\d{2,3}\b", "The financial crisis occurred in 08 and 09"))

['08', '09']


## 6.5 Groups in Regex

In [18]:
m = re.match(r"Total Assets = (\$[\d,\.]+)\b", "Total Assets = $10,000,000")

print(f'FULL MATCH:\n{m.group(0)}')
print()
print(f'GROUP MATCH:\n{m.group(1)}')

FULL MATCH:
Total Assets = $10,000,000

GROUP MATCH:
$10,000,000


In [19]:
m = re.match(r"Email: ([\w\.-]+@[\w\.-]+.edu)", "Email: abc@ABC.edu")

print(f'FULL MATCH:\n{m.group(0)}')
print()
print(f'GROUP MATCH:\n{m.group(1)}')

FULL MATCH:
Email: abc@ABC.edu

GROUP MATCH:
abc@ABC.edu


In [20]:
m = re.match(r"Form (10-K|10-Q)", "Form 10-K")

print(f'FULL MATCH:\n{m.group(0)}')
print()
print(f'GROUP MATCH:\n{m.group(1)}')

FULL MATCH:
Form 10-K

GROUP MATCH:
10-K


In [21]:
print(re.search(r"\b(\w+)\s\1\b", "above above"))

<re.Match object; span=(0, 11), match='above above'>


## 6.6 Lookahead and Lookbehind in Regex

In [22]:
# Match the word filename, only when followed by .txt
print(re.search(r"filename(?=\.txt)", "filename.txt"))
print(re.search(r"filename(?=\.txt)", "filename.csv"))

<re.Match object; span=(0, 8), match='filename'>
None


In [23]:
# Match the word filename, only when not followed by .txt
print(re.search(r"filename(?!\.txt)", "filename.txt"))
print(re.search(r"filename(?!\.txt)", "filename.csv"))

None
<re.Match object; span=(0, 8), match='filename'>


In [24]:
# Match 4 digits, only when preceded by the word year and a space
print(re.search(r"(?<=year\s)\d{4}", "year 2020"))
print(re.search(r"(?<=year\s)\d{4}", "series 2020"))

<re.Match object; span=(5, 9), match='2020'>
None


In [25]:
# Match 4 digits, only when not preceded by the word year and a space
print(re.search(r"(?<!year\s)\d{4}", "year 2020"))
print(re.search(r"(?<!year\s)\d{4}", "series 2020"))

None
<re.Match object; span=(7, 11), match='2020'>


## 6.7 Examples of Regex for different textual analysis tasks

#### Example 1: Character Sets

In [26]:
# input text
text = "This project has resulted in over 70% of our 2019 revenues to date. As a result, our operating income increased by 9%, while our operating expenses increased by 12%. We had a 12.5 percent increase in regional sales." 

# recall that ?: after the left parenthesis specifies a non-capturing group
x = re.findall(r'[\d\.]+(?:\%|\s\bpercent\b)', text) 

print(x)

['70%', '9%', '12%', '12.5 percent']


#### Example 2: Character Sets, Quantifiers, Groups, Lookbehinds

In [27]:
# an example of a standard input header used in SEC filings
header = """<SEC-HEADER>0000080424-18-000100.hdr.sgml : 20181019
<ACCEPTANCE-DATETIME>20181019161731
ACCESSION NUMBER:		0000080424-18-000100
CONFORMED SUBMISSION TYPE:	10-Q
PUBLIC DOCUMENT COUNT:		68
CONFORMED PERIOD OF REPORT:	20180930
FILED AS OF DATE:		20181019
DATE AS OF CHANGE:		20181019
FILER:
	COMPANY DATA:	
		COMPANY CONFORMED NAME:			PROCTER & GAMBLE Co
		CENTRAL INDEX KEY:			0000080424
		STANDARD INDUSTRIAL CLASSIFICATION:	SOAP, DETERGENT, CLEANING PREPARATIONS, PERFUMES, COSMETICS [2840]
		IRS NUMBER:				310411980
		STATE OF INCORPORATION:			OH
		FISCAL YEAR END:			0630
[...]
</SEC-HEADER>"""

# CIK is the 10-digit number, so we use the quantifier {10} to consider only 10-digit numbers in the match
# this Regex specifies text "CENTRAL INDEX KEY:", followed by space \s 
# (matched zero or many times as indicated by *), followed by a group capturing 10-digit numbers
# Also, note that re.findall with a group Regex only returns the group match, and not the full match
cik = re.findall(r"CENTRAL INDEX KEY:\s*(\d{10})", header) 

# this Regex specifies text "COMPANY CONFORMED NAME:", followed by space \s (matched zero or many times), 
# followed by a group capturing any character one or many times
# flag MULTILINE makes ^ and $ characters capture beginning and end positions 
# of text lines instead of only text files
company_name = re.findall(r"COMPANY CONFORMED NAME:\s*(.+)$", header, re.MULTILINE) 

# this Regex specifies text "FILED AS OF DATE:", followed by space \s (matched zero or many times), 
# followed by a group capturing 8-digit numbers as all dates in the SEC headers are in the YYYYMMDD format
filing_date = re.findall(r'FILED AS OF DATE:\s*(\d{8})', header) 

# This Regex uses a positive lookbehind to check if a 4-digit number 
# is preceded by text "STANDARD INDUSTRIAL CLASSIFICATION:"
sic = re.findall(r'(?<=STANDARD INDUSTRIAL CLASSIFICATION:).+(\d{4})', header) 

print(cik)
print(company_name)
print(filing_date)
print(sic)

['0000080424']
['PROCTER & GAMBLE Co']
['20181019']
['2840']


#### Example 3: Word Boundaries and Quantifiers

In [28]:
# input text
text = "An investment in our common stock involves a high degree of risk. You should carefully consider the risks summarized below. The risks are discussed more fully in the Risk Factors section of this prospectus immediately following this prospectus summary. These risks include, but are not limited to, the following [...] These operations are risky [...] Marcroeconomic fluctuations increase the riskiness of our operations. As indicated in Section 2.1, our company's long-term risks include [...]" 

# this Regex matches a word boundary followed by a text string 'risk', followed by an 
# alphanumeric character (repeated zero or many times), followed by a word boundary; 
# re.IGNORECASE specifies a case-insensitive matching
risk_words = re.findall(r"\brisk\w*\b", text, re.IGNORECASE) 

# matches all single words (allowing for '-' between two words and apostrophe) in text
all_words = re.findall(r"\b[a-zA-Z\'\-]+\b", text)

# function len() here returns the number of words that start with string 'risk', 
# i.e., the number of matches in risk_words list
risk_words_freq =  len(risk_words) 

# the number of all words in text
all_words_freq = len(all_words) 

# percentage of risk-related words in text
text_riskiness = 100 * risk_words_freq / all_words_freq 

print(f'Number of risky words: {risk_words}')
print(f'Percent of risky words: {text_riskiness:.4f}%')

Number of risky words: ['risk', 'risks', 'risks', 'Risk', 'risks', 'risky', 'riskiness', 'risks']
Percent of risky words: 11.4286%
