# Chapter 8: Sample Notebook

This notebook contains all code from Chapter 8: _Quantifying Text Complexity_.

In [1]:
import re

## 8.2 Calculating Text Length

In [2]:
def identify_words(input_text:str):
    """Extracts all words from a given text."""
    words = re.findall(r"\b[a-zA-Z\'\-]+\b",input_text)
    return words
    
def count_words (input_text:str):
    """Counts the number of words in a given text."""
    words = identify_words(input_text)
    # calculates the number of all words in a given text
    word_count = len(words) 
    return word_count

In [3]:
# excerpt from Microsoft Corporation's 2016 10-K.
text = """We acquire other companies and intangible assets and may not realize all the economic benefit from those acquisitions, which could cause an impairment of goodwill or intangibles. We review our amortizable intangible assets for impairment when events or changes in circumstances indicate the carrying value may not be recoverable. We test goodwill for impairment at least annually. Factors that may be a change in circumstances, indicating that the carrying value of our goodwill or amortizable intangible assets may not be recoverable, include a decline in our stock price and market capitalization, reduced future cash flow estimates, and slower growth rates in industry segments in which we participate. We may be required to record a significant charge on our consolidated financial statements during the period in which any impairment of our goodwill or amortizable intangible assets is determined, negatively affecting our results of operations."""

text_length = count_words(text)
print(f"Number of words in text: {text_length}")

Number of words in text: 143


In [5]:
def identify_sentences(input_text:str):
    """Extracts all sentences from a given text."""
    sentences = re.findall(r"\b[A-Z](?:[^\.!?]|\.\d)*[\.!?]",input_text)
    return sentences
    
def count_sentences (input_text:str):
    """Counts the number of sentences in a given text."""
    sentences = identify_sentences(input_text)
    sentence_count = len(sentences)
    return sentence_count

num_sentences = count_sentences(text)
print(f"Number of sentences in text: {num_sentences}")

Number of sentences in text: 5


## 8.3 Measuring Text Readability Using the Fog Index

### 8.3.1 Writing a Function to Calculate the Fog Index

In [6]:
re_syllables = re.compile(r'(^|[^aeuoiy])(?!e$)[aeouiy]', re.IGNORECASE) # regex pattern that matches vowels in a word (case-insensitive); used for syllable count

def count_syllables(word:str):
    """Counts the number of syllables in a word."""
    syllables_matches = re_syllables.findall(word) # gets all syllable regex pattern matches in the input word
    return len(syllables_matches) # returns the number of syllable matches

def is_complex_word(word:str):
    """Checks whether a word has three or more syllables."""
    return count_syllables(word) >= 3

In [7]:
print("Number of syllables in word \"Text\":",count_syllables("Text"))
print("Is word \"Text\" complex:", is_complex_word("Text"))

print("Number of syllables in word \"analysis\":",count_syllables("analysis"))
print("Is word \"analysis\" complex?:",is_complex_word("analysis"))

print("Number of syllables in word \"procedure\":",count_syllables("procedure"))
print("Is word \"procedure\" complex?:",is_complex_word("procedure"))

Number of syllables in word "Text": 1
Is word "Text" complex: False
Number of syllables in word "analysis": 4
Is word "analysis" complex?: True
Number of syllables in word "procedure": 3
Is word "procedure" complex?: True


In [10]:
def calculate_fog(text:str):
    """Calculates the fog index for a given text (string)."""
    
    # extracts all sentences from the input text
    sentences = identify_sentences(text) 
    # extracts all words from the input text
    words = identify_words(text) 
    # creates a list of complex words by using is_complex_word function as a filter
    complex_words = list(filter(is_complex_word, words))
    return 0.4*(float(len(words))/float(len(sentences)) + 100 * float(len(complex_words))/float(len(words)) ) # calculates and returns the fog index

In [14]:
fog_score = calculate_fog(text)
print("The fog index is", fog_score)

The fog index is 21.78965034965035


### 8.3.2 Using Python Packages to Calculate the Fog Index

In [15]:
# Readability class provides methods to compute various readability metrics 
from readability import Readability

# create a new Readability object with the example text as an input
r = Readability(text)

# calculate and output the fog index
fog_score = r.gunning_fog()
print(fog_score)

score: 21.78965034965035, grade_level: 'college_graduate'
