In [1]:
import spacy
from textstat.textstat import textstatistics
from textstat import textstat

Generral steps to apply the formula:-
<ul>
    <li>Select several 100-word samples throughout the text. </li>
    <li>Compute the average sentence length in words (divide the number of words by the number of sentences).</li> 
    <li>Compute the percentage of words NOT on the Dale–Chall word list of 3, 000 easy words.</li> 
    <li>Compute the respective equations</li> 
</ul> 

In [3]:
def break_sentences(text):
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(text)
    return list(doc.sents)

In [4]:
def word_count(text):
    sentences = break_sentences(text)
    words = 0
    for sentence in sentences:
        words += len([token for token in sentence])
    return words

In [5]:
def sentence_count(text):
    sentences = break_sentences(text)
    return len(sentences)

In [6]:
def avg_sentence_length(text):
    words = word_count(text)
    sentences = sentence_count(text)
    average_sentence_length = float(words / sentences)
    return average_sentence_length

In [7]:
def syllables_count(word):
    return textstatistics().syllable_count(word)

In [9]:
def difficult_words(text):
     
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(text)
    
    words = []
    sentences = break_sentences(text)
    for sentence in sentences:
        words += [str(token) for token in sentence]
 
    # difficult words are those with syllables >= 2 
    # easy_word_set is provide by Textstat as a list of common words
    diff_words_set = set()
     
    for word in words:
        syllable_count = syllables_count(word)
        if word not in nlp.Defaults.stop_words and syllable_count >= 2:
            diff_words_set.add(word)
 
    return len(diff_words_set)

In [10]:
# A word is polysyllablic if it has more than 3 syllables. 
def poly_syllable_count(text):
    count = 0
    words = []
    sentences = break_sentences(text)
    for sentence in sentences:
        words += [token for token in sentence]
     
 
    for word in words:
        syllable_count = syllables_count(word)
        if syllable_count >= 3:
            count += 1
    return count

### Gunning Fog
The Gunning fog index measures the readability of English writing. The index estimates the years of formal education needed to understand the text on a first reading. A fog index of 12 requires the reading level of a U.S. high school senior (around 18 years old). <br> <br>
Grade level= 0.4 * ( (average sentence length) + (percentage of Hard Words))<br>
&nbsp; &nbsp; Here, Hard Words = words with more than two syllables. 

In [11]:
def gunning_fog(text):
    per_diff_words = (difficult_words(text) / word_count(text) * 100) + 5
    grade = 0.4 * (avg_sentence_length(text) + per_diff_words)
    return grade

In [12]:
text1 = "account termination policy youtube will terminate a user s access to the service if under appropriate circumstances the user is determined to be a repeat infringer. youtube reserves the right to decide whether content violates these terms of service for reasons other than copyright infringement such as but not limited to pornography obscenity or excessive length. youtube may at any time without prior notice and in its sole discretion remove such content and or terminate a user s account for submitting such material in violation of these terms of service."
text2 = "if you infringe copyright multiple times we close your account. if you are in violation of our community guidelines we may do that immediately."
score_text1 = gunning_fog(text1)
score_text2 = gunning_fog(text2)
print(score_text1, score_text2)

27.30322580645161 19.50769230769231


### Flesch-Kincaid Grade Level
The U.S. Army uses Flesch-Kincaid Grade Level for assessing the difficulty of technical manuals. The commonwealth of Pennsylvania uses Flesch-Kincaid Grade Level for scoring automobile insurance policies to ensure their texts are no higher than a ninth grade level of reading difficulty. Many other U.S. states also use Flesch-Kincaid Grade Level to score other legal documents such as business policies and financial forms.

In [14]:
def flesch_kincaid(text):
    num_words = word_count(text)
    num_sentences = sentence_count(text)
    num_syllables = syllables_count(text)
    score =  0.39 * (num_words / num_sentences) + 11.8 * (num_syllables / num_words) - 15.59
    return score

In [15]:
text1 = "account termination policy youtube will terminate a user s access to the service if under appropriate circumstances the user is determined to be a repeat infringer. youtube reserves the right to decide whether content violates these terms of service for reasons other than copyright infringement such as but not limited to pornography obscenity or excessive length. youtube may at any time without prior notice and in its sole discretion remove such content and or terminate a user s account for submitting such material in violation of these terms of service."
text2 = "if you infringe copyright multiple times we close your account. if you are in violation of our community guidelines we may do that immediately."
score_text1 = flesch_kincaid(text1)
score_text2 = flesch_kincaid(text2)
print(score_text1, score_text2)

15.786021505376347 7.633846153846157


### Automated Readability Index (ARI)
Unlike the other indices, the ARI, along with the Coleman-Liau, relies on a factor of characters per word, instead of the usual syllables per word. ARI is widely used on all types of texts.

In [16]:
def ARI(text):
    num_characters = len(text)
    num_words = word_count(text)
    num_sentences = sentence_count(text)
    score = 4.71 * (num_characters / num_words) + 0.5 * (num_words / num_sentences) - 21.43
    return score

In [17]:
text1 = "account termination policy youtube will terminate a user s access to the service if under appropriate circumstances the user is determined to be a repeat infringer. youtube reserves the right to decide whether content violates these terms of service for reasons other than copyright infringement such as but not limited to pornography obscenity or excessive length. youtube may at any time without prior notice and in its sole discretion remove such content and or terminate a user s account for submitting such material in violation of these terms of service."
text2 = "if you infringe copyright multiple times we close your account. if you are in violation of our community guidelines we may do that immediately."
score_text1 = ARI(text1)
score_text2 = ARI(text2)
print(score_text1, score_text2)

22.431290322580644 10.975000000000001
