In [1]:
!pip install textstat



# English

In [2]:
import textstat

In [3]:
textstat.set_lang("en")

In [4]:
text = """
Data science is the main focus of most sciences and studies right now, 
it needs a lot of things like AI, programming, statistics, 
business understanding, effective presentation skills and much more. 
That's why it's not easy to understand or study. But we can do it, we are doing it.
Data science has become the standard solving problem framework for academia and 
the industry and it's going to be like that for a while. But we need to remember 
where we are coming from, who we are and where we are going.
"""

In [5]:
# Count Syllables
textstat.syllable_count(text)

126

In [6]:
# Lexicon count
textstat.lexicon_count(text, removepunct=True)

91

In [7]:
# Sentence count
textstat.sentence_count(text)

4

In [8]:
# Flesch Reading Ease formula
textstat.flesch_reading_ease(text)

65.25

Around 65 means this text has a "standard" difficuly to be read.

In [9]:
# Flesch-Kincaid Grade Level
textstat.flesch_kincaid_grade(text)

9.8

This means the text is very difficult to read. Best understood by university graduates. Which seems fine.

In [10]:
# Fog Scale (Gunning FOG Formula)
textstat.gunning_fog(text)

11.76

Meaning that a High school junior can read this.

In [11]:
# SMOG Index # Similar to FOG
textstat.smog_index(text)

11.2

In [12]:
# Automated Readability Index
textstat.automated_readability_index(text)

11.8

Meaning that a Eleventh Grade student can read it.

In [13]:
# Coleman-Liau Index
textstat.coleman_liau_index(text)

8.94

In [14]:
# Linsear Write Formula
textstat.linsear_write_formula(text)

10.7

In [15]:
# Dale-Chall Readability Score
textstat.dale_chall_readability_score(text)

7.54

Meaning that an average 9th or 10th-grade student can read it.

In [16]:
# Readability Consensus
textstat.text_standard(text, float_output=False)

'11th and 12th grade'

Meaning that in general someone that has finished 11th or 12th grade could understand this piece.

In [17]:
# Time to read the text in seconds
textstat.reading_time(text)

6.08

In [18]:
# Run all at once
import inspect
funcs = ["textstat." + inspect.getmembers(textstat, predicate=inspect.ismethod)[i][0] for i in range(1,28)]

In [19]:
for elem in funcs:
    method = eval(elem)
    textstat.set_lang("en")
    print(elem.split(".")[1])
    print(method(text))
    print(" ")

avg_character_per_word
4.64
 
avg_letter_per_word
4.47
 
avg_sentence_length
22.8
 
avg_sentence_per_word
0.04
 
avg_syllables_per_word
1.4
 
char_count
422
 
coleman_liau_index
8.94
 
dale_chall_readability_score
7.54
 
dale_chall_readability_score_v2
7.54
 
difficult_words
16
 
difficult_words_list
['data', 'programming', 'presentation', 'problem', 'industry', 'focus', 'framework', 'statistics', 'understanding', 'standard', 'doing', 'science', 'studies', 'solving', 'sciences', 'effective']
 
flesch_kincaid_grade
9.8
 
flesch_reading_ease
65.25
 
gunning_fog
11.76
 
letter_count
407
 
lexicon_count
91
 
linsear_write_formula
10.7
 
lix
42.58
 
polysyllabcount
8
 
reading_time
6.08
 
rix
4.5
 
sentence_count
4
 
set_lang
None
 
smog_index
11.2
 
spache_readability
5.5588379120879114
 
syllable_count
126
 
text_standard
11th and 12th grade
 


# Spanish - Español

In [20]:
text = """
La ciencia de datos es el foco principal de la mayoría de las ciencias y estudios en este momento, 
necesita muchas cosas como inteligencia artificial, programación, estadísticas, 
comprensión del negocio, habilidades de presentación efectivas y mucho más. 
Por eso no es fácil de entender o estudiar. Pero podemos hacerlo, lo estamos haciendo.
La ciencia de datos se ha convertido en el marco de resolución de 
problemas estándar para la academia y la industria y va a ser así 
por un tiempo. Pero debemos recordar de dónde venimos, 
quiénes somos y hacia dónde vamos.
"""

In [21]:
textstat.set_lang("es")

## Note: The only readibility function implemented is the Fernandez Huerta Readability Formula which is a variant of the Flesch Reading Ease formula

In [22]:
textstat.flesch_reading_ease(text)

61.75

In [23]:
# Time to read the text in seconds
textstat.reading_time(text)

6.92

In [24]:
# This works so-so in Spanish
textstat.difficult_words_list(text)

['venimos',
 'resolución',
 'muchas',
 'estadísticas',
 'hacerlo',
 'dónde',
 'mucho',
 'pero',
 'estudios',
 'presentación',
 'ciencia',
 'datos',
 'comprensión',
 'mayoría',
 'negocio',
 'como',
 'vamos',
 'quiénes',
 'momento',
 'inteligencia',
 'programación',
 'industria',
 'habilidades',
 'convertido',
 'ciencias',
 'efectivas',
 'estamos',
 'marco',
 'estándar',
 'recordar',
 'cosas',
 'estudiar',
 'principal',
 'artificial',
 'fácil',
 'necesita',
 'hacia',
 'entender',
 'debemos',
 'academia',
 'tiempo',
 'para',
 'somos',
 'problemas',
 'haciendo',
 'foco',
 'podemos',
 'este']

# Check spelling

In [25]:
!pip install autocorrect



In [26]:
# Here I'm misspelling :
# presentation as presentatio
# focus as focsu
# framework as framwork 
text = """
Data science is the main focsu of most sciences and studies right now, 
it needs a lot of things like AI, programming, statistics, 
business understanding, effective presentatio skills and much more. 
That's why it's not easy to understand or study. But we can do it, we are doing it.
Data science has become the standard solving problem framwork for academia and 
the industry and it's going to be like that for a while. But we need to remember 
where we are coming from, who we are and where we are going.
"""

In [27]:
from autocorrect import Speller

check = Speller(lang='en')

check(text)

"\ndata science is the main focus of most sciences and studies right now, \nit needs a lot of things like AI, programming, statistics, \nbusiness understanding, effective presentation skills and much more. \nThat's why it's not easy to understand or study. But we can do it, we are doing it.\ndata science has become the standard solving problem framework for academia and \nthe industry and it's going to be like that for a while. But we need to remember \nwhere we are coming from, who we are and where we are going.\n"