# Clarity metrics
Included in this notebook:
- Average sentence length (by number of words)
- Average word length (by number of syllables)
- Readability index Szigriszt-Pazos
- Usage of common words

In [63]:
import pandas as pd
import numpy as np
import re

from wordfreq import word_frequency, top_n_list, get_language_info # Documentation: https://github.com/LuminosoInsight/wordfreq/
import textstat # Documentation: https://github.com/shivam5992/textstat
from textstat import szigriszt_pazos
textstat.set_lang('es')

from stop_words import get_stop_words
from sklearn.preprocessing import MinMaxScaler

In [64]:
# Load articles of authors
data = pd.read_csv('../Data/Data_clean_csv/clean_dataframe.csv')

with open('../Data/Data_clean_txt/Denisse Dresser.txt', 'r', encoding='utf8') as f:
    dresser_content = f.read()
    
with open('../Data/Data_clean_txt/Enrique Krauze.txt', 'r', encoding='utf8') as f:
    krauze_content = f.read()
    
with open('../Data/Data_clean_txt/John Ackerman.txt', 'r', encoding='utf8') as f:
    ackerman_content = f.read()
    
with open('../Data/Data_clean_txt/Ricardo Raphael.txt', 'r', encoding='utf8') as f:
    raphael_content = f.read()
    
with open('../Data/Data_clean_txt/Valeria Moy.txt', 'r', encoding='utf8') as f:
    moy_content = f.read()

In [65]:
# Remove empty article
data = data.drop(1216).reset_index(drop=True)

## Average sentence length (by number of words)

In [66]:
def html_cleaner(text):
    """ Removes html expressions and line breaks"""
    
    text = re.sub(r'(\n|\r)', '', text)
    
    # Remove signature of Ricardo Raphael's articles
    text = re.sub(r'@ricardomraphael', '', text)
    
    # Remove italics
    text = re.sub(r'<i>', '', text)
    text = re.sub(r'<\\i>', '', text)
    
    # Remove bold
    text = re.sub(r'<b>', '', text)
    text = re.sub(r'<\\b>', '', text)
    
    #Remove multiple spaces
    text = re.sub(r'\s{2,}', ' ', text)
    text = re.sub(r'\s,\s', ', ', text)
      
    return text

In [67]:
def avg_words_per_sentence(text):
    """Returns average words per sentence of a text"""
    
    # Clean html expressions and line breaks
    text = html_cleaner(text)
    
    #Split into sentences
    sentence_regex = re.compile('[\.|\?\s?|!]\s')
    sentences = sentence_regex.split(text)
    
    #Remove empty sentences
    for sentence in sentences:
        if not sentence:
            sentences.remove(sentence)
    
    #Count words in each string
    words_per_sentence = [len(re.findall(r'\w+', sentence)) for sentence in sentences]
        
    #Get average words per sentence
    avg_words_per_sentence = sum(words_per_sentence) / len(words_per_sentence)
        
    return avg_words_per_sentence

## Average word length (by number of syllables)

In [68]:
def punctuation_cleaner(text):
    """Removes all punctuation and special characters from a text"""
    
    text = re.sub(r'[^A-Za-z\sáéíóúñ]+', '', text)
    
    return text

In [69]:
def avg_syllables_per_word(text):
    """Returns average syllables per word of a text"""
    
    # Clean html expressions, line breaks, and punctuation
    text = html_cleaner(text)
    text = punctuation_cleaner(text)
    
    #Remove initialisms and acronyms
    text = re.sub(r'\b[A-ZÑ]{2,}\b', '', text)
    
    # Remove Roman numerals
    text = re.sub(r'\b[IVXL]+\b', '', text)
    
    #Remove multiple spaces
    text = re.sub(r'\s{2,}', ' ', text)
    text = re.sub(r'\s,\s', ', ', text)
    text = re.sub(r'\s+$', '', text)
    
    # Lowercase all words
    text = text.lower()
    
    # Calculate average number of syllables per word
    words = re.findall(r'\b\w+\b', text)
    syllables_per_word = [textstat.syllable_count(word) for word in words]
    avg_syllables_per_word = sum(syllables_per_word) / len(words)
    
    return avg_syllables_per_word

## Readability index Szigriszt-Pazos

This index is a Spanish adaptation of the Flesch readability-ease test, which considers average words per sentence and average syllables per word.

See https://legible.es/blog/perspicuidad-szigriszt-pazos/

| Score | Difficulty | Education level |
| ----- | ---------- | --------------- |
|0-15| Very hard|University graduates
|16-35| Hard| University graduates
|36-50| Somewhat hard| College
|51-65| Normal| 13 to 15-year-old students
|66-75| Somewhat easy| 12 year-old students
|76-85| Easy| 11 year-old students
|86-100| Very easy| 6 to 10 year-olds

In [70]:
def szigriszt_pazos_adapted(text):
    """Returns szigriszt-Pazos index of the articles of an author"""
    
    # Clean html expressions and line breaks
    text = html_cleaner(text)
    
    # Get szigriszt_pazos index
    index = szigriszt_pazos(text)
    
    return index

## Usage of common words

Higher score means the words used are more common.

In [99]:
# Define stop words
stop_words = get_stop_words('es')

# Add additional stopwords
more_stopwords = ['el', 'la', 'los', 'las', 'un', 'uno', 'una', 'unos', 'unas']
                  
for word in more_stopwords:
    stop_words.append(word)

In [137]:
def word_frequency_score(text):
    """Returns a score of how frequent are the words used in the text"""
    
    # Clean html expressions, line breaks, and punctuation
    text = html_cleaner(text)
    text = punctuation_cleaner(text)
    
    # Remove Roman numerals
    text = re.sub(r'\b[IVXL]+\b', '', text)
    
    # Lowercase all words
    text = text.lower()
    
    # Get words and remove stopwords
    stop_words = get_stop_words('es')
    words = re.findall(r'\b\w+\b', text)
    words = [word for word in words if word not in stop_words]
    
    # Calculate individual frequency of each word
    words_freq = [word_frequency(word, 'es') for word in words]   
    
    #Sum frequencies and divide by the number of words
    score = sum(words_freq) / len(words_freq) * 100
    
    return score

In [73]:
def rescale(values, new_min=0, new_max=100):
    """Changes the values of a variable to another scale"""
    
    output = []
    old_min, old_max = min(values), max(values)

    for v in values:
        new_v = (new_max - new_min) / (old_max - old_min) * (v - old_min) + new_min
        output.append(new_v)

    return output

# Clarity metrics of all authors

In [109]:
data_clarity = data.copy()
data_clarity['sentence length'] = data_clarity.body.apply(avg_words_per_sentence).round(2)
data_clarity['word length'] = data_clarity.body.apply(avg_syllables_per_word).round(2)
data_clarity['szigriszt-pazos'] = data_clarity.body.apply(szigriszt_pazos_adapted).round(1)

In [113]:
data_clarity['word frequency score'] = data_clarity.body.apply(word_frequency_score)

# Transform 'word frequency score' to a 0-100 scale for easier interpretation
data_clarity['word frequency score'] = rescale(data_clarity['word frequency score'])

In [117]:
data_clarity.head(4)

Unnamed: 0,author,title,date,body,source,link,sentence length,word length,szigriszt-pazos,word frequency score
0,Enrique Krauze,El jurista bondadoso,2021/03/08,"La justicia es ciega, lleva en una mano la bal...",Personal website,https://enriquekrauze.com.mx/el-jurista-bondad...,22.0,1.97,63.5,32.482504
1,Enrique Krauze,La hambruna recordada,2021/02/22,Toda revolución debe verse en el espejo de la ...,Personal website,https://enriquekrauze.com.mx/la-hambruna-recor...,18.02,1.92,69.2,33.938613
2,Enrique Krauze,Una historia de Covid,2021/02/08,Martha creció con su numerosa familia en una c...,Personal website,https://enriquekrauze.com.mx/una-historia-de-c...,12.16,1.87,77.4,59.145363
3,Enrique Krauze,El espejo de Weimar,2021/01/25,Toda democracia en el siglo XXI debe verse en ...,Personal website,https://enriquekrauze.com.mx/el-espejo-de-weimar/,19.86,2.08,57.8,25.38097


In [114]:
data_clarity.groupby('author')[['sentence length', 'word length', 'szigriszt-pazos', 'word frequency score']].mean().round(2)

Unnamed: 0_level_0,sentence length,word length,szigriszt-pazos,word frequency score
author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Denisse Dresser,18.61,2.05,52.09,42.88
Enrique Krauze,21.71,2.02,57.33,41.66
John Ackerman,40.69,2.07,32.78,40.82
Ricardo Raphael,23.6,2.03,55.45,41.86
Valeria Moy,19.11,1.98,64.92,54.68


# Export database

In [260]:
# Save as csv
data_clarity.to_csv('../Data/Data_clean_csv/data_clarity_metrics', index=False)

# Metrics for generated articles of Denisse Dresser

In [138]:
generated_dresser = pd.read_csv('../Data/Data_clean_csv/generated_dresser_articles.csv')
generated_dresser = generated_dresser.drop('Unnamed: 0', axis=1)

In [139]:
generated_dresser['sentence length'] = generated_dresser.body.apply(avg_words_per_sentence).round(2)
generated_dresser['word length'] = generated_dresser.body.apply(avg_syllables_per_word).round(2)
generated_dresser['szigriszt-pazos'] = generated_dresser.body.apply(szigriszt_pazos_adapted).round(1)
generated_dresser['word frequency score'] = generated_dresser.body.apply(word_frequency_score)
generated_dresser['word frequency score'] = rescale(generated_dresser['word frequency score'])

In [141]:
generated_dresser.head(4)

Unnamed: 0,body,sentence length,word length,szigriszt-pazos,word frequency score
0,"La historia no se repite , pero sí instruye , ...",24.82,2.13,45.1,1.136378
1,"La historia no se repite , pero sí instruye , ...",22.58,1.94,54.0,42.989595
2,"En estos tiempos aciagos , el realismo es un i...",17.06,1.89,64.8,38.728735
3,El Presidente no son - porque la elección está...,23.08,1.78,66.5,78.768934


In [148]:
generated_dresser.mean().round(2)

sentence length         22.24
word length              1.96
szigriszt-pazos         57.13
word frequency score    32.40
dtype: float64

In [147]:
# Remove outliers from word frequency score mean
generated_dresser.loc[generated_dresser['word frequency score'] > 10, 'word frequency score'].mean().round(2)

49.59