<a href="https://colab.research.google.com/github/Arseniy-Polyakov/data_bases_course/blob/main/text_readability.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Данный ноутбук посвящен расчету количественных метрик оценивания сложности текстов на разных уровней для сбора данных для базы

В качестве сбора данных будет использовать платформу Kaggle, в частности датасет размеченных английских текстов по уровням CEFR и считать для них метрики на каждом лингвистическом уровне (фонологический, грамматический, лексический + количественные метрики отображающие диапазоны сложности текстов, например, FRE (Коэффициент Флеша), SMOG, LIX и т.д.)

Импортируем библиотеки и модули для сбора, обработки и анализа текстовых данных

In [None]:
import re
import random
import json
import nltk
import pandas as pd
import kagglehub
from tqdm import tqdm
from nltk.corpus import stopwords

Загружаем стоп-слова для аннглийского языка и словарь меток для частеречной разметки

In [None]:
nltk.download("punkt_tab")
nltk.download("stopwords")
nltk.download("universal_tagset")
nltk.download("averaged_perceptron_tagger_eng")

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

Инициализируем переменную со стоп-словами

In [None]:
stop_words = stopwords.words("english")

Загружаем датасет

In [None]:
path = kagglehub.dataset_download("amontgomerie/cefr-levelled-english-texts")
print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/cefr-levelled-english-texts


In [None]:
df = pd.read_csv("/kaggle/input/cefr-levelled-english-texts/cefr_leveled_texts.csv")
df.head()

Unnamed: 0,text,label
0,Hi!\nI've been meaning to write for ages and f...,B2
1,﻿It was not so much how hard people found the ...,B2
2,Keith recently came back from a trip to Chicag...,B2
3,"The Griffith Observatory is a planetarium, and...",B2
4,-LRB- The Hollywood Reporter -RRB- It's offici...,B2


0. Импортируем в csv формате файл с текстами и их id

In [None]:
ids = [str(random.randint(1000000, 9999999)) for i in range(len(df))]
df["text_id"] = ids
df_general = df[["text", "text_id"]]
df_general.to_csv("/content/csv/general.csv")

df_general_no_texts = df["text_id"]
df_general_no_texts.to_csv("/content/csv/general_no_texts.csv")

In [None]:
texts = df["text"].tolist()

Пишем функцию препроцессинга текста (удаление знаков препинания, цифр, символов других алфавитов)

In [None]:
def preprocessing(text: str) -> str:
  texts_splitted = "".join(re.split("\n", text))
  text_cleaned = re.sub(r"[^a-z'\-\s]", "", texts_splitted.lower())
  return text_cleaned

In [None]:
texts_preprocessed = [preprocessing(text) for text in texts]

I. Фонологический уровень (односложные, двусложные, трехсложные, четырехсложные слова). Импортируем финальный csv файл для базы данных

In [None]:
def syllables_counting(word):
  vowels = ["a", "o", "e", "y", "u", "i"]
  digrafs_and_trigrafs = ["ai", "ay", "ea", "ee", "ei", "ey", "oa", "oe", "oo", "ou",
                      "ua", "ue", "ui", "uy", "eau", "aye", "iou"]
  vowels_count = len([symbol for symbol in word if symbol in vowels])
  digrafs_and_trigrafs_count = len([item for item in digrafs_and_trigrafs if item in word])
  limitations = ["the", "he", "she", "me", "we", "cafe", "apostrophe", "cliche"]
  final_count = vowels_count - digrafs_and_trigrafs_count
  if word.endswith("e") and word not in limitations:
    final_count -= 1
  if word.startswith("y"):
    final_count -= 1
  return final_count

In [None]:
def phonological_level(text: str) -> dict:
  one_syllable, two_syllables, three_syllables, four_syllables, five_and_more_syllables = 0, 0, 0, 0, 0
  for word in text.split():
    if syllables_counting(word) == 1:
      one_syllable +=1
    elif syllables_counting(word) == 2:
      two_syllables += 1
    elif syllables_counting(word) == 3:
      three_syllables += 1
    elif syllables_counting(word) == 4:
      four_syllables += 1
    else:
      five_and_more_syllables += 1

  phonological_level_df = {
      "one_syllable_words": one_syllable,
      "two_syllable_words": two_syllables,
      "three_syllables_words": three_syllables,
      "four_syllables_words": four_syllables,
      "five_and_more_syllables_words": five_and_more_syllables
  }
  return phonological_level_df

In [None]:
one_syllable_dataset = []
two_syllables_dataset = []
three_syllables_dataset = []
four_syllables_dataset = []
five_and_more_syllables_dataset = []

for text in tqdm(texts_preprocessed):
  phonological_level_stat = phonological_level(text)
  one_syllable_dataset.append(phonological_level_stat["one_syllable_words"])
  two_syllables_dataset.append(phonological_level_stat["two_syllable_words"])
  three_syllables_dataset.append(phonological_level_stat["three_syllables_words"])
  four_syllables_dataset.append(phonological_level_stat["four_syllables_words"])
  five_and_more_syllables_dataset.append(phonological_level_stat["five_and_more_syllables_words"])

df_phonological = pd.DataFrame(ids,
                               index=[i for i in range(len(texts_preprocessed))],
                               columns=["text_id"])

df_phonological["one_syllable_words"] = one_syllable_dataset
df_phonological["two_syllables_words"] = two_syllables_dataset
df_phonological["three_syllables_words"] = three_syllables_dataset
df_phonological["four_syllables_words"] = four_syllables_dataset
df_phonological["five_and_more_syllables_words"] = five_and_more_syllables_dataset

df_phonological.head()
df_phonological.to_csv("/content/csv/phonological_level.csv")


100%|██████████| 1494/1494 [00:02<00:00, 520.75it/s]


II. Грамматический уровень (частеречная разметка, номинативность и дескриптивность). Импортируем финальный csv файл для базы данных

In [None]:
def grammatical_level(text: list):
  pos_tags = nltk.tag.pos_tag(text, tagset="universal")

  pos_count = {
      "nouns": 0,
      "adjectives": 0,
      "verbs": 0,
      "adverbs": 0,
      "pronouns": 0,
      "numerals": 0
  }

  for tag in pos_tags:
    if tag[1] == "NOUN":
      pos_count["nouns"] += 1
    elif tag[1] == "ADJ":
      pos_count["adjectives"] += 1
    elif tag[1] == "VERB":
      pos_count["verbs"] += 1
    elif tag[1] == "ADV":
      pos_count["adverbs"] += 1
    elif tag[1] == "PRON":
      pos_count["pronouns"] += 1
    elif tag[1] == "NUM":
      pos_count["numerals"] += 1

  text_without_stopwords = [token for token in text if token not in stop_words]

  nominativity = round(pos_count["nouns"] / len(text_without_stopwords), 2)
  descriptivity = round(pos_count["adjectives"] / len(text_without_stopwords), 2)

  grammatical_level_stat = {
      "nominativity": nominativity,
      "descriptivity": descriptivity,
      "nouns": pos_count["nouns"],
      "adjectives": pos_count["adjectives"],
      "verbs": pos_count["verbs"],
      "adverbs": pos_count["adverbs"],
      "pronouns": pos_count["pronouns"],
      "numerals": pos_count["numerals"]
  }
  return grammatical_level_stat

In [None]:
nominativity_dataset = []
descriptivity_dataset = []
nouns_dataset = []
adjectives_dataset = []
verbs_dataset = []
adverbs_dataset = []
pronouns_dataset = []
numerals_dataset = []

for text in tqdm(texts_preprocessed):
  grammatical_dataset = grammatical_level(text.split())
  nominativity_dataset.append(grammatical_dataset["nominativity"])
  descriptivity_dataset.append(grammatical_dataset["descriptivity"])
  nouns_dataset.append(grammatical_dataset["nouns"])
  adjectives_dataset.append(grammatical_dataset["adjectives"])
  verbs_dataset.append(grammatical_dataset["verbs"])
  adverbs_dataset.append(grammatical_dataset["adverbs"])
  pronouns_dataset.append(grammatical_dataset["pronouns"])
  numerals_dataset.append(grammatical_dataset["numerals"])

df_grammatical = pd.DataFrame(ids,
                              index=[i for i in range(len(texts_preprocessed))],
                              columns=["text_id"])

df_grammatical["nominativity"] = nominativity_dataset
df_grammatical["descriptivity"] = descriptivity_dataset
df_grammatical["nouns"] = nouns_dataset
df_grammatical["adjectives"] = adjectives_dataset
df_grammatical["verbs"] = verbs_dataset
df_grammatical["adverbs"] = adverbs_dataset
df_grammatical["pronouns"] = pronouns_dataset
df_grammatical["numerals"] = numerals_dataset

df_grammatical.head()
df_grammatical.to_csv("/content/csv/grammatical_level.csv")

100%|██████████| 1494/1494 [00:24<00:00, 60.35it/s]


III. Лексический уровень (TTR, RTTR, CTTR). Импортируем финальный csv файл для базы данных

In [None]:
def lexical_level(text: list) -> dict:
  unique_words = set(text)
  ttr = round(len(unique_words) / len(text), 2)
  rttr = round(len(unique_words) / (len(text)**0.5), 2)
  cttr = round(len(unique_words) / (len(text)*2)**0.5, 2)
  lexical_level = {
      "ttr": ttr,
      "rttr": rttr,
      "cttr": cttr
  }
  return lexical_level

In [None]:
ttr_dataset = []
rttr_dataset = []
cttr_dataset = []
cefr_level_dataset = df["label"].tolist()
for text in tqdm(texts_preprocessed):
  lexical_dataset = lexical_level(text.split())
  ttr_dataset.append(lexical_dataset["ttr"])
  rttr_dataset.append(lexical_dataset["rttr"])
  cttr_dataset.append(lexical_dataset["cttr"])

df_lexical = pd.DataFrame(ids,
                          index=[i for i in range(len(texts_preprocessed))],
                          columns=["text_id"])

df_lexical["ttr"] = ttr_dataset
df_lexical["rttr"] = rttr_dataset
df_lexical["cttr"] = cttr_dataset
df_lexical["cefr_level"] = cefr_level_dataset
df_lexical.head()
df_lexical.to_csv("/content/csv/lexical_level.csv")

100%|██████████| 1494/1494 [00:00<00:00, 17680.66it/s]


IV. Статистические метрики (FKRI, gunning fox, SMOG, ARI, spache, dall, powers sumnel, coleman liau, lix, rix). Импортируем финальный csv файл для базы данных

In [None]:
def flesch_kincaid_readability_index(text: str) -> float:
  sentences = nltk.sent_tokenize(text)
  text = re.sub(r"[^A-Za-z\s]", "", text)
  words = nltk.word_tokenize(text)
  syllables = sum([syllables_counting(word) for word in words])
  FKGL = round(0.39*(len(words)/len(sentences)) + (11.8*syllables/len(words)) - 15.59, 2)
  return FKGL

In [None]:
def gunning_fog_index(text: str) -> float:
  sentences = nltk.sent_tokenize(text)
  text = re.sub(r"[^A-Za-z\s]", "", text)
  words = nltk.word_tokenize(text)
  complex_words = [word for word in words if syllables_counting(word) >= 3]
  FOG = round((0.4*((len(words)/len(sentences))) + (100*(len(complex_words)/len(words)))), 2)
  return FOG

In [None]:
def simple_measure_of_gobbledygook(text: str) -> float:
  text = re.sub(r"[^A-Za-z\s]", "", text)
  words = nltk.word_tokenize(text)
  complex_words = [word for word in words if syllables_counting(word) >= 3]
  root = round(len(complex_words)**0.5, 0)
  part = root % 10
  if part < 5:
    rounded_ten = root - part
  else:
    rounded_ten = root + part
  SMOG = 3 + rounded_ten
  return SMOG

In [None]:
def automated_readability_index(text: str) -> float:
  sentences = nltk.sent_tokenize(text)
  text = re.sub(r"[^A-Za-z\s]", "", text)
  words = nltk.word_tokenize(text)
  characters = re.sub(r"[^A-Za-z]", "", text)
  ARI = round(4.71*(len(characters)/len(words)) + 0.5*(len(words)/len(sentences)) - 21.43, 2)
  return ARI

In [None]:
def spache_formula(text: str) -> float:
  with open("/content/spache_words.txt", "rt", encoding="utf-8") as file:
    spache_words = file.read().split()
  sentences = nltk.sent_tokenize(text)
  text = re.sub(r"[^A-Za-z\s]", "", text)
  words = nltk.word_tokenize(text)
  average_sentence_length = round(len(words) / len(sentences), 2)
  unique_unfamiliar_words = [word for word in words if word not in spache_words]
  percent_unfamiliar_words = (len(unique_unfamiliar_words) * 100) / len(words)
  spache = round((0.121*average_sentence_length) + (0.082*percent_unfamiliar_words) + 0.659, 2)
  return spache

In [None]:
def dale_chall(text: str) -> float:
  with open("/content/dale_chall.txt", "rt", encoding="utf-8") as file:
    dale_chall = file.read().split()
  sentences = nltk.sent_tokenize(text)
  text = re.sub(r"[^A-Za-z\s]", "", text)
  words = nltk.word_tokenize(text)
  complex_words = [word for word in words if word not in dale_chall]
  dale_chall = round(0.1579*(100*(len(complex_words)/len(words)) + 0.0496*(len(words)/len(sentences))), 2)
  return dale_chall

In [None]:
def powers_sumner_kearl(text: str) -> float:
  sentences = nltk.sent_tokenize(text)
  text = re.sub(r"[^A-Za-z\s]", "", text)
  words = nltk.word_tokenize(text)
  average_sentence_length = round(len(words)/len(sentences), 2)
  syllables = sum([syllables_counting(word) for word in words])
  powers_sumner_kearl = round((0.0778*average_sentence_length) + (0.0455*syllables) + 2.7971, 2)
  return powers_sumner_kearl

In [None]:
def coleman_liau_index(text: str) -> float:
  sentences = nltk.sent_tokenize(text)
  text = re.sub(r"[^A-Za-z\s]", "", text)
  words = nltk.word_tokenize(text)
  letters = [len(word) for word in words]
  average_letters = sum(letters) / (len(words) / 100)
  average_sentences = len(sentences) / (len(words) / 100)
  coleman_index = round((0.0588*average_letters) - (0.296*average_sentences) - 15.8, 2)
  return coleman_index

In [None]:
def lix(text: str) -> float:
  sentences = nltk.sent_tokenize(text)
  text = re.sub(r"[^A-Za-z\s]", "", text)
  words = nltk.word_tokenize(text)
  long_words = [word for word in words if len(word) >= 7]
  words_per_sentence = [len(nltk.word_tokenize(sentence)) for sentence in sentences]
  average_number_words_per_sentence = sum(words_per_sentence) / len(words_per_sentence)
  lix = round((len(long_words) * 100 / len(words)) + average_number_words_per_sentence, 2)
  return lix

In [None]:
def rix(text: str) -> float:
  sentences = nltk.sent_tokenize(text)
  text = re.sub(r"[^A-Za-z\s]", "", text)
  words = nltk.word_tokenize(text)
  long_words = [word for word in words if len(word) >= 7]
  rix = round(len(long_words) / len(sentences), 2)
  return rix

In [None]:
def statistical_level(text: str) -> dict:
  statistical_level = {
      "fre": flesch_kincaid_readability_index(text),
      "gunning_fog_index": gunning_fog_index(text),
      "smog": simple_measure_of_gobbledygook(text),
      "ari": automated_readability_index(text),
      "spache_formula": spache_formula(text),
      "dale_chall": dale_chall(text),
      "powers_sumner_kearl": powers_sumner_kearl(text),
      "coleman_liau_index": coleman_liau_index(text),
      "lix": lix(text),
      "rix": rix(text)
  }
  return statistical_level

In [None]:
flesch_kincaid_readability_index_dataset = []
gunning_fog_index_dataset = []
simple_measure_of_gobbledygook_dataset = []
automated_readability_index_dataset = []
spache_formula_dataset = []
dale_chall_dataset = []
powers_sumner_kearl_dataset = []
coleman_liau_index_dataset = []
lix_dataset = []
rix_dataset = []

for text in tqdm(texts_preprocessed):
  statistical_level_dataset = statistical_level(text)
  flesch_kincaid_readability_index_dataset.append(statistical_level_dataset["fre"])
  gunning_fog_index_dataset.append(statistical_level_dataset["gunning_fog_index"])
  simple_measure_of_gobbledygook_dataset.append(statistical_level_dataset["smog"])
  automated_readability_index_dataset.append(statistical_level_dataset["ari"])
  spache_formula_dataset.append(statistical_level_dataset["spache_formula"])
  dale_chall_dataset.append(statistical_level_dataset["dale_chall"])
  powers_sumner_kearl_dataset.append(statistical_level_dataset["powers_sumner_kearl"])
  coleman_liau_index_dataset.append(statistical_level_dataset["coleman_liau_index"])
  lix_dataset.append(statistical_level_dataset["lix"])
  rix_dataset.append(statistical_level_dataset["rix"])

df_statistical = pd.DataFrame(ids,
                              index=[i for i in range(len(texts_preprocessed))],
                              columns=["text_id"])

df_statistical["fre"] = flesch_kincaid_readability_index_dataset
df_statistical["gunning_fog_index"] = gunning_fog_index_dataset
df_statistical["smog"] = simple_measure_of_gobbledygook_dataset
df_statistical["ari"] = automated_readability_index_dataset
df_statistical["spache_formula"] = spache_formula_dataset
df_statistical["dale_chall"] = dale_chall_dataset
df_statistical["powers_sumner_kearl"] = powers_sumner_kearl_dataset
df_statistical["coleman_liau_index"] = coleman_liau_index_dataset
df_statistical["lix"] = coleman_liau_index_dataset
df_statistical["rix"] = rix_dataset

df_statistical.head()
df_statistical.to_csv("/content/csv/statistical_level.csv")

100%|██████████| 1494/1494 [00:58<00:00, 25.40it/s]


Источники:
1. Поляков, А. М. Количественные критерии оценки сложности текста для методических целей / А. М. Поляков // Наука в мегаполисе Science in a Megapolis. – 2024. – № 3(59). – EDN FOFGKS. [Ссылка](https://mgpu-media.ru/issues/issue-59/literaturovedenie-i-yazykoznanie/kolichestvennye-kriterii-otsenki-slozhnosti-teksta-dlya-metodicheskikh-tselej.html?ysclid=mcaxhunvbp187574973)
2. Поляков, А. М. Обработка текста на лексическом и семантическом уровнях для методических целей / А. М. Поляков // Фундаментальная и прикладная лингвистика : Материалы I Межвузовской студенческой конференции, Москва, 30 октября 2023 года. – Москва: Издательство МГТУ им. Н.Э. Баумана, 2024. – С. 54-56. – EDN SYCUTX.
3. https://github.com/Arseniy-Polyakov/English-Texts-Complexity-Evaluation/tree/main
4. https://github.com/Arseniy-Polyakov/LLM_vs_script_readability_formulas
