Todo list:
- ~~book reading function~~
- ~~lexical diversity calculation function~~
- parts of speech counting function (by word's initial form)
- parts of speech counting function (by tags, part of speech types)
- ~~text dynamics calculation functiion~~


# Imports and installs

In [1]:
%pip install -q nltk pymorphy2
import nltk
import pymorphy2
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

Note: you may need to restart the kernel to use updated packages.


[nltk_data] Downloading package punkt to /home/andrew/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/andrew/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/andrew/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Defining a few constants

In [2]:
PUNCT = ('.',',',':',';','\'','"','-','(',')','!','?','...','$','№')

# Text file reading function

In [3]:
def get_book(name):
    with open(name, "r") as raw:
        text = ""
        for t in raw.readlines():
            text += t+"\n"
        return text

# Lexical diversity calculation function

In [4]:
def lex_diversity(word_list):
    unique_words_set = set(word_list)
    return len(unique_words_set) / len(word_list)

# Text dynamics calculation function

In [10]:
def text_dynamics(total_tags: list):
    verbs = total_tags.count("VERB")
    return verbs / len(total_tags)

def text_dynamics_dict(tags_dict):
    verbs = tags_dict["VERB"]
    total = 0
    for i in tags_dict.keys():
        total += tags_dict[i]
    return verbs/total

# Morph analyzer init

In [6]:
morph = pymorphy2.MorphAnalyzer()
def tagger(tokenized):
    res = []
    for token in tokenized:
        res.append((token, morph.parse(token)[0].tag.POS))
    return res

# Parts of speech counting function

In [7]:
def count_speech_parts(tagged):
    res = dict()
    for token in tagged:
        if token[1] in res.keys():
            res[token[1]] += 1
        else:
            res[token[1]] = 1

    return res

def count_unique_words(tokenized):
    res = dict()
    for token in tokenized:
        if token in res.keys():
            res[token] += 1
        else:
            res[token] = 1

    return res

# Syntax analyzing functions

### Now onto the fun part
# Putting everything together

In [13]:
list_of_books = ["Собачье сердце (фрагмент)",]
for name in list_of_books:
    text = get_book(name)
    tokenized_no_punct = nltk.tokenize.RegexpTokenizer(r"\w+").tokenize(text)
    tagged = tagger(tokenized_no_punct)
    with open(name+"_out", "w") as file:
        file.write(f"Lexical diversity: {str(lex_diversity(tokenized_no_punct))}\n")
        file.write("Top-3 words: \n")
        words_count = count_unique_words(tokenized_no_punct)
        words_count_sorted = dict(sorted(words_count.items(), key=lambda item: item[1]))
        
        file.write(f"\t{list(words_count_sorted.keys())[-1]}\n")
        file.write(f"\t{list(words_count_sorted.keys())[-2]}\n")
        file.write(f"\t{list(words_count_sorted.keys())[-3]}\n")

        speech_part_count = count_speech_parts(tagger(tokenized_no_punct))
        file.write(f"Text dynamics: {text_dynamics_dict(speech_part_count)}")
        # file.write("")