Todo list:
- ~~book reading function~~
- morphologic analysis:
    - ~~lexical diversity calculation function~~
    - ~~parts of speech counting function (by word's initial form)~~
    - ~~parts of speech counting function (by tags, part of speech types)~~
    - ~~text dynamics calculation functiion~~
    - *top-1 for every speech part
- syntax analysis
    - reduce the amount of text
    - ~~most popular root word~~
- semantic analysis
    - stop-word cleanup
    - *automatic tag generation
- graphic demonstration
    - tag cloud

# Imports and installs

In [11]:
%pip install -q nltk pymorphy2
import nltk
import pymorphy2
import spacy
import numpy as np

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nlp = spacy.load('ru_core_news_sm')

Note: you may need to restart the kernel to use updated packages.


[nltk_data] Downloading package punkt to /home/andrew/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/andrew/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/andrew/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Defining a few constants

In [12]:
PUNCT = ('.',',',':',';','\'','"','-','(',')','!','?','...','$','№')

# Preprocess class definition

In [28]:
class Preproc:
    def __init__(self, book_name):
        self.text = self.get_book_text(book_name)
        self.tokens = nltk.word_tokenize(self.text)
        self.morph = pymorphy2.MorphAnalyzer()
        self.tokens_no_punct = nltk.tokenize.RegexpTokenizer(r"\w+").tokenize(self.text)
        self.nlp = spacy.load('ru_core_news_sm')
        self.doc = nlp(self.text)
        self.lex_diversity()
        self.make_tags()
    
    def process_stopwords(self, overwrite=False):
        stopwords = stopwords.words("russian")
        if overwrite:
            pass
        else:
            self.tokens_no_stopwords = [token for token in self.tokens if token not in stopwords]

    def get_book_text(self, name):
        with open(name, "r") as raw:
            text = ""
            for t in raw.readlines():
                text += t+"\n"
            return text
    
    def lex_diversity(self):
        unique_words_set = set(self.tokens)
        self.lex_diversity_coeff = len(unique_words_set) / len(self.tokens)

    def make_tags(self):
        self.tagged = []
        for token in self.tokens_no_punct:
            self.tagged.append((token, self.morph.parse(token)[0].tag.POS))
            # spacy find all собственные имена
        return self.tagged

    def text_dynamics(total_tags: list):
        verbs = total_tags.count("VERB")
        return verbs / len(total_tags)

    def text_dynamics_dict(self):
        self.count_speech_parts()
        verbs = self.tags_count["VERB"]
        total = 0
        for i in self.tags_count.keys():
            total += self.tags_count[i]
        return verbs/total
    
    def count_speech_parts(self):
        self.make_tags()
        res = dict()
        for token in self.tagged:
            if (token[1] in res.keys()) :
                res[token[1]] += 1
            else:
                res[token[1]] = 1
        self.tags_count = res

    def count_unique_words(self):
        self.make_tags()
        res = dict()
        for token in self.tagged:
            if not (token[1] in ("CONJ", "PREP", "PRCL")):
                if token[0] in res.keys():
                    res[token[0]] += 1
                else:
                    res[token[0]] = 1
        self.word_count = res
        self.word_count_sorted = dict(sorted(self.word_count.items(), key=lambda item: item[1]))
    
    @staticmethod
    def compare_books(self, book1, book2):
        text1 = self.get_book_text(book1)
        text2 = self.get_book_text(book2)

        doc1 = self.nlp(text1)
        doc2 = self.nlp(text2)
        return (doc1.similarity(doc2), np.dot(doc1.vector, doc2.vector) / (np.linalg.norm(doc1.vector) * (np.linalg.norm(doc2.vector))))

    def count_root_speech_parts(self):
        res = dict()
        for token in self.doc:
            if token.pos_ not in ("SPACE", "PUNCT"):
                if token.pos_ not in res.keys():
                    res[token.pos_] = 1
                else:
                    res[token.pos_] += 1
        self.root_count = res
        self.root_count_sorted = dict(sorted(self.root_count.items(), key=lambda item: item[1]))

# Text file reading function

In [14]:
def get_book(name):
    with open(name, "r") as raw:
        text = ""
        for t in raw.readlines():
            text += t+"\n"
        return text

# Lexical diversity calculation function

In [15]:
def lex_diversity(word_list):
    unique_words_set = set(word_list)
    return len(unique_words_set) / len(word_list)

# Text dynamics calculation function

In [16]:
def text_dynamics(total_tags: list):
    verbs = total_tags.count("VERB")
    return verbs / len(total_tags)

def text_dynamics_dict(tags_dict):
    verbs = tags_dict["VERB"]
    total = 0
    for i in tags_dict.keys():
        total += tags_dict[i]
    return verbs/total

# Morph analyzer init

In [17]:
morph = pymorphy2.MorphAnalyzer()
def tagger(tokenized):
    res = []
    for token in tokenized:
        res.append((token, morph.parse(token)[0].tag.POS))
        # spacy find all собственные имена
    return res

# Parts of speech counting function

In [18]:
# counter to count speech parts
def count_speech_parts(tagged):
    res = dict()
    for token in tagged:
        if (token[1] in res.keys()) :
            res[token[1]] += 1
        else:
            res[token[1]] = 1
    return res

def count_unique_words(tagged):
    res = dict()
    for token in tagged:
        if not (token[1] in ("CONJ", "PREP", "PRCL")):
            if token[0] in res.keys():
                res[token[0]] += 1
            else:
                res[token[0]] = 1
    return res

# Syntax analyzing functions

In [19]:
def count_root_speech_parts(doc):
    res = dict()
    for token in doc:
        if token.pos_ not in ("SPACE", "PUNCT"):
            if token.pos_ not in res.keys():
                res[token.pos_] = 1
            else:
                res[token.pos_] += 1
    return res

### Now onto the fun part
# Putting everything together

In [30]:
list_of_books = ["Похождения Чичикова (1921)","Белая гвардия (1922)","Роковые яйца (1924)","Собачье сердце (1925)","Мастер и маргарита (1929)","Театральный роман (1936)"]
for name in list_of_books:
    preproc = Preproc(name)
    with open(name+"_out", "w") as file:
        preproc.count_unique_words()
        preproc.count_speech_parts()
        preproc.count_root_speech_parts()
        
        file.write(f"Lexical diversity: {str(preproc.lex_diversity_coeff)}\n")

        file.write("Top-3 words: \n")
        file.write(f"\t{list(preproc.word_count_sorted.keys())[-1]}\n")
        file.write(f"\t{list(preproc.word_count_sorted.keys())[-2]}\n")
        file.write(f"\t{list(preproc.word_count_sorted.keys())[-3]}\n")

        file.write(f"Text dynamics: {preproc.text_dynamics_dict()}\n")

        file.write(f"Top root speech part: {list(preproc.root_count_sorted.keys())[-1]}\n")

KeyboardInterrupt: 