Todo list:
- ~~book reading function~~
- morphologic analysis:
    - ~~lexical diversity calculation function~~
    - ~~parts of speech counting function (by word's initial form)~~
    - ~~parts of speech counting function (by tags, part of speech types)~~
    - ~~text dynamics calculation functiion~~
    - *top-1 for every speech part
- syntax analysis
    - reduce the amount of text
    - ~~most popular root word~~
- semantic analysis
    - ~~stop-word cleanup~~
    - *automatic tag generation
- graphic demonstration
    - tag cloud

# Imports and installs

In [111]:
%pip install -q nltk pymorphy2 gensim
import nltk
from nltk.corpus import stopwords
import pymorphy2
import spacy
import numpy as np
import gensim.downloader

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nlp = spacy.load('ru_core_news_sm')
# word2vec_rus = gensim.downloader.load('word2vec-ruscorpora-300')

Note: you may need to restart the kernel to use updated packages.


[nltk_data] Downloading package punkt to /home/andrew/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/andrew/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/andrew/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Defining a few constants

In [112]:
PUNCT = ('.',',',':',';','\'','"','-','(',')','!','?','...','$','№')

# Preprocess class definition

In [113]:
class Preproc:
    def __init__(self, book_name):
        self.text = self.get_book_text(book_name)
        self.tokens = nltk.word_tokenize(self.text)
        self.morph = pymorphy2.MorphAnalyzer()
        self.tokens_no_punct = nltk.tokenize.RegexpTokenizer(r"\w+").tokenize(self.text)
        self.nlp = spacy.load('ru_core_news_sm')
        self.word2vec_rus = gensim.downloader.load('word2vec-ruscorpora-300')
        self.doc = nlp(self.text)
        self.lemm = nltk.WordNetLemmatizer()
        self.lex_diversity()
        self.make_tags()
    
    def process_stopwords(self, overwrite=False):
        stopw = stopwords.words("russian")
        if overwrite:
            self.tokens = [token for token in self.tokens if token not in stopw]
        else:
            self.tokens_no_stopwords = [token for token in self.tokens if token not in stopw]

    def get_book_text(self, name):
        with open(name, "r") as raw:
            text = ""
            for t in raw.readlines():
                text += t+"\n"
            return text
    
    def lex_diversity(self):
        unique_words_set = set(self.tokens)
        self.lex_diversity_coeff = len(unique_words_set) / len(self.tokens)

    def make_tags(self):
        self.tagged = []
        self.tokens_and_tags = []
        for token in self.tokens_no_punct:
            tag = self.morph.parse(token)[0].tag.POS
            self.tagged.append((token, tag))
            # self.tokens_and_tags.append(self.lemm.lemmatize(token).lower()+'_'+tag)
            # spacy find all собственные имена
        return self.tagged

    def text_dynamics(total_tags: list):
        verbs = total_tags.count("VERB")
        return verbs / len(total_tags)

    def text_dynamics_dict(self):
        self.count_speech_parts()
        verbs = self.tags_count["VERB"]
        total = 0
        for i in self.tags_count.keys():
            total += self.tags_count[i]
        return verbs/total
    
    def count_speech_parts(self):
        self.make_tags()
        res = dict()
        for token in self.tagged:
            if (token[1] in res.keys()) :
                res[token[1]] += 1
            else:
                res[token[1]] = 1
        self.tags_count = res

    def count_unique_words(self):
        self.make_tags()
        res = dict()
        for token in self.tagged:
            if not (token[1] in ("CONJ", "PREP", "PRCL")):
                if token[0] in res.keys():
                    res[token[0]] += 1
                else:
                    res[token[0]] = 1
        self.word_count = res
        self.word_count_sorted = dict(sorted(self.word_count.items(), key=lambda item: item[1]))
    
    @staticmethod
    def compare_books(self, book1, book2):
        text1 = self.get_book_text(book1)
        text2 = self.get_book_text(book2)

        doc1 = self.nlp(text1)
        doc2 = self.nlp(text2)
        return (doc1.similarity(doc2), np.dot(doc1.vector, doc2.vector) / (np.linalg.norm(doc1.vector) * (np.linalg.norm(doc2.vector))))

    def count_root_speech_parts(self):
        res = dict()
        for token in self.doc:
            if token.pos_ not in ("SPACE", "PUNCT"):
                if token.pos_ not in res.keys():
                    res[token.pos_] = 1
                else:
                    res[token.pos_] += 1
        self.root_count = res
        self.root_count_sorted = dict(sorted(self.root_count.items(), key=lambda item: item[1]))
    
    def fetch_tokens(self, amount=100, index=0):
        return [self.tokens[token] for token in range(index, amount)]

    def get_tokens_and_tags(self, amount=100):
        res = []
        i = 0
        while len(res) < amount and i < len(self.doc):
        # for i in range(min(amount, len(self.doc))):
            if self.doc[i].tag_ not in ("PUNCT", "SPACE", "PROPN", "ADP", "NUM", "CCONJ", "PART", "PRON") and self.doc[i].lemma_ != "-":
                res.append((self.doc[i].lemma_, self.doc[i].tag_))
            i += 1
        
        return res

    def check_vectors(self, first_n_elements=100, logging=False):
        res = []
        i = 0
        while len(res) < first_n_elements:
            try:
                res.append(self.word2vec_rus[self.doc[i].lemma_+'_'+self.tagged[i][1]]) # word2vec_rus["тест_NOUN"]
            except:
                if logging:
                    print(f"error parsing {self.doc[i].lemma_+'_'+self.tagged[i][1]}")
                # res.append(self.doc[i].lemma_)
            i += 1
        return res
    
    def metamorph(self, first_n_elements=100):
        res = ""
        for i in self.check_vectors(first_n_elements):
            res += self.word2vec_rus.most_similar(positive=["женщина_NOUN", i], negative="мужчина_NOUN")[-1][0].split("_")[0] + " "
        return res
    
    # def antonimize(word_list):
    #     antonyms = []
    #     for word in word_list:
    #         synsets = wordnet.synsets(word)
    #         for synset in synsets:
    #             for lemma in synset.lemmas():
    #                 if lemma.antonyms():
    #                     antonyms.append(lemma.antonyms()[0].name())
    #                     break
    #     return antonyms

### Now onto the fun part
# Putting everything together

In [114]:
list_of_books = ["Похождения Чичикова (1921)","Белая гвардия (1922)","Роковые яйца (1924)","Собачье сердце (1925)","Мастер и маргарита (1929)","Театральный роман (1936)"]
for name in list_of_books:
    preproc = Preproc(name)
    with open(name+"_out", "w") as file:
        preproc.process_stopwords()
        preproc.count_unique_words()
        preproc.count_speech_parts()
        preproc.count_root_speech_parts()
        # if name == list_of_books[0]:
        print(f"Metamorphed \"{name}\":\n",preproc.metamorph(), "\n")
        
        file.write(f"Lexical diversity: {str(preproc.lex_diversity_coeff)}\n")

        file.write("Top-3 words: \n")
        file.write(f"\t{list(preproc.word_count_sorted.keys())[-1]}\n")
        file.write(f"\t{list(preproc.word_count_sorted.keys())[-2]}\n")
        file.write(f"\t{list(preproc.word_count_sorted.keys())[-3]}\n")

        file.write(f"Text dynamics: {preproc.text_dynamics_dict()}\n")

        file.write(f"Top root speech part: {list(preproc.root_count_sorted.keys())[-1]}\n")

KeyboardInterrupt: 

# Manual word2vec words processing

In [107]:
preproc = Preproc(list_of_books[4])
preproc.process_stopwords(True)

In [108]:
raw_tokens_and_tags = preproc.get_tokens_and_tags(200)

In [109]:
tokens_and_tags = [token+"_"+tag for token, tag in raw_tokens_and_tags]
print(len(tokens_and_tags))
# tokens_and_tags
raw_tokens_and_tags

200


[('стравинского', 'NOUN'),
 ('тот', 'DET'),
 ('время', 'NOUN'),
 ('как', 'SCONJ'),
 ('раз', 'NOUN'),
 ('как', 'ADV'),
 ('вести', 'VERB'),
 ('бездомный', 'ADJ'),
 ('долгий', 'ADJ'),
 ('сон', 'NOUN'),
 ('открыть', 'VERB'),
 ('глаз', 'NOUN'),
 ('некоторый', 'DET'),
 ('время', 'NOUN'),
 ('соображать', 'VERB'),
 ('как', 'SCONJ'),
 ('попасть', 'VERB'),
 ('этот', 'DET'),
 ('необыкновенный', 'ADJ'),
 ('комната', 'NOUN'),
 ('чистый', 'ADJ'),
 ('белый', 'ADJ'),
 ('стена', 'NOUN'),
 ('удивительный', 'ADJ'),
 ('ночной', 'ADJ'),
 ('столик', 'NOUN'),
 ('сделать', 'VERB'),
 ('какой', 'DET'),
 ('то', 'DET'),
 ('неизвестный', 'ADJ'),
 ('светлый', 'ADJ'),
 ('металл', 'NOUN'),
 ('величественной', 'VERB'),
 ('белый', 'ADJ'),
 ('штора', 'NOUN'),
 ('весь', 'DET'),
 ('стена', 'NOUN'),
 ('тряхнуть', 'VERB'),
 ('голова', 'NOUN'),
 ('убедиться', 'VERB'),
 ('что', 'SCONJ'),
 ('болеть', 'VERB'),
 ('очень', 'ADV'),
 ('отчетливо', 'ADV'),
 ('припомнить', 'VERB'),
 ('страшный', 'ADJ'),
 ('смерть', 'NOUN'),
 ('вызват

In [110]:
for i in range(len(tokens_and_tags)):
    print(f"{raw_tokens_and_tags[i]}: {preproc.word2vec_rus.most_similar(tokens_and_tags[i], topn=3)}")

KeyError: "Key 'стравинского_NOUN' not present in vocabulary"

### Every
## Single
# Word

# Processed manually

In [141]:
word2vec_rus = gensim.downloader.load('word2vec-ruscorpora-300')


In [142]:

def negative_coloration_remover(posit):
    posit_full = posit[0]+"_"+posit[1]
    return word2vec_rus.most_similar([posit_full, "хороший_ADJ"], negative=["плохой_ADJ"])[0][0].split("_")[0]

In [143]:
res = [] 
for i in raw_tokens_and_tags:
    try:
        res.append([i, negative_coloration_remover(i)])
    except:
        pass
res = res[:100]
len(res)

100

# An attempt in removing negative coloration from words 
## (not all words were found in the model, so some of them were removed on preproc stage)

In [145]:
with open("un-negatified words", "w") as file:
    for i in res:
        file.write(f"\"{i[0][0]}\" was automatically un-negatified into \"{i[1]}\"\n")