In [4]:
import pandas as pd

In [103]:
RUSSIAN_LETTERS = [chr(i) for i in range(ord('а'), ord('я')+1)] + [' ']

In [6]:
import re

In [7]:
def prepare_word(word):
    word = word.lower()
    word = word.replace('ё', 'е')
    word = re.sub(r'[^а-я ]', '', word)

    return word

In [8]:
from collections import defaultdict

In [185]:
def get_freq_dict(corpus):
    a = dict.fromkeys(RUSSIAN_LETTERS, 0)

    for letter in RUSSIAN_LETTERS:
         a[letter] = dict.fromkeys(RUSSIAN_LETTERS, 0)
            
    for word in corpus:
        for i in range(1, len(word)):
            a[word[i]][word[i-1]] += 1
    
    letter_freq = dict()
    
    for key1 in a:
        c = 0
        
        for key2 in a:
            c += a[key2][key1]
            
        letter_freq[key1] = c
        
    for key1 in a:
        c = 0

        for key2 in a[key1]:
            a[key1][key2] /= letter_freq[key2]
            
    return a

In [186]:
import math

In [187]:
class WordEntropyCounter:
    def __init__(self, letter_freq_dict):
        self.letter_freq_dict = letter_freq_dict
        
    def _entropy(self, p):
        return -p * math.log(p)
    
    def word_entropy(self, word):
        entr = 0

        for i in range(1, len(word)):
            entr += self._entropy(self.letter_freq_dict[word[i]][word[i-1]])
        
        mean_entr = entr / (len(word) - 1)
        
        return (entr + mean_entr) / 2
    
    def to_file(self, path):
        with open(path, 'w') as f:
            f.write(str(self.letter_freq_dict))
            
    def from_file(path):
        
        with open(path, 'r') as f:
            letter_freq_dict = eval(f.read())
            
        return WordEntropyCounter(letter_freq_dict)

### Просто русский словарь

In [188]:
def prepare_words_to_calc(words):
    return list(map(lambda x: ' ' + x + ' ', words))

In [189]:
russian_words = open('russian_words.txt').read().split()
russian_words = list(map(prepare_word, russian_words))
russian_words = prepare_words_to_calc(russian_words)

In [190]:
a = get_freq_dict(russian_words)

In [191]:
wec = WordEntropyCounter(a)

In [192]:
wec.word_entropy('оим')

0.10126201279289634

In [193]:
wec.word_entropy('жизнь')

0.46072482907091783

In [194]:
wec.to_file('russian_words.wec')

In [195]:
len_words = list(filter(lambda x: len(x) > 1, russian_words))

In [196]:
entropies = list(map(wec.word_entropy, len_words))

In [197]:
import numpy as np

In [198]:
np.min(entropies)

0.05975289749991332

In [199]:
np.mean(entropies)

1.3228357089428977

### Война и мир

In [200]:
war_and_piece = open('war_and_piece.txt').read()

In [201]:
war_and_piece = prepare_word(war_and_piece)

In [202]:
war_and_piece_words = list(filter(lambda x: len(x) > 2, war_and_piece.split()))

In [203]:
war_and_piece_words = list(map(lambda x: ' ' + x + ' ', war_and_piece_words))

In [204]:
len(war_and_piece_words)

338168

In [205]:
len(set(war_and_piece_words))

54294

In [206]:
b = get_freq_dict(war_and_piece_words)

In [207]:
wec_wap = WordEntropyCounter(b)

In [213]:
wec_wap.word_entropy(' оим ')

0.46247409451677657

In [214]:
wec_wap.word_entropy(' жизнь ')

0.6388005465108424

In [210]:
wec_wap.to_file('war_and_piece.wec')