In [4]:
import pandas as pd

In [103]:
RUSSIAN_LETTERS = [chr(i) for i in range(ord('а'), ord('я')+1)] + [' ']

In [6]:
import re

In [7]:
def prepare_word(word):
    word = word.lower()
    word = word.replace('ё', 'е')
    word = re.sub(r'[^а-я ]', '', word)

    return word

In [8]:
from collections import defaultdict

In [124]:
def get_freq_dict(corpus):
    a = dict.fromkeys(RUSSIAN_LETTERS, 0)

    for letter in RUSSIAN_LETTERS:
         a[letter] = dict.fromkeys(RUSSIAN_LETTERS, 0)
            
    for word in corpus:
        for i in range(1, len(word)):
            a[word[i]][word[i-1]] += 1
    
    letter_freq = dict()
    
    for key1 in a:
        c = 0
        
        for key2 in a:
            c += a[key2][key1]
            
        letter_freq[key1] = c
        
    for key1 in a:
        c = 0

        for key2 in a[key1]:
            a[key1][key2] /= letter_freq[key2]
            
    return a

In [125]:
import math

In [126]:
class WordEntropyCounter:
    def __init__(self, letter_freq_dict):
        self.letter_freq_dict = letter_freq_dict
        
    def _entropy(self, p):
        return -p * math.log(p)
    
    def word_entropy(self, word):
        entr = 0

        for i in range(1, len(word)):
            entr += self._entropy(self.letter_freq_dict[word[i]][word[i-1]])
        
        mean_entr = entr / (len(word) - 1)
        
        return (entr + mean_entr) / 2
    
    def to_file(self, path):
        with open(path, 'w') as f:
            f.write(str(self.letter_freq_dict))
            
    def from_file(path):
        
        with open(path, 'r') as f:
            letter_freq_dict = eval(f.read())
            
        return WordEntropyCounter(letter_freq_dict)

### Просто русский словарь

In [127]:
def prepare_words_to_calc(words):
    return list(map(lambda x: ' ' + x + ' ', words))

In [128]:
russian_words = open('russian_words.txt').read().split()
russian_words = list(map(prepare_word, russian_words))
russian_words = prepare_words_to_calc(russian_words)

In [129]:
a = get_freq_dict(russian_words)

In [130]:
wec = WordEntropyCounter(a)

In [131]:
wec.word_entropy('оим')

0.10126201279289634

In [132]:
wec.word_entropy('жизнь')

0.46072482907091783

In [133]:
wec.to_file('russian_words.wec')

In [134]:
len_words = list(filter(lambda x: len(x) > 1, russian_words))

In [135]:
entropies = list(map(wec.word_entropy, len_words))

In [136]:
import numpy as np

In [137]:
np.min(entropies)

0.05975289749991332

In [138]:
np.mean(entropies)

1.3228357089428977

### Война и мир

In [139]:
war_and_piece = open('war_and_piece.txt').read()

In [140]:
war_and_piece = prepare_word(war_and_piece)

In [141]:
war_and_piece_words = list(filter(lambda x: len(x) > 2, war_and_piece.split()))

In [142]:
war_and_piece_words = list(map(lambda x: ' ' + x + ' ', war_and_piece_words))

In [143]:
len(war_and_piece_words)

338168

In [144]:
len(set(war_and_piece_words))

54294

In [145]:
b = get_freq_dict(war_and_piece_words)

In [146]:
wec_wap = WordEntropyCounter(b)

In [147]:
wec_wap.word_entropy(' оим ')

0.46247409451677657

In [148]:
wec_wap.word_entropy(' жизнь ')

0.6388005465108424

In [149]:
wec_wap.to_file('war_and_piece.wec')

In [150]:
with open('../../data/all_anamnesises.txt') as f:
    anam = f.read()

In [81]:
def is_abbreviation(word):
    """
    Будем считать аббревиатурами слова, половина букв которого - заглавные
    
    Уберем случаи, когда это может инициалы сотрудника
    """
    
    # убрать инициалы
    if re.fullmatch(r'[A-ZА-Я]\.[A-ZА-Я]\.?', word):
        return False
    
    upper_count = sum([l.isupper() for l in word])
    
    return upper_count > len(word) // 2 and len(word) > 1

def clean_abbr(word):
    """
    Убираем все символы кроме буквенных, точки и тире
    """
    word = re.sub(r'[^A-ZА-Яа-я-a-z-\.]', '', word)
    
    
    while word[-1] in ['-', '.']:
        word = word[:-1]
        
    while word[0] in ['-', '.']:
        word = word[1:]
        
    return word.upper()

# Первый этап
def naive_extract_abbreviation(text):
    text = text.replace(',', ' ')
    abbrs = filter(lambda word: is_abbreviation(word), text.split())
    abbrs = map(clean_abbr, abbrs)
    
    return list(abbrs)

def split_abbreviations(abbr):
    # Если без знаков препинания является аббревиатурой
    # ЭХО-КГ -> ЭХОКГ
    
    if re.sub(r'[\.|-]', '', abbr) in abbrs_counter:
        return [re.sub(r'[\.|-]', '', abbr)]
    
    items = re.split(r'[-\.]', abbr)
    
    new_abbrs = []
    found = False
    
    for item in items:
        if item in abbrs_counter:
            new_abbrs.append(item)

    if len(new_abbrs) > 0:
        return new_abbrs

    return [abbr]


In [89]:
abbrs = set(naive_extract_abbreviation(anam))

In [90]:
entropies = []

In [91]:
for abbr in abbrs:
    abbr = re.sub(r'[^А-Я]', '', abbr).lower()
    
    entropies.append(
        (wec.word_entropy(abbr),
        abbr)
    )

In [92]:
entropies.sort()

In [93]:
entropies[-30:]

[(1.3806080607591324, 'кагот'),
 (1.3843956477024948, 'вцэирм'),
 (1.4201252288057489, 'жизни'),
 (1.4380952136811707, 'осмотр'),
 (1.442134568499776, 'спбран'),
 (1.451947090574083, 'кдокос'),
 (1.4805690694807754, 'змжвапка'),
 (1.4912165120582024, 'жалобы'),
 (1.4976861468672469, 'фжэкг'),
 (1.5025636503975957, 'пкапада'),
 (1.5171416173882815, 'каглка'),
 (1.547979642818265, 'анамнез'),
 (1.5760489311192634, 'фцскэим'),
 (1.5776802444421065, 'правут'),
 (1.7030766594111744, 'стресс'),
 (1.7131173425065152, 'гкириши'),
 (1.773831568513959, 'змжвпкарввпка'),
 (1.8194531261578248, 'фжел'),
 (1.8318846646749416, 'эхокгкдоксо'),
 (1.8636122658482412, 'мочевой'),
 (1.9396470329665356, 'железа'),
 (2.060063484119252, 'акшзмжвпка'),
 (2.1461770767896975, 'операции'),
 (2.464135755371667, 'стрессэхокг'),
 (2.545891130919506, 'протокол'),
 (2.8871018825891923, 'заключение'),
 (3.436548711687352, 'поступления'),
 (3.555264978996522, 'заболевания'),
 (4.398877418841387, 'предстательная'),
 (4.