In [1]:
import numpy as np
import pandas as pd

from collections import Counter
from typing import (
    List,
    Dict,
    Optional,
)
import re

import matplotlib.pyplot as plt
%matplotlib inline


Constants

In [2]:
RANDOM_SEED = 5
DATA_PATH = "raw_data"
AK_FILE = "AnnaKarenina"
WNP_RU_FILE = "WarAndPeace"
WNP_EN_FILE = "WarAndPeaceEng"
TXT = ".txt"

np.random.seed(RANDOM_SEED)


Data loading

In [3]:
wnp_ru = None
with open(f"{DATA_PATH}/{WNP_RU_FILE}{TXT}", "r", encoding="utf-8") as f:
    wnp_ru = f.read()
assert wnp_ru is not None


wnp_eng = None
with open(f"{DATA_PATH}/{WNP_EN_FILE}{TXT}", "r", encoding="utf-8") as f:
    wnp_eng = f.read()
assert wnp_eng is not None


ak = None
with open(f"{DATA_PATH}/{AK_FILE}{TXT}", "r", encoding="utf-8") as f:
    ak = f.read()
assert ak is not None


In [4]:
class TextPreproccesing:
    def __init__(self, text: str) -> None:
        try:
            regex = re.compile("[\W_\d]+|[^А-Яа-я]+")
            self.original_text = regex.sub(" ", text.lower()).strip()
        except BaseException as e:
            print(f"error with text processing")
            print(e)

    def count_letters_freq(self) -> None:
        self.counter: Optional[Counter] = Counter(self.original_text)
        self.letter_num = len(self.counter.keys())


In [5]:
wnp_ru_counter = TextPreproccesing(wnp_ru)
wnp_ru_counter.count_letters_freq()


In [13]:
sorted(
    wnp_ru_counter.counter.items(),
    key=lambda x: x[1],
    reverse=True,
)


[('a', 1887),
 ('r', 1853),
 ('u', 1732),
 ('o', 1724),
 ('t', 1640),
 ('э', 1629),
 ('щ', 1514),
 ('l', 1312),
 ('ф', 1209),
 ('m', 1131),
 ('c', 921),
 ('d', 870),
 ('p', 726),
 ('v', 616),
 ('ё', 431),
 ('h', 416),
 ('é', 367),
 ('b', 317),
 ('q', 292),
 ('f', 290),
 ('ъ', 283),
 ('g', 221),
 ('j', 217),
 ('z', 182),
 ('x', 148),
 ('è', 143),
 ('à', 110),
 ('ê', 61),
 ('y', 60),
 ('k', 41),
 ('w', 37),
 ('ç', 23),
 ('â', 18),
 ('ô', 11),
 ('î', 8),
 ('ö', 6),
 ('ü', 4),
 ('û', 2),
 ('í', 1),
 ('ä', 1),
 ('å', 1)]