### Libs

In [20]:
import re

### Algorithm

In [33]:
class TextStatistics:
    def __init__(self):
        self.text = ""
        self.words = []
        self.letters = ()
        self.frequency = {}
        self.bilingual_words = []
        
    # текст из файла
    def get_text(self, file_path: str) -> str:
        with open(file_path, 'r', encoding='UTF-8') as file:
            self.text = file.read()
        
    # слова из текста
    def get_words(self) -> list:
        self.words = re.findall(r'\w+', self.text)
        return self.words
    
    # количество слов
    def words_amount(self) -> int:
        return len(self.words)
    
    # буквы из слов
    def get_letters(self) -> tuple:
        self.letters =  tuple("".join(self.words))
        return self.letters
    
    # количество букв и их частоту
    def letters_frequency(self) -> dict:
        split_text = "".join(self.words)

        for letter in self.letters:
            f_letter = split_text.count(letter)
            p_letter = f_letter / len(self.words)
            self.frequency[letter] = (f_letter, p_letter)
            
        return self.frequency
        
    # количество параграфов
    def paragraph_amount(self) -> int:
        return self.text.count("\n\n")
    
    # слова с включающие буквы из разных алфавитов
    def bilingual_word_amount(self) -> int:
        for w in self.words:
            if re.search(r'[a-zA-Z]', w) and re.search(r'[а-яА-Я]', w):
                self.bilingual_words.append(w)   
                  
        return self.bilingual_words



### Test

In [34]:
def text_statistics(file_path: str) -> tuple:
    ts = TextStatistics()
    ts.get_text(file_path)
    ts.get_words()
    ts.get_letters()
    
    w_amount = ts.words_amount()
    l_frequency = ts.letters_frequency()
    p_amount = ts.paragraph_amount()
    b_words = ts.bilingual_word_amount()
    
    text_statistics = {}
    
    for key, value in l_frequency.items():
        text_statistics[key] = value
    
    text_statistics["word_amount"] = w_amount
    text_statistics["paragraph_amount"] = p_amount
    text_statistics["bilingual_word_amount"] = len(b_words)
    
    return text_statistics

### Output

In [35]:
text_statistics("../test_data/statistics.txt")

{'П': (2, 0.011695906432748537),
 'р': (50, 0.29239766081871343),
 'о': (125, 0.7309941520467836),
 'с': (55, 0.3216374269005848),
 'т': (72, 0.42105263157894735),
 'ы': (21, 0.12280701754385964),
 'е': (70, 0.4093567251461988),
 'ч': (13, 0.07602339181286549),
 'и': (89, 0.52046783625731),
 'л': (46, 0.26900584795321636),
 'а': (74, 0.4327485380116959),
 'в': (44, 0.2573099415204678),
 'з': (25, 0.14619883040935672),
 'д': (35, 0.2046783625730994),
 'н': (65, 0.38011695906432746),
 'м': (31, 0.18128654970760233),
 'п': (31, 0.18128654970760233),
 'Н': (2, 0.011695906432748537),
 'б': (16, 0.0935672514619883),
 'х': (10, 0.05847953216374269),
 'm': (7, 0.04093567251461988),
 'b': (2, 0.011695906432748537),
 'ь': (18, 0.10526315789473684),
 'ф': (7, 0.04093567251461988),
 'у': (23, 0.13450292397660818),
 'к': (36, 0.21052631578947367),
 'ц': (15, 0.08771929824561403),
 'ю': (10, 0.05847953216374269),
 'p': (3, 0.017543859649122806),
 'r': (6, 0.03508771929824561),
 'i': (6, 0.0350877192