## 4.5 Представление текста

In [67]:
# from string import punctuation

import torch

### 4.5.1 Преобразование текста в числа

In [68]:
with open("../dlwpt-code/data/p1ch4/jane-austen/1342-0.txt", encoding='utf-8') as f:
    text = f.read()

In [69]:
text

"\ufeffThe Project Gutenberg EBook of Pride and Prejudice, by Jane Austen\n\nThis eBook is for the use of anyone anywhere at no cost and with\nalmost no restrictions whatsoever.  You may copy it, give it away or\nre-use it under the terms of the Project Gutenberg License included\nwith this eBook or online at www.gutenberg.org\n\n\nTitle: Pride and Prejudice\n\nAuthor: Jane Austen\n\nPosting Date: August 26, 2008 [EBook #1342]\nRelease Date: June, 1998\nLast Updated: March 10, 2018\n\nLanguage: English\n\nCharacter set encoding: UTF-8\n\n*** START OF THIS PROJECT GUTENBERG EBOOK PRIDE AND PREJUDICE ***\n\n\n\n\nProduced by Anonymous Volunteers\n\n\n\n\n\nPRIDE AND PREJUDICE\n\nBy Jane Austen\n\n\n\nChapter 1\n\n\nIt is a truth universally acknowledged, that a single man in possession\nof a good fortune, must be in want of a wife.\n\nHowever little known the feelings or views of such a man may be on his\nfirst entering a neighbourhood, this truth is so well fixed in the minds\nof the su

### 4.5.2 One-hot-coding символов

In [70]:
# разбиваем текст на список строк
lines = text.split('\n')
len(lines)


13428

In [71]:
# Выбираем одну из строк
line = lines[200]
line

'“Impossible, Mr. Bennet, impossible, when I am not acquainted with him'

Создаем тензор размера, достаточного для хранения всех уникальных кодированных символов для всей строки

In [72]:
letter_t = torch.zeros(len(line), 128)
letter_t.shape

torch.Size([70, 128])

Индекс устанваливаемой единицы соответсовует индексу в кодировке:

In [73]:
for i, letter in enumerate(line.lower().strip()):
    letter_index = ord(letter) if ord(letter) < 128 else 0 # отфильтровали двойные кавычки
    letter_t[i][letter_index] = 1

In [74]:
letter_t

tensor([[1., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

### 4.5.3 Унитарное кодирование целых слов

Функция `clean_words()` принимает на вход текст, а возвращает словарь слов в нижнем регистре и без знаков препинания

In [97]:
def clean_words(input_str):
    # punctuation = '.,";:""!?_-'
    word_list = input_str.lower().replace('\n', ' ').replace('“', '"').split()
    word_list = [word.strip(punctuation) for word in word_list]
    return word_list

words_in_line = clean_words(line)
line, words_in_line

('“Impossible, Mr. Bennet, impossible, when I am not acquainted with him',
 ['impossible',
  'mr',
  'bennet',
  'impossible',
  'when',
  'i',
  'am',
  'not',
  'acquainted',
  'with',
  'him'])

In [98]:
words_in_text = clean_words(text)
print(len(words_in_text))
print(words_in_text)

124592
['\ufeffthe', 'project', 'gutenberg', 'ebook', 'of', 'pride', 'and', 'prejudice', 'by', 'jane', 'austen', 'this', 'ebook', 'is', 'for', 'the', 'use', 'of', 'anyone', 'anywhere', 'at', 'no', 'cost', 'and', 'with', 'almost', 'no', 'restrictions', 'whatsoever', 'you', 'may', 'copy', 'it', 'give', 'it', 'away', 'or', 're-use', 'it', 'under', 'the', 'terms', 'of', 'the', 'project', 'gutenberg', 'license', 'included', 'with', 'this', 'ebook', 'or', 'online', 'at', 'www.gutenberg.org', 'title', 'pride', 'and', 'prejudice', 'author', 'jane', 'austen', 'posting', 'date', 'august', '26', '2008', 'ebook', '1342', 'release', 'date', 'june', '1998', 'last', 'updated', 'march', '10', '2018', 'language', 'english', 'character', 'set', 'encoding', 'utf-8', '', 'start', 'of', 'this', 'project', 'gutenberg', 'ebook', 'pride', 'and', 'prejudice', '', 'produced', 'by', 'anonymous', 'volunteers', 'pride', 'and', 'prejudice', 'by', 'jane', 'austen', 'chapter', '1', 'it', 'is', 'a', 'truth', 'universa

Формируем соответствие слов индексам в нашей кодировке:

In [99]:
word_list = sorted(set(clean_words(text))) # сортировка и удаление дублей
word2index_dict = {word: i for (i, word) in enumerate(word_list)} # создаем словарь {word:index}

len(word2index_dict), word2index_dict['impossible']

(8172, 3780)

Кодируем предложение:

In [95]:
word_t = torch.zeros(len(words_in_line), len(word2index_dict))
for i, word in enumerate(words_in_line):
    word_index = word2index_dict[word]
    word_t[i][word_index] = 1
    print(f'{i} {word_index} {word}')

0 3848 impossible
1 4928 mr
2 899 bennet
3 3848 impossible
4 8049 when
5 3759 i
6 449 am
7 5081 not
8 249 acquainted
9 8126 with
10 3637 him


### 4.5.4 Вложения текста