# read text file

In [27]:
import numpy as np
import torch
torch.set_printoptions(edgeitems=2, linewidth=75)

In [28]:
with open('../data/p1ch4/jane-austen/1342-0.txt', encoding='utf8') as f:
    text = f.read()

In [29]:
line = text.split('\n')

In [30]:
line = line[200]
line, len(line)

('“Impossible, Mr. Bennet, impossible, when I am not acquainted with him', 70)

In [31]:
letter_t = torch.zeros(len(line), 128)
letter_t.shape

torch.Size([70, 128])

In [32]:
for i, letter in enumerate(line.lower().strip()):
    letter_index = ord(letter) if ord(letter) < 128 else 0
    letter_t[i][letter_index] = 1

这段代码是在对字符串`line`中的每个字符进行操作，将每个字符转换为一个one-hot编码的向量。

- `enumerate(line.lower().strip())`：这会对`line`字符串进行小写转换和去除首尾空格，然后遍历每个字符及其索引。
- `letter_index = ord(letter) if ord(letter) < 128 else 0`：这会获取每个字符的ASCII码作为索引。如果字符的ASCII码大于或等于128（即非ASCII字符），则索引为0。
- `letter_t[i][letter_index] = 1`：这会在`letter_t`张量的第`i`行和第`letter_index`列处设置值为1，其他位置为0。这就创建了一个one-hot编码的向量，表示`line`中的第`i`个字符。

总的来说，这段代码的作用是将字符串`line`转换为一个one-hot编码的张量，其中每一行对应`line`中的一个字符。

---

对于英语来说, 对于字母的编码没有什么意义(同👆), 所以接下来是对词进行编码

👇

In [33]:
# 清理文字, 有用, 常回来看看

def clean_words(input_str):
    punctuation = '.,;:"!?”“_-'
    word_list = input_str.lower().replace('\n',' ').split()
    word_list = [word.strip(punctuation) for word in word_list]
    return word_list

In [34]:
words_in_line = clean_words(line)
line, words_in_line

('“Impossible, Mr. Bennet, impossible, when I am not acquainted with him',
 ['impossible',
  'mr',
  'bennet',
  'impossible',
  'when',
  'i',
  'am',
  'not',
  'acquainted',
  'with',
  'him'])

In [35]:
# 全文处理

word_list = sorted(set(clean_words(text)))

In [36]:
word_list

['',
 '#1342]',
 '$5,000)',
 "'_she",
 "'after",
 "'ah",
 "'as-is'",
 "'bingley",
 "'had",
 "'having",
 "'i",
 "'keep",
 "'lady",
 "'lately",
 "'lydia",
 "'mr",
 "'my",
 "'oh",
 "'s",
 "'this",
 "'tis",
 "'violently",
 "'yes,'",
 "'you",
 '($1',
 '(801)',
 '(a)',
 '(an',
 '(and',
 '(any',
 '(available',
 '(b)',
 '(by',
 '(c)',
 '(comparatively',
 '(does',
 '(for',
 '(glancing',
 '(if',
 '(lady',
 '(like',
 '(most',
 '(my',
 '(or',
 '(trademark/copyright)',
 '(unasked',
 '(what',
 '(who',
 '(www.gutenberg.org)',
 '(“the',
 '*',
 '***',
 '*****',
 '1',
 '1.a',
 '1.b',
 '1.c',
 '1.d',
 '1.e',
 '1.e.1',
 '1.e.2',
 '1.e.3',
 '1.e.4',
 '1.e.5',
 '1.e.6',
 '1.e.7',
 '1.e.8',
 '1.e.9',
 '1.f',
 '1.f.1',
 '1.f.2',
 '1.f.3',
 '1.f.4',
 '1.f.5',
 '1.f.6',
 '10',
 '11',
 '12',
 '13',
 '1342-0.txt',
 '1342-0.zip',
 '14',
 '15',
 '1500',
 '15th',
 '16',
 '17',
 '18',
 '18th',
 '19',
 '1998',
 '2',
 '20',
 '20%',
 '2001',
 '2008',
 '2018',
 '21',
 '22',
 '23',
 '24',
 '25',
 '26',
 '26th',
 '27',
 '2

In [37]:
word2index_dict = {word: i for (i, word) in enumerate(word_list)}

就是建立一个`word: index`的字典, 按照顺序来说就是, 在`enumerate(word_list)`中进行遍历迭代, 建立对于的`word: index`的字典.

In [39]:
len(word2index_dict), word2index_dict['impossible']

(7261, 3394)

In [40]:
# 进行one_hot编码
word_t = torch.zeros(len(words_in_line), len(word2index_dict))
word_t.shape

torch.Size([11, 7261])

In [42]:
for i, word in enumerate(words_in_line):
    word_index = word2index_dict[word]
    word_t[i][word_index] = 1
    print('{:2} {:4} {}'.format(i, word_index, word))

print(word_t.shape)

 0 3394 impossible
 1 4305 mr
 2  813 bennet
 3 3394 impossible
 4 7078 when
 5 3315 i
 6  415 am
 7 4436 not
 8  239 acquainted
 9 7148 with
10 3215 him
torch.Size([11, 7261])


In [43]:
word_t = word_t.unsqueeze(1)
word_t.shape

torch.Size([11, 1, 7261])

In [44]:
[(c, ord(c)) for c in sorted(set(text))]

[('\n', 10),
 (' ', 32),
 ('!', 33),
 ('#', 35),
 ('$', 36),
 ('%', 37),
 ("'", 39),
 ('(', 40),
 (')', 41),
 ('*', 42),
 (',', 44),
 ('-', 45),
 ('.', 46),
 ('/', 47),
 ('0', 48),
 ('1', 49),
 ('2', 50),
 ('3', 51),
 ('4', 52),
 ('5', 53),
 ('6', 54),
 ('7', 55),
 ('8', 56),
 ('9', 57),
 (':', 58),
 (';', 59),
 ('?', 63),
 ('@', 64),
 ('A', 65),
 ('B', 66),
 ('C', 67),
 ('D', 68),
 ('E', 69),
 ('F', 70),
 ('G', 71),
 ('H', 72),
 ('I', 73),
 ('J', 74),
 ('K', 75),
 ('L', 76),
 ('M', 77),
 ('N', 78),
 ('O', 79),
 ('P', 80),
 ('Q', 81),
 ('R', 82),
 ('S', 83),
 ('T', 84),
 ('U', 85),
 ('V', 86),
 ('W', 87),
 ('X', 88),
 ('Y', 89),
 ('Z', 90),
 ('[', 91),
 (']', 93),
 ('_', 95),
 ('a', 97),
 ('b', 98),
 ('c', 99),
 ('d', 100),
 ('e', 101),
 ('f', 102),
 ('g', 103),
 ('h', 104),
 ('i', 105),
 ('j', 106),
 ('k', 107),
 ('l', 108),
 ('m', 109),
 ('n', 110),
 ('o', 111),
 ('p', 112),
 ('q', 113),
 ('r', 114),
 ('s', 115),
 ('t', 116),
 ('u', 117),
 ('v', 118),
 ('w', 119),
 ('x', 120),
 ('y',

In [45]:
ord('l')

108