In [1]:
import urllib.request

In [2]:
import re

### 2.2 Tokenizing text

In [9]:
# prepare text
url = ("https://raw.githubusercontent.com/rasbt/"
 "LLMs-from-scratch/main/ch02/01_main-chapter-code/"
 "the-verdict.txt")
file_path = "the-verdict.txt"
urllib.request.urlretrieve(url, file_path)

('the-verdict.txt', <http.client.HTTPMessage at 0x28e92ceaaf0>)

In [3]:
# look content
with open("the-verdict.txt", "r") as f:
    raw_text = f.read()
print("length: ", len(raw_text))
print("content part: ", raw_text[:99])

length:  20479
content part:  I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


In [11]:
# split
text = "Hello, world. This, is a test."
text_split = re.split("(\s)", text)
print(text_split)

['Hello,', ' ', 'world.', ' ', 'This,', ' ', 'is', ' ', 'a', ' ', 'test.']


In [12]:
# 区分标点
text_split1 = re.split("([.,]|\s)", text)
print(text_split1)

['Hello', ',', '', ' ', 'world', '.', '', ' ', 'This', ',', '', ' ', 'is', ' ', 'a', ' ', 'test', '.', '']


In [13]:
# 去除空格
text_split2 = [item for item in text_split1 if item.strip()]
print(text_split2)

['Hello', ',', 'world', '.', 'This', ',', 'is', 'a', 'test', '.']


In [14]:
# 增加token种类
text = "Hello, world. Is this-- a test?"
text_split3 = re.split("([,.?]|--|\s)", text)
print(text_split3)

['Hello', ',', '', ' ', 'world', '.', '', ' ', 'Is', ' ', 'this', '--', '', ' ', 'a', ' ', 'test', '?', '']


In [15]:
text_split4 = [item for item in text_split3 if item.strip()]
print(text_split4)

['Hello', ',', 'world', '.', 'Is', 'this', '--', 'a', 'test', '?']


In [16]:
# 整个文本
raw_text[:20]

'I HAD always thought'

In [6]:
text_split5 = re.split(r'([,.?"\';:_!()]|--|\s)', raw_text)
text_split5 = [item for item in text_split5 if item.strip()]

In [18]:
print(text_split5[:100])

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in', 'the', 'height', 'of', 'his', 'glory', ',', 'he', 'had', 'dropped', 'his', 'painting', ',', 'married', 'a', 'rich', 'widow', ',', 'and', 'established', 'himself', 'in', 'a', 'villa', 'on', 'the', 'Riviera', '.', '(', 'Though', 'I', 'rather', 'thought', 'it', 'would', 'have', 'been', 'Rome', 'or', 'Florence', '.', ')', '"', 'The', 'height', 'of', 'his', 'glory', '"', '--', 'that', 'was', 'what', 'the', 'women', 'called', 'it', '.', 'I', 'can', 'hear', 'Mrs', '.', 'Gideon', 'Thwing', '--', 'his', 'last', 'Chicago', 'sitter', '--']


In [20]:
print(len(text_split5))

4690


### 2.3 Converting tokens into token IDs

In [7]:
all_words = sorted(set(text_split5))
len_words = len(all_words)
print(len_words)

1130


In [21]:
print(type(all_words))

<class 'list'>


In [8]:
# 对token生成id
vocab = {token:token_id for token_id, token in enumerate(all_words)}

In [25]:
type(vocab.items())

dict_items

In [26]:
count = 0
for item in vocab.items():
    if count > 50:
        break
    print(item)
    count += 1

('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)
('A', 11)
('Ah', 12)
('Among', 13)
('And', 14)
('Are', 15)
('Arrt', 16)
('As', 17)
('At', 18)
('Be', 19)
('Begin', 20)
('Burlington', 21)
('But', 22)
('By', 23)
('Carlo', 24)
('Chicago', 25)
('Claude', 26)
('Come', 27)
('Croft', 28)
('Destroyed', 29)
('Devonshire', 30)
('Don', 31)
('Dubarry', 32)
('Emperors', 33)
('Florence', 34)
('For', 35)
('Gallery', 36)
('Gideon', 37)
('Gisburn', 38)
('Gisburns', 39)
('Grafton', 40)
('Greek', 41)
('Grindle', 42)
('Grindles', 43)
('HAD', 44)
('Had', 45)
('Hang', 46)
('Has', 47)
('He', 48)
('Her', 49)
('Hermia', 50)


In [27]:
vocab['For']

35

tokenizer类

In [9]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_id = vocab
        self.id_to_str = {i:s for s, i in vocab.items()}
    
    def encode(self, text):
        processed = re.split(r'([,.?"\';:_!()]|--|\s)', text)  # 将传入的文本分词
        processed_new = [item.strip() for item in processed if item.strip()]  # 去除空格
        ids = [self.str_to_id[item] for item in processed_new]  # 将字符变为token_id
        return ids
    
    def decode(self, ids):
        text = " ".join(self.id_to_str[i] for i in ids)  # 拼接所有字符
        text_new = re.sub(r'\s+([,.?"\';:_!()])', r'\1', text)  # 去除符号前空格
        return text_new
        

In [34]:
# 查看编码效果
tokenizer = SimpleTokenizerV1(vocab)
text = """"It's the last he painted, you know," 
 Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(text)
print(ids)

[1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]


In [35]:
# 查看解码效果
print(tokenizer.decode(ids))

" It' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.


In [36]:
text = "Hello, do you like tea?"
print(tokenizer.encode(text))

KeyError: 'Hello'

**Hello**无法被识别因为之前的词表里没有

### 2.4 Adding special context tokens

加入"<|unk|>"和"<|endoftext|>"表示未知字符和文本结尾

In [10]:
all_tokens = sorted(set(text_split5))
all_tokens.extend(['<|unk|>', '<|endoftext|>'])
print(len(all_tokens))

1132


In [11]:
vocab_new = {item:token_id for token_id, item in enumerate(all_tokens)}

In [42]:
count = 0
for i in list(vocab_new.items())[-5:]:
#     if count > 50:
#         break
    print(i)
#     count += 1

('younger', 1127)
('your', 1128)
('yourself', 1129)
('<|unk|>', 1130)
('<|endoftext|>', 1131)


建立新版tokenizer类

In [12]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_id = vocab
        self.id_to_str = {i:s for s, i in vocab.items()}
    
    def encode(self, text):
        processed = re.split(r'([,.?"\';:_!()]|--|\s)', text)
        processed_new = [item.strip() for item in processed if item.strip()]
        processed_with_spec_char = [item if item in self.str_to_id else '<|unk|>' for item in processed_new]  # 将不在词表中的字符设为未知字符
        ids = [self.str_to_id[item] for item in processed_with_spec_char]
        return ids
    
    def decode(self, ids):
        text = " ".join([self.id_to_str[i] for i in ids])
        text_new = re.sub(r'\s+([,.?"\';:_!()])', r"\1", text)
        return text_new

In [50]:
text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."
text = " <|endoftext|> ".join((text1, text2))
print(text)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.


In [58]:
tokenizer2 = SimpleTokenizerV2(vocab_new)
ids2 = tokenizer2.encode(text)
print(ids2)

[1130, 5, 355, 1126, 628, 975, 10, 1131, 55, 988, 956, 984, 722, 988, 1130, 7]


In [60]:
print(tokenizer2.decode(ids2))

<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.


In [61]:
text1 = """2020 Missouri Amendment 2, also called the Medicaid Expansion Initiative, was a ballot measure to amend the Constitution of Missouri to expand Medicaid under the Affordable Care Act. The initiative was on the August 4, 2020, primary ballot and passed with 53.27% of the vote. Following Medicaid expansion initiatives in other states, Republican lawmakers in Nebraska and Utah added work requirements to their states' expansions; supporters aimed to prevent this by proposing state constitutional amendments for future Medicaid expansion initiatives. The measure was supported most in urban areas and opposed in rural areas. After a delay due to a lack of funding from the Missouri General Assembly and resulting litigation, the initiative was slowly implemented in October 2021. Republican lawmakers attempted to roll back the program and add a work requirement through a state constitutional amendment, which failed after the United States Supreme Court prevented its implementation."""

In [62]:
print(tokenizer2.encode(text1))

[1130, 1130, 1130, 1130, 5, 1130, 242, 988, 1130, 1130, 1130, 5, 1077, 115, 1130, 1130, 1016, 1130, 988, 1130, 722, 1130, 1016, 1130, 1130, 1044, 988, 1130, 1130, 1130, 7, 93, 1130, 1077, 727, 988, 1130, 1130, 5, 1130, 5, 1130, 1130, 157, 1130, 1108, 1130, 7, 1130, 722, 988, 1130, 7, 1130, 1130, 1130, 1130, 568, 735, 1130, 5, 1130, 1130, 568, 1130, 157, 1130, 130, 1117, 1130, 1016, 989, 1130, 2, 1130, 9, 1130, 1130, 1016, 1130, 999, 241, 1130, 1130, 1130, 1130, 456, 1130, 1130, 1130, 1130, 7, 93, 1130, 1077, 1130, 686, 568, 1130, 1130, 157, 1130, 568, 1130, 1130, 7, 1130, 115, 1130, 1130, 1016, 115, 1130, 722, 1130, 477, 988, 1130, 1130, 1130, 157, 1130, 1130, 5, 988, 1130, 1077, 903, 1130, 568, 1130, 1130, 7, 1130, 1130, 1130, 1016, 1130, 191, 988, 1130, 157, 129, 115, 1117, 1130, 1007, 115, 1130, 1130, 1130, 5, 1093, 422, 138, 988, 1130, 1130, 1130, 1130, 1130, 586, 1130, 7]


In [63]:
ids_try = tokenizer2.encode(text1)

In [64]:
print(tokenizer2.decode(ids_try))

<|unk|> <|unk|> <|unk|> <|unk|>, <|unk|> called the <|unk|> <|unk|> <|unk|>, was a <|unk|> <|unk|> to <|unk|> the <|unk|> of <|unk|> to <|unk|> <|unk|> under the <|unk|> <|unk|> <|unk|>. The <|unk|> was on the <|unk|> <|unk|>, <|unk|>, <|unk|> <|unk|> and <|unk|> with <|unk|>. <|unk|> of the <|unk|>. <|unk|> <|unk|> <|unk|> <|unk|> in other <|unk|>, <|unk|> <|unk|> in <|unk|> and <|unk|> added work <|unk|> to their <|unk|>' <|unk|>; <|unk|> <|unk|> to <|unk|> this by <|unk|> <|unk|> <|unk|> <|unk|> for <|unk|> <|unk|> <|unk|> <|unk|>. The <|unk|> was <|unk|> most in <|unk|> <|unk|> and <|unk|> in <|unk|> <|unk|>. <|unk|> a <|unk|> <|unk|> to a <|unk|> of <|unk|> from the <|unk|> <|unk|> <|unk|> and <|unk|> <|unk|>, the <|unk|> was slowly <|unk|> in <|unk|> <|unk|>. <|unk|> <|unk|> <|unk|> to <|unk|> back the <|unk|> and add a work <|unk|> through a <|unk|> <|unk|> <|unk|>, which failed after the <|unk|> <|unk|> <|unk|> <|unk|> <|unk|> its <|unk|>.


### 2.5 Byte pair encoding

In [65]:
pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.9.0-cp39-cp39-win_amd64.whl (894 kB)
     -------------------------------------- 894.2/894.2 kB 2.8 MB/s eta 0:00:00
Installing collected packages: tiktoken
Successfully installed tiktoken-0.9.0
Note: you may need to restart the kernel to use updated packages.


In [16]:
from importlib.metadata import version
import tiktoken
print("tiktoken version:", version("tiktoken"))

tiktoken version: 0.9.0


In [17]:
tokenizer_bpe = tiktoken.encoding_for_model("gpt-4")

In [77]:
text = (
 "Hello, do you like tea? <|endoftext|> In the sunlit terraces"
 " of someunknownPlace."
)
ids = tokenizer_bpe.encode(text, allowed_special={"<|endoftext|>"})
print(ids)

[9906, 11, 656, 499, 1093, 15600, 30, 220, 100257, 763, 279, 7160, 32735, 7317, 2492, 315, 1063, 16476, 17826, 13]


In [78]:
print(tokenizer_bpe.decode(ids))

Hello, do you like tea? <|endoftext|> In the sunlit terraces of someunknownPlace.


In [79]:
ids1 = tokenizer_bpe.encode(text1)
print(ids1)

[2366, 15, 25378, 22454, 220, 17, 11, 1101, 2663, 279, 34129, 55654, 38756, 11, 574, 264, 26938, 6767, 311, 30569, 279, 18039, 315, 25378, 311, 9407, 34129, 1234, 279, 43606, 10852, 3298, 13, 578, 20770, 574, 389, 279, 6287, 220, 19, 11, 220, 2366, 15, 11, 6156, 26938, 323, 5946, 449, 220, 4331, 13, 1544, 4, 315, 279, 7055, 13, 23548, 34129, 14800, 28271, 304, 1023, 5415, 11, 9540, 26137, 304, 38379, 323, 23195, 3779, 990, 8670, 311, 872, 5415, 6, 78588, 26, 15879, 20034, 311, 5471, 420, 555, 57515, 1614, 25543, 41693, 369, 3938, 34129, 14800, 28271, 13, 578, 6767, 574, 7396, 1455, 304, 16036, 5789, 323, 16475, 304, 19624, 5789, 13, 4740, 264, 7781, 4245, 311, 264, 6996, 315, 11006, 505, 279, 25378, 3331, 12000, 323, 13239, 39725, 11, 279, 20770, 574, 14297, 11798, 304, 6664, 220, 2366, 16, 13, 9540, 26137, 17644, 311, 6638, 1203, 279, 2068, 323, 923, 264, 990, 16686, 1555, 264, 1614, 25543, 28238, 11, 902, 4745, 1306, 279, 3723, 4273, 13814, 7301, 32098, 1202, 8292, 13]


In [81]:
print(tokenizer_bpe.decode(ids1))

2020 Missouri Amendment 2, also called the Medicaid Expansion Initiative, was a ballot measure to amend the Constitution of Missouri to expand Medicaid under the Affordable Care Act. The initiative was on the August 4, 2020, primary ballot and passed with 53.27% of the vote. Following Medicaid expansion initiatives in other states, Republican lawmakers in Nebraska and Utah added work requirements to their states' expansions; supporters aimed to prevent this by proposing state constitutional amendments for future Medicaid expansion initiatives. The measure was supported most in urban areas and opposed in rural areas. After a delay due to a lack of funding from the Missouri General Assembly and resulting litigation, the initiative was slowly implemented in October 2021. Republican lawmakers attempted to roll back the program and add a work requirement through a state constitutional amendment, which failed after the United States Supreme Court prevented its implementation.


In [82]:
text_wiki = """After the 30 September Movement, many members of the Indonesian Communist Party (PKI), as well as several ABRI personnel from East Java, fled to Mbah Suro's hermitage in Nginggil to avoid being arrested by the government.[1] Mbah Suro's followers continued to grow in number. In 1966, the Blora Attorney estimated that the number of Mbah Suro's followers had reached 500,000 people.[2]

Mbah Suro often gave speeches to his followers about the prophecy of the coming of the ratu adil (just king) and instructed them to chant slogans such as “Long live Mbah Suro” and “Long live Sukarno.” His speeches led the New Order authorities to begin monitoring his activities starting in 1966.[3] To prepare for an attack from the New Order government, Mbah Suro formed a hermitage armed forces consisting of two battalions: Banteng Wulung and Banteng Sarinah. Banteng Wulung had 200 personnel, while Banteng Sarinah was composed of 35 women.[4]

The Commander of the IV Military Regional Command/Diponegoro (Kodam Diponegoro) had requested Mbah Suro to shut down his hermitage and also sent an envoy, Srinardi, to persuade him. The government made this effort four times. However, Mbah Suro rejected the request, and his followers assaulted the envoy from the Kodam Diponegoro.[5][6][7]

In November 1966, the ABRI team for Operation Kalong successfully arrested an Islamic studies lecturer in Jakarta, Djaelani, who was attempting to gather remaining BTI cadres in Jakarta to launch a rebellion against the "feudal class." In his statement, he revealed that he had been instructed by a shadow PKI member from Ngawi named Ngabdu, who was living in Mbah Suro's hermitage, to gather the BTI cadres. From Djaelani's confession, it was uncovered that Mbah Suro's hermitage was harboring PKI members. Based on his testimony—along with Mbah Suro's refusal to comply with the orders of the Diponegoro Division Commander—ABRI began planning an attack on Nginggil, which they codenamed Operation Kamtib.[5][8][9]

In planning Operation Kamtib in Nginggil, ABRI deployed troops from Battalions 408, 409, and 410, as well as RPKAD special forces. In addition, ABRI also mobilized support troops from Military Regional Command/Brawijaya which were the Military Area Command 0805/Ngawi and District Military Area Command 0813/Bojonegoro. The operation was led by Commander of District Military Command 0721, Major Srinardi. Meanwhile, Feisal Tanjung led the RPKAD forces.[10][1]
"""

In [18]:
ids_wiki = tokenizer_bpe.encode(text_wiki)
print(ids_wiki[:50])

NameError: name 'text_wiki' is not defined

In [85]:
print(tokenizer_bpe.decode(ids_wiki))

After the 30 September Movement, many members of the Indonesian Communist Party (PKI), as well as several ABRI personnel from East Java, fled to Mbah Suro's hermitage in Nginggil to avoid being arrested by the government.[1] Mbah Suro's followers continued to grow in number. In 1966, the Blora Attorney estimated that the number of Mbah Suro's followers had reached 500,000 people.[2]

Mbah Suro often gave speeches to his followers about the prophecy of the coming of the ratu adil (just king) and instructed them to chant slogans such as “Long live Mbah Suro” and “Long live Sukarno.” His speeches led the New Order authorities to begin monitoring his activities starting in 1966.[3] To prepare for an attack from the New Order government, Mbah Suro formed a hermitage armed forces consisting of two battalions: Banteng Wulung and Banteng Sarinah. Banteng Wulung had 200 personnel, while Banteng Sarinah was composed of 35 women.[4]

The Commander of the IV Military Regional Command/Diponegoro (K

### 2.6 Data sampling with a sliding window

In [86]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

In [87]:
raw_text[:50]

'I HAD always thought Jack Gisburn rather a cheap g'

In [19]:
enc_text = tokenizer_bpe.encode(raw_text)
print(len(enc_text))

4943


In [20]:
enc_sample = enc_text[:50]

In [21]:
context_size = 4
x = enc_sample[:context_size]
y = enc_sample[1:context_size + 1]
print(f"x:{x}")
print(f"y:    {y}")

x:[40, 473, 1846, 2744]
y:    [473, 1846, 2744, 3463]


In [22]:
context_size = 10

In [23]:
for i in range(1, context_size + 1):
    content = enc_sample[:i]
    desire = enc_sample[i]
    print(content, "---->", desire)

[40] ----> 473
[40, 473] ----> 1846
[40, 473, 1846] ----> 2744
[40, 473, 1846, 2744] ----> 3463
[40, 473, 1846, 2744, 3463] ----> 7762
[40, 473, 1846, 2744, 3463, 7762] ----> 480
[40, 473, 1846, 2744, 3463, 7762, 480] ----> 285
[40, 473, 1846, 2744, 3463, 7762, 480, 285] ----> 22464
[40, 473, 1846, 2744, 3463, 7762, 480, 285, 22464] ----> 4856
[40, 473, 1846, 2744, 3463, 7762, 480, 285, 22464, 4856] ----> 264


In [24]:
for i in range(1, context_size + 1):
    content = enc_sample[:i]
    desire = enc_sample[i]
    print(tokenizer_bpe.decode(content), "---->", tokenizer_bpe.decode([desire]))

I ---->  H
I H ----> AD
I HAD ---->  always
I HAD always ---->  thought
I HAD always thought ---->  Jack
I HAD always thought Jack ---->  G
I HAD always thought Jack G ----> is
I HAD always thought Jack Gis ----> burn
I HAD always thought Jack Gisburn ---->  rather
I HAD always thought Jack Gisburn rather ---->  a


In [102]:
import sys
def install_pytorch():
    cuda_version = None
    if 'linux' in sys.platform:
        !nvidia-smi --query-gpu=driver_version --format=csv,noheader 2>/dev/null
        cuda_version = !nvcc --version 2>/dev/null | grep "release" | awk '{print $6}'
    print(f"Python: {sys.version}\nCUDA: {cuda_version if cuda_version else 'Not Found'}")

    # 生成安装命令
    pytorch_command = "pip install torch torchvision torchaudio"
    if cuda_version:
        pytorch_command += " --index-url https://download.pytorch.org/whl/cu118"  # 根据CUDA版本调整
    print(f"\n运行以下命令安装:\n!{pytorch_command}")

install_pytorch()

Python: 3.9.13 (main, Aug 25 2022, 23:51:50) [MSC v.1916 64 bit (AMD64)]
CUDA: Not Found

运行以下命令安装:
!pip install torch torchvision torchaudio


In [103]:
!pip install torch torchvision torchaudio

Collecting torch
  Downloading torch-2.7.1-cp39-cp39-win_amd64.whl (216.0 MB)
     -------------------------------------- 216.0/216.0 MB 9.1 MB/s eta 0:00:00
Collecting torchvision
  Downloading torchvision-0.22.1-cp39-cp39-win_amd64.whl (1.7 MB)
     ---------------------------------------- 1.7/1.7 MB 15.6 MB/s eta 0:00:00
Collecting torchaudio
  Downloading torchaudio-2.7.1-cp39-cp39-win_amd64.whl (2.5 MB)
     ---------------------------------------- 2.5/2.5 MB 14.3 MB/s eta 0:00:00
Collecting typing-extensions>=4.10.0
  Downloading typing_extensions-4.14.1-py3-none-any.whl (43 kB)
     ---------------------------------------- 43.9/43.9 kB ? eta 0:00:00
Collecting sympy>=1.13.3
  Downloading sympy-1.14.0-py3-none-any.whl (6.3 MB)
     ---------------------------------------- 6.3/6.3 MB 14.9 MB/s eta 0:00:00
Installing collected packages: typing-extensions, sympy, torch, torchvision, torchaudio
  Attempting uninstall: typing-extensions
    Found existing installation: typing_extensio

In [1]:
pip uninstall -y torch torchvision torchaudio

Found existing installation: torch 2.7.1
Uninstalling torch-2.7.1:
  Successfully uninstalled torch-2.7.1
Note: you may need to restart the kernel to use updated packages.


ERROR: Exception:
Traceback (most recent call last):
  File "D:\anaconda\anaconda1\lib\site-packages\pip\_internal\cli\base_command.py", line 167, in exc_logging_wrapper
    status = run_func(*args)
  File "D:\anaconda\anaconda1\lib\site-packages\pip\_internal\commands\uninstall.py", line 103, in run
    uninstall_pathset.commit()
  File "D:\anaconda\anaconda1\lib\site-packages\pip\_internal\req\req_uninstall.py", line 424, in commit
    self._moved_paths.commit()
  File "D:\anaconda\anaconda1\lib\site-packages\pip\_internal\req\req_uninstall.py", line 277, in commit
    save_dir.cleanup()
  File "D:\anaconda\anaconda1\lib\site-packages\pip\_internal\utils\temp_dir.py", line 173, in cleanup
    rmtree(self._path)
  File "D:\anaconda\anaconda1\lib\site-packages\pip\_vendor\tenacity\__init__.py", line 326, in wrapped_f
    return self(f, *args, **kw)
  File "D:\anaconda\anaconda1\lib\site-packages\pip\_vendor\tenacity\__init__.py", line 406, in __call__
    do = self.iter(retry_state=ret

In [2]:
pip uninstall -y torch torchvision torchaudio

Found existing installation: torchvision 0.22.1
Uninstalling torchvision-0.22.1:
  Successfully uninstalled torchvision-0.22.1
Found existing installation: torchaudio 2.7.1
Uninstalling torchaudio-2.7.1:
  Successfully uninstalled torchaudio-2.7.1
Note: you may need to restart the kernel to use updated packages.




In [4]:
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124

Looking in indexes: https://download.pytorch.org/whl/cu124
Collecting torch
  Downloading https://download.pytorch.org/whl/cu124/torch-2.6.0%2Bcu124-cp39-cp39-win_amd64.whl (2532.2 MB)
     ---------------------------------------- 2.5/2.5 GB 1.2 MB/s eta 0:00:00
Collecting torchvision
  Downloading https://download.pytorch.org/whl/cu124/torchvision-0.21.0%2Bcu124-cp39-cp39-win_amd64.whl (6.1 MB)
     ---------------------------------------- 6.1/6.1 MB 6.7 MB/s eta 0:00:00
Collecting torchaudio
  Downloading https://download.pytorch.org/whl/cu124/torchaudio-2.6.0%2Bcu124-cp39-cp39-win_amd64.whl (4.2 MB)
     ---------------------------------------- 4.2/4.2 MB 6.0 MB/s eta 0:00:00
Collecting sympy==1.13.1
  Downloading https://download.pytorch.org/whl/sympy-1.13.1-py3-none-any.whl (6.2 MB)
     ---------------------------------------- 6.2/6.2 MB 5.8 MB/s eta 0:00:00
Installing collected packages: sympy, torch, torchvision, torchaudio
  Attempting uninstall: sympy
    Found existing insta



In [6]:
import torch

In [7]:
print(torch.cuda.is_available()) 

True


In [8]:
print(torch.cuda.get_device_name(0))

NVIDIA GeForce RTX 4060 Laptop GPU


In [25]:
import torch

In [26]:
from torch.utils.data import Dataset, DataLoader

In [27]:
# 将文本转化为token_id并以tensor形式存储
class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []
        token_ids = tokenizer.encode(txt)
        
        for i in range(0, len(token_ids) - max_length, stride):  # 建立对应的输入和输出toekn_id tensor
            input_chunk = token_ids[i: i+ max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))
            
    def __len__(self):
        return len(self.input_ids)  # 判断长度
    
    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]  # 返回输入和目标tensor中某一行
    

In [28]:
datasetv1 = GPTDatasetV1(raw_text, tokenizer_bpe, 4, 1)

In [29]:
datasetv1.input_ids

[tensor([  40,  473, 1846, 2744]),
 tensor([ 473, 1846, 2744, 3463]),
 tensor([1846, 2744, 3463, 7762]),
 tensor([2744, 3463, 7762,  480]),
 tensor([3463, 7762,  480,  285]),
 tensor([ 7762,   480,   285, 22464]),
 tensor([  480,   285, 22464,  4856]),
 tensor([  285, 22464,  4856,   264]),
 tensor([22464,  4856,   264, 12136]),
 tensor([ 4856,   264, 12136, 35201]),
 tensor([  264, 12136, 35201,   313]),
 tensor([12136, 35201,   313,  4636]),
 tensor([35201,   313,  4636,   264]),
 tensor([ 313, 4636,  264, 1695]),
 tensor([ 4636,   264,  1695, 12637]),
 tensor([  264,  1695, 12637,  3403]),
 tensor([ 1695, 12637,  3403,   313]),
 tensor([12637,  3403,   313,   708]),
 tensor([3403,  313,  708,  433]),
 tensor([313, 708, 433, 574]),
 tensor([708, 433, 574, 912]),
 tensor([ 433,  574,  912, 2294]),
 tensor([  574,   912,  2294, 13051]),
 tensor([  912,  2294, 13051,   311]),
 tensor([ 2294, 13051,   311,   757]),
 tensor([13051,   311,   757,   311]),
 tensor([ 311,  757,  311, 6865]),

In [30]:
datasetv1.__getitem__(3)

(tensor([2744, 3463, 7762,  480]), tensor([3463, 7762,  480,  285]))

In [31]:
def create_dataloader_v1(txt, batch_size=4, max_length=256, stride=128, shuffle=True, drop_last=True, num_workers=0):
    tokenizer = tiktoken.encoding_for_model("gpt-4")
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride) 
    dataloader = DataLoader(
                            dataset,
                            batch_size=batch_size,
                            shuffle=shuffle,
                            drop_last=drop_last, 
                            num_workers=num_workers 
    )
    return dataloader

In [32]:
dataloader = create_dataloader_v1(raw_text, batch_size=1, max_length=4, stride=1, shuffle=False)

In [128]:
type(dataloader)

torch.utils.data.dataloader.DataLoader

In [33]:
data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[  40,  473, 1846, 2744]]), tensor([[ 473, 1846, 2744, 3463]])]


In [34]:
second_batch = next(data_iter)
print(second_batch)

[tensor([[ 473, 1846, 2744, 3463]]), tensor([[1846, 2744, 3463, 7762]])]


In [35]:
dataloader2 = create_dataloader_v1(raw_text, batch_size=8, max_length=4, stride=4, shuffle=False)

In [36]:
data_iter2 = iter(dataloader2)
inputs, targets = next(data_iter2)
print("input: ", inputs)
print("target: ", targets)

input:  tensor([[   40,   473,  1846,  2744],
        [ 3463,  7762,   480,   285],
        [22464,  4856,   264, 12136],
        [35201,   313,  4636,   264],
        [ 1695, 12637,  3403,   313],
        [  708,   433,   574,   912],
        [ 2294, 13051,   311,   757],
        [  311,  6865,   430,    11]])
target:  tensor([[  473,  1846,  2744,  3463],
        [ 7762,   480,   285, 22464],
        [ 4856,   264, 12136, 35201],
        [  313,  4636,   264,  1695],
        [12637,  3403,   313,   708],
        [  433,   574,   912,  2294],
        [13051,   311,   757,   311],
        [ 6865,   430,    11,   304]])


In [37]:
inputs2, targets2 = next(data_iter2)
print("input: ", inputs2)
print("target: ", targets2)

input:  tensor([[  304,   279,  2673,   315],
        [  813, 27025,    11,   568],
        [ 1047, 12504,   813, 19354],
        [   11, 12502,   264,  9257],
        [57896,    11,   323,  9749],
        [ 5678,   304,   264, 47625],
        [  389,   279, 51768, 26919],
        [   13,   320, 27831,   358]])
target:  tensor([[  279,  2673,   315,   813],
        [27025,    11,   568,  1047],
        [12504,   813, 19354,    11],
        [12502,   264,  9257, 57896],
        [   11,   323,  9749,  5678],
        [  304,   264, 47625,   389],
        [  279, 51768, 26919,    13],
        [  320, 27831,   358,  4856]])


### 2.7 Creating token embeddings

将token id进行向量嵌入

In [38]:
input_ids = torch.tensor([2, 3, 5, 1])

In [39]:
vocab_size = 6
output_dim = 3

In [40]:
torch.manual_seed(123)

<torch._C.Generator at 0x1ebf9608d70>

In [41]:
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [44]:
print(embedding_layer.weight)

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)


In [45]:
# 对单个token id进行向量嵌入
print(embedding_layer(torch.tensor([3])))

tensor([[-0.4015,  0.9666, -1.1481]], grad_fn=<EmbeddingBackward0>)


In [46]:
# 对多个token id进行向量嵌入
print(embedding_layer(input_ids))

tensor([[ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-2.8400, -0.7849, -1.4096],
        [ 0.9178,  1.5810,  1.3010]], grad_fn=<EmbeddingBackward0>)


### 2.8 Encoding word positions

In [47]:
vocab_size = 50257
output_dim = 256

In [48]:
# 创建嵌入层，vocab_size表示一次处理几个值，output_dim表示每个值变为几维向量
token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [49]:
# 将文字数据变为token id
max_length = 4
dataloader = create_dataloader_v1(raw_text, batch_size=8, max_length=max_length, stride=max_length, shuffle=False)

In [52]:
dataiter = iter(dataloader)
input_ids, target_ids = next(dataiter)
print("input: ", input_ids)
print("target: ", target_ids)

input:  tensor([[   40,   473,  1846,  2744],
        [ 3463,  7762,   480,   285],
        [22464,  4856,   264, 12136],
        [35201,   313,  4636,   264],
        [ 1695, 12637,  3403,   313],
        [  708,   433,   574,   912],
        [ 2294, 13051,   311,   757],
        [  311,  6865,   430,    11]])
target:  tensor([[  473,  1846,  2744,  3463],
        [ 7762,   480,   285, 22464],
        [ 4856,   264, 12136, 35201],
        [  313,  4636,   264,  1695],
        [12637,  3403,   313,   708],
        [  433,   574,   912,  2294],
        [13051,   311,   757,   311],
        [ 6865,   430,    11,   304]])


In [53]:
# 对每个token id进行向量嵌入，此时每个token id变为256维
token_vector = token_embedding_layer(input_ids)
print(token_vector)

tensor([[[ 0.4913,  1.1239,  1.4588,  ..., -0.3995, -1.8735, -0.1445],
         [-0.8191,  0.2605,  0.5637,  ..., -1.4546,  1.7735, -0.5172],
         [-0.1778, -1.1731, -1.8769,  ...,  0.1934, -0.7914, -1.6395],
         [ 1.0257, -0.8298, -0.8972,  ..., -0.0812, -0.5273,  0.0268]],

        [[ 0.3395, -0.2356,  1.8618,  ...,  0.0056, -0.8215, -1.5837],
         [-1.4556,  0.4396,  1.2542,  ...,  0.4209,  1.2177,  0.3914],
         [ 1.1792, -1.0156, -0.1078,  ..., -0.2914, -0.7526, -0.8195],
         [ 0.7219,  0.5423, -0.7337,  ...,  0.1182, -0.4828, -0.2856]],

        [[-0.6609,  1.0887, -0.0345,  ...,  2.1770, -0.0629, -1.1514],
         [ 1.2575, -0.9567, -1.2624,  ..., -0.5898, -0.7390,  0.6355],
         [-0.2603,  0.8579,  0.9012,  ..., -1.5838,  1.0285,  0.8128],
         [-0.7562, -1.2479,  0.2260,  ...,  1.3723,  0.3643,  0.6230]],

        ...,

        [[ 0.4008,  0.4796, -0.1928,  ..., -1.0423,  0.7868,  0.0060],
         [-0.4998, -0.3685, -0.3767,  ..., -1.2173,  2.56

In [54]:
token_vector.shape

torch.Size([8, 4, 256])

In [56]:
# 考虑相对位置，使用递增序列建立嵌入向量，并分与token id变为的嵌入向量进行加和，由于广播机制，位置向量会分别和每一行token id变为的向量加和
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)
pos_embedding = pos_embedding_layer(torch.arange(context_length))
print(pos_embedding)

tensor([[ 0.1265, -0.0513,  1.3903,  ...,  0.7732, -0.0831,  0.0133],
        [ 1.8435, -0.4803, -0.6885,  ..., -1.9100, -0.1373,  1.5089],
        [-0.8424,  0.2939, -0.5960,  ...,  1.3296,  0.1291,  0.5257],
        [ 0.0491, -1.4016,  0.1980,  ..., -0.4796, -1.4831, -0.8476]],
       grad_fn=<EmbeddingBackward0>)


In [57]:
print(pos_embedding.shape)

torch.Size([4, 256])


In [58]:
input_embeddings = token_vector + pos_embedding
print(input_embeddings)

tensor([[[ 6.1783e-01,  1.0725e+00,  2.8492e+00,  ...,  3.7365e-01,
          -1.9567e+00, -1.3124e-01],
         [ 1.0244e+00, -2.1971e-01, -1.2481e-01,  ..., -3.3645e+00,
           1.6362e+00,  9.9172e-01],
         [-1.0203e+00, -8.7917e-01, -2.4728e+00,  ...,  1.5230e+00,
          -6.6232e-01, -1.1139e+00],
         [ 1.0748e+00, -2.2315e+00, -6.9911e-01,  ..., -5.6082e-01,
          -2.0104e+00, -8.2086e-01]],

        [[ 4.6606e-01, -2.8699e-01,  3.2521e+00,  ...,  7.7877e-01,
          -9.0464e-01, -1.5704e+00],
         [ 3.8791e-01, -4.0674e-02,  5.6573e-01,  ..., -1.4891e+00,
           1.0804e+00,  1.9002e+00],
         [ 3.3682e-01, -7.2166e-01, -7.0378e-01,  ...,  1.0382e+00,
          -6.2352e-01, -2.9384e-01],
         [ 7.7102e-01, -8.5930e-01, -5.3568e-01,  ..., -3.6143e-01,
          -1.9659e+00, -1.1332e+00]],

        [[-5.3433e-01,  1.0373e+00,  1.3558e+00,  ...,  2.9502e+00,
          -1.4602e-01, -1.1380e+00],
         [ 3.1010e+00, -1.4369e+00, -1.9509e+00,  .

In [59]:
print(input_embeddings.shape)

torch.Size([8, 4, 256])
