In [7]:
! pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.7.0-cp311-cp311-win_amd64.whl.metadata (6.8 kB)
Collecting regex>=2022.1.18 (from tiktoken)
  Downloading regex-2024.5.15-cp311-cp311-win_amd64.whl.metadata (41 kB)
     ---------------------------------------- 0.0/42.0 kB ? eta -:--:--
     ---------------------------------------- 42.0/42.0 kB 2.1 MB/s eta 0:00:00
Downloading tiktoken-0.7.0-cp311-cp311-win_amd64.whl (799 kB)
   ---------------------------------------- 0.0/799.0 kB ? eta -:--:--
    --------------------------------------- 10.2/799.0 kB ? eta -:--:--
   -- ------------------------------------ 61.4/799.0 kB 825.8 kB/s eta 0:00:01
   ------- -------------------------------- 143.4/799.0 kB 1.2 MB/s eta 0:00:01
   ------------- -------------------------- 276.5/799.0 kB 1.7 MB/s eta 0:00:01
   -------------------------- ------------- 532.5/799.0 kB 2.6 MB/s eta 0:00:01
   ---------------------------------------- 799.0/799.0 kB 3.2 MB/s eta 0:00:00
Downloading regex-2024.5.15-cp311

In [72]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import torch.nn.functional as F
import os
import requests

## Get the dataset

In [84]:
if not os.path.exists("dataset/sales_textbook.txt"):
    url="https://huggingface.co/datasets/goendalf666/sales-textbook_for_convincing_and_selling/resolve/main/sales_textbook.txt?download=true"
    r=requests.get(url)
    with open("dataset/sales_textbook.txt", "wb") as f:
        f.write(r.content)
with open("dataset/sales_textbook.txt", "r") as f:
    text=f.read()

## Tokenizer化 

In [85]:
import tiktoken
encoding=tiktoken.get_encoding("cl100k_base")

In [86]:
tokenized_text=encoding.encode(text)
tokenized_text=torch.tensor(encoding.encode(text))
max_token_value=tokenized_text.max().item()

In [134]:
for i in range(len(tokenized_text)):
    print(encoding.decode([tokenized_text[i]]))

Chapter
 
1
:
 Building
 Rap
port
 and
 Capt
uring
 Attention


Sub
point
:
 Understanding
 the
 Importance
 of
 Building
 Rap
port


Building
 rapport
 is
 a
 fundamental
 skill
 in
 sales
 that
 cannot
 be
 underestimated
.
 It
 lays
 the
 foundation
 for
 establishing
 a
 connection
 with
 your
 potential
 customers
,
 gaining
 their
 trust
,
 and
 ultimately
 convincing
 them
 to
 make
 a
 purchase
.
 Rap
port
 can
 be
 defined
 as
 a
 harmon
ious
 relationship
 based
 on
 mutual
 understanding
 and
 empathy
.
 When
 you
 build
 rapport
 with
 someone
,
 you
 create
 a
 sense
 of
 familiarity
,
 comfort
,
 and
 shared
 interests
,
 making
 it
 easier
 to
 communicate
 and
 influence
 their
 decision
-making
 process
.

One
 of
 the
 main
 reasons
 why
 building
 rapport
 is
 crucial
 in
 sales
 is
 that
 people
 are
 more
 likely
 to
 buy
 from
 someone
 they
 like
 and
 trust
.
 By
 establishing
 a
 positive
 and
 genuine
 connection
 with
 your
 customers
,
 you
 increase
 their


In [133]:

encoding.decode(tokenized_text.numpy())

'Chapter 1: Building Rapport and Capturing Attention\nSubpoint: Understanding the Importance of Building Rapport\nBuilding rapport is a fundamental skill in sales that cannot be underestimated. It lays the foundation for establishing a connection with your potential customers, gaining their trust, and ultimately convincing them to make a purchase. Rapport can be defined as a harmonious relationship based on mutual understanding and empathy. When you build rapport with someone, you create a sense of familiarity, comfort, and shared interests, making it easier to communicate and influence their decision-making process.\nOne of the main reasons why building rapport is crucial in sales is that people are more likely to buy from someone they like and trust. By establishing a positive and genuine connection with your customers, you increase their confidence in you and your product or service. People want to do business with individuals they feel comfortable with, those who understand their n

In [87]:
max_token_value

100069

In [88]:
print("tokenized text length:",len(tokenized_text))
print("vocabulary length:",len(set(tokenized_text)))

tokenized text length: 77919
vocabulary length: 77919


## Hyperparameters 

In [89]:
batch_size=4
context_length=16
d_model=64
num_head=4

## Dataset split

In [90]:
train_idx=int(len(tokenized_text)*0.9)
train_data=tokenized_text[:train_idx]
validate_data=tokenized_text[train_idx:]

In [91]:
data=train_data
idxs=torch.randint(low=0,high=len(data)-context_length,size=(bacth_size,)) # tensor([42803,  5625, 38462, 45086])
x_batch=torch.stack([data[i:i+context_length] for i in idxs])# 训练x
y_batch=torch.stack([data[i+1:i+context_length+1] for i in idxs])# 训练y x序列整体向后平移一位

In [92]:
print(encoding.decode(x_batch[0].numpy()))

 subpoint aims to equip salespeople with effective strategies to address these objections and position


## Word embedding

In [93]:
input_embedding_lookup_table=nn.Embedding(max_token_value+1,d_model)
print(input_embedding_lookup_table.weight.data)

tensor([[-0.7132,  1.5857,  0.1256,  ...,  2.2428, -0.8335,  0.6856],
        [-1.4438, -0.3636, -0.1870,  ...,  0.2389,  1.1670, -0.6784],
        [ 0.4596,  0.0213,  0.3284,  ..., -0.6699, -0.3219,  0.0105],
        ...,
        [-1.7391,  3.0796,  0.2057,  ..., -2.7255, -0.1962,  1.5509],
        [ 0.4148,  1.1203,  0.8043,  ..., -0.8352,  0.6435,  0.3555],
        [-0.4843,  1.3959, -0.9108,  ..., -0.0525, -1.9090,  0.3182]])


In [94]:
x_batch_embedding=input_embedding_lookup_table(x_batch)
y_batch_embedding=input_embedding_lookup_table(y_batch)

In [95]:
x_batch_embedding.shape

torch.Size([4, 16, 64])

## Position embedding

In [96]:
import math

In [97]:
position_encoding_lookup_table = torch.zeros(context_length, d_model)
position=torch.arange(0,context_length,dtype=torch.float).unsqueeze(1)#扩充一个维度

In [98]:
print(position_encoding_lookup_table.shape)
print(position.shape)

torch.Size([16, 64])
torch.Size([16, 1])


![position embedding formula](./img/position_embedding.png)

In [99]:
div_term=torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))# 分母
position_encoding_lookup_table[:, 0::2] = torch.sin(position * div_term)# 奇数位置
position_encoding_lookup_table[:, 1::2] = torch.cos(position * div_term)# 偶数位置
position_encoding_lookup_table = position_encoding_lookup_table.unsqueeze(0).expand(batch_size, -1, -1)

In [100]:
print(position_encoding_lookup_table.shape)

torch.Size([4, 16, 64])


In [101]:
x=x_batch_embedding+position_encoding_lookup_table #（4，16,64）
y=y_batch_embedding+position_encoding_lookup_table
print(x.shape,y.shape)

torch.Size([4, 16, 64]) torch.Size([4, 16, 64])


## Masked Multi-head Attention

![multi-head attention](./img/multi-head%20attention.png)

![attention formula](./img/attention_formula.png)

#### Get Q、K、V

In [102]:
Wq=nn.Linear(d_model,d_model)
Wk=nn.Linear(d_model,d_model)
Wv=nn.Linear(d_model,d_model)

Q=Wq(x)# （4,16,64）
K=Wk(x)
V=Wv(x)

In [103]:
print(Q.shape,K.shape,V.shape) 

torch.Size([4, 16, 64]) torch.Size([4, 16, 64]) torch.Size([4, 16, 64])


#### 多头转换

In [104]:
Q=Q.reshape((batch_size,context_length,num_head,d_model//num_head)).permute(0,2,1,3)
K=K.reshape((batch_size,context_length,num_head,d_model//num_head)).permute(0,2,1,3)
V=V.reshape((batch_size,context_length,num_head,d_model//num_head)).permute(0,2,1,3)

In [105]:
print(Q.shape,K.shape,V.shape)

torch.Size([4, 4, 16, 16]) torch.Size([4, 4, 16, 16]) torch.Size([4, 4, 16, 16])


#### 注意力计算

In [106]:
output=Q@K.transpose(-2,-1)/math.sqrt(d_model//num_head)

#### Mask

In [107]:
mask=torch.triu(torch.ones(context_length,context_length),diagonal=1).bool()
output=output.masked_fill(mask,float('-inf'))
pd.DataFrame(output[0,0].detach().numpy())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,0.256827,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf
1,-0.342454,-1.520618,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf
2,-0.434096,0.78084,0.920728,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf
3,0.179458,-0.704933,0.274545,0.237548,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf
4,-0.206155,0.680781,-0.617827,0.381046,0.017696,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf
5,0.325471,-0.075816,0.860106,0.550442,0.518375,0.012939,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf
6,0.386611,-1.110908,0.115225,-0.055114,0.024327,-0.186498,-1.089248,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf
7,0.26749,-0.650551,0.01558,-0.000103,0.271184,-0.040387,-0.875186,-0.56909,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf
8,0.097901,0.001225,-0.021065,0.411283,0.219961,0.616137,0.25834,0.027239,-0.01249,-inf,-inf,-inf,-inf,-inf,-inf,-inf
9,0.57652,0.69255,0.577323,0.882363,-0.199935,0.473689,0.256325,-0.997746,0.384736,0.086219,-inf,-inf,-inf,-inf,-inf,-inf


#### Softmax

In [108]:
attention_score=F.softmax(output,dim=-1)
print(attention_score.shape)

torch.Size([4, 4, 16, 16])


### Matmul

In [109]:
A=attention_score @ V
print(A.shape)

torch.Size([4, 4, 16, 16])


#### 多头合并

In [110]:
A=A.permute(0,2,1,3).reshape(batch_size,context_length,d_model)
print(A.shape)

torch.Size([4, 16, 64])


#### 线性变换

In [111]:
Wo=nn.Linear(d_model,d_model)
output=Wo(A)
print(output.shape)

torch.Size([4, 16, 64])


#### 残差连接

In [112]:
output=output+x

## Layer Norm

In [113]:
layer_norm=nn.LayerNorm(d_model)
layer_norm_output=layer_norm(output)


## Feed Foward Network

In [114]:
output=nn.Linear(d_model,d_model*4)(layer_norm_output)
output=nn.ReLU()(output)
output=nn.Linear(d_model*4,d_model)(output)
output=output+layer_norm_output

In [116]:
print(output.shape)

torch.Size([4, 16, 64])


## Layer Norm

In [117]:
output=layer_norm(output)

## 线性变换层

In [118]:
output=nn.Linear(d_model,max_token_value+1)(output)
print(output.shape)

torch.Size([4, 16, 100070])


## Softmax

In [119]:
logits=F.softmax(output,dim=-1)
print(logits.shape)

torch.Size([4, 16, 100070])


In [135]:
predict_idx=torch.argmax(logits[0,3]).item()

In [136]:
encoding.decode([predict_idx])

' prized'

![out put](./img/output.png)