In [105]:
import numpy as np

def attention(Q, K, V, d_k):
    # step1: calculate similarity dot product between Q and K
    scores = np.dot(Q, K.T) / np.sqrt(d_k)
    
    # step2: softmax normalization: softmax function: exp / sum
    attention_weights = np.exp(scores) / np.sum(np.exp(scores), axis=-1, keepdims=True)
    
    # step3: alpha dot product V
    output = np.dot(attention_weights, V)
    
    return output, attention_weights

In [106]:
Q = np.array([[1, 0], [0, 1]])  # 2x2 Query matrix
K = np.array([[1, 0], [0, 1]])  # 2x2 Key matrix
V = np.array([[1, 2], [3, 4]])  # 2x2 Value matrix
d_k = K.shape[1]  # Dimension of the Key (for scaling)

In [107]:
output, attention_weights = attention(Q, K, V, d_k)

print("Attention Weights:\n", attention_weights)
print("Output after applying attention:\n", output)

Attention Weights:
 [[0.66976155 0.33023845]
 [0.33023845 0.66976155]]
Output after applying attention:
 [[1.6604769 2.6604769]
 [2.3395231 3.3395231]]


In [108]:
import gensim
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np
import jieba
import re


In [109]:
f = open("../dataset/sanguo.txt", 'r',encoding='utf-8') #读入文本

lines = []
for line in f: #分别对每段分词
    temp = jieba.lcut(line)  #结巴分词 精确模式
    words = []
    for i in temp:
        #过滤掉所有的标点符号
        i = re.sub("[\s+\.\!\/_,$%^*(+\"\'””《》]+|[+——！，。？、~@#￥%……&*（）：；‘]+", "", i)
        if len(i) > 0:
            words.append(i)
    if len(words) > 0:
        lines.append(words)
print(lines[0:5])#预览前5行分词结果


[['三国演义', '上卷'], ['罗贯中'], ['滚滚', '长江', '东', '逝水', '浪花', '淘尽', '英雄', '是非成败', '转头', '空', '青山', '依旧', '在', '几度', '夕阳红'], ['白发', '渔樵', '江渚上', '惯看', '秋月春风', '一壶', '浊酒', '喜相逢', '古今', '多少', '事', '都', '付笑谈', '中'], ['--', '调寄', '临江仙']]


In [110]:
# 调用Word2Vec训练
# 参数：size: 词向量维度；window: 上下文的宽度，min_count为考虑计算的单词的最低词频阈值
model = Word2Vec(lines,vector_size = 20, window = 2 , min_count = 3, epochs=7, negative=10, sg=1)


In [111]:
model.wv.most_similar('孔明', topn = 20) # 与孔明最相关的前20个词语

[('先主', 0.9160001873970032),
 ('玄德', 0.8978016376495361),
 ('使者', 0.8872831463813782),
 ('周瑜', 0.8821434378623962),
 ('关公', 0.8793385028839111),
 ('陆逊', 0.8752183318138123),
 ('心中', 0.8747538924217224),
 ('孙权', 0.8602688312530518),
 ('庞统', 0.8565940856933594),
 ('门吏', 0.8525434136390686),
 ('司马昭', 0.8510890007019043),
 ('袁术', 0.8493541479110718),
 ('二嫂', 0.8472496867179871),
 ('密书', 0.8466708660125732),
 ('鲁肃', 0.8466097116470337),
 ('维', 0.8464145660400391),
 ('魏主', 0.846045732498169),
 ('孙夫人', 0.8451597094535828),
 ('孙策', 0.8408129215240479),
 ('后主', 0.8386663198471069)]

In [112]:
print("孔明的词向量：\n", model.wv.get_vector('孔明'))

孔明的词向量：
 [ 0.03546751 -0.23976095  0.42774808 -0.12599747  0.4775886  -0.49785823
  0.62812436  1.4118224  -0.26054758  0.9620116   0.4333564  -0.18006714
  0.11103736 -0.8641684   0.8370761   0.60063756  0.31567514  0.09167805
 -0.674838   -0.6923421 ]


In [113]:
# 获取词向量（只读）
word_vectors = model.wv

# 创建一个新的字典来存储归一化后的词向量
normalized_vectors = {}

# 对词向量进行归一化
for word in word_vectors.index_to_key:
    vector = word_vectors[word]
    norm = np.linalg.norm(vector)
    # 如果向量的范数不为零，则进行归一化
    if norm != 0:
        normalized_vectors[word] = vector / norm

In [114]:
print("孔明的归一化词向量：\n", normalized_vectors['孔明'])

孔明的归一化词向量：
 [ 0.01321955 -0.08936434  0.1594314  -0.04696211  0.1780081  -0.18556304
  0.23411618  0.52621824 -0.09711199  0.3585635   0.16152175 -0.06711511
  0.04138614 -0.32209516  0.31199723  0.22387123  0.11765928  0.03417049
 -0.25152743 -0.2580516 ]


In [115]:
print("周瑜的归一化词向量：\n", normalized_vectors['周瑜'])

周瑜的归一化词向量：
 [ 0.15286243 -0.10232241  0.07909361  0.05580199  0.01053218 -0.04018738
  0.14878486  0.32779312 -0.11989477  0.32229838  0.03918505 -0.04621497
  0.08911306 -0.5541      0.3462584   0.24091484  0.17802759 -0.07364184
 -0.20492955 -0.3626902 ]


In [116]:
# 定义计算相似度的函数
def calculate_attention(query, key):
    d_k = query.shape[-1]  # 获取向量的维度
    similarity = np.dot(query, key) / np.sqrt(d_k)  # 计算点积并归一化
    return similarity

In [117]:
calculate_attention(normalized_vectors['孔明'], normalized_vectors['周瑜'])

0.19725328262457173

In [118]:
calculate_attention(normalized_vectors['孔明'], normalized_vectors['张飞'])

0.16912493055189184

In [120]:
calculate_attention(normalized_vectors['刘备'], normalized_vectors['刘禅'])

0.15584850617715132

In [121]:
calculate_attention(normalized_vectors['刘备'], normalized_vectors['曹操'])

0.17991281670659112

In [122]:
calculate_attention(normalized_vectors['曹操'], normalized_vectors['关羽'])

0.14587914604470847