## 数据预处理
· 数据集采用多语言数据集中的cmn（英文/繁体中文）

· 采用数据集中的英文部分进行实验

· 使用numpy实现了skip gram，CBOW，glove和fastText（作业要求中提及的四种算法）

· 实现了词向量的保存、加载和词/句子相似度计算，并对每个算法进词和句子的相似度计算

· 由于电脑算力一般，为了不浪费时间，只采用了前1000条数据，这样算出的word embadding效果较差，也在情理之中，勉强能用

In [46]:
import numpy as np
import re
import tqdm
# 加载数据集，保留标点符号
def load_dataset(filename, max_lines=1000):
    pairs = []
    with open(filename, 'r', encoding='utf-8') as file:
        for i, line in enumerate(file):
            if i >= max_lines:
                break
            parts = line.split('\t')
            if len(parts) >= 2:
                en_sentence = parts[0].strip().lower()  # 保留英文标点符号并转为小写
                cn_sentence = parts[1].strip() 
                pairs.append((en_sentence, cn_sentence))
    return pairs
# 创建vocab
def create_vocab(pairs):
    vocab = {'<PAD>': 0, '<SOS>': 1, '<EOS>': 2, '<UNK>': 3}
    for en, _ in pairs:
        for word in re.findall(r'\w+|[^\w\s]', en, re.UNICODE):  # 分词时保留标点符号
            if word not in vocab:
                vocab[word] = len(vocab)
    return vocab

# 数据处理
filename = 'cmn.txt' 
data_pairs = load_dataset(filename)
vocab = create_vocab(data_pairs)

print("Example data pairs:", data_pairs[:5])
print("Vocabulary:", list(vocab.items())[:50])

Example data pairs: [('hi.', '嗨。'), ('hi.', '你好。'), ('run.', '你用跑的。'), ('wait!', '等等！'), ('wait!', '等一下！')]
Vocabulary: [('<PAD>', 0), ('<SOS>', 1), ('<EOS>', 2), ('<UNK>', 3), ('hi', 4), ('.', 5), ('run', 6), ('wait', 7), ('!', 8), ('begin', 9), ('hello', 10), ('i', 11), ('won', 12), ('oh', 13), ('no', 14), ('cheers', 15), ('got', 16), ('it', 17), ('?', 18), ('he', 19), ('ran', 20), ('hop', 21), ('in', 22), ('quit', 23), ("'", 24), ('m', 25), ('ok', 26), ('up', 27), ('listen', 28), ('way', 29), ('really', 30), ('try', 31), ('we', 32), ('why', 33), ('me', 34), ('ask', 35), ('tom', 36), ('awesome', 37), ('be', 38), ('calm', 39), ('fair', 40), ('kind', 41), ('nice', 42), ('call', 43), ('us', 44), ('come', 45), ('get', 46), ('out', 47), ('go', 48), ('away', 49)]


In [47]:
def generate_skipgram_data(pairs, vocab, window_size=6):
    training_data = []
    for en, _ in pairs:
        words = re.findall(r'\w+|[^\w\s]', en, re.UNICODE)  # 分词时保留标点符号
        
        indices = [vocab[word] for word in words]
        for center_i, center_word in enumerate(indices):
            for offset in range(-window_size, window_size + 1):
                context_i = center_i + offset
                if context_i < 0 or context_i >= len(indices) or center_i == context_i:
                    continue
                context_word = indices[context_i]
                training_data.append((center_word, context_word))
    return training_data

# 生成 Skip-gram 训练数据
window_size = 2
training_data = generate_skipgram_data(data_pairs, vocab, window_size)

print("Training data example:", training_data[:5])


Training data example: [(4, 5), (5, 4), (4, 5), (5, 4), (6, 5)]


In [48]:
# Sigmoid
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

# Skip-gram 训练函数
def train_skipgram(training_data, vocab_size, embedding_dim=100, iterations=100, learning_rate=0.01, negative_samples=5):
    W_input = np.random.randn(vocab_size, embedding_dim)
    W_output = np.random.randn(vocab_size, embedding_dim)

    for iteration in range(iterations):
        total_loss = 0
        for center_word, context_word in training_data:
            # 正样本
            z = np.dot(W_input[center_word], W_output[context_word])
            p = sigmoid(z)
            loss = -np.log(p)
            total_loss += loss

            # 反向传播
            grad_output = p - 1
            grad_input = grad_output * W_output[context_word]

            W_output[context_word] -= learning_rate * grad_output * W_input[center_word]
            W_input[center_word] -= learning_rate * grad_input

            # 负采样
            for _ in range(negative_samples):
                negative_word = np.random.choice(vocab_size)
                z = np.dot(W_input[center_word], W_output[negative_word])
                p = sigmoid(z)
                loss = -np.log(1 - p)
                total_loss += loss

                grad_output = p
                grad_input = grad_output * W_output[negative_word]

                W_output[negative_word] -= learning_rate * grad_output * W_input[center_word]
                W_input[center_word] -= learning_rate * grad_input

        print(f"Iteration {iteration + 1}, Loss: {total_loss}")

    return W_input

In [49]:
def build_cooccurrence_matrix(pairs, vocab, window_size=5):
    vocab_size = len(vocab)
    cooccurrence_matrix = np.zeros((vocab_size, vocab_size))
    
    for en,_ in pairs:
        
        words = re.findall(r'\w+|[^\w\s]', en, re.UNICODE)
        indices = [vocab[word] for word in words]
        for center_i, center_word in enumerate(indices):
            for offset in range(-window_size, window_size + 1):
                context_i = center_i + offset
                if context_i < 0 or context_i >= len(indices) or center_i == context_i:
                    continue
                context_word = indices[context_i]
                cooccurrence_matrix[center_word, context_word] += 1.0 / abs(offset)
                
    return cooccurrence_matrix

def train_glove(cooccurrence_matrix, embedding_dim=100, iterations=100, learning_rate=0.05, x_max=100, alpha=0.75):
    vocab_size = cooccurrence_matrix.shape[0]
    W = np.random.randn(vocab_size, embedding_dim)
    biases = np.random.randn(vocab_size)
    
    for iteration in range(iterations):
        total_cost = 0
        for i in range(vocab_size):
            for j in range(vocab_size):
                if cooccurrence_matrix[i, j] > 0:
                    X_ij = cooccurrence_matrix[i, j]
                    weight = (X_ij / x_max) ** alpha if X_ij < x_max else 1
                    cost = weight * (np.dot(W[i], W[j]) + biases[i] + biases[j] - np.log(X_ij)) ** 2
                    total_cost += 0.5 * cost

                    grad_Wi = weight * (np.dot(W[i], W[j]) + biases[i] + biases[j] - np.log(X_ij)) * W[j]
                    grad_Wj = weight * (np.dot(W[i], W[j]) + biases[i] + biases[j] - np.log(X_ij)) * W[i]
                    grad_bi = weight * (np.dot(W[i], W[j]) + biases[i] + biases[j] - np.log(X_ij))
                    grad_bj = weight * (np.dot(W[i], W[j]) + biases[i] + biases[j] - np.log(X_ij))

                    W[i] -= learning_rate * grad_Wi
                    W[j] -= learning_rate * grad_Wj
                    biases[i] -= learning_rate * grad_bi
                    biases[j] -= learning_rate * grad_bj

        print(f"Iteration {iteration + 1}, Cost: {total_cost}")
    
    return W

In [50]:
# 保存词嵌入矩阵
def save_embeddings(embeddings, filename):
    np.save(filename, embeddings)

# 加载词嵌入矩阵
def load_embeddings(filename):
    return np.load(filename)

In [51]:
cooccurrence_matrix = build_cooccurrence_matrix(data_pairs, vocab)
glove_embeddings = train_glove(cooccurrence_matrix)

print("GloVe embedding matrix shape:", glove_embeddings.shape)
# 保存 GloVe 词嵌入矩阵
save_embeddings(glove_embeddings, 'glove_embeddings.npy')

Iteration 1, Cost: 6529.397103992768
Iteration 2, Cost: 2549.6716167086265
Iteration 3, Cost: 440.9463573979372
Iteration 4, Cost: 227.76217958417624
Iteration 5, Cost: 158.44262582461403
Iteration 6, Cost: 122.79575797723774
Iteration 7, Cost: 101.99254971048211
Iteration 8, Cost: 88.65929269629595
Iteration 9, Cost: 79.44465450510253
Iteration 10, Cost: 72.68268258996738
Iteration 11, Cost: 67.47738815651215
Iteration 12, Cost: 63.31153527635931
Iteration 13, Cost: 59.86885351216667
Iteration 14, Cost: 56.94715719695931
Iteration 15, Cost: 54.41254557009251
Iteration 16, Cost: 52.173664723042734
Iteration 17, Cost: 50.166527428027514
Iteration 18, Cost: 48.34522611515558
Iteration 19, Cost: 46.67608215656137
Iteration 20, Cost: 45.13386744625902
Iteration 21, Cost: 43.69931044371266
Iteration 22, Cost: 42.35741714008023
Iteration 23, Cost: 41.09631975601388
Iteration 24, Cost: 39.90647360341632
Iteration 25, Cost: 38.78008764756712
Iteration 26, Cost: 37.71071451365065
Iteration 27, 

In [52]:
# 训练 Skip-gram 模型
embedding_dim = 100
iterations = 50
learning_rate = 0.02
negative_samples = 4

skipgram_embeddings = train_skipgram(training_data, len(vocab), embedding_dim, iterations, learning_rate, negative_samples)

# 打印 Skip-gram 嵌入矩阵的形状
print("Skip-gram embedding matrix shape:", skipgram_embeddings.shape)


Iteration 1, Loss: 129184.80876976343


  loss = -np.log(1 - p)


Iteration 2, Loss: inf
Iteration 3, Loss: 60928.8337092693
Iteration 4, Loss: 49125.20822948345
Iteration 5, Loss: 41675.053734827394
Iteration 6, Loss: 36203.97347805432
Iteration 7, Loss: 31375.213737049296
Iteration 8, Loss: 27953.33845185403
Iteration 9, Loss: 25465.832723627118
Iteration 10, Loss: 22369.154036273983
Iteration 11, Loss: 20268.545966687037
Iteration 12, Loss: 18755.66236088865
Iteration 13, Loss: 17077.218710261808
Iteration 14, Loss: 16043.09091301601
Iteration 15, Loss: 15213.019004726613
Iteration 16, Loss: 14553.044094374533
Iteration 17, Loss: 13771.482131854142
Iteration 18, Loss: 13163.274641838121
Iteration 19, Loss: 12627.254945445935
Iteration 20, Loss: 12114.16190893994
Iteration 21, Loss: 11786.40374887398
Iteration 22, Loss: 11330.424397664845
Iteration 23, Loss: 11154.457431184816
Iteration 24, Loss: 10752.69734976888
Iteration 25, Loss: 10638.478248673335
Iteration 26, Loss: 10383.978334345036
Iteration 27, Loss: 10326.861779207149
Iteration 28, Loss:

In [53]:
save_embeddings(skipgram_embeddings, 'skipgram_embeddings_small.npy')

In [57]:
from tqdm.notebook import tqdm  

tokenized_sentences = [re.findall(r'\w+|[^\w\s]', en, re.UNICODE) for en, _ in data_pairs]


def generate_cbow_training_data(tokenized_sentences, window_size=2):
    training_data = []
    for sentence in tokenized_sentences:
        for i, word in enumerate(sentence):
            target_word_idx = vocab[word]
            context_start = max(0, i - window_size)
            context_end = min(len(sentence), i + window_size + 1)
            context_words = [vocab[sentence[j]] for j in range(context_start, context_end) if j != i]
            if len(context_words) > 0:
                training_data.append((context_words, target_word_idx))
    return np.array(training_data)


In [58]:
class CBOW:
    def __init__(self, vocab_size, embedding_dim):
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.w1 = np.random.rand(vocab_size, embedding_dim)
        self.w2 = np.random.rand(embedding_dim, vocab_size)

    def softmax(self, x):
        e_x = np.exp(x - np.max(x))
        return e_x / e_x.sum(axis=0)

    def train(self, training_data, epochs, learning_rate):
        for epoch in range(epochs):
            loss = 0
            for context_words_idx, target_word_idx in tqdm(training_data, desc=f"Epoch {epoch+1}/{epochs}", leave=False):
                h = np.mean(self.w1[context_words_idx], axis=0)
                u = np.dot(self.w2.T, h)
                y_pred = self.softmax(u)
                e = y_pred
                e[target_word_idx] -= 1
                self.w1[context_words_idx] -= learning_rate * np.dot(self.w2, e).reshape(1, -1) / len(context_words_idx)
                self.w2 -= learning_rate * np.outer(h, e)
                loss += -np.log(y_pred[target_word_idx])
            if epoch % 10 == 0:
                print(f'Epoch: {epoch}, Loss: {loss}')


In [60]:
cbow_training_data = generate_cbow_training_data(tokenized_sentences)

embedding_dim = 50
cbow_model = CBOW(len(vocab), embedding_dim)
cbow_model.train(cbow_training_data, epochs=100, learning_rate=0.01)

cbow_word_embeddings = cbow_model.w1
print(cbow_word_embeddings)

  return np.array(training_data)


Epoch 1/100:   0%|          | 0/4078 [00:00<?, ?it/s]

  loss += -np.log(y_pred[target_word_idx])


Epoch: 0, Loss: nan


Epoch 2/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 3/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 4/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 5/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 6/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 7/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 8/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 9/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 10/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 11/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch: 10, Loss: nan


Epoch 12/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 13/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 14/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 15/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 16/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 17/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 18/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 19/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 20/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 21/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch: 20, Loss: nan


Epoch 22/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 23/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 24/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 25/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 26/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 27/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 28/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 29/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 30/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 31/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch: 30, Loss: nan


Epoch 32/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 33/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 34/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 35/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 36/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 37/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 38/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 39/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 40/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 41/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch: 40, Loss: nan


Epoch 42/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 43/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 44/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 45/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 46/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 47/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 48/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 49/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 50/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 51/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch: 50, Loss: nan


Epoch 52/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 53/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 54/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 55/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 56/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 57/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 58/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 59/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 60/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 61/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch: 60, Loss: nan


Epoch 62/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 63/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 64/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 65/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 66/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 67/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 68/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 69/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 70/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 71/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch: 70, Loss: nan


Epoch 72/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 73/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 74/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 75/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 76/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 77/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 78/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 79/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 80/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 81/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch: 80, Loss: nan


Epoch 82/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 83/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 84/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 85/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 86/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 87/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 88/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 89/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 90/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 91/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch: 90, Loss: nan


Epoch 92/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 93/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 94/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 95/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 96/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 97/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 98/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 99/100:   0%|          | 0/4078 [00:00<?, ?it/s]

Epoch 100/100:   0%|          | 0/4078 [00:00<?, ?it/s]

[[ 0.34204738  0.50871633  0.95809491 ...  0.03879069  0.96232111
   0.54860251]
 [ 0.35353445  0.29523667  0.64176126 ...  0.7314443   0.96386474
   0.95476344]
 [ 0.58760124  0.98951303  0.23486261 ...  0.57312616  0.28429103
   0.49596012]
 ...
 [ 0.87961555  0.40379252  0.16475303 ...  0.21165309  0.53835406
   0.65874318]
 [ 0.28871213  0.76753156  1.05534213 ...  0.13192158  0.27054625
   0.52804656]
 [ 0.82142559  0.65891681 -0.20109087 ...  0.18597281  0.27298787
   0.65136454]]


In [27]:
def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)
# 获取词向量
def get_word_vector(word, vocab, embeddings):
    index = vocab.get(word, vocab['<UNK>'])
    return embeddings[index]
# 去除标点符号的函数
def preprocess_sentence(sentence):
    return re.sub(r'[^\w\s]', '', sentence)

def get_similarity(word1, word2, vocab, embeddings):
    vector1 = get_word_vector(word1, vocab, embeddings)
    vector2 = get_word_vector(word2, vocab, embeddings)

    return cosine_similarity(vector1, vector2)

In [28]:
# loaded_embeddings = load_embeddings('glove_embeddings.npy')
loaded_embeddings = load_embeddings('skipgram_embeddings.npy')


# 读取预训练embdding
print("Loaded embedding matrix shape:", loaded_embeddings.shape)

Loaded embedding matrix shape: (663, 100)


In [63]:
reverse_vocabulary = {i: word for word, i in vocab.items()}
reverse_vocabulary

{0: '<PAD>',
 1: '<SOS>',
 2: '<EOS>',
 3: '<UNK>',
 4: 'hi',
 5: '.',
 6: 'run',
 7: 'wait',
 8: '!',
 9: 'begin',
 10: 'hello',
 11: 'i',
 12: 'won',
 13: 'oh',
 14: 'no',
 15: 'cheers',
 16: 'got',
 17: 'it',
 18: '?',
 19: 'he',
 20: 'ran',
 21: 'hop',
 22: 'in',
 23: 'quit',
 24: "'",
 25: 'm',
 26: 'ok',
 27: 'up',
 28: 'listen',
 29: 'way',
 30: 'really',
 31: 'try',
 32: 'we',
 33: 'why',
 34: 'me',
 35: 'ask',
 36: 'tom',
 37: 'awesome',
 38: 'be',
 39: 'calm',
 40: 'fair',
 41: 'kind',
 42: 'nice',
 43: 'call',
 44: 'us',
 45: 'come',
 46: 'get',
 47: 'out',
 48: 'go',
 49: 'away',
 50: 'home',
 51: 'goodbye',
 52: 'hang',
 53: 'on',
 54: 'came',
 55: 'runs',
 56: 'help',
 57: 'hit',
 58: 'hold',
 59: 'hug',
 60: 'agree',
 61: 'ill',
 62: 'sad',
 63: 'wet',
 64: 's',
 65: 'join',
 66: 'keep',
 67: 'kiss',
 68: 'perfect',
 69: 'see',
 70: 'you',
 71: 'shut',
 72: 'skip',
 73: 'take',
 74: 'wake',
 75: 'wash',
 76: 'know',
 77: 'welcome',
 78: 'who',
 79: 'not',
 80: 'win',
 81

In [68]:
def generate_fasttext_training_data(tokenized_sentences, window_size=2):
    training_data = []
    for sentence in tokenized_sentences:
        for i, word in enumerate(sentence):
            target_word_idx = vocab[word]
            context_start = max(0, i - window_size)
            context_end = min(len(sentence), i + window_size + 1)
            context_words = [vocab[sentence[j]] for j in range(context_start, context_end) if j != i]
            for context_word_idx in context_words:
                training_data.append((target_word_idx, context_word_idx))
    return np.array(training_data)


class FastText:
    def __init__(self, vocab_size, embedding_dim, n_grams=2):
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.n_grams = n_grams
        self.word_embeddings = np.random.rand(vocab_size, embedding_dim)
        self.ngram_embeddings = np.random.rand(vocab_size * n_grams, embedding_dim)

    def get_ngrams(self, word):
        ngrams = [word[i:i+self.n_grams] for i in range(len(word)-self.n_grams+1)]
        return ngrams

    def softmax(self, x):
        e_x = np.exp(x - np.max(x))
        return e_x / e_x.sum(axis=0)

    def train(self, training_data, epochs, learning_rate):
        for epoch in range(epochs):
            loss = 0
            for target_word_idx, context_word_idx in tqdm(training_data, desc=f"Epoch {epoch+1}/{epochs}", leave=False):
                ngrams = self.get_ngrams(reverse_vocabulary[target_word_idx])
                ngram_idxs = [vocab.get(ngram, -1) for ngram in ngrams]
                ngram_idxs = [idx for idx in ngram_idxs if idx != -1]

                if len(ngram_idxs) == 0:
                    continue

                h = np.mean(np.vstack([self.word_embeddings[target_word_idx]] + [self.ngram_embeddings[idx] for idx in ngram_idxs]), axis=0)
                u = np.dot(h, self.word_embeddings.T)
                y_pred = self.softmax(u)
                e = y_pred
                e[context_word_idx] -= 1

                self.word_embeddings[target_word_idx] -= learning_rate * np.dot(e, self.word_embeddings)
                for idx in ngram_idxs:
                    self.ngram_embeddings[idx] -= learning_rate * np.dot(e, self.word_embeddings)

                loss += -np.log(y_pred[context_word_idx])

            if epoch % 10 == 0:
                print(f'Epoch: {epoch}, Loss: {loss}')

fasttext_training_data = generate_fasttext_training_data(tokenized_sentences)

# Initialize and train the fastText model
fasttext_model = FastText(len(vocab), embedding_dim)
fasttext_model.train(fasttext_training_data, epochs=100, learning_rate=0.01)

# Get the word embeddings
fasttext_word_embeddings = fasttext_model.word_embeddings
print(fasttext_word_embeddings)

Epoch 1/100:   0%|          | 0/10312 [00:00<?, ?it/s]

  loss += -np.log(y_pred[context_word_idx])


Epoch: 0, Loss: nan


Epoch 2/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 3/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 4/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 5/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 6/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 7/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 8/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 9/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 10/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 11/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch: 10, Loss: nan


Epoch 12/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 13/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 14/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 15/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 16/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 17/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 18/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 19/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 20/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 21/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch: 20, Loss: nan


Epoch 22/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 23/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 24/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 25/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 26/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 27/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 28/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 29/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 30/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 31/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch: 30, Loss: nan


Epoch 32/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 33/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 34/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 35/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 36/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 37/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 38/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 39/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 40/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 41/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch: 40, Loss: nan


Epoch 42/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 43/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 44/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 45/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 46/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 47/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 48/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 49/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 50/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 51/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch: 50, Loss: nan


Epoch 52/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 53/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 54/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 55/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 56/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 57/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 58/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 59/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 60/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 61/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch: 60, Loss: nan


Epoch 62/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 63/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 64/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 65/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 66/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 67/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 68/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 69/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 70/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 71/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch: 70, Loss: nan


Epoch 72/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 73/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 74/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 75/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 76/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 77/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 78/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 79/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 80/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 81/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch: 80, Loss: nan


Epoch 82/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 83/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 84/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 85/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 86/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 87/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 88/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 89/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 90/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 91/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch: 90, Loss: nan


Epoch 92/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 93/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 94/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 95/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 96/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 97/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 98/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 99/100:   0%|          | 0/10312 [00:00<?, ?it/s]

Epoch 100/100:   0%|          | 0/10312 [00:00<?, ?it/s]

[[ 0.16314414  0.00705658  0.26132478 ...  0.59983461  0.29876284
   0.11342668]
 [ 0.12116813  0.53261968  0.67174751 ...  0.12554508  0.77882146
   0.02151494]
 [ 0.48978431  0.3457182   0.29117078 ...  0.07718073  0.60555994
   0.96811756]
 ...
 [ 0.5064201   0.845945    0.31978981 ...  0.48814942  0.40642158
  -0.23922189]
 [ 0.3736361   0.58903552  0.54320634 ...  0.32565591  0.03653038
   0.066884  ]
 [ 0.15028321  1.24396811  0.51601879 ...  0.61647634  0.78923081
  -0.2176151 ]]


评估 skipgram_embeddings

In [29]:
# 计算词向量相似度
def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)

# 获取词向量
def get_word_vector(word, vocab, embeddings):
    index = vocab.get(word, vocab['<UNK>'])
    return embeddings[index]

# 计算句子的嵌入表示
def get_sentence_embedding(sentence, vocab, embeddings):
    words = sentence.split()
    vectors = [get_word_vector(word, vocab, embeddings) for word in words if word in vocab]
    if len(vectors) == 0:
        return np.zeros(embeddings.shape[1])  # 如果句子中没有已知词，则返回零向量
    return np.mean(vectors, axis=0)  # 使用向量平均值表示句子

# 示例句子
sentence1 = 'What are they saying ?'  
sentence2 = 'How old are you ?'  
sentence3 = 'Your dog is very big .'  

# 获取句子的嵌入表示
sentence_embedding1 = get_sentence_embedding(sentence1, vocab, skipgram_embeddings)
sentence_embedding2 = get_sentence_embedding(sentence2, vocab, skipgram_embeddings)
sentence_embedding3 = get_sentence_embedding(sentence3, vocab, skipgram_embeddings)

# 计算句子之间的相似度
similarity12 = cosine_similarity(sentence_embedding1, sentence_embedding2)
similarity13 = cosine_similarity(sentence_embedding1, sentence_embedding3)
print(f"Similarity between '{sentence1}' and '{sentence2}':", similarity12)
print(f"Similarity between '{sentence1}' and '{sentence3}':", similarity13)

Similarity between 'What are they saying ?' and 'How old are you ?': 0.7147349897445243
Similarity between 'What are they saying ?' and 'Your dog is very big .': 0.36901023775191305


In [31]:
word1 = 'them'  
word2 = 'they'  

similarity = get_similarity(word1,word2, vocab, skipgram_embeddings)
print(f"Similarity between '{word1}' and '{word2}':", similarity)


Similarity between 'them' and 'they': 0.12963637191820188


评估 cbow_word_embeddings

In [61]:
sentence1 = 'What are they saying ?' 
sentence2 = 'How old are you ?'  
sentence3 = 'Your dog is very big .'  

# 获取句子的嵌入表示
sentence_embedding1 = get_sentence_embedding(sentence1, vocab, cbow_word_embeddings)
sentence_embedding2 = get_sentence_embedding(sentence2, vocab, cbow_word_embeddings)
sentence_embedding3 = get_sentence_embedding(sentence3, vocab, cbow_word_embeddings)

# 计算句子之间的相似度
similarity12 = cosine_similarity(sentence_embedding1, sentence_embedding2)
similarity13 = cosine_similarity(sentence_embedding1, sentence_embedding3)
print(f"Similarity between '{sentence1}' and '{sentence2}':", similarity12)
print(f"Similarity between '{sentence1}' and '{sentence3}':", similarity13)

word1 = 'hi'  
word2 = 'hello'  

similarity = get_similarity(word1,word2, vocab, cbow_word_embeddings)
print(f"Similarity between '{word1}' and '{word2}':", similarity)

Similarity between 'What are they saying ?' and 'How old are you ?': 0.7930726730937743
Similarity between 'What are they saying ?' and 'Your dog is very big .': 0.25281509968431753
Similarity between 'hi' and 'hello': 0.6586488366950162


评估 glove_embeddings

In [42]:
sentence1 = 'What are they saying ?'  
sentence2 = 'How old are you ?'  
sentence3 = 'Your dog is very big .' 

# 获取句子的嵌入表示
sentence_embedding1 = get_sentence_embedding(sentence1, vocab, glove_embeddings)
sentence_embedding2 = get_sentence_embedding(sentence2, vocab, glove_embeddings)
sentence_embedding3 = get_sentence_embedding(sentence3, vocab, glove_embeddings)

# 计算句子之间的相似度
similarity12 = cosine_similarity(sentence_embedding1, sentence_embedding2)
similarity13 = cosine_similarity(sentence_embedding1, sentence_embedding3)
print(f"Similarity between '{sentence1}' and '{sentence2}':", similarity12)
print(f"Similarity between '{sentence1}' and '{sentence3}':", similarity13)

word1 = 'hi'  
word2 = 'hello'  

similarity = get_similarity(word1,word2, vocab, glove_embeddings)
print(f"Similarity between '{word1}' and '{word2}':", similarity)

Similarity between 'What are they saying ?' and 'How old are you ?': 0.3448355337818463
Similarity between 'What are they saying ?' and 'Your dog is very big .': 0.0007248940616008282
Similarity between 'hi' and 'hello': 0.10990319975967086


评估 fasttext_word_embeddings

In [69]:
sentence1 = 'What are they saying ?'  
sentence2 = 'How old are you ?'  
sentence3 = 'Your dog is very big .'  
# 获取句子的嵌入表示
sentence_embedding1 = get_sentence_embedding(sentence1, vocab, fasttext_word_embeddings)
sentence_embedding2 = get_sentence_embedding(sentence2, vocab, fasttext_word_embeddings)
sentence_embedding3 = get_sentence_embedding(sentence3, vocab, fasttext_word_embeddings)

# 计算句子之间的相似度
similarity12 = cosine_similarity(sentence_embedding1, sentence_embedding2)
similarity13 = cosine_similarity(sentence_embedding1, sentence_embedding3)
print(f"Similarity between '{sentence1}' and '{sentence2}':", similarity12)
print(f"Similarity between '{sentence1}' and '{sentence3}':", similarity13)

word1 = 'hi'  
word2 = 'hello'  

similarity = get_similarity(word1,word2, vocab, fasttext_word_embeddings)
print(f"Similarity between '{word1}' and '{word2}':", similarity)

Similarity between 'What are they saying ?' and 'How old are you ?': 0.9483458430131255
Similarity between 'What are they saying ?' and 'Your dog is very big .': 0.830820885289421
Similarity between 'hi' and 'hello': 0.4343950532804011
