In [1]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('punkt_tab')

In [2]:
def clean_text(text):
    # 转换为小写
    text = text.lower()
    # 移除标点符号和数字
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # 分词
    tokens = word_tokenize(text)
    # 去除停用词
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return tokens
# 读取并清洗语料库
corpus_path = "/home/jia/PycharmProjects/CS224n/GloVe/text8"
with open(corpus_path, 'r', encoding='utf-8') as f:
    raw_text = f.read()
cleaned_tokens = clean_text(raw_text)

In [3]:
print(cleaned_tokens[:5])

['anarchism', 'originated', 'term', 'abuse', 'first']


In [4]:
from collections import defaultdict
word_freq = defaultdict(int)
for word in cleaned_tokens:
    word_freq[word] += 1

min_freq = 5
filtered_words = [word for word in cleaned_tokens if word_freq[word] >= min_freq]

In [5]:
# 构建词汇表
vocab = list(set(filtered_words))
word2id = {word: idx for idx, word in enumerate(vocab)}
id2word = {idx: word for idx, word in enumerate(vocab)}

In [6]:
import numpy as np
from tqdm import tqdm
window_size = 5  # 上下文窗口大小
cooccur = defaultdict(lambda: defaultdict(float))
# 遍历每个中心词
for center_pos in tqdm(range(len(filtered_words))):
    center_word = filtered_words[center_pos]
    center_id = word2id[center_word]
    
    # 遍历窗口内的上下文
    start = max(0, center_pos - window_size)
    end = min(len(filtered_words), center_pos + window_size + 1)
    
    for context_pos in range(start, end):
        if context_pos == center_pos:
            continue  # 跳过中心词本身
        context_word = filtered_words[context_pos]
        context_id = word2id[context_word]
        
        # 根据距离加权 (可选: 1/distance)
        distance = abs(context_pos - center_pos)
        cooccur[center_id][context_id] += 1.0 / distance
# 转换为稀疏矩阵格式 (i, j, X_ij)
rows, cols, values = [], [], []
for i in cooccur:
    for j in cooccur[i]:
        rows.append(i)
        cols.append(j)
        values.append(cooccur[i][j])


100%|███████████████████████████| 10602003/10602003 [01:25<00:00, 124541.85it/s]


In [7]:
# pytorch 版本训练
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from scipy.sparse import coo_matrix

class GloveDataset(Dataset):
    def __init__(self, coo_matrix):
        self.rows = torch.LongTensor(coo_matrix.row)
        self.cols = torch.LongTensor(coo_matrix.col)
        self.values = torch.FloatTensor(coo_matrix.data)
    def __len__(self):
        return len(self.values)
    def __getitem__(self, idx):
        return self.rows[idx], self.cols[idx], self.values[idx]

In [8]:
class GloVePyTorch(nn.Module):
    def __init__(self, vocab_size, embedding_dim=100, x_max=100, alpha=0.75):
        super().__init__()
        self.x_max = x_max
        self.alpha = alpha
        
        # 词向量和偏置项（中心词和上下文词分开）
        self.w = nn.Embedding(vocab_size, embedding_dim)
        self.w_tilde = nn.Embedding(vocab_size, embedding_dim)
        self.b = nn.Embedding(vocab_size, 1)
        self.b_tilde = nn.Embedding(vocab_size, 1)
        
        # 初始化参数
        self._init_weights()
        
    def _init_weights(self):
        init_range = 0.1
        self.w.weight.data.uniform_(-init_range, init_range)
        self.w_tilde.weight.data.uniform_(-init_range, init_range)
        self.b.weight.data.zero_()
        self.b_tilde.weight.data.zero_()
        
    def forward(self, i, j, X_ij):
        # 计算加权损失
        weights = (X_ij / self.x_max) ** self.alpha
        weights = torch.clamp(weights, max=1.0)
        
        # 计算内积和偏置
        w_i = self.w(i)
        w_tilde_j = self.w_tilde(j)
        b_i = self.b(i).squeeze()
        b_tilde_j = self.b_tilde(j).squeeze()
        
        similarity = torch.sum(w_i * w_tilde_j, dim=1)
        preds = similarity + b_i + b_tilde_j
        logs = torch.log(X_ij)
        
        # 计算损失
        loss = torch.mean(weights * (preds - logs) ** 2)
        return loss

In [9]:
from torch.amp import autocast
def train_glove(coo_mat, vocab_size, device='cuda', batch_size=163840, epochs=50):
    # 准备数据集
    dataset = GloveDataset(coo_mat)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True,num_workers=16,pin_memory=True)
    
    # 初始化模型
    model = GloVePyTorch(vocab_size).to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    # 新增混合精度训练组件
    scaler = torch.amp.GradScaler()
    
    # 训练循环
    for epoch in range(epochs):
        total_loss = 0.0
        progress_bar = tqdm(dataloader, 
                        desc=f"Epoch {epoch+1}/{epochs}",
                        leave=True)
        for i, j, X_ij in progress_bar:
            i = i.to(device, non_blocking=True)
            j = j.to(device, non_blocking=True)
            X_ij = X_ij.to(device, non_blocking=True)
            # i, j, X_ij = [x.to(device) for x in batch]
            
            
            # optimizer.zero_grad()
            # loss = model(i, j, X_ij)
            # loss.backward()
            # optimizer.step()

            with autocast(device_type='cuda', dtype=torch.float16):
                loss = model(i, j, X_ij)
            # 梯度缩放与反向传播
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad(set_to_none=True)  # 快速梯度清零
            
            total_loss += loss.item()
        
        print(f"Epoch {epoch+1}/{epochs} | Loss: {total_loss/len(dataloader):.4f}")
    
    return model

In [10]:
coo = coo_matrix((values, (rows, cols)), shape=(len(vocab), len(vocab)))
# 训练并保存模型
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
model = train_glove(coo, len(vocab), device=device)
# 合并词向量（中心词和上下文词平均）
final_embeddings = (model.w.weight.data + model.w_tilde.weight.data) / 2

Epoch 1/50: 100%|█████████████████████████████| 213/213 [07:32<00:00,  2.13s/it]


Epoch 1/50 | Loss: 0.0753


Epoch 2/50: 100%|█████████████████████████████| 213/213 [07:39<00:00,  2.16s/it]


Epoch 2/50 | Loss: 0.0445


Epoch 3/50: 100%|█████████████████████████████| 213/213 [07:34<00:00,  2.13s/it]


Epoch 3/50 | Loss: 0.0306


Epoch 4/50: 100%|█████████████████████████████| 213/213 [07:40<00:00,  2.16s/it]


Epoch 4/50 | Loss: 0.0282


Epoch 5/50: 100%|█████████████████████████████| 213/213 [07:28<00:00,  2.10s/it]


Epoch 5/50 | Loss: 0.0265


Epoch 6/50: 100%|█████████████████████████████| 213/213 [07:42<00:00,  2.17s/it]


Epoch 6/50 | Loss: 0.0247


Epoch 7/50: 100%|█████████████████████████████| 213/213 [07:35<00:00,  2.14s/it]


Epoch 7/50 | Loss: 0.0229


Epoch 8/50: 100%|█████████████████████████████| 213/213 [07:40<00:00,  2.16s/it]


Epoch 8/50 | Loss: 0.0213


Epoch 9/50: 100%|█████████████████████████████| 213/213 [07:39<00:00,  2.16s/it]


Epoch 9/50 | Loss: 0.0199


Epoch 10/50: 100%|████████████████████████████| 213/213 [07:42<00:00,  2.17s/it]


Epoch 10/50 | Loss: 0.0188


Epoch 11/50: 100%|████████████████████████████| 213/213 [07:48<00:00,  2.20s/it]


Epoch 11/50 | Loss: 0.0178


Epoch 12/50: 100%|████████████████████████████| 213/213 [07:54<00:00,  2.23s/it]


Epoch 12/50 | Loss: 0.0170


Epoch 13/50: 100%|████████████████████████████| 213/213 [07:58<00:00,  2.25s/it]


Epoch 13/50 | Loss: 0.0163


Epoch 14/50: 100%|████████████████████████████| 213/213 [07:49<00:00,  2.20s/it]


Epoch 14/50 | Loss: 0.0157


Epoch 15/50: 100%|████████████████████████████| 213/213 [07:53<00:00,  2.22s/it]


Epoch 15/50 | Loss: 0.0152


Epoch 16/50: 100%|████████████████████████████| 213/213 [07:56<00:00,  2.24s/it]


Epoch 16/50 | Loss: 0.0148


Epoch 17/50: 100%|████████████████████████████| 213/213 [07:55<00:00,  2.23s/it]


Epoch 17/50 | Loss: 0.0144


Epoch 18/50: 100%|████████████████████████████| 213/213 [07:59<00:00,  2.25s/it]


Epoch 18/50 | Loss: 0.0141


Epoch 19/50: 100%|████████████████████████████| 213/213 [07:57<00:00,  2.24s/it]


Epoch 19/50 | Loss: 0.0138


Epoch 20/50: 100%|████████████████████████████| 213/213 [07:40<00:00,  2.16s/it]


Epoch 20/50 | Loss: 0.0135


Epoch 21/50: 100%|████████████████████████████| 213/213 [07:40<00:00,  2.16s/it]


Epoch 21/50 | Loss: 0.0133


Epoch 22/50: 100%|████████████████████████████| 213/213 [07:49<00:00,  2.20s/it]


Epoch 22/50 | Loss: 0.0131


Epoch 23/50: 100%|████████████████████████████| 213/213 [07:44<00:00,  2.18s/it]


Epoch 23/50 | Loss: 0.0129


Epoch 24/50: 100%|████████████████████████████| 213/213 [07:42<00:00,  2.17s/it]


Epoch 24/50 | Loss: 0.0128


Epoch 25/50: 100%|████████████████████████████| 213/213 [07:44<00:00,  2.18s/it]


Epoch 25/50 | Loss: 0.0126


Epoch 26/50: 100%|████████████████████████████| 213/213 [07:40<00:00,  2.16s/it]


Epoch 26/50 | Loss: 0.0125


Epoch 27/50: 100%|████████████████████████████| 213/213 [07:34<00:00,  2.14s/it]


Epoch 27/50 | Loss: 0.0124


Epoch 28/50: 100%|████████████████████████████| 213/213 [07:49<00:00,  2.20s/it]


Epoch 28/50 | Loss: 0.0123


Epoch 29/50: 100%|████████████████████████████| 213/213 [07:51<00:00,  2.22s/it]


Epoch 29/50 | Loss: 0.0121


Epoch 30/50: 100%|████████████████████████████| 213/213 [07:44<00:00,  2.18s/it]


Epoch 30/50 | Loss: 0.0120


Epoch 31/50: 100%|████████████████████████████| 213/213 [07:55<00:00,  2.23s/it]


Epoch 31/50 | Loss: 0.0120


Epoch 32/50: 100%|████████████████████████████| 213/213 [07:43<00:00,  2.18s/it]


Epoch 32/50 | Loss: 0.0119


Epoch 33/50: 100%|████████████████████████████| 213/213 [07:38<00:00,  2.15s/it]


Epoch 33/50 | Loss: 0.0118


Epoch 34/50: 100%|████████████████████████████| 213/213 [07:52<00:00,  2.22s/it]


Epoch 34/50 | Loss: 0.0117


Epoch 35/50: 100%|████████████████████████████| 213/213 [07:43<00:00,  2.18s/it]


Epoch 35/50 | Loss: 0.0116


Epoch 36/50: 100%|████████████████████████████| 213/213 [07:45<00:00,  2.18s/it]


Epoch 36/50 | Loss: 0.0116


Epoch 37/50: 100%|████████████████████████████| 213/213 [07:49<00:00,  2.20s/it]


Epoch 37/50 | Loss: 0.0115


Epoch 38/50: 100%|████████████████████████████| 213/213 [07:36<00:00,  2.14s/it]


Epoch 38/50 | Loss: 0.0115


Epoch 39/50: 100%|████████████████████████████| 213/213 [07:40<00:00,  2.16s/it]


Epoch 39/50 | Loss: 0.0114


Epoch 40/50: 100%|████████████████████████████| 213/213 [07:58<00:00,  2.25s/it]


Epoch 40/50 | Loss: 0.0114


Epoch 41/50: 100%|████████████████████████████| 213/213 [07:53<00:00,  2.22s/it]


Epoch 41/50 | Loss: 0.0113


Epoch 42/50: 100%|████████████████████████████| 213/213 [07:46<00:00,  2.19s/it]


Epoch 42/50 | Loss: 0.0113


Epoch 43/50: 100%|████████████████████████████| 213/213 [07:46<00:00,  2.19s/it]


Epoch 43/50 | Loss: 0.0112


Epoch 44/50: 100%|████████████████████████████| 213/213 [07:46<00:00,  2.19s/it]


Epoch 44/50 | Loss: 0.0112


Epoch 45/50: 100%|████████████████████████████| 213/213 [07:43<00:00,  2.18s/it]


Epoch 45/50 | Loss: 0.0111


Epoch 46/50: 100%|████████████████████████████| 213/213 [07:51<00:00,  2.21s/it]


Epoch 46/50 | Loss: 0.0111


Epoch 47/50: 100%|████████████████████████████| 213/213 [07:59<00:00,  2.25s/it]


Epoch 47/50 | Loss: 0.0111


Epoch 48/50: 100%|████████████████████████████| 213/213 [07:45<00:00,  2.19s/it]


Epoch 48/50 | Loss: 0.0110


Epoch 49/50: 100%|████████████████████████████| 213/213 [07:50<00:00,  2.21s/it]


Epoch 49/50 | Loss: 0.0110


Epoch 50/50: 100%|████████████████████████████| 213/213 [07:44<00:00,  2.18s/it]

Epoch 50/50 | Loss: 0.0110





In [11]:
# 保存到文件
with open("glove_pytorch.txt", "w", encoding="utf-8") as f:
    for word, idx in word2id.items():
        vec = final_embeddings[idx].cpu().numpy()
        f.write(f"{word} " + " ".join(map(str, vec)) + "\n")

In [12]:
# import numpy as np
# from collections import defaultdict
# from scipy.sparse import coo_matrix

# class GloVeManual:
#     def __init__(self, vector_dim=100, x_max=100, alpha=0.75, lr=0.05):
#         self.vector_dim = vector_dim  # 词向量维度
#         self.x_max = x_max            # 权重函数截断值
#         self.alpha = alpha            # 权重函数指数
#         self.lr = lr                  # 学习率
        
#     def _init_params(self, vocab_size):
#         # 初始化中心词和上下文词向量
#         self.W = np.random.randn(vocab_size, self.vector_dim) * 0.1
#         self.W_tilde = np.random.randn(vocab_size, self.vector_dim) * 0.1
        
#         # 初始化偏置项
#         self.b = np.zeros(vocab_size)
#         self.b_tilde = np.zeros(vocab_size)
        
#     def _weight_func(self, x):
#         # 定义权重函数 f(x)
#         return np.minimum((x / self.x_max) ** self.alpha, 1.0)
    
#     def fit(self, coo_matrix, epochs=50):
#         rows = coo_matrix.row
#         cols = coo_matrix.col
#         values = coo_matrix.data
        
#         vocab_size = coo_matrix.shape[0]
#         self._init_params(vocab_size)
        
#         for epoch in range(epochs):
#             total_loss = 0.0
#             for idx in np.random.permutation(len(values)):  # 随机顺序遍历
#                 i = rows[idx]
#                 j = cols[idx]
#                 X_ij = values[idx]
                
#                 # 计算预测值
#                 pred = np.dot(self.W[i], self.W_tilde[j]) + self.b[i] + self.b_tilde[j]
#                 log_X = np.log(X_ij)
                
#                 # 计算损失项
#                 loss = pred - log_X
#                 weighted_loss = self._weight_func(X_ij) * loss
#                 total_loss += 0.5 * (weighted_loss * loss)
                
#                 # 计算梯度
#                 grad_W_i = weighted_loss * self.W_tilde[j]
#                 grad_W_tilde_j = weighted_loss * self.W[i]
#                 grad_b_i = weighted_loss
#                 grad_b_tilde_j = weighted_loss
                
#                 # 更新参数
#                 self.W[i] -= self.lr * grad_W_i
#                 self.W_tilde[j] -= self.lr * grad_W_tilde_j
#                 self.b[i] -= self.lr * grad_b_i
#                 self.b_tilde[j] -= self.lr * grad_b_tilde_j
                
#             print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss:.2f}")
            
#     def get_vectors(self):
#         # 合并中心词和上下文词向量（常见做法）
#         return (self.W + self.W_tilde) / 2
# coo = coo_matrix((values, (rows, cols)), shape=(len(vocab), len(vocab)))
# # 初始化模型
# glove = GloVeManual(vector_dim=100, x_max=100, alpha=0.75, lr=0.05)  # 100 维向量
# glove.fit(coo)
# # 获取最终词向量
# word_vectors = glove.get_vectors()

# # 保存到文件
# with open("glove_manual_vectors.txt", "w", encoding="utf-8") as f:
#     for word in word2id:
#         idx = word2id[word]
#         vec = word_vectors[idx]
#         f.write(f"{word} " + " ".join(map(str, vec)) + "\n")

In [13]:
# 加载词向量
def load_vectors(file_path):
    word_vecs = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split()
            word = parts[0]
            vec = np.array([float(x) for x in parts[1:]])
            word_vecs[word] = vec
    return word_vecs
vectors = load_vectors("glove_pytorch.txt")
# 计算余弦相似度
def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
print(cosine_similarity(vectors['king'], vectors['queen']))

0.6044898999818478
