In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter, defaultdict

import ast
import random
from tqdm import tqdm

global device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device,'\n')

Docs = pd.read_csv('../dataset/地理语料库_taz.csv')
Docs['Doc'] = Docs['Doc'].apply(lambda x: ast.literal_eval(x))
Docs

cuda 



Unnamed: 0,eID,Doc
0,254.0,"[公司, 公司企业, 公司, 住宅区, 住宅区, 休闲场所, 科教文化场所, 政府及社会团体..."
1,258.0,"[体育休闲服务场所, 科教文化场所, 运动场馆, 综合市场, 专卖店, 宾馆酒店, 服装鞋帽..."
2,239.0,"[公共厕所, 旅行社, 旅行社, 公司, 生活服务场所, 便民商店/便利店, 糕饼店, 公司..."
3,255.0,"[汽车维修, 服装鞋帽皮具店, 购物相关场所, 家居建材市场, 住宅区, 科教文化场所, 专..."
4,253.0,"[公检法机构, 公检法机构, 公检法机构, 工商税务机构, 公检法机构, 商务住宅相关, 住..."
...,...,...
1553,977.0,"[公园广场, 公园广场]"
1554,1515.0,"[公共厕所, 购物相关场所]"
1555,1439.0,"[产业园区, 公司, 公司, 公司企业, 汽车维修, 汽车养护/装饰, 公司企业, 金融保险..."
1556,1628.0,"[公共厕所, 公共厕所]"


## GloVe

### 模型架构

In [23]:
class GloVe(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(GloVe, self).__init__()
        self.target_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.context_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.target_bias = nn.Embedding(vocab_size, 1)
        self.context_bias = nn.Embedding(vocab_size, 1)
        
        nn.init.xavier_uniform_(self.target_embeddings.weight)
        nn.init.xavier_uniform_(self.context_embeddings.weight)
        nn.init.zeros_(self.target_bias.weight)
        nn.init.zeros_(self.context_bias.weight)

    def forward(self, target_idx, context_idx, cooc_val):
        target_vec = self.target_embeddings(target_idx)
        context_vec = self.context_embeddings(context_idx)
        target_bias = self.target_bias(target_idx).squeeze()
        context_bias = self.context_bias(context_idx).squeeze()

        x_max = 100.0
        alpha = 0.75
        weight = torch.pow(torch.clamp(cooc_val / x_max, max=1.0), alpha)

        loss = weight * (torch.sum(target_vec * context_vec, dim=1) 
                 + target_bias + context_bias 
                 - torch.log(cooc_val + 1e-8)) ** 2
        
        return loss.mean()

### 构建数据集

In [24]:
def BuildVocab(docs, min_count=1):  
    word_counts = Counter(word for doc in docs for word in doc)
    vocab = {word: i for i, (word, count) in enumerate(word_counts.items()) if count >= min_count}
    return vocab

def BuildCoMatrix(docs, vocab, window_size=5): 
    cooc = defaultdict(float)
    for doc in docs:
        indexed_doc = [vocab[w] for w in doc if w in vocab]
        for i, wi in enumerate(indexed_doc):
            for j in range(max(0, i - window_size), min(len(indexed_doc), i + window_size + 1)):
                if i != j:
                    cooc[(wi, indexed_doc[j])] += 1.0
    return cooc

### 训练模型

In [25]:
def TrainGlove(docs, embedding_dim=70, epochs=30, lr=0.005, batch_size=128, 
               min_count=1, window_size=5):
    print('正在构建训练数据集...')
    losses = []
    vocab = BuildVocab(docs, min_count)
    print("词汇表大小:", len(vocab))
    
    cooc_matrix = BuildCoMatrix(docs, vocab, window_size)
    
    model = GloVe(len(vocab), embedding_dim).to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr)

    data = [(i, j, val) for (i, j), val in cooc_matrix.items()]
    
    dataloader = DataLoader(data, batch_size=batch_size, shuffle=True)

    print('开始训练 GloVe！')
    model.train()
    for epoch in range(epochs):
        epoch_loss = 0
        progress_bar = tqdm(dataloader, desc=f"Epoch {epoch+1:03d}/{epochs}, Loss: {epoch_loss:06.2f}")
    
        for target_idx, context_idx, cooc_val in progress_bar:
            target_idx = target_idx.to(device)
            context_idx = context_idx.to(device)
            cooc_val = cooc_val.to(device)
    
            optimizer.zero_grad()
            loss = model(target_idx, context_idx, cooc_val)
            loss.backward()
            optimizer.step()
    
            epoch_loss += loss.item()
            progress_bar.set_description(f"Epoch {epoch+1:03d}/{epochs}, Loss: {epoch_loss:06.2f}")
        losses.append(epoch_loss)
    print('训练完成！')
    return model, vocab, losses

In [26]:
corpus = Docs['Doc']
embedding_dim = 100
epochs = 100
lr = 0.001
batch_size = 512
min_count = 1
window_size = 5

In [27]:
glove_model, vocab, losses = TrainGlove(
    corpus,
    embedding_dim=embedding_dim,
    epochs=epochs,
    lr=lr,
    batch_size=batch_size,
    min_count=min_count,
    window_size=window_size
)

正在构建训练数据集...
词汇表大小: 187
开始训练 GloVe！


Epoch 001/100, Loss: 371.90: 100%|██████████| 32/32 [00:00<00:00, 158.95it/s]
Epoch 002/100, Loss: 362.64: 100%|██████████| 32/32 [00:00<00:00, 223.39it/s]
Epoch 003/100, Loss: 351.41: 100%|██████████| 32/32 [00:00<00:00, 255.73it/s]
Epoch 004/100, Loss: 335.05: 100%|██████████| 32/32 [00:00<00:00, 264.03it/s]
Epoch 005/100, Loss: 309.75: 100%|██████████| 32/32 [00:00<00:00, 264.01it/s]
Epoch 006/100, Loss: 269.96: 100%|██████████| 32/32 [00:00<00:00, 265.67it/s]
Epoch 007/100, Loss: 214.99: 100%|██████████| 32/32 [00:00<00:00, 265.07it/s]
Epoch 008/100, Loss: 150.87: 100%|██████████| 32/32 [00:00<00:00, 253.33it/s]
Epoch 009/100, Loss: 090.83: 100%|██████████| 32/32 [00:00<00:00, 285.18it/s]
Epoch 010/100, Loss: 047.27: 100%|██████████| 32/32 [00:00<00:00, 298.31it/s]
Epoch 011/100, Loss: 023.19: 100%|██████████| 32/32 [00:00<00:00, 294.22it/s]
Epoch 012/100, Loss: 012.73: 100%|██████████| 32/32 [00:00<00:00, 290.34it/s]
Epoch 013/100, Loss: 008.52: 100%|██████████| 32/32 [00:00<00:00

训练完成！





In [28]:
# torch.save(glove_model, 'model/glove.pth')

### 验证词嵌入

In [29]:
def WordVectors(model, vocab):
    target_vectors = model.target_embeddings.weight.data.cpu().numpy()
    context_vectors = model.context_embeddings.weight.data.cpu().numpy()
    word_vectors = (target_vectors + context_vectors) / 2  # 两者的平均

    df = pd.DataFrame(word_vectors, index=vocab.keys())
    df = df.reset_index()
    df = df.rename(columns={'index':'word'})
    return df

# # 载入训练好的模型：提前初始化模型，并构建词汇表
# Docs = pd.read_csv('data/地理语料库_taz.csv')
# Docs['Doc'] = Docs['Doc'].apply(lambda x: ast.literal_eval(x))
# vocab = BuildVocab(Docs["Doc"], min_count=1)
# glove_model = torch.load("model/glove.pth", weights_only=False)

# 提取词向量并转换为 DataFrame
glove_model.eval()
df_glove = WordVectors(glove_model, vocab)
df_glove

Unnamed: 0,word,0,1,2,3,4,5,6,7,8,...,90,91,92,93,94,95,96,97,98,99
0,公司,0.385676,0.177176,-0.533454,-0.104351,0.045786,0.150309,0.474138,-0.443896,0.204602,...,0.313616,-0.461174,0.397950,-0.526870,0.004920,0.454670,-0.278821,0.332411,0.509298,-0.460606
1,公司企业,0.395900,0.142559,-0.462978,-0.000544,-0.089451,0.090054,0.398962,-0.338764,0.136257,...,0.379780,-0.313333,0.304469,-0.228833,-0.014996,0.233627,-0.184909,0.148499,0.455292,-0.277208
2,住宅区,0.390419,0.319433,-0.303736,0.179057,0.258316,0.245668,0.339413,-0.278839,0.003106,...,0.160676,-0.421593,0.338034,-0.333554,0.166239,0.313421,-0.210750,0.259113,0.303507,-0.176700
3,休闲场所,0.374113,-0.044550,-0.222829,0.217110,0.378088,-0.124316,0.195264,-0.316652,0.158832,...,0.175261,-0.104057,0.289075,-0.177160,0.507687,0.351197,-0.124514,0.139084,0.294106,-0.257657
4,科教文化场所,0.335801,0.115763,-0.216251,0.088354,0.362807,0.052072,0.403911,-0.355441,0.206874,...,0.362471,-0.340785,0.213737,-0.428871,0.082980,0.311348,-0.217055,0.102709,0.359701,-0.327771
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
182,克莱斯勒特约销售,-0.100320,0.034260,-0.167038,-0.048179,-0.042370,-0.112223,0.007506,0.037250,-0.104663,...,0.020050,-0.092914,0.038319,-0.121669,0.074895,0.024270,0.120417,-0.118670,-0.055339,0.012141
183,捷豹特约维修,0.011475,-0.047206,0.062051,-0.141760,0.057270,-0.136282,-0.290929,0.156879,-0.183731,...,0.007205,0.020578,-0.153679,0.174969,-0.033189,-0.023634,0.014573,0.003675,0.092749,-0.025018
184,长安汽车维修,0.131450,-0.055899,-0.124131,-0.060639,-0.022035,0.082018,0.084652,0.075467,-0.024777,...,-0.093495,0.028447,0.076431,0.071002,0.066535,-0.205608,0.049422,0.174820,0.149434,-0.019416
185,纳智捷销售,0.071648,0.030525,0.093708,0.016903,-0.061123,-0.125474,-0.040182,-0.038824,-0.032002,...,-0.097257,-0.046420,0.053926,-0.072398,-0.127124,0.059622,0.048677,-0.005645,0.078620,0.016459


In [None]:
def cosine_similarity(vec1, vec2):
    vec1 = vec1 / np.linalg.norm(vec1)
    vec2 = vec2 / np.linalg.norm(vec2)
    return np.dot(vec1, vec2)


word1 = '金融保险服务机构'
word2 = '公司'
word3 = '风景名胜'
word4 = '风景名胜相关'
word5 = '长安汽车维修'
word6 = '现代特约销售'
df = df_glove.copy()
vec1 = df.query(f'word=="{word1}"').iloc[:,1:].values.flatten()
vec2 = df.query(f'word=="{word2}"').iloc[:,1:].values.flatten()
vec3 = df.query(f'word=="{word3}"').iloc[:,1:].values.flatten()
vec4 = df.query(f'word=="{word4}"').iloc[:,1:].values.flatten()
vec5 = df.query(f'word=="{word5}"').iloc[:,1:].values.flatten()
vec6 = df.query(f'word=="{word6}"').iloc[:,1:].values.flatten()

print(f"'{word2}' 与 '{word1}' 的余弦相似度: {cosine_similarity(vec1, vec2):.4f} (GloVe)")
print(f"'{word2}' 与 '{word3}' 的余弦相似度: {cosine_similarity(vec3, vec2):.4f} (GloVe)")
print(f"'{word4}' 与 '{word3}' 的余弦相似度: {cosine_similarity(vec3, vec4):.4f} (GloVe)")
print(f"'{word5}' 与 '{word2}' 的余弦相似度: {cosine_similarity(vec2, vec5):.4f} (GloVe)")
print(f"'{word5}' 与 '{word1}' 的余弦相似度: {cosine_similarity(vec1, vec5):.4f} (GloVe)")
print(f"'{word5}' 与 '{word4}' 的余弦相似度: {cosine_similarity(vec4, vec5):.4f} (GloVe)")
print(f"'{word5}' 与 '{word6}' 的余弦相似度: {cosine_similarity(vec5, vec6):.4f} (GloVe)")

'公司' 与 '金融保险服务机构' 的余弦相似度: 0.8998 (GloVe)
'公司' 与 '风景名胜' 的余弦相似度: 0.6146 (GloVe)
'风景名胜相关' 与 '风景名胜' 的余弦相似度: 0.8300 (GloVe)
'长安汽车维修' 与 '公司' 的余弦相似度: 0.1046 (GloVe)
'长安汽车维修' 与 '金融保险服务机构' 的余弦相似度: 0.0408 (GloVe)
'长安汽车维修' 与 '风景名胜相关' 的余弦相似度: -0.0234 (GloVe)
'长安汽车维修' 与 '现代特约销售' 的余弦相似度: 0.1462 (GloVe)


In [48]:
df_glove.to_csv('glove_vectors.csv', index=False)