In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter, defaultdict

import ast
import random
from tqdm import tqdm

global device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device,'\n')

Docs = pd.read_csv('../dataset/地理语料库_taz.csv')
Docs['Doc'] = Docs['Doc'].apply(lambda x: ast.literal_eval(x))
Docs

cuda 



Unnamed: 0,eID,Doc
0,254.0,"[公司, 公司企业, 公司, 住宅区, 住宅区, 休闲场所, 科教文化场所, 政府及社会团体..."
1,258.0,"[体育休闲服务场所, 科教文化场所, 运动场馆, 综合市场, 专卖店, 宾馆酒店, 服装鞋帽..."
2,239.0,"[公共厕所, 旅行社, 旅行社, 公司, 生活服务场所, 便民商店/便利店, 糕饼店, 公司..."
3,255.0,"[汽车维修, 服装鞋帽皮具店, 购物相关场所, 家居建材市场, 住宅区, 科教文化场所, 专..."
4,253.0,"[公检法机构, 公检法机构, 公检法机构, 工商税务机构, 公检法机构, 商务住宅相关, 住..."
...,...,...
1553,977.0,"[公园广场, 公园广场]"
1554,1515.0,"[公共厕所, 购物相关场所]"
1555,1439.0,"[产业园区, 公司, 公司, 公司企业, 汽车维修, 汽车养护/装饰, 公司企业, 金融保险..."
1556,1628.0,"[公共厕所, 公共厕所]"


## Doc2vec

### 模型架构

In [13]:
class DM(nn.Module):
    def __init__(self, vocab_size, doc_size, embedding_dim):
        super(DM, self).__init__()
        self.word_emb = nn.Embedding(vocab_size, embedding_dim)
        self.doc_emb = nn.Embedding(doc_size, embedding_dim)
        self.out = nn.Linear(embedding_dim, vocab_size)
        nn.init.xavier_uniform_(self.word_emb.weight)
        nn.init.xavier_uniform_(self.doc_emb.weight)

    def forward(self, doc_id, context_ids):
        context_vec = self.word_emb(context_ids).mean(dim=1)
        doc_vec = self.doc_emb(doc_id)
        hidden = context_vec + doc_vec
        logits = self.out(hidden)
        return logits


class DBOW(nn.Module):
    def __init__(self, vocab_size, doc_size, embedding_dim):
        super(DBOW, self).__init__()
        self.doc_emb = nn.Embedding(doc_size, embedding_dim)
        self.out = nn.Linear(embedding_dim, vocab_size)
        nn.init.xavier_uniform_(self.doc_emb.weight)

    def forward(self, doc_id):
        doc_vec = self.doc_emb(doc_id)
        logits = self.out(doc_vec)
        return logits

### 构建数据集

In [14]:
class Doc2VecDataset(Dataset):
    def __init__(self, docs, vocab, window_size=5, model_type="dm"):
        self.data = []
        self.model_type = model_type
        self.vocab = vocab
        self.word2id = {w: i for i, w in enumerate(vocab)}
        
        for doc_id, doc in enumerate(docs):
            indexed = [self.word2id[w] for w in doc if w in self.word2id]
            for i, word in enumerate(indexed):
                if model_type == "dm":
                    start = max(0, i - window_size)
                    end = min(len(indexed), i + window_size + 1)
                    context = [indexed[j] for j in range(start, end) if j != i]
                    if len(context) > 0:
                        self.data.append((doc_id, context, word))
                else:
                    window = random.randint(1, window_size)
                    start = max(0, i - window)
                    end = min(len(indexed), i + window + 1)
                    context = [indexed[j] for j in range(start, end) if j != i]
                    for w in context:
                        self.data.append((doc_id, w))
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx]
    
def BuildVocab(docs, min_count=1):
    word_counts = Counter(word for doc in docs for word in doc)
    vocab = {word: i for i, (word, count) in enumerate(word_counts.items()) if count >= min_count}
    return vocab

### 训练模型

In [15]:
def TrainDoc2Vec(docs, embedding_dim=100, epochs=10, lr=0.01, batch_size=64,
                 window_size=5, min_count=1, model_type="dm"):
    print("正在构建词汇表...")
    vocab = BuildVocab(docs, min_count)
    print("词汇表大小:", len(vocab))

    dataset = Doc2VecDataset(docs, vocab, window_size, model_type=model_type)
    
    if model_type == "dm":
        def collate_fn(batch):
            doc_ids, contexts, targets = zip(*batch)
            max_len = max(len(c) for c in contexts)
            padded = [c + [0] * (max_len - len(c)) for c in contexts]
            return (
                torch.tensor(doc_ids, dtype=torch.long),
                torch.tensor(padded, dtype=torch.long),
                torch.tensor(targets, dtype=torch.long),
            )
    elif model_type == "dbow":
        def collate_fn(batch):
            doc_ids, targets = zip(*batch)
            return (
                torch.tensor(doc_ids, dtype=torch.long),
                torch.tensor(targets, dtype=torch.long),
            )
    else:
        raise ValueError("model_type 必须是 'dm' 或 'dbow'")

    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

    if model_type == "dm":
        model = DM(len(vocab), len(docs), embedding_dim).to(device)
    else:
        model = DBOW(len(vocab), len(docs), embedding_dim).to(device)

    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()

    print(f"开始训练 Doc2Vec ({model_type.upper()})！")
    model.train()
    losses = []
    for epoch in range(epochs):
        epoch_loss = 0
        progress_bar = tqdm(dataloader, desc=f"Epoch {epoch+1}/{epochs}, Loss: {0:.4f}")
        for batch in progress_bar:
            optimizer.zero_grad()
            if model_type == "dm":
                doc_ids, context_ids, targets = [b.to(device) for b in batch]
                logits = model(doc_ids, context_ids)
                loss = criterion(logits, targets)
            elif model_type == "dbow":
                doc_ids, targets = [b.to(device) for b in batch]
                logits = model(doc_ids)
                loss = criterion(logits, targets)
            else:  
                raise ValueError("model_type 必须是 'dm' 或 'dbow'")

            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
            progress_bar.set_description(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss:.4f}")
        losses.append(epoch_loss)

    print("训练完成！")
    return model, vocab, losses

### DM

#### 训练

In [53]:
corpus = Docs['Doc']
embedding_dim = 100
epochs = 50
lr = 0.001
batch_size = 1024
min_count = 1
window_size = 5
model_type = "dm"

In [54]:
d2v_dm, vocab, losses = TrainDoc2Vec(
    corpus, embedding_dim=embedding_dim, epochs=epochs, lr=lr, 
    batch_size=batch_size, min_count=min_count, 
    window_size=window_size, model_type=model_type
)

正在构建词汇表...
词汇表大小: 187
开始训练 Doc2Vec (DM)！


Epoch 1/50, Loss: 1866.1509: 100%|██████████| 518/518 [00:03<00:00, 142.77it/s]
Epoch 2/50, Loss: 1654.5427: 100%|██████████| 518/518 [00:03<00:00, 147.77it/s]
Epoch 3/50, Loss: 1632.6624: 100%|██████████| 518/518 [00:03<00:00, 142.59it/s]
Epoch 4/50, Loss: 1621.9231: 100%|██████████| 518/518 [00:04<00:00, 124.20it/s]
Epoch 5/50, Loss: 1614.6249: 100%|██████████| 518/518 [00:04<00:00, 116.96it/s]
Epoch 6/50, Loss: 1609.1372: 100%|██████████| 518/518 [00:04<00:00, 111.20it/s]
Epoch 7/50, Loss: 1604.5960: 100%|██████████| 518/518 [00:04<00:00, 117.44it/s]
Epoch 8/50, Loss: 1600.7464: 100%|██████████| 518/518 [00:04<00:00, 123.75it/s]
Epoch 9/50, Loss: 1597.5787: 100%|██████████| 518/518 [00:04<00:00, 123.18it/s]
Epoch 10/50, Loss: 1594.7399: 100%|██████████| 518/518 [00:04<00:00, 115.29it/s]
Epoch 11/50, Loss: 1592.2753: 100%|██████████| 518/518 [00:04<00:00, 117.73it/s]
Epoch 12/50, Loss: 1590.0579: 100%|██████████| 518/518 [00:04<00:00, 119.06it/s]
Epoch 13/50, Loss: 1588.0282: 100%|██

训练完成！





#### 验证词嵌入

In [55]:
def DocVectors(model, vocab):
    word_embeddings = model.word_emb.weight.data.cpu().numpy()
    doc_embeddings = model.doc_emb.weight.data.cpu().numpy()
    idx2word = {i: w for w, i in vocab.items()}
    
    word_df = pd.DataFrame(word_embeddings, index=idx2word.values())
    word_df = word_df.reset_index().rename(columns={'index': 'word'})
    
    doc_df = pd.DataFrame(doc_embeddings, index=[f"doc_{i}" for i in range(len(doc_embeddings))])
    doc_df = doc_df.reset_index().rename(columns={'index': 'doc_id'})
    
    return word_df, doc_df

df_dm, doc = DocVectors(d2v_dm, vocab)
df_dm

Unnamed: 0,word,0,1,2,3,4,5,6,7,8,...,90,91,92,93,94,95,96,97,98,99
0,公司,0.161242,0.146947,-0.616929,0.267296,-0.156528,-0.180625,0.095109,-0.193770,0.250860,...,0.421479,-0.162709,0.371792,0.132796,0.218590,-0.333795,-0.221186,-0.164314,-0.252152,-0.163040
1,公司企业,0.017931,-0.043765,-0.534149,0.223290,-0.412654,-0.309337,0.356519,0.493942,0.048963,...,-0.091025,-0.383354,0.097700,0.022276,0.441127,-0.038244,-0.106900,-0.024578,-0.335361,0.474102
2,住宅区,0.067630,-0.209869,0.338789,-0.110295,-0.032067,-0.277681,0.168707,0.388600,0.359285,...,-0.288170,-0.192300,0.368216,0.089970,-0.233046,-0.206181,-0.005277,0.394007,-0.106067,-0.084792
3,休闲场所,0.265322,-0.336436,0.472667,-0.200067,-0.576660,-0.370440,0.059744,-0.668797,-0.064109,...,-0.183523,-0.299129,0.214588,-0.101270,-0.269924,0.069169,-0.029540,0.363624,0.018451,-0.588450
4,科教文化场所,0.207398,0.013610,-0.427475,-0.360990,-0.465114,-0.566535,-0.023282,-0.125233,0.031636,...,0.012262,-0.165493,0.176255,-0.325471,0.175339,0.080430,-0.070426,0.252794,-0.046115,0.042309
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
182,克莱斯勒特约销售,-0.223813,0.172364,-0.142995,1.098559,-2.165963,-1.767278,-0.191169,-0.027862,0.746040,...,0.935176,-0.207598,-0.654879,1.019911,1.206388,-0.142004,0.588579,0.263430,0.600932,-1.151366
183,捷豹特约维修,-0.245266,0.014190,-0.373023,-0.012793,0.203143,0.656365,0.460930,0.584109,-0.219827,...,-0.233519,0.472548,-0.333524,-0.147441,0.037295,-0.042088,0.239830,-0.039488,0.684430,1.203868
184,长安汽车维修,-0.377518,-0.419426,0.776349,1.211121,0.690218,-0.234275,-0.806498,1.776496,1.093254,...,0.950000,-0.852852,1.333009,1.134161,0.110125,-1.300484,-1.540665,0.677219,0.281579,0.061091
185,纳智捷销售,1.478608,-0.023404,-0.307673,0.852838,-0.393744,-0.869791,0.853265,0.397219,-0.568213,...,0.917175,0.962643,0.048540,2.510937,-0.314439,0.044191,0.126799,0.018639,-1.899505,-0.878100


In [None]:
def cosine_similarity(vec1, vec2):
    vec1 = vec1 / np.linalg.norm(vec1)
    vec2 = vec2 / np.linalg.norm(vec2)
    return np.dot(vec1, vec2)

word1 = '公司企业'                      
word2 = '公司'
word3 = '风景名胜'
word4 = '风景名胜相关'
word5 = '长安汽车维修'
word6 = '现代特约销售'
df = df_dm.copy()
vec1 = df.query(f'word=="{word1}"').iloc[:,1:].values.flatten()
vec2 = df.query(f'word=="{word2}"').iloc[:,1:].values.flatten()
vec3 = df.query(f'word=="{word3}"').iloc[:,1:].values.flatten()
vec4 = df.query(f'word=="{word4}"').iloc[:,1:].values.flatten()
vec5 = df.query(f'word=="{word5}"').iloc[:,1:].values.flatten()
vec6 = df.query(f'word=="{word6}"').iloc[:,1:].values.flatten()

print(f"'{word2}' 与 '{word1}' 的余弦相似度: {cosine_similarity(vec1, vec2):.4f} (CBOW)")
print(f"'{word2}' 与 '{word3}' 的余弦相似度: {cosine_similarity(vec3, vec2):.4f} (CBOW)")
print(f"'{word4}' 与 '{word3}' 的余弦相似度: {cosine_similarity(vec3, vec4):.4f} (CBOW)")
print(f"'{word5}' 与 '{word2}' 的余弦相似度: {cosine_similarity(vec2, vec5):.4f} (CBOW)")
print(f"'{word5}' 与 '{word1}' 的余弦相似度: {cosine_similarity(vec1, vec5):.4f} (CBOW)")
print(f"'{word5}' 与 '{word4}' 的余弦相似度: {cosine_similarity(vec4, vec5):.4f} (CBOW)")
print(f"'{word5}' 与 '{word6}' 的余弦相似度: {cosine_similarity(vec5, vec6):.4f} (CBOW)")

'公司' 与 '公司企业' 的余弦相似度: 0.5125 (CBOW)
'公司' 与 '风景名胜' 的余弦相似度: -0.0493 (CBOW)
'风景名胜相关' 与 '风景名胜' 的余弦相似度: 0.2305 (CBOW)
'长安汽车维修' 与 '公司' 的余弦相似度: 0.4322 (CBOW)
'长安汽车维修' 与 '公司企业' 的余弦相似度: 0.2415 (CBOW)
'长安汽车维修' 与 '风景名胜相关' 的余弦相似度: 0.0562 (CBOW)
'长安汽车维修' 与 '现代特约销售' 的余弦相似度: 0.2213 (CBOW)


In [60]:
df_dm.to_csv('d2vdm_vectors.csv', index=False)

### DBOW

#### 训练

In [20]:
corpus = Docs['Doc']
embedding_dim = 100
epochs = 30
lr = 0.001
batch_size = 512
min_count = 1
window_size = 5
model_type = "dbow"

In [21]:
d2v_dbow, vocab, losses = TrainDoc2Vec(
    corpus, embedding_dim=embedding_dim, epochs=epochs, lr=lr, 
    batch_size=batch_size, min_count=min_count, 
    window_size=window_size, model_type=model_type
)

正在构建词汇表...
词汇表大小: 187
开始训练 Doc2Vec (DBOW)！


Epoch 1/30, Loss: 21046.6167: 100%|██████████| 6165/6165 [00:20<00:00, 295.65it/s]
Epoch 2/30, Loss: 20305.0940: 100%|██████████| 6165/6165 [00:25<00:00, 239.33it/s]
Epoch 3/30, Loss: 20161.1166: 100%|██████████| 6165/6165 [00:28<00:00, 218.70it/s]
Epoch 4/30, Loss: 20083.9675: 100%|██████████| 6165/6165 [00:31<00:00, 197.51it/s]
Epoch 5/30, Loss: 20036.7094: 100%|██████████| 6165/6165 [00:34<00:00, 178.80it/s]
Epoch 6/30, Loss: 20005.4359: 100%|██████████| 6165/6165 [00:36<00:00, 169.10it/s]
Epoch 7/30, Loss: 19984.8579: 100%|██████████| 6165/6165 [00:38<00:00, 160.81it/s]
Epoch 8/30, Loss: 19970.4050: 100%|██████████| 6165/6165 [00:40<00:00, 152.07it/s]
Epoch 9/30, Loss: 19959.6425: 100%|██████████| 6165/6165 [00:42<00:00, 144.91it/s]
Epoch 10/30, Loss: 19952.2209: 100%|██████████| 6165/6165 [00:44<00:00, 138.76it/s]
Epoch 11/30, Loss: 19946.1372: 100%|██████████| 6165/6165 [00:46<00:00, 133.53it/s]
Epoch 12/30, Loss: 19941.3752: 100%|██████████| 6165/6165 [00:47<00:00, 129.21it/s]
E

训练完成！





#### 验证词嵌入

In [22]:
def DocVectors(model):
    doc_embeddings = model.doc_emb.weight.data.cpu().numpy()
    
    doc_df = pd.DataFrame(doc_embeddings, index=[f"doc_{i}" for i in range(len(doc_embeddings))])
    doc_df = doc_df.reset_index().rename(columns={'index': 'doc_id'})
    
    return doc_df

doc = DocVectors(d2v_dbow)
doc

Unnamed: 0,doc_id,0,1,2,3,4,5,6,7,8,...,90,91,92,93,94,95,96,97,98,99
0,doc_0,-0.204730,-0.016608,0.320887,-0.121759,0.059550,0.491978,0.567346,0.489863,0.319136,...,0.479489,-0.545780,-0.368210,-0.459224,-0.128575,-0.265785,-0.877520,-0.185485,-0.557282,0.146579
1,doc_1,0.617989,-0.449954,-0.515998,-0.266710,0.105638,0.081391,0.260528,0.699719,0.307924,...,0.642855,-0.165636,-0.223824,-0.017118,-0.302723,-0.520786,0.180342,0.146220,-1.067864,0.072823
2,doc_2,-0.383419,0.166875,-0.109428,-0.037259,-0.311376,0.560463,0.471630,0.218481,0.784298,...,0.313562,0.203106,0.120520,-0.614101,-0.481883,-0.146913,-0.712851,0.416549,-0.636571,0.192558
3,doc_3,-0.083842,-0.092397,0.086169,-0.473876,-0.057751,-0.166024,0.127776,-0.023565,0.771755,...,0.586669,0.388482,-0.082095,-0.677345,-0.522252,-0.832545,-0.634794,0.666749,-0.448270,0.570973
4,doc_4,0.326987,-0.185660,0.380252,-0.253666,-0.808441,-0.420179,0.491168,0.005097,1.100598,...,-0.316628,0.234225,-0.144095,-0.163381,-0.593664,0.114228,0.171992,0.238935,-0.401307,0.456501
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1553,doc_1553,-0.315598,0.361818,-0.126505,0.076087,0.187393,-0.403591,0.407625,-0.355209,0.325893,...,0.329219,-0.360040,0.153793,-0.370102,-0.329496,-0.471353,-0.450108,0.387790,0.239355,-0.261078
1554,doc_1554,0.105555,0.115189,0.202127,-0.253316,1.408416,0.210350,0.046095,0.213391,-0.011713,...,0.166139,-1.042281,0.028824,-0.149401,-0.020609,-0.186031,-0.696592,0.655580,-1.170119,-0.036990
1555,doc_1555,1.194669,-0.013587,0.958797,-0.382345,0.568839,0.348785,1.307027,0.509271,-0.016300,...,0.110829,0.129989,-0.591123,-0.941567,-0.292753,0.377096,-0.975020,-0.167606,-0.253513,0.523053
1556,doc_1556,-0.606211,0.577999,0.339669,-0.535893,0.655007,0.639667,0.639528,0.565849,-0.668912,...,0.668625,-0.434924,0.651536,-0.100529,0.646665,-0.267406,-0.589809,0.728324,-0.493338,-0.482818
