In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter, defaultdict

import ast
import random
from tqdm import tqdm

global device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device,'\n')

Docs = pd.read_csv('../dataset/地理语料库_taz.csv')
Docs['Doc'] = Docs['Doc'].apply(lambda x: ast.literal_eval(x))
Docs

cuda 



Unnamed: 0,eID,Doc
0,254.0,"[公司, 公司企业, 公司, 住宅区, 住宅区, 休闲场所, 科教文化场所, 政府及社会团体..."
1,258.0,"[体育休闲服务场所, 科教文化场所, 运动场馆, 综合市场, 专卖店, 宾馆酒店, 服装鞋帽..."
2,239.0,"[公共厕所, 旅行社, 旅行社, 公司, 生活服务场所, 便民商店/便利店, 糕饼店, 公司..."
3,255.0,"[汽车维修, 服装鞋帽皮具店, 购物相关场所, 家居建材市场, 住宅区, 科教文化场所, 专..."
4,253.0,"[公检法机构, 公检法机构, 公检法机构, 工商税务机构, 公检法机构, 商务住宅相关, 住..."
...,...,...
1553,977.0,"[公园广场, 公园广场]"
1554,1515.0,"[公共厕所, 购物相关场所]"
1555,1439.0,"[产业园区, 公司, 公司, 公司企业, 汽车维修, 汽车养护/装饰, 公司企业, 金融保险..."
1556,1628.0,"[公共厕所, 公共厕所]"


## Word2vec

#### 模型架构

In [2]:
class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(CBOW, self).__init__()
        self.in_embed = nn.Embedding(vocab_size, embedding_dim)
        self.out_embed = nn.Linear(embedding_dim, vocab_size)

    def forward(self, context_ids):
        embeds = self.in_embed(context_ids)
        hidden = embeds.mean(dim=1)
        out = self.out_embed(hidden)
        return out


class SkipGram(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(SkipGram, self).__init__()
        self.in_embed = nn.Embedding(vocab_size, embedding_dim)
        self.out_embed = nn.Embedding(vocab_size, embedding_dim)

    def forward(self, target_ids, pos_ids, neg_ids):
        v = self.in_embed(target_ids)
        u_pos = self.out_embed(pos_ids)
        u_neg = self.out_embed(neg_ids)

        pos_score = torch.mul(v, u_pos).sum(dim=1)
        pos_loss = torch.log(torch.sigmoid(pos_score) + 1e-9)

        neg_score = torch.bmm(u_neg, v.unsqueeze(2)).squeeze(2)
        neg_loss = torch.log(torch.sigmoid(-neg_score) + 1e-9).sum(dim=1)

        loss = -(pos_loss + neg_loss).mean()
        return loss

#### 构建数据集

In [3]:
class Word2VecDataset(Dataset):
    def __init__(self, docs, vocab, window_size=2, neg_sample_num=5, model_type="cbow"):
        self.vocab = vocab
        self.word2idx = vocab
        self.idx2word = {i: w for w, i in vocab.items()}
        self.vocab_size = len(vocab)
        self.window_size = window_size
        self.neg_sample_num = neg_sample_num
        self.model_type = model_type.lower()

        # 转为索引序列
        tokens = [w for doc in docs for w in doc if w in vocab]

        # 构建 unigram 分布 (负采样用)
        word_counts = Counter(tokens)
        freqs = np.array([word_counts[self.idx2word[i]] for i in range(len(vocab))])
        unigram_dist = freqs / freqs.sum()
        self.noise_dist = (unigram_dist ** 0.75)
        self.noise_dist /= self.noise_dist.sum()

        self.data = []
        if self.model_type == "cbow":
            for i in range(window_size, len(tokens) - window_size):
                context = tokens[i - window_size:i] + tokens[i + 1:i + window_size + 1]
                target = tokens[i]
                context_ids = [vocab[w] for w in context]
                target_id = vocab[target]
                self.data.append((context_ids, target_id))

        elif self.model_type == "skipgram":
            for i in range(window_size, len(tokens) - window_size):
                target = tokens[i]
                context = tokens[i - window_size:i] + tokens[i + 1:i + window_size + 1]
                target_id = vocab[target]
                for w in context:
                    pos_id = vocab[w]
                    neg_ids = np.random.choice(len(vocab), self.neg_sample_num, p=self.noise_dist)
                    self.data.append((target_id, pos_id, neg_ids))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        if self.model_type == "cbow":
            context_ids, target_id = self.data[idx]
            return torch.tensor(context_ids, dtype=torch.long), torch.tensor(target_id, dtype=torch.long)
        else:  # skipgram
            target_id, pos_id, neg_ids = self.data[idx]
            return (
                torch.tensor(target_id, dtype=torch.long),
                torch.tensor(pos_id, dtype=torch.long),
                torch.tensor(neg_ids, dtype=torch.long)
            )
        
def BuildVocab(docs, min_count=1):
    word_counts = Counter(word for doc in docs for word in doc)
    vocab = {word: i for i, (word, count) in enumerate(word_counts.items()) if count >= min_count}
    return vocab

#### 训练模型

In [4]:
def TrainWord2Vec(docs, embedding_dim=100, epochs=50, lr=0.001,
                  batch_size=128, min_count=1, window_size=2,
                  neg_sample_num=5, model_type="cbow"):

    print("正在构建词表...")
    vocab = BuildVocab(docs, min_count)
    print("词汇表大小:", len(vocab))

    dataset = Word2VecDataset(docs, vocab, window_size, neg_sample_num, model_type)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    if model_type == "cbow":
        model = CBOW(len(vocab), embedding_dim).to(device)
        optimizer = optim.Adam(model.parameters(), lr=lr)
        loss_fn = nn.CrossEntropyLoss()
    elif model_type == "skipgram":
        model = SkipGram(len(vocab), embedding_dim).to(device)
        optimizer = optim.Adam(model.parameters(), lr=lr)
    else:
        raise ValueError("model_type 必须是 'cbow' 或 'skipgram'")

    print(f"开始训练 Word2Vec ({model_type.upper()})！")
    losses = []
    model.train()
    for epoch in range(epochs):
        epoch_loss = 0
        progress_bar = tqdm(dataloader, desc=f"Epoch {epoch+1}/{epochs}, Loss: {0:.4f}")

        for batch in progress_bar:
            optimizer.zero_grad()

            if model_type == "cbow":
                context_ids, target_id = batch
                context_ids = context_ids.to(device)
                target_id = target_id.to(device)
                output = model(context_ids)
                loss = loss_fn(output, target_id)

            elif model_type == "skipgram":
                target_id, pos_id, neg_ids = batch
                target_id = target_id.to(device)
                pos_id = pos_id.to(device)
                neg_ids = neg_ids.to(device)
                loss = model(target_id, pos_id, neg_ids)
            else:
                raise ValueError("model_type 必须是 'cbow' 或 'skipgram'")

            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()
            progress_bar.set_description(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss:.4f}")

        losses.append(epoch_loss)
    print("训练完成！")
    return model, vocab, losses

#### CBOW

##### 训练

In [44]:
corpus = Docs['Doc']
embedding_dim = 100
epochs = 50
lr = 0.001
batch_size = 1024
min_count = 1
window_size = 5
neg_sample_num = 5 
model_type = "cbow"

In [45]:
w2v_cbow, vocab, losses = TrainWord2Vec(
    corpus, embedding_dim=embedding_dim, epochs=epochs, lr=lr, 
    batch_size=batch_size, min_count=min_count, window_size=window_size,
    neg_sample_num=neg_sample_num, model_type=model_type
)

正在构建词表...
词汇表大小: 187
开始训练 Word2Vec (CBOW)！


Epoch 1/50, Loss: 1798.5879: 100%|██████████| 518/518 [00:09<00:00, 57.22it/s]
Epoch 2/50, Loss: 1661.7434: 100%|██████████| 518/518 [00:08<00:00, 60.44it/s]
Epoch 3/50, Loss: 1652.8887: 100%|██████████| 518/518 [00:09<00:00, 52.93it/s]
Epoch 4/50, Loss: 1649.3139: 100%|██████████| 518/518 [00:09<00:00, 53.50it/s]
Epoch 5/50, Loss: 1647.3163: 100%|██████████| 518/518 [00:09<00:00, 53.07it/s]
Epoch 6/50, Loss: 1645.9496: 100%|██████████| 518/518 [00:09<00:00, 52.18it/s]
Epoch 7/50, Loss: 1645.0345: 100%|██████████| 518/518 [00:10<00:00, 48.85it/s]
Epoch 8/50, Loss: 1644.2232: 100%|██████████| 518/518 [00:10<00:00, 49.69it/s]
Epoch 9/50, Loss: 1643.7236: 100%|██████████| 518/518 [00:12<00:00, 42.90it/s]
Epoch 10/50, Loss: 1643.2110: 100%|██████████| 518/518 [00:11<00:00, 43.84it/s]
Epoch 11/50, Loss: 1642.7975: 100%|██████████| 518/518 [00:10<00:00, 48.15it/s]
Epoch 12/50, Loss: 1642.5088: 100%|██████████| 518/518 [00:10<00:00, 50.60it/s]
Epoch 13/50, Loss: 1642.2061: 100%|██████████| 51

训练完成！


##### 验证词嵌入

In [46]:
def WordVectors(model, vocab):
    embeddings = model.out_embed.weight.data.cpu().numpy()
    idx2word = {i: w for w, i in vocab.items()}
    words = [idx2word[i] for i in range(len(vocab))]
    df = pd.DataFrame(embeddings, index=words)
    df = df.reset_index().rename(columns={'index': 'word'})
    return df
  
w2v_cbow.eval()
df_cbow = WordVectors(w2v_cbow, vocab) 
df_cbow

Unnamed: 0,word,0,1,2,3,4,5,6,7,8,...,90,91,92,93,94,95,96,97,98,99
0,公司,-0.279509,0.148413,-0.148520,-0.040362,0.136207,0.256115,0.112568,0.115207,-0.105398,...,-0.183571,0.161929,0.067406,0.159727,0.128865,-0.263784,0.148649,-0.136276,-0.125299,-0.030952
1,公司企业,-0.179601,0.016475,-0.023884,-0.049179,0.070100,0.164037,-0.057913,0.183208,0.046234,...,-0.198208,-0.033318,-0.069524,0.231410,-0.137613,-0.135393,0.057125,-0.132225,0.253146,-0.071424
2,住宅区,-0.038463,0.068162,-0.130778,0.089859,0.018379,0.108775,0.112300,0.137928,-0.010357,...,-0.120698,0.143166,0.098756,0.075964,-0.318558,-0.208704,0.226024,0.025179,-0.047435,0.253623
3,休闲场所,0.098193,-0.188153,-0.120709,0.088592,-0.056763,-0.205827,-0.152195,-0.051602,-0.123493,...,0.012683,-0.283921,0.101491,-0.139063,0.176835,-0.145072,0.134991,-0.139272,-0.203136,-0.066831
4,科教文化场所,0.087423,-0.005690,-0.250879,0.019574,-0.200784,0.107417,0.069054,0.189936,-0.148560,...,-0.223048,-0.033794,0.011197,0.130367,-0.156020,-0.196868,0.287938,-0.043297,-0.130437,0.185355
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
182,克莱斯勒特约销售,0.829857,0.115268,0.143057,-0.861797,-0.164994,-1.825776,-0.055982,-0.198219,0.414072,...,0.664088,-0.345565,-0.044601,-0.176123,1.337720,0.692508,-0.102213,2.437115,0.499854,-1.097759
183,捷豹特约维修,-0.130256,-0.593764,0.668737,-1.595378,-0.796948,-0.131135,-0.149264,-0.179162,0.439490,...,0.383894,-0.180346,-1.078104,0.058756,-0.197528,-0.146114,-0.115932,0.465406,0.357431,-1.323402
184,长安汽车维修,1.606986,0.039800,0.021160,-0.409483,-0.030517,-0.042318,-2.080971,-1.055073,0.340076,...,1.457224,-0.002390,0.083928,-0.428982,0.516890,1.637653,-0.030898,2.201164,1.231079,-0.045138
185,纳智捷销售,0.220137,-0.141411,0.400136,-0.653684,0.040237,-1.343261,-0.093219,0.051445,1.038135,...,0.379755,-0.006474,0.048457,-0.337707,-0.026465,1.840076,-0.191200,0.420006,0.246212,-0.433429


In [58]:
def cosine_similarity(vec1, vec2):
    vec1 = vec1 / np.linalg.norm(vec1)
    vec2 = vec2 / np.linalg.norm(vec2)
    return np.dot(vec1, vec2)


word1 = '公司企业'                      
word2 = '公司'
word3 = '风景名胜'
word4 = '风景名胜相关'
word5 = '长安汽车维修'
word6 = '现代特约销售'
df = df_cbow.copy()
vec1 = df.query(f'word=="{word1}"').iloc[:,1:].values.flatten()
vec2 = df.query(f'word=="{word2}"').iloc[:,1:].values.flatten()
vec3 = df.query(f'word=="{word3}"').iloc[:,1:].values.flatten()
vec4 = df.query(f'word=="{word4}"').iloc[:,1:].values.flatten()
vec5 = df.query(f'word=="{word5}"').iloc[:,1:].values.flatten()
vec6 = df.query(f'word=="{word6}"').iloc[:,1:].values.flatten()

print(f"'{word2}' 与 '{word1}' 的余弦相似度: {cosine_similarity(vec1, vec2):.4f} (CBOW)")
print(f"'{word2}' 与 '{word3}' 的余弦相似度: {cosine_similarity(vec3, vec2):.4f} (CBOW)")
print(f"'{word4}' 与 '{word3}' 的余弦相似度: {cosine_similarity(vec3, vec4):.4f} (CBOW)")
print(f"'{word5}' 与 '{word2}' 的余弦相似度: {cosine_similarity(vec2, vec5):.4f} (CBOW)")
print(f"'{word5}' 与 '{word1}' 的余弦相似度: {cosine_similarity(vec1, vec5):.4f} (CBOW)")
print(f"'{word5}' 与 '{word4}' 的余弦相似度: {cosine_similarity(vec4, vec5):.4f} (CBOW)")
print(f"'{word5}' 与 '{word6}' 的余弦相似度: {cosine_similarity(vec5, vec6):.4f} (CBOW)")

'公司' 与 '公司企业' 的余弦相似度: 0.4823 (CBOW)
'公司' 与 '风景名胜' 的余弦相似度: -0.2244 (CBOW)
'风景名胜相关' 与 '风景名胜' 的余弦相似度: 0.4945 (CBOW)
'长安汽车维修' 与 '公司' 的余弦相似度: -0.5405 (CBOW)
'长安汽车维修' 与 '公司企业' 的余弦相似度: -0.2218 (CBOW)
'长安汽车维修' 与 '风景名胜相关' 的余弦相似度: 0.2572 (CBOW)
'长安汽车维修' 与 '现代特约销售' 的余弦相似度: 0.5163 (CBOW)


In [61]:
df_cbow.to_csv('w2vcbow_vectors.csv', index=False)

### Skip-Gram

##### 训练

In [9]:
corpus = Docs['Doc']
embedding_dim = 100
epochs = 30
lr = 0.01
batch_size = 4096
min_count = 1
window_size = 5
neg_sample_num = 5 
model_type = "skipgram"

In [10]:
w2v_sg, vocab, losses = TrainWord2Vec(
    corpus, embedding_dim=embedding_dim, epochs=epochs, lr=lr, 
    batch_size=batch_size, min_count=min_count, window_size=window_size,
    neg_sample_num=neg_sample_num, model_type=model_type
)

正在构建词表...
词汇表大小: 187


开始训练 Word2Vec (SKIPGRAM)！


Epoch 1/30, Loss: 3952.9747: 100%|██████████| 1295/1295 [02:36<00:00,  8.28it/s]
Epoch 2/30, Loss: 3317.0641: 100%|██████████| 1295/1295 [02:36<00:00,  8.30it/s]
Epoch 3/30, Loss: 3310.1868: 100%|██████████| 1295/1295 [02:32<00:00,  8.48it/s]
Epoch 4/30, Loss: 3303.5834: 100%|██████████| 1295/1295 [02:17<00:00,  9.42it/s]
Epoch 5/30, Loss: 3296.8909: 100%|██████████| 1295/1295 [02:21<00:00,  9.16it/s]
Epoch 6/30, Loss: 3290.3829: 100%|██████████| 1295/1295 [02:21<00:00,  9.18it/s]
Epoch 7/30, Loss: 3284.4195: 100%|██████████| 1295/1295 [02:21<00:00,  9.17it/s]
Epoch 8/30, Loss: 3279.5626: 100%|██████████| 1295/1295 [02:20<00:00,  9.19it/s]
Epoch 9/30, Loss: 3276.6377: 100%|██████████| 1295/1295 [02:23<00:00,  9.02it/s]
Epoch 10/30, Loss: 3275.7884: 100%|██████████| 1295/1295 [02:21<00:00,  9.13it/s]
Epoch 11/30, Loss: 3275.4503: 100%|██████████| 1295/1295 [02:22<00:00,  9.09it/s]
Epoch 12/30, Loss: 3275.0885: 100%|██████████| 1295/1295 [02:21<00:00,  9.15it/s]
Epoch 13/30, Loss: 3274.8

训练完成！


##### 验证词嵌入

In [11]:
w2v_sg.eval()
df_sg = WordVectors(w2v_sg, vocab) 
df_sg

Unnamed: 0,word,0,1,2,3,4,5,6,7,8,...,90,91,92,93,94,95,96,97,98,99
0,公司,-0.001325,0.040039,-0.026745,0.121017,-0.039544,-0.024047,0.001079,-0.079589,-0.018382,...,0.199860,-0.061716,-0.032787,-0.025552,0.678270,-0.064450,-0.026476,0.017096,-0.052508,-0.022093
1,公司企业,-0.028285,-0.099142,-0.075033,0.065468,0.014031,0.048582,-0.038037,-0.035482,-0.040273,...,0.564137,-0.097680,-0.017768,-0.065792,1.121048,0.098104,0.026211,-0.041838,0.027003,0.023285
2,住宅区,0.020069,-0.054516,0.075837,0.273152,0.053578,0.000820,0.015977,-0.107502,0.022082,...,0.369662,0.007954,-0.016511,0.006288,0.706786,0.042079,0.040600,-0.012976,-0.122183,-0.002046
3,休闲场所,0.011115,0.048633,0.153728,0.245351,-0.006593,0.018020,0.111292,-0.045935,-0.067851,...,1.113364,-0.291727,0.204924,0.208741,0.980460,-0.076936,-0.009837,-0.208042,-0.096157,0.138435
4,科教文化场所,0.030341,0.041658,-0.058225,0.084882,-0.028031,-0.016587,-0.074708,0.024136,-0.043120,...,0.479631,-0.162792,-0.016199,0.076915,0.806678,0.065611,-0.063472,0.063472,0.072339,-0.039055
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
182,克莱斯勒特约销售,-1.143318,0.786647,0.344542,-0.948799,-1.071158,0.362677,-0.554823,-0.687403,0.574649,...,2.884382,-0.730146,0.925535,-0.202544,2.923240,0.285014,-1.308512,0.630275,-0.242824,0.708880
183,捷豹特约维修,0.013656,-0.078976,0.839021,0.059081,-0.039091,0.054780,-1.633214,-0.823990,0.059538,...,2.160773,1.261611,-0.665312,0.429939,8.006378,-2.375823,0.672182,-0.724369,0.079452,-0.762299
184,长安汽车维修,0.066607,-0.841656,-0.697063,0.096211,-0.193956,-0.773621,-1.346876,-0.304568,-0.768560,...,0.582410,-2.877853,-0.046075,-0.135878,2.974216,1.890828,-0.974349,-2.078564,0.676376,0.640105
185,纳智捷销售,1.326383,0.214787,0.115277,2.571142,-1.316218,0.476660,0.050894,-0.540050,-1.702756,...,1.632042,0.070642,1.437786,0.149784,4.484808,-1.720721,0.049101,-0.492348,0.081908,0.932253


In [12]:
word1 = '金融保险服务机构'                          
word2 = '公司'
word3 = '风景名胜'
word4 = '风景名胜相关'
word5 = '长安汽车维修'
word6 = '现代特约销售'
df = df_sg.copy()
vec1 = df.query(f'word=="{word1}"').iloc[:,1:].values.flatten()
vec2 = df.query(f'word=="{word2}"').iloc[:,1:].values.flatten()
vec3 = df.query(f'word=="{word3}"').iloc[:,1:].values.flatten()
vec4 = df.query(f'word=="{word4}"').iloc[:,1:].values.flatten()
vec5 = df.query(f'word=="{word5}"').iloc[:,1:].values.flatten()
vec6 = df.query(f'word=="{word6}"').iloc[:,1:].values.flatten()

print(f"'{word2}' 与 '{word1}' 的余弦相似度: {cosine_similarity(vec1, vec2):.4f} (CBOW)")
print(f"'{word2}' 与 '{word3}' 的余弦相似度: {cosine_similarity(vec3, vec2):.4f} (CBOW)")
print(f"'{word4}' 与 '{word3}' 的余弦相似度: {cosine_similarity(vec3, vec4):.4f} (CBOW)")
print(f"'{word5}' 与 '{word2}' 的余弦相似度: {cosine_similarity(vec2, vec5):.4f} (CBOW)")
print(f"'{word5}' 与 '{word1}' 的余弦相似度: {cosine_similarity(vec1, vec5):.4f} (CBOW)")
print(f"'{word5}' 与 '{word4}' 的余弦相似度: {cosine_similarity(vec4, vec5):.4f} (CBOW)")
print(f"'{word5}' 与 '{word6}' 的余弦相似度: {cosine_similarity(vec5, vec6):.4f} (CBOW)")

'公司' 与 '金融保险服务机构' 的余弦相似度: 0.8196 (CBOW)
'公司' 与 '风景名胜' 的余弦相似度: 0.5993 (CBOW)
'风景名胜相关' 与 '风景名胜' 的余弦相似度: 0.8773 (CBOW)
'长安汽车维修' 与 '公司' 的余弦相似度: 0.4504 (CBOW)
'长安汽车维修' 与 '金融保险服务机构' 的余弦相似度: 0.3689 (CBOW)
'长安汽车维修' 与 '风景名胜相关' 的余弦相似度: 0.4154 (CBOW)
'长安汽车维修' 与 '现代特约销售' 的余弦相似度: 0.2547 (CBOW)


In [57]:
df_sg.to_csv('w2vsg_vectors.csv', index=False)