### SVD构建词向量
把路径参数调好直接运行即可，20min

In [8]:
from typing import Optional
from scipy.sparse import csr_matrix,coo_matrix,load_npz,save_npz
from scipy.sparse.linalg import svds

import numpy as np

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.corpus import wordnet

# nltk.download('punkt')
# nltk.download('stopwords')
class SVDEmbedding:
    """
    SVD分解的类，包括了数据预处理，共现矩阵的创建，SVD的分解
    """
    def __init__(self, 
                 train_path: Optional[str]='../data/training.txt',
                 k: Optional[int]=5,
                 co_appear_matrix_path: Optional[str]=None,
                 svd_vector_path: Optional[str]=None,
                 num_dimensions: Optional[int]=200):
        """初始化

        Args:
            train_path (Optional[str], optional): 训练数据的路径. Defaults to '../data/training.txt'.
            k (Optional[int], optional): 窗口大小. Defaults to 5.
            co_appear_matrix_path (Optional[str], optional): 共现矩阵存储的路径. Defaults to None.
            svd_vector_path (Optional[str], optional): SVD分解的路径. Defaults to None.
            num_dimensions (Optional[int], optional): SVD维度. Defaults to 200.
        """
        self.k = k  # 上下文窗口大小
        self.word2id = {}  # 词到id的映射
        self.id2word = {}  # id到词的映射
        self.words_frequence = {}
        self.num_dimensions = num_dimensions
        
        # 读取文本
        self.words_list = self._read_data(train_path)
        self.words_set = set(self.words_list)
        print(len(self.words_set))
        
        if svd_vector_path is None:
            # 构建共现矩阵  
            if co_appear_matrix_path is None:
                self.co_appear_matrix = None  # 共现矩阵
                self._build_co_appear_matrix()
                save_npz('co_appear_matrix.npz', self.co_appear_matrix)
            else:
                print('加载共现矩阵')
                self.co_appear_matrix = load_npz(co_appear_matrix_path)
            # SVD分解
            self.word_vectors=None
            self._svd_embedding()
            
        else:
            print('加载svd embedding')
            self.word_vectors = np.load(svd_vector_path)
    def _read_data(self, 
                   train_path: Optional[str]='../data/training.txt'):  # 训练语料还是比较朴素的，只有一行，没有标点符号
        """读取数据，数据预处理

        Args:
            train_path (Optional[str], optional): 数据预处理. Defaults to '../data/training.txt'.

        Returns:
            _type_: _description_
        """
        # 初始化词形还原器
        print('去除停用词和恢复词语原型')
        stemmer = PorterStemmer()
        with open(train_path, 'r') as f:
            stop_words = set(stopwords.words('english'))
            words_list = []
            with open(train_path, 'r') as f:
                for line in f:
                    line = line.strip()
                    words = line.split()  # list
                    for word in words:
                        # 如果单词不是停用词，才进行处理
                        if word not in stop_words:
                            word = stemmer.stem(word)
                            words_list.append(word)
                            if word not in self.word2id:
                                
                                word_id = len(self.word2id)
                                self.word2id[word] = word_id
                                self.id2word[word_id] = word
        return words_list
        

    def _build_co_appear_matrix(self):
        """创建共现矩阵
        """
        print('构建共现矩阵')
        window_size = self.k
        row=[]
        col=[]
        count=[]
        window_size=5
        num_words = len(self.words_set)  # 去重后的数量
        print(f'num_words:{num_words}')
        for i in range(len(self.words_list)) :
            center_word=self.word2id[self.words_list[i]]
            #在句子中的位置
            window=list(range(max(0,i-window_size),min(i+1+window_size,len(self.words_list))))
            window.remove(i)
            for j in window :
                #要转换为index
                if center_word!=self.word2id[self.words_list[j]] :
                    row.append(center_word)
                    col.append(self.word2id[self.words_list[j]])
                    count.append(1)
        co_appear_matrix = coo_matrix((count, (row, col)), shape=(num_words,num_words),dtype=np.float64)
        self.co_appear_matrix = co_appear_matrix
        
        
    def _svd_embedding(self, 
                      save_path: Optional[str]='./svd_vector.npy'):
        print('svd分解')
        U, S, Vt = svds(self.co_appear_matrix,k=self.num_dimensions)
        self.S = S
        print(f'S{S.shape},{S}')
        print(f'计算了{len(S)}个奇异值')
        print(f'U.shape{U.shape}')
        
        non_zero_singular_values = len(S[S > 0])
        print(f'总共有{non_zero_singular_values}个非零奇异值')
        
        select_demisions = 100 if int(non_zero_singular_values * 0.75)>100 else int(non_zero_singular_values * 0.75)
        print(f'选取了{select_demisions}个奇异值')
        
        selected_sum = S[select_demisions:].sum()
        print(f'选取奇异值之和{selected_sum}')
        
        all_sum = S.sum()
        print(f'全部奇异值之和: {all_sum}')
        
        ratio = selected_sum / all_sum
        print(f'选取的奇异值之和与全部奇异值之和的比例: {ratio:.2f}')
        self.word_vectors = U[:, select_demisions:]
        np.save(save_path,np.array(self.word_vectors))
        
    def load_vector(self):
        return self.word_vectors

    def get_cos_sim(self, word1, word2):
        """
        计算传入的余弦相似度，如果有一个单词不在词典中，就返回0
        """
        word1 = word1.lower()  # 将单词转换为小写
        word2 = word2.lower()  # 将单词转换为小写

        if (word1 not in self.word2id or word2 not in self.word2id):
            return 0

        word1_vec = self.word_vectors[self.word2id[word1]]
        word2_vec = self.word_vectors[self.word2id[word2]]
        cos_sim = np.dot(word1_vec, word2_vec) / (np.linalg.norm(word1_vec) * np.linalg.norm(word2_vec))
        return cos_sim
  
        
svd = SVDEmbedding(co_appear_matrix_path='/home/wangtuo/workspace/Homework/embedding/src/co_appear_matrix.npz')

        

去除停用词和恢复词语原型
201626
加载共现矩阵
svd分解
S(200,),[  1098.04519633   1100.45801202   1109.25252177   1115.81023497
   1124.2493256    1128.53775306   1131.48622134   1139.68639495
   1142.25493929   1142.61110648   1147.37858439   1152.17372785
   1153.31908651   1159.04543802   1159.19100021   1168.30065258
   1168.74869896   1175.62459389   1181.06811785   1185.67339774
   1188.50931537   1193.90747844   1197.90667399   1205.9779365
   1206.42233614   1210.81906955   1216.08462039   1223.77871782
   1229.85136215   1236.07631286   1243.92818657   1248.64232099
   1254.02277213   1258.75726801   1267.45374673   1278.10360228
   1278.91278103   1280.84186995   1293.08385585   1302.07520185
   1305.71604183   1311.94784112   1323.43885175   1325.82319105
   1329.76523891   1335.40607773   1338.70593561   1349.11053843
   1350.16395907   1351.79095115   1362.80055161   1364.19580763
   1378.76993376   1381.85990783   1391.36590329   1404.10255849
   1417.11904892   1421.38461249   1422.01462217  

In [5]:
vectors = svd.load_vector()
print(len(svd.words_set))
print(vectors.shape)

201626
(201626, 100)


### 推理

In [3]:
def get_txt_sim(test_txt_path='/home/wangtuo/workspace/Homework/embedding/data/test.txt', output_path='../data/output.txt'):
    stemmer = PorterStemmer()
    with open(test_txt_path, 'r') as f_in, open(output_path, 'w') as f_out:
        for line in f_in:  # 对于每一行
            line = line.strip()
            words = line.split()
            word1 = stemmer.stem(words[1])
            word2 = stemmer.stem(words[2])
            sim = svd.get_cos_sim(word1.lower(), word2.lower())  # 假设第一个词和第二个词分别是 words[0] 和 words[1]
            line_with_sim = line + '\t' + str(sim) + '\n'  # 在行末加一个制表符后写入余弦相似度
            f_out.write(line_with_sim)  # 将处理后的行写入输出文件

# 调用函数，并指定输入文件路径和输出文件路径
get_txt_sim('/home/wangtuo/workspace/Homework/embedding/data/test.txt', 'output1.txt')

