In [1]:
import jieba
from gensim import corpora,models,similarities

In [2]:
documents = ['工业互联网平台的核心技术是什么',
            '工业现场生产过程优化场景有哪些',
            '互联网泡沫即将过去']
text3 = "大厂场景固定，泡沫比较少"

In [3]:
def word_cut(doc):
    seg = [jieba.lcut(w) for w in doc]
    return seg

texts= word_cut(documents)
texts

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/dm/n5w6s3vs0yv6nm2h3smdvs380000gn/T/jieba.cache
Loading model cost 0.803 seconds.
Prefix dict has been built successfully.


[['工业', '互联网', '平台', '的', '核心技术', '是', '什么'],
 ['工业', '现场', '生产', '过程', '优化', '场景', '有', '哪些'],
 ['互联网', '泡沫', '即将', '过去']]

In [4]:
# 为语料库中出现的所有单词分配了一个唯一的整数id
dictionary = corpora.Dictionary(texts)
dictionary.keys(), dictionary.token2id

([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
 {'互联网': 0,
  '什么': 1,
  '工业': 2,
  '平台': 3,
  '是': 4,
  '核心技术': 5,
  '的': 6,
  '优化': 7,
  '哪些': 8,
  '场景': 9,
  '有': 10,
  '现场': 11,
  '生产': 12,
  '过程': 13,
  '即将': 14,
  '泡沫': 15,
  '过去': 16})

In [5]:
# 通过doc2bow转化为稀疏向量
corpus=[dictionary.doc2bow(text)for text in texts]
corpus

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)],
 [(2, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1)],
 [(0, 1), (14, 1), (15, 1), (16, 1)]]

In [6]:
test_corpus = dictionary.doc2bow(jieba.lcut(text3))

In [7]:
tfidf = models.TfidfModel(corpus)

In [8]:
index = similarities.SparseMatrixSimilarity(tfidf[corpus],len(dictionary.keys()))

In [9]:
index[tfidf[test_corpus]]

array([0.        , 0.26469827, 0.3992843 ], dtype=float32)

In [10]:
texts

[['工业', '互联网', '平台', '的', '核心技术', '是', '什么'],
 ['工业', '现场', '生产', '过程', '优化', '场景', '有', '哪些'],
 ['互联网', '泡沫', '即将', '过去']]

In [11]:
jieba.lcut(text3)

['大厂', '场景', '固定', '，', '泡沫', '比较', '少']

In [12]:
#。word2vec
context = [
    "The recently introduced continuous Skip-gram model is an efficient method for learning high-quality distributed vector representations that capture a large num- ber of precise syntactic and semantic word relationships. In this paper we present several extensions that improve both the quality of the vectors and the training speed. By subsampling of the frequent words we obtain significant speedup and also learn more regular word representations. We also describe a simple alterna- tive to the hierarchical softmax called negative sampling.",
    "We propose two novel model architectures for computing continuous vector repre- sentations of words from very large data sets. The quality of these representations is measured in a word similarity task, and the results are compared to the previ- ously best performing techniques based on different types of neural networks. We observe large improvements in accuracy at much lower computational cost, i.e. it takes less than a day to learn high quality word vectors from a 1.6 billion words data set. Furthermore, we show that these vectors provide state-of-the-art perfor- mance on our test set for measuring syntactic and semantic word similarities."
]

In [13]:
dict_corpus = [jieba.lcut(word) for word in context]
dict_corpus

[['The',
  ' ',
  'recently',
  ' ',
  'introduced',
  ' ',
  'continuous',
  ' ',
  'Skip',
  '-',
  'gram',
  ' ',
  'model',
  ' ',
  'is',
  ' ',
  'an',
  ' ',
  'efficient',
  ' ',
  'method',
  ' ',
  'for',
  ' ',
  'learning',
  ' ',
  'high',
  '-',
  'quality',
  ' ',
  'distributed',
  ' ',
  'vector',
  ' ',
  'representations',
  ' ',
  'that',
  ' ',
  'capture',
  ' ',
  'a',
  ' ',
  'large',
  ' ',
  'num',
  '-',
  ' ',
  'ber',
  ' ',
  'of',
  ' ',
  'precise',
  ' ',
  'syntactic',
  ' ',
  'and',
  ' ',
  'semantic',
  ' ',
  'word',
  ' ',
  'relationships',
  '.',
  ' ',
  'In',
  ' ',
  'this',
  ' ',
  'paper',
  ' ',
  'we',
  ' ',
  'present',
  ' ',
  'several',
  ' ',
  'extensions',
  ' ',
  'that',
  ' ',
  'improve',
  ' ',
  'both',
  ' ',
  'the',
  ' ',
  'quality',
  ' ',
  'of',
  ' ',
  'the',
  ' ',
  'vectors',
  ' ',
  'and',
  ' ',
  'the',
  ' ',
  'training',
  ' ',
  'speed',
  '.',
  ' ',
  'By',
  ' ',
  'subsampling',
  ' ',
  'of',
  '

In [14]:
w2c_model = models.Word2Vec(dict_corpus,window=50,min_count=1)

In [15]:
w2c_model.wv.most_similar("vectors", 15)

[('simple', 0.14790154993534088),
 ('Skip', 0.13093295693397522),
 ('called', 0.12758195400238037),
 ('data', 0.12099821120500565),
 ('several', 0.10152021795511246),
 ('vector', 0.09153813868761063),
 ('lower', 0.08061347156763077),
 ('very', 0.08015013486146927),
 ('subsampling', 0.07660150527954102),
 ('billion', 0.07590745389461517)]

In [16]:
w2c_model.wv["vectors"]

array([-0.00899808,  0.02343618, -0.00635184, -0.0005505 , -0.01821155,
       -0.04638669,  0.01477248,  0.05894374, -0.02766863, -0.03499538,
       -0.00179536, -0.03759007, -0.01116655,  0.02498997,  0.0157638 ,
       -0.00670933,  0.01024574, -0.00571091, -0.02050687, -0.05088061,
        0.02079174,  0.01903727,  0.01672825,  0.00754327,  0.0146605 ,
        0.00693148, -0.01916442, -0.01884183, -0.02243941,  0.00109882,
        0.0171978 , -0.00439178,  0.0317363 , -0.04200909, -0.00687773,
        0.03272486,  0.01748733, -0.00990182, -0.00228411, -0.03727836,
       -0.01279959, -0.00306173, -0.01402469, -0.00011032,  0.01454152,
       -0.00794416, -0.03582023,  0.00314365,  0.01354054,  0.01356626,
       -0.00491282, -0.01408146, -0.01526404, -0.01050801,  0.01041296,
        0.00497275,  0.01763285, -0.01106232, -0.02070753,  0.02627723,
        0.01662021,  0.00347114,  0.0165413 ,  0.0059672 , -0.0120734 ,
        0.02825534,  0.0172846 ,  0.03739831, -0.02354981,  0.03

In [None]:
import jieba
from gensim import corpora,models,similarities
from collections import defaultdict   #用于创建一个空的字典，在后续统计词频可清理频率少的词语
#1、读取文档
doc1="./d1.txt"
doc2="./d2.txt"
d1=open(doc1,encoding='GBK').read()
d2=open(doc2,encoding='GBK').read()
#2、对要计算的文档进行分词
data1=jieba.cut(d1)
data2=jieba.cut(d2)
#3、对分词完的数据进行整理为指定格式
data11=""
for i in data1:
    data11+=i+" "
data21=""
for i in data2:
    data21+=i+" "
documents=[data11,data21]
texts=[[word for word in document.split()] for document in documents]
#4、 计算词语的频率
frequency=defaultdict(int)
for text in texts:
    for word in text:
        frequency[word]+=1
'''
#5、对频率低的词语进行过滤（可选）
texts=[[word for word in text if frequency[word]>10] for text in texts]
'''
#6、通过语料库将文档的词语进行建立词典
dictionary=corpora.Dictionary(texts)
dictionary.save("./dict.txt")    #可以将生成的词典进行保存
#7、加载要对比的文档
doc3="./d3.txt"
d3=open(doc3,encoding='GBK').read()
data3=jieba.cut(d3)
data31=""
for i in data3:
    data31+=i+" "
#8、将要对比的文档通过doc2bow转化为稀疏向量
new_xs=dictionary.doc2bow(data31.split())
#9、对语料库进一步处理，得到新语料库
corpus=[dictionary.doc2bow(text)for text in texts]
#10、将新语料库通过tf-idf model 进行处理，得到tfidf
tfidf=models.TfidfModel(corpus)
#11、通过token2id得到特征数
featurenum=len(dictionary.token2id.keys())
#12、稀疏矩阵相似度，从而建立索引
index=similarities.SparseMatrixSimilarity(tfidf[corpus],num_features=featurenum)
#13、得到最终相似结果
sim=index[tfidf[new_xs]]
print(sim)