# Gensim学习

使用list迭代的方式，将一段corpus转换称对应的dictionary。单词与其序列的映射关系。

In [10]:
import pprint


text_corpus = [
    "Human machine interface for lab abc computer applications",
    "A survey of user opinion of computer system response time",
    "The EPS user interface management system",
    "System and human system engineering testing of EPS",
    "Relation of user perceived response time to error measurement",
    "The generation of random binary unordered trees",
    "The intersection graph of paths in trees",
    "Graph minors IV Widths of trees and well quasi ordering",
    "Graph minors A survey",
]

# 创建一个集合 用来存储一些介词
stoplist = set('for a of the and to in'.split(' '))

# print(type(stoplist))

# .lower() 将文本转换成小写字母 .split() 对document进行分词
# 最后通过if来排除常见的介词
texts = [
    [word for word in document.lower().split() if word not in stoplist]
    for document in text_corpus
]

# 计算单词出现的频率
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

# 输出不止出现一次的单词
# 通过双重循环 将二维数组转换为一维数组
processed_corpus = [
    [ token for token in text if frequency[token]> 1 ] for text in texts
]

pprint.pprint(processed_corpus)
print('预处理语料库长度：',len(processed_corpus))

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]
预处理语料库长度： 9


使用gensim内的ditionary生成字典

In [11]:
from gensim import corpora

dictionary = corpora.Dictionary(processed_corpus)
print(dictionary)
pprint.pprint(dictionary.token2id)

Dictionary<12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...>
{'computer': 0,
 'eps': 8,
 'graph': 10,
 'human': 1,
 'interface': 2,
 'minors': 11,
 'response': 3,
 'survey': 4,
 'system': 5,
 'time': 6,
 'trees': 9,
 'user': 7}


In [3]:
new_doc = "Human computer interation"
# new_doc = "Human use computer, Human eps computer"

new_vec = dictionary.doc2bow(new_doc.lower().split())
print(new_vec)

NameError: name 'dictionary' is not defined

在每个元组中，第一个数值表示单词的ID，第二个单词表示单词出现的次数

In [4]:
bow_corpus = [ dictionary.doc2bow(text.lower().split()) for text in text_corpus]
pprint.pprint(bow_corpus)

NameError: name 'dictionary' is not defined

## Model

In [32]:
from gensim import models

tfidf = models.TfidfModel(bow_corpus)

words = "system minors".lower().split()
print(tfidf[dictionary.doc2bow(words)])

[(5, 0.5898341626740045), (11, 0.8075244024440723)]


每个元组中，第一个数值表示单词的ID，第二个表示tf-idf权重

In [36]:
from gensim import similarities

index = similarities.SparseMatrixSimilarity(tfidf[bow_corpus], num_features=12)

query_document = 'system enginnering'.split()
query_bow = dictionary.doc2bow(query_document)
print('query_bow\n', query_bow)

sims = index[tfidf[query_bow]]
print(list(enumerate(sims)))

query_bow
 [(5, 1)]
[(0, 0.0), (1, 0.32448703), (2, 0.41707572), (3, 0.7184812), (4, 0.0), (5, 0.0), (6, 0.0), (7, 0.0), (8, 0.0)]


In [37]:
for document_number, score in sorted(enumerate(sims), key=lambda x:x[1], reverse=True):
    print(document_number, score)

3 0.7184812
2 0.41707572
1 0.32448703
0 0.0
4 0.0
5 0.0
6 0.0
7 0.0
8 0.0


In [40]:
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

### From Strings to Vectors

In [16]:
import pprint
from collections import defaultdict

# 导入9句话
documents = [
    "Human machine interface for lab abc computer applications",
    "A survey of user opinion of computer system response time",
    "The EPS user interface management system",
    "System and human system engineering testing of EPS",
    "Relation of user perceived response time to error measurement",
    "The generation of random binary unordered trees",
    "The intersection graph of paths in trees",
    "Graph minors IV Widths of trees and well quasi ordering",
    "Graph minors A survey",
]

stoplist = set('for a of the and to in'.split(' '))
texts = [
    [word for word in document.lower().split() if word not in stoplist ] 
    for document in documents
]

frequency = defaultdict(int)
for text in texts:
    for word in text:
        # 使用方法相当于字典
        frequency[word] += 1
        
# 去除只出现一token次的单词
texts = [[token for token in text if frequency[token]>1 ] for text in texts]

from gensim import corpora
dictionary = corpora.Dictionary(texts)
dictionary.save('./tmp/deerwester.dict')
print(dictionary)

Dictionary<12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...>


In [17]:
# 得到单词的ID
print(dictionary.token2id)

new_doc = "human computer iteraction"
new_vect = dictionary.doc2bow(new_doc.lower().split())
print('new_vect: ', '\n', new_vect)

{'computer': 0, 'human': 1, 'interface': 2, 'response': 3, 'survey': 4, 'system': 5, 'time': 6, 'user': 7, 'eps': 8, 'trees': 9, 'graph': 10, 'minors': 11}
new_vect:  
 [(0, 1), (1, 1)]


In [18]:
corpus = [dictionary.doc2bow(text) for text in texts]
# 保存上述数据
corpora.MmCorpus.serialize('./tmp/deerwester.mm', corpus)
pprint.pprint(corpus)

[[(0, 1), (1, 1), (2, 1)],
 [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)],
 [(2, 1), (5, 1), (7, 1), (8, 1)],
 [(1, 1), (5, 2), (8, 1)],
 [(3, 1), (6, 1), (7, 1)],
 [(9, 1)],
 [(9, 1), (10, 1)],
 [(9, 1), (10, 1), (11, 1)],
 [(4, 1), (10, 1), (11, 1)]]


In [27]:
from smart_open import open

class MyCorpus:
    def __iter__(self):
        # 这里也可以通过http协议来访问网络上的文件
        for line in open('./tmp/mycorpus.txt'):
            # 通过迭代返回内容
            yield dictionary.doc2bow(line.lower().split())

In [28]:
corpus_memory_friendly = MyCorpus()
print(corpus_memory_friendly)

# corpus现在是个对象 
# 不能直接打印输出 要通过循环的方式遍历
for vector in corpus_memory_friendly:
    print(vector)

<__main__.MyCorpus object at 0x00000182CB070FD0>
[(0, 1), (1, 1), (2, 1)]
[(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)]
[(2, 1), (5, 1), (7, 1), (8, 1)]
[(1, 1), (5, 2), (8, 1)]
[(3, 1), (6, 1), (7, 1)]
[(9, 1)]
[(9, 1), (10, 1)]
[(9, 1), (10, 1), (11, 1)]
[(4, 1), (10, 1), (11, 1)]


虽然输出结果与上述相同，但是通过这样的方式，哦对内存是更加友好的。

In [40]:
# 将corpus中的所有单词构成一个字典
dictionary = corpora.Dictionary(line.lower().split() for line in open('./tmp/mycorpus.txt'))
# 获得常用符号的id
stop_ids = [
    dictionary.token2id[stopword]
    for stopword in stoplist
    if stopword in dictionary.token2id
]
# 获得只出现一次的id
once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.items()
          if docfreq == 1]

# 将上述id进行过滤
dictionary.filter_tokens(stop_ids + once_ids)
dictionary.compactify()
print(dictionary)


Dictionary<12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...>


In [44]:
corpus = [[(1, 0.5)], []]
# 保存一个Matrix Market文件
corpora.MmCorpus.serialize('./tmp/corpus.mm', corpus)

corpus = corpora.MmCorpus('./tmp/corpus.mm')
print(corpus)
print(list(corpus))

MmCorpus(2 documents, 2 features, 1 non-zero entries)
[[(1, 0.5)], []]


In [54]:
import gensim
import numpy as np
numpy_matrix = np.random.randint(10, size=[5, 2])
corpus = gensim.matutils.Dense2Corpus(numpy_matrix)
print(numpy_matrix)

for doc in corpus:
    print(doc)

[[7 0]
 [6 3]
 [2 9]
 [1 8]
 [1 7]]
[(0, 7.0), (1, 6.0), (2, 2.0), (3, 1.0), (4, 1.0)]
[(1, 3.0), (2, 9.0), (3, 8.0), (4, 7.0)]


## Topics and Transformations

In [63]:
import pprint
from collections import defaultdict
from gensim import corpora

# 导入9句话
documents = [
    "Human machine interface for lab abc computer applications",
    "A survey of user opinion of computer system response time",
    "The EPS user interface management system",
    "System and human system engineering testing of EPS",
    "Relation of user perceived response time to error measurement",
    "The generation of random binary unordered trees",
    "The intersection graph of paths in trees",
    "Graph minors IV Widths of trees and well quasi ordering",
    "Graph minors A survey",
]

stoplist = set('for a of the and to in'.split(' '))
texts = [
    [word for word in document.lower().split() if word not in stoplist ] 
    for document in documents
]

frequency = defaultdict(int)
for text in texts:
    for word in text:
        # 使用方法相当于字典
        frequency[word] += 1
        
# 去除只出现一token次的单词
texts = [[token for token in text if frequency[token]>1 ] for text in texts]

dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

from gensim import models

tfidf = models.TfidfModel(corpus)

In [64]:
doc_bow = [(0, 1), (1, 1)]
print(tfidf[doc_bow])

[(0, 0.7071067811865476), (1, 0.7071067811865476)]


`tfidf`可以用于将任何矩阵从旧的表示方法（单词的出现次数-整形）转换为新的表示方法（tfidf权重-浮点型）

In [66]:
corpus_tfidf = tfidf[corpus]
for doc in corpus_tfidf:
    print(doc)

[(0, 0.5773502691896257), (1, 0.5773502691896257), (2, 0.5773502691896257)]
[(0, 0.44424552527467476), (3, 0.44424552527467476), (4, 0.44424552527467476), (5, 0.3244870206138555), (6, 0.44424552527467476), (7, 0.3244870206138555)]
[(2, 0.5710059809418182), (5, 0.4170757362022777), (7, 0.4170757362022777), (8, 0.5710059809418182)]
[(1, 0.49182558987264147), (5, 0.7184811607083769), (8, 0.49182558987264147)]
[(3, 0.6282580468670046), (6, 0.6282580468670046), (7, 0.45889394536615247)]
[(9, 1.0)]
[(9, 0.7071067811865475), (10, 0.7071067811865475)]
[(9, 0.5080429008916749), (10, 0.5080429008916749), (11, 0.695546419520037)]
[(4, 0.6282580468670046), (10, 0.45889394536615247), (11, 0.6282580468670046)]


In [67]:
lsi_model = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=2)
corpus_lsi = lsi_model[corpus_tfidf]

通过Laten Semantic Indexing将Tf-Idf转换成一个latent 2-D空间。

In [68]:
lsi_model.print_topics(2)

[(0,
  '0.703*"trees" + 0.538*"graph" + 0.402*"minors" + 0.187*"survey" + 0.061*"system" + 0.060*"response" + 0.060*"time" + 0.058*"user" + 0.049*"computer" + 0.035*"interface"'),
 (1,
  '-0.460*"system" + -0.373*"user" + -0.332*"eps" + -0.328*"interface" + -0.320*"response" + -0.320*"time" + -0.293*"computer" + -0.280*"human" + -0.171*"survey" + 0.161*"trees"')]

## Word2Vec 模型使用
https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html#sphx-glr-auto-examples-tutorials-run-word2vec-py

传统方式：使用Bag of words
* 从一些句子之中得到一些单词，对他们分别标号
* 对于一条新的句子，按照单词序号记录单词出现的频率，生成一个表格

这种方式虽然能够很容易进行，但是有不足之处
1. 这种方式失去了句子的顺序，John likes Mary 和 Mary likes John
2. 向量之间的距离并不反映语义上的相似度


### Word2Vec demo

In [1]:
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')



In [4]:
for index, word in enumerate(wv.index_to_key):
    if index == 10:
        break
    print(f"word #{index}/{len(wv.index_to_key)} is {word}")

word #0/3000000 is </s>
word #1/3000000 is in
word #2/3000000 is for
word #3/3000000 is that
word #4/3000000 is is
word #5/3000000 is on
word #6/3000000 is ##
word #7/3000000 is The
word #8/3000000 is with
word #9/3000000 is said


In [6]:
vec_king = wv['king']

try:
    vec_cameroon = wv['cameroon']
except KeyError:
    print("The word 'cameroon does not appear in this model'")

The word 'cameroon does not appear in this model'


In [7]:
pairs = [
    ('car', 'minivan'),   # a minivan is a kind of car
    ('car', 'bicycle'),   # still a wheeled vehicle
    ('car', 'airplane'),  # ok, no wheels, but still a vehicle
    ('car', 'cereal'),    # ... and so on
    ('car', 'communism'),
]
for w1, w2 in pairs:
    print('%r\t%r\t%.2f' % (w1, w2, wv.similarity(w1, w2)))

'car'	'minivan'	0.69
'car'	'bicycle'	0.54
'car'	'airplane'	0.42
'car'	'cereal'	0.14
'car'	'communism'	0.06


In [8]:
print(wv.most_similar(positive=['car', 'minivan'], topn=5))

[('SUV', 0.853219211101532), ('vehicle', 0.8175785541534424), ('pickup_truck', 0.7763689160346985), ('Jeep', 0.7567334175109863), ('Ford_Explorer', 0.756571888923645)]


In [9]:
print(wv.doesnt_match(['fire', 'water', 'land', 'sea', 'air', 'car']))

car


In [12]:
print(wv.similarity('king', 'man'))
print(wv.similarity('queen', 'woman'))

print(wv.most_similar(positive=['king', 'man', 'queen'], topn=5))

0.22942671
0.31618136
[('princess', 0.6530032753944397), ('monarch', 0.6512453556060791), ('prince', 0.6426263451576233), ('kings', 0.6259569525718689), ('queens', 0.5816447138786316)]


## Training My Own Model

In [28]:
from gensim.test.utils import datapath
from gensim import utils
import pprint

class MyCorpus:
    def __iter__(self):
        corpus_path = datapath('lee_background.cor')
        for line in open(corpus_path):
            yield utils.simple_preprocess(line)
def test():
    sentences = MyCorpus()
    count = 0
    num = 2
    for sentence in sentences:
        if count == 2:
            break
        pprint.pprint(sentence)
        count += 1

通过MyCorpus来训练模型。目前不要太担心这些训练参数

In [19]:
import gensim.models

sentences = MyCorpus()
model = gensim.models.Word2Vec(sentences=sentences)

通过我们训练的模型 我们也能够实现上述demo的效果

In [23]:
vec_king = model.wv['king']

for index, word in enumerate(wv.index_to_key):
    if index == 10:
        break;
    print(f"word #{index}/{len(wv.index_to_key)} is {word}")

word #0/3000000 is </s>
word #1/3000000 is in
word #2/3000000 is for
word #3/3000000 is that
word #4/3000000 is is
word #5/3000000 is on
word #6/3000000 is ##
word #7/3000000 is The
word #8/3000000 is with
word #9/3000000 is said


## 存储和加载模型

In [29]:
import tempfile

with tempfile.NamedTemporaryFile(prefix='gensim-model-', delete=False) as tmp:
    temporary_filepath = tmp.name
    # 保存模型
    model.save(temporary_filepath)
    # 删除模型
    new_model = gensim.models.Word2Vec.load(temporary_filepath)

##  训练你参数

min_count: 有些单词出现的次数不足以了解他的含义

vector_size: 词向量的维度大小