### 切词，并把切好的词保存到文件中

In [1]:
def cut_words(origin_path, seg_path):
    '''
    切词，并把切好的词保存到文件中
    :origin_path 原始文本路径
    :seg_path 切好词后的文本保存路径
    '''
    import jieba

    with open(origin_path,'rb') as f:
        document  = f.read()
        document_cut = jieba.cut(document, cut_all =False)
        # print('/'.join(document_cut))
        result = ' '.join(document_cut)
        result = result.encode('utf-8')           
        with open(seg_path,'wb+') as f1:
            f1.write(result)#读取的方式和写入的方式要一致
    f.close()
    f1.close()

### 使用word2vec训练得出词向量，并返回

In [2]:
def w2v(seg_path, model_path):
    '''
    使用word2vec训练得出词向量，并返回
    :seg_path 分词后的文本保存路径
    :model_path 模型保存路径
    :return 训练好的模型
    '''
    from gensim.models import word2vec
    
    # 使用Ttext2Corpus类加载分词后的文本
    sentences = word2vec.Text8Corpus(seg_path)
    # 训练模型
    model  = word2vec.Word2Vec(sentences, size=100, hs=1, min_count=1, window=3)
    # 保存模型
    model.save(model_path)
    return model

### 加载训练好的模型

In [3]:
def load_model(model_path):
    '''
    加载训练好的模型
    :model_path 模型保存路径
    :return 加载到的word2vec模型
    '''
    from gensim.models import word2vec
    return word2vec.Word2Vec.load(model_path)

In [4]:
%%time
# 原始文本路径
origin_path = '/home/beanyon/Desktop/word2vec/Sample/C01/0.txt'
# 切好词后的文本保存路径
seg_path = '/home/beanyon/Desktop/word2vec/Seg/0.txt'
# 保存训练好的模型的路径
model_path = '/home/beanyon/Desktop/word2vec/Seg/0.model'
# 切词并保存
cut_words(origin_path, seg_path)
# 训练word2vec模型
model = w2v(seg_path, model_path)
# 加载训练好的模型
# model = load_model(model_path)
# 输出词向量
print model.wv[u'训练']

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.168 seconds.
Prefix dict has been built succesfully.


[-1.6505637e-03 -3.3998720e-03  1.0265629e-03 -1.7260675e-03
  3.4075826e-03  2.3544950e-03 -3.3103686e-03 -4.2839167e-03
 -1.4623058e-03  2.1523188e-03  7.5366150e-04  4.8724576e-03
 -4.2742132e-03 -1.0996771e-03  2.5720089e-03 -5.3104747e-04
 -3.3513962e-03 -4.3089269e-03 -7.5204991e-04 -3.6322135e-03
  2.4498779e-05  2.4105241e-03  1.3389625e-03 -6.4274948e-04
  1.1462485e-03 -1.0254171e-03  2.3259264e-03 -1.1944470e-03
 -1.7984301e-03  1.6001158e-03  4.2629689e-03  3.8752411e-04
  3.2683592e-03 -3.9898162e-03  2.7465296e-03 -1.7919563e-03
 -7.7371503e-04 -1.8028897e-03 -3.5846534e-03 -2.8045571e-03
 -2.8334069e-03 -6.5411645e-04  2.1093716e-03 -1.6120247e-03
  7.1445259e-04  6.9027237e-04  3.8261211e-03  3.1428295e-03
 -4.7569186e-03 -1.7900468e-03  3.2942255e-05 -1.6126178e-03
 -3.2078482e-03  3.6376638e-03 -2.9149654e-03  1.1147796e-03
  3.9742733e-03 -4.3940148e-03 -3.4566971e-03  4.6155234e-03
 -5.0224629e-03  2.9614149e-03 -3.7506961e-03 -4.3587768e-03
  1.7738793e-03  2.26112