In [20]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
sent1 = "The cat is walking in the bedroom."
sent2 = "A dog was running across the kitchen."
sents = [sent1, sent2]

In [3]:
count_vec = CountVectorizer()
print(count_vec.fit_transform(sents).toarray())
print(count_vec.get_feature_names())

[[0 1 1 0 1 1 0 0 2 1 0]
 [1 0 0 1 0 0 1 1 1 0 1]]
['across', 'bedroom', 'cat', 'dog', 'in', 'is', 'kitchen', 'running', 'the', 'walking', 'was']


In [4]:
import nltk
# 对句子进行分割正规化
tokens1 = nltk.word_tokenize(sent1)
tokens2 = nltk.word_tokenize(sent2)
# 整理词汇表
vocab1 = sorted(set(tokens1))
vocab2 = sorted(set(tokens2))
# 寻找各词汇的原始词根
stemmer = nltk.stem.PorterStemmer()
stem1 = [stemmer.stem(t) for t in tokens1]
stem2 = [stemmer.stem(t) for t in tokens2]
print(stem1)
print(stem2)
# 初始化词性标注器，对每个词汇进行标注
pos_tag1 = nltk.tag.pos_tag(tokens1)
pos_tag2 = nltk.tag.pos_tag(tokens2)
print(pos_tag1)
print(pos_tag2)

['the', 'cat', 'is', 'walk', 'in', 'the', 'bedroom', '.']
['A', 'dog', 'wa', 'run', 'across', 'the', 'kitchen', '.']
[('The', 'DT'), ('cat', 'NN'), ('is', 'VBZ'), ('walking', 'VBG'), ('in', 'IN'), ('the', 'DT'), ('bedroom', 'NN'), ('.', '.')]
[('A', 'DT'), ('dog', 'NN'), ('was', 'VBD'), ('running', 'VBG'), ('across', 'IN'), ('the', 'DT'), ('kitchen', 'NN'), ('.', '.')]


In [5]:
from sklearn.datasets import fetch_20newsgroups
from bs4 import BeautifulSoup
import nltk
import re

In [6]:
news = fetch_20newsgroups(subset="all")

In [7]:
x, y = news.data, news.target

In [8]:
# 将每条新闻中的句子逐一剥离， 并返回一个句子列表
def news_to_sent(news):
    news_text = BeautifulSoup(news).get_text()
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    raw_sent = tokenizer.tokenize(news_text)
    result = [re.sub('[^a-zA-Z]',' ',sent.lower().strip()).split() for sent in raw_sent]
    return result

In [61]:
sentences = []
for i in x:
    sentences += news_to_sent(i)



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


In [10]:
from gensim.models import word2vec



In [62]:
# 词向量模型训练
model = word2vec.Word2Vec(sentences=sentences, 
                          workers=2, 
                          size=300, 
                          min_count=20, 
                          window=5, 
                          sample=1e-3)

In [67]:
# 表示当前训练好的词向量为最终版，可加快训练速度
model.init_sims(replace=True)

In [69]:
# 利用训练好的模型，寻找文本中与‘morning’最相关的 20个词汇
model.most_similar('morning', topn=20)

[('afternoon', 0.795380711555481),
 ('weekend', 0.7631502151489258),
 ('evening', 0.7528274059295654),
 ('saturday', 0.7292416095733643),
 ('night', 0.705287754535675),
 ('friday', 0.6900104880332947),
 ('newspaper', 0.6784282326698303),
 ('summer', 0.6652509570121765),
 ('sunday', 0.6510100364685059),
 ('week', 0.6430712938308716),
 ('monday', 0.6334890723228455),
 ('month', 0.6218483448028564),
 ('thursday', 0.6154978275299072),
 ('tuesday', 0.614425778388977),
 ('yesterday', 0.6131061911582947),
 ('season', 0.6128036975860596),
 ('november', 0.6029937863349915),
 ('july', 0.5953641533851624),
 ('february', 0.5948885679244995),
 ('century', 0.5914508104324341)]

In [70]:
model.most_similar('email', topn=20)

[('mail', 0.7357809543609619),
 ('contact', 0.7029544115066528),
 ('mailed', 0.6643069982528687),
 ('replies', 0.6533781290054321),
 ('address', 0.6261845827102661),
 ('send', 0.6236612796783447),
 ('request', 0.623630166053772),
 ('archie', 0.62105393409729),
 ('listserv', 0.6111855506896973),
 ('internet', 0.6090856790542603),
 ('sas', 0.6070916652679443),
 ('compuserve', 0.6016649007797241),
 ('fax', 0.6003612279891968),
 ('snail', 0.579025387763977),
 ('rend', 0.5787627696990967),
 ('ftp', 0.573432207107544),
 ('subscription', 0.5726449489593506),
 ('finger', 0.5684045553207397),
 ('amin', 0.567340612411499),
 ('kusmierczak', 0.5647874474525452)]