In [2]:
import pandas as pd

In [3]:
train_data=pd.read_csv('data/labeledTrainData.tsv',header=0, delimiter='\t', quoting=3)
test_data=pd.read_csv('data/testData.tsv',header=0, delimiter='\t', quoting=3)
unlabeled_data=pd.read_csv('data/unlabeledTrainData.tsv',header=0, delimiter='\t', quoting=3)

In [4]:
print("train:",train_data.shape, train_data[:20])
print("test:", test_data.shape, test_data[:20])
print("unlabeled train:",unlabeled_data.shape, unlabeled_data[:20])

train: (25000, 3)            id  sentiment                                             review
0    "5814_8"          1  "With all this stuff going down at the moment ...
1    "2381_9"          1  "\"The Classic War of the Worlds\" by Timothy ...
2    "7759_3"          0  "The film starts with a manager (Nicholas Bell...
3    "3630_4"          0  "It must be assumed that those who praised thi...
4    "9495_8"          1  "Superbly trashy and wondrously unpretentious ...
5    "8196_8"          1  "I dont know why people think this is such a b...
6    "7166_2"          0  "This movie could have been very good, but com...
7   "10633_1"          0  "I watched this video at a friend's house. I'm...
8     "319_1"          0  "A friend of mine bought this film for £1, and...
9   "8713_10"          1  "<br /><br />This movie is full of references....
10   "2486_3"          0  "What happens when an army of wetbacks, towelh...
11  "6811_10"          1  "Although I generally do not like remakes be

### 文本预处理
1. 去除HTML标签：BeautifulSoup
2. 将所有词转为小写
2. 去除数字和标点，用空格置换，（后续可以保留类似于"!!!" or ":-(" 这样的带有情感的符号）
3. 去除停用词(可选，因为在word2vec中更多的词可以产生更高质量的词向量）：nltk.stopwords
4. 词干提取和词性还原：https://www.cnblogs.com/cwp-bg/p/9510513.html

In [5]:
from bs4 import BeautifulSoup
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.lancaster import LancasterStemmer#词干提取
from nltk.stem import WordNetLemmatizer#词形还原

In [6]:
def txt2wordlist(raw_txt, remove_stop):
    l_stem=LancasterStemmer()#不能用PorterStemmer，对于一些未登录词会报错，比如OED
    w_lem=WordNetLemmatizer()
    txt=BeautifulSoup(raw_txt,'lxml')
    txt=txt.get_text().lower()
    txt=re.sub('[^a-zA-Z]',' ',txt)
    word=word_tokenize(txt)
    if remove_stop:
        stops=set(stopwords.words('english'))
        word=[w for w in word if w not in stops]
    word=[l_stem.stem(w) for w in word]
    word=[w_lem.lemmatize(w) for w in word]
    return word

输入Word2vec的是单个句子，一个句子是一个list，由词组成。
所以需要将文本先分割成句子。
使用NLTK中的**punkt**标记生成器来进行句子分割

In [7]:
from nltk import sent_tokenize

In [8]:
train_data['review'][0]

'"With all this stuff going down at the moment with MJ i\'ve started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ\'s feeling towards the press and also the obvious message of drugs are bad m\'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally

In [9]:
sent_list=[]
for review in train_data['review']:
    sent_list+=[txt2wordlist(sent,False) for sent in sent_tokenize(review)]
for review in unlabeled_data['review']:
    sent_list+=[txt2wordlist(sent,False) for sent in sent_tokenize(review)]
len(sent_list)

  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup


795538

In [10]:
#import logging
#logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)#打印日志的时间、级别名称、信息
from gensim.models import word2vec
num_worker=4#并行的线程数
num_feature=300#词向量的维度
num_min_count=40#最小词频，小于这个词频的就不纳入分析
num_window=10#上下文窗口大小
num_downsample=1e-3#高频词汇的随机降采样的配置阈值，默认为1e-3，范围是(0,1e-5)
model=word2vec.Word2Vec(sent_list, workers=num_worker, size=num_feature, min_count=num_min_count, window=num_window, sample=num_downsample)



In [11]:
#model.init_sims(replace=True)#确定模型训练好之后，固定化模型，然后保存
model_name = "model/300features_40minwords_10context"
model.save(model_name)

[《How to Generate a Good Word Embedding?》导读](http://licstar.net/archives/620) 给出了一个word2vec模型训练套路：

首先根据具体任务，选一个领域相似的语料，在这个条件下，语料越大越好。然后下载一个 word2vec 的新版（14年9月更新），语料小（小于一亿词，约 500MB 的文本文件）的时候用 Skip-gram 模型，语料大的时候用 CBOW 模型。最后记得设置迭代次数为三五十次，维度至少选 50，就可以了。

### 探索模型结果

In [17]:
model.wv.doesnt_match("man woman child kitchen".split())#推断在一个集合里，哪一个单词与其他单词最不相似

  if np.issubdtype(vec.dtype, np.int):


'child'

In [18]:
model.wv.doesnt_match('france england germany berlin'.split())

  if np.issubdtype(vec.dtype, np.int):


'berlin'

In [19]:
model.wv.doesnt_match('paris berlin london austria'.split())

  if np.issubdtype(vec.dtype, np.int):


'berlin'

In [20]:
model.wv.most_similar('man')

  if np.issubdtype(vec.dtype, np.int):


[('millionair', 0.4716331362724304),
 ('wom', 0.4708542227745056),
 ('businessm', 0.442435622215271),
 ('widow', 0.4354481101036072),
 ('men', 0.43412673473358154),
 ('lady', 0.43173521757125854),
 ('chap', 0.41588473320007324),
 ('lawy', 0.4075814485549927),
 ('doct', 0.4056174159049988),
 ('convict', 0.40403640270233154)]

In [16]:
model.wv.most_similar('queen')

  if np.issubdtype(vec.dtype, np.int):


[('princess', 0.6119894981384277),
 ('countess', 0.5805540680885315),
 ('victor', 0.5745164155960083),
 ('mistress', 0.550065815448761),
 ('antoinet', 0.5489809513092041),
 ('brid', 0.5339058637619019),
 ('goddess', 0.5314623117446899),
 ('maid', 0.5292067527770996),
 ('bombshel', 0.5221601724624634),
 ('stepmoth', 0.5083791017532349)]

AttributeError: 'Word2VecKeyedVectors' object has no attribute 'doesnt_mtach'