# word2vec训练词向量

In [12]:
import os
import re
import numpy as np
import pandas as pd

from bs4 import BeautifulSoup

import nltk.data
from nltk.corpus import stopwords

from gensim.models.word2vec import Word2Vec

In [13]:
def load_dataset(name, nrows=None):
    datasets = {
        'unlabeled_train': 'unlabeledTrainData.tsv',
        'labeled_train': 'labeledTrainData.tsv',
        'test': 'testData.tsv'
    }
    if name not in datasets:
        raise ValueError(name)
    data_file = os.path.join('..', 'data', datasets[name])
    df = pd.read_csv(data_file, sep='\t', escapechar='\\', nrows=nrows)
    print('Number of reviews: {}'.format(len(df)))
    return df

## 读入无标签数据
用于训练生成word2vec词向量

In [14]:
df = load_dataset('unlabeled_train')
df.head()

Number of reviews: 50000


Unnamed: 0,id,review
0,9999_0,"Watching Time Chasers, it obvious that it was ..."
1,45057_0,I saw this film about 20 years ago and remembe...
2,15561_0,"Minor Spoilers<br /><br />In New York, Joan Ba..."
3,7161_0,I went to see this film with a great deal of e...
4,43971_0,"Yes, I agree with everyone on this site this m..."


## 和第一个笔记本一样做数据的预处理
稍稍有一点不一样的是，我们留了个候选，可以去除停用词，也可以不去除停用词

In [15]:
eng_stopwords = set(stopwords.words('english'))

def clean_text(text, remove_stopwords=False):
    text = BeautifulSoup(text, 'html.parser').get_text()
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    words = text.lower().split()
    if remove_stopwords:
        words = [w for w in words if w not in eng_stopwords]
    return words


In [16]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

def print_call_counts(f):
    n = 0
    def wrapped(*args, **kwargs):
        nonlocal n
        n += 1
        if n % 1000 == 1:
            print('method {} called {} times'.format(f.__name__, n))
            return f(*args, **kwargs)
        return wrapped

@print_call_counts
def split_sentences(review):
    raw_sentences = tokenizer.tokenize(review.str.strip())
    sentences = [clean_text(s) for s in raw_sentences if s]
    return sentences

In [24]:
%time
sentences = sum(split_sentences(df.review), [])
print('{} reviews -> {} sentences'.format(len(df), len(sentences)))

CPU times: total: 0 ns
Wall time: 0 ns


TypeError: 'NoneType' object is not callable

In [28]:
tokenizer.tokenize(df.review.str.strip())

TypeError: expected string or bytes-like object

## 用gensim训练词嵌入模型

In [None]:
import logging
logging.basicConfig(format='%(asctime)s: %(levelname)s :%(message)s', level=logging.INFO)

In [None]:
# 设定词向量训练的参数
num_features = 300
min_word_count = 40
num_workers = 4
context = 10
downsampling = 1e-3

model_name = '{}features_{}minwords_{}context.model'.format(num_features, min_word_count, context)

In [None]:
print('Training model...')
model = Word2Vec(sentences)