In [1]:
import pandas as pd
import numpy as np
import nltk.data
import logging
import csv
from gensim.models import Word2Vec
from sklearn.ensemble import RandomForestClassifier

from bs4 import MarkupResemblesLocatorWarning
import warnings
warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning)

import sys
sys.path.append('/kaggle/input/kaggleword2vecutility/')
from KaggleWord2VecUtility import KaggleWord2VecUtility


def makeFeatureVec(words, model, num_features):
    featureVec = np.zeros((num_features,), dtype="float32")
    nwords = 0
    index2word_set = set(model.wv.key_to_index)
    for word in words:
        if word in index2word_set:
            nwords += 1
            featureVec = np.add(featureVec, model.wv[word])
    if nwords > 0:
        featureVec = np.divide(featureVec, nwords)
    return featureVec


def getAvgFeatureVecs(reviews, model, num_features):
    reviewFeatureVecs = np.zeros((len(reviews), num_features), dtype="float32")
    for counter, review in enumerate(reviews):
        if counter % 1000 == 0:
            print("Review {} of {}".format(counter, len(reviews)))
        reviewFeatureVecs[counter] = makeFeatureVec(review, model, num_features)
    return reviewFeatureVecs


def getCleanReviews(reviews):
    clean_reviews = []
    for review in reviews["review"]:
        clean_reviews.append(KaggleWord2VecUtility.review_to_wordlist(review, remove_stopwords=True))
    return clean_reviews

In [2]:
if __name__ == '__main__':
    train_path = '/kaggle/input/word2vec-nlp-tutorial/labeledTrainData.tsv/labeledTrainData.tsv'
    test_path = '/kaggle/input/word2vec-nlp-tutorial/testData.tsv/testData.tsv'
    unlabeled_path = '/kaggle/input/word2vec-nlp-tutorial/unlabeledTrainData.tsv/unlabeledTrainData.tsv'

    # 读取数据
    train = pd.read_csv(train_path, header=0, delimiter="\t", quoting=csv.QUOTE_NONE)
    test = pd.read_csv(test_path, header=0, delimiter="\t", quoting=csv.QUOTE_NONE)
    unlabeled_train = pd.read_csv(unlabeled_path, header=0, delimiter="\t", quoting=csv.QUOTE_NONE)

    print("Read {} labeled train reviews, {} labeled test reviews, and {} unlabeled reviews\n".format(
        train["review"].size, test["review"].size, unlabeled_train["review"].size))

    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

    # 解析句子
    sentences = []
    print("Parsing sentences from training set")
    for review in train["review"]:
        sentences += KaggleWord2VecUtility.review_to_sentences(review, tokenizer)

    print("Parsing sentences from unlabeled set")
    for review in unlabeled_train["review"]:
        sentences += KaggleWord2VecUtility.review_to_sentences(review, tokenizer)

    # Word2Vec 参数
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    num_features = 300
    min_word_count = 40
    num_workers = 4
    context = 10
    downsampling = 1e-3

    print("Training Word2Vec model...")
    model = Word2Vec(sentences, workers=num_workers, vector_size=num_features,
                     min_count=min_word_count, window=context, sample=downsampling, seed=1)

    model_name = "/kaggle/working/300features_40minwords_10context"
    model.save(model_name)

    # 示例
    print(model.wv.doesnt_match("man woman child kitchen".split()))
    print(model.wv.doesnt_match("france england germany berlin".split()))
    print(model.wv.doesnt_match("paris berlin london austria".split()))
    print(model.wv.most_similar("man"))
    print(model.wv.most_similar("queen"))
    print(model.wv.most_similar("awful"))

    # 创建平均向量
    print("Creating average feature vecs for training reviews")
    trainDataVecs = getAvgFeatureVecs(getCleanReviews(train), model, num_features)

    print("Creating average feature vecs for test reviews")
    testDataVecs = getAvgFeatureVecs(getCleanReviews(test), model, num_features)

    # 随机森林训练与预测
    forest = RandomForestClassifier(n_estimators=100)
    print("Fitting a random forest to labeled training data...")
    forest = forest.fit(trainDataVecs, train["sentiment"])

    result = forest.predict(testDataVecs)

    # 输出到 /kaggle/working/
    output_path = "/kaggle/working/Word2Vec_AverageVectors.csv"
    output = pd.DataFrame(data={"id": test["id"], "sentiment": result})
    output.to_csv(output_path, index=False, quoting=csv.QUOTE_NONE)
    print("Wrote {}".format(output_path))

Read 25000 labeled train reviews, 25000 labeled test reviews, and 50000 unlabeled reviews

Parsing sentences from training set
Parsing sentences from unlabeled set
Training Word2Vec model...
kitchen
berlin
paris
[('woman', 0.6191080212593079), ('lady', 0.6072725057601929), ('lad', 0.5812886357307434), ('farmer', 0.5178955793380737), ('soldier', 0.5147017240524292), ('millionaire', 0.5104910135269165), ('guy', 0.5102142095565796), ('person', 0.5032175183296204), ('men', 0.5015764236450195), ('chap', 0.5003091096878052)]
[('princess', 0.6570090055465698), ('bride', 0.6166528463363647), ('goddess', 0.6054405570030212), ('prince', 0.5884653329849243), ('eva', 0.5839771628379822), ('maid', 0.5802943706512451), ('victoria', 0.5779140591621399), ('mistress', 0.5757982730865479), ('belle', 0.5754151344299316), ('stepmother', 0.5751047134399414)]
[('terrible', 0.7544547915458679), ('horrible', 0.7245296835899353), ('dreadful', 0.7058584094047546), ('atrocious', 0.7013605237007141), ('abysmal', 