In [1]:
import pandas as pd

train = pd.read_csv("./data/labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)
test = pd.read_csv("./data/testData.tsv", header=0, delimiter="\t", quoting=3)
unlabeled_train = pd.read_csv("./data/unlabeledTrainData.tsv", header=0, delimiter="\t", quoting=3)

In [2]:
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords

def review_to_wordlist(review, remove_stopwords=False):
    review_text = BeautifulSoup(review).get_text()
    review_text = re.sub("[^a-zA-Z]", " ", review_text)
    words = review_text.lower().split()
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    return (words)

In [3]:
import nltk

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

def review_to_sentences(review, tokenizer, remove_stopwords=False):
    raw_sentences = tokenizer.tokenize(review.strip())
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            sentences.append(review_to_wordlist(raw_sentence, remove_stopwords))
    return sentences

In [4]:
sentences = []
for review in train["review"]:
    sentences += review_to_sentences(review, tokenizer)
    
for review in unlabeled_train["review"]:
    sentences += review_to_sentences(review, tokenizer)



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup


In [5]:
print(len(sentences))
print(sentences[0])
print(sentences[1])

795538
['with', 'all', 'this', 'stuff', 'going', 'down', 'at', 'the', 'moment', 'with', 'mj', 'i', 've', 'started', 'listening', 'to', 'his', 'music', 'watching', 'the', 'odd', 'documentary', 'here', 'and', 'there', 'watched', 'the', 'wiz', 'and', 'watched', 'moonwalker', 'again']
['maybe', 'i', 'just', 'want', 'to', 'get', 'a', 'certain', 'insight', 'into', 'this', 'guy', 'who', 'i', 'thought', 'was', 'really', 'cool', 'in', 'the', 'eighties', 'just', 'to', 'maybe', 'make', 'up', 'my', 'mind', 'whether', 'he', 'is', 'guilty', 'or', 'innocent']


In [6]:
## 训练 & 保存模型

In [7]:
import logging
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s', level=logging.INFO)

num_features = 300
min_word_count = 40
num_workers = 4
context = 10
downsampling = 1e-3

# Initialize and train the model
from gensim.models import word2vec
model = word2vec.Word2Vec(sentences, workers=num_workers, size=num_features, min_count=min_word_count, window=context, sample=downsampling)
model.init_sims(replace=True) # calling init_sims will make the model much more memory-efficient
model_name = "myWord2VecModel"
model.save(model_name)

2020-10-25 18:47:24,279: INFO: 'pattern' package not found; tag filters are not available for English
2020-10-25 18:47:24,289: INFO: collecting all words and their counts
2020-10-25 18:47:24,290: INFO: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-10-25 18:47:24,328: INFO: PROGRESS: at sentence #10000, processed 225803 words, keeping 17776 word types
2020-10-25 18:47:24,367: INFO: PROGRESS: at sentence #20000, processed 451892 words, keeping 24948 word types
2020-10-25 18:47:24,405: INFO: PROGRESS: at sentence #30000, processed 671315 words, keeping 30034 word types
2020-10-25 18:47:24,443: INFO: PROGRESS: at sentence #40000, processed 897815 words, keeping 34348 word types
2020-10-25 18:47:24,482: INFO: PROGRESS: at sentence #50000, processed 1116963 words, keeping 37761 word types
2020-10-25 18:47:24,521: INFO: PROGRESS: at sentence #60000, processed 1338404 words, keeping 40723 word types
2020-10-25 18:47:24,559: INFO: PROGRESS: at sentence #70000, processed

2020-10-25 18:47:27,235: INFO: PROGRESS: at sentence #700000, processed 15657213 words, keeping 116943 word types
2020-10-25 18:47:27,278: INFO: PROGRESS: at sentence #710000, processed 15880202 words, keeping 117596 word types
2020-10-25 18:47:27,318: INFO: PROGRESS: at sentence #720000, processed 16105489 words, keeping 118221 word types
2020-10-25 18:47:27,358: INFO: PROGRESS: at sentence #730000, processed 16331870 words, keeping 118954 word types
2020-10-25 18:47:27,402: INFO: PROGRESS: at sentence #740000, processed 16552903 words, keeping 119668 word types
2020-10-25 18:47:27,442: INFO: PROGRESS: at sentence #750000, processed 16771230 words, keeping 120295 word types
2020-10-25 18:47:27,481: INFO: PROGRESS: at sentence #760000, processed 16990622 words, keeping 120930 word types
2020-10-25 18:47:27,525: INFO: PROGRESS: at sentence #770000, processed 17217759 words, keeping 121703 word types
2020-10-25 18:47:27,566: INFO: PROGRESS: at sentence #780000, processed 17447905 words, 

2020-10-25 18:48:14,265: INFO: worker thread finished; awaiting finish of 3 more threads
2020-10-25 18:48:14,271: INFO: worker thread finished; awaiting finish of 2 more threads
2020-10-25 18:48:14,272: INFO: worker thread finished; awaiting finish of 1 more threads
2020-10-25 18:48:14,279: INFO: worker thread finished; awaiting finish of 0 more threads
2020-10-25 18:48:14,280: INFO: EPOCH - 4 : training on 17798082 raw words (12748380 effective words) took 10.7s, 1186920 effective words/s
2020-10-25 18:48:15,286: INFO: EPOCH 5 - PROGRESS: at 9.40% examples, 1190577 words/s, in_qsize 7, out_qsize 0
2020-10-25 18:48:16,287: INFO: EPOCH 5 - PROGRESS: at 18.98% examples, 1199443 words/s, in_qsize 7, out_qsize 0
2020-10-25 18:48:17,291: INFO: EPOCH 5 - PROGRESS: at 28.03% examples, 1181757 words/s, in_qsize 7, out_qsize 0
2020-10-25 18:48:18,295: INFO: EPOCH 5 - PROGRESS: at 36.95% examples, 1167535 words/s, in_qsize 7, out_qsize 0
2020-10-25 18:48:19,295: INFO: EPOCH 5 - PROGRESS: at 46.2

In [8]:
model.doesnt_match("man woman child kitchen".split())

  """Entry point for launching an IPython kernel.
  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


'kitchen'

In [17]:
model.doesnt_match("france england germany berlin".split())

  """Entry point for launching an IPython kernel.
  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


'berlin'

In [18]:
model.most_similar("man")

  """Entry point for launching an IPython kernel.


[('woman', 0.6223161220550537),
 ('lady', 0.595390796661377),
 ('lad', 0.5683529376983643),
 ('monk', 0.5371238589286804),
 ('men', 0.5296732783317566),
 ('millionaire', 0.5188815593719482),
 ('guy', 0.5160331726074219),
 ('priest', 0.5098307728767395),
 ('businessman', 0.5035345554351807),
 ('person', 0.49785810708999634)]