In [3]:
import gzip
import gensim
import logging
from gensim.test.utils import get_tmpfile
from gensim.models import KeyedVectors



logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', 
                    level=logging.INFO)

In [4]:

def show_file_contents(input_file):
    with gzip.open(input_file, 'rb') as f:
        for i, line in enumerate(f):
            print(line)
            break

In [5]:


def read_input(input_file):
    """This method reads the input file which is in gzip format"""

    logging.info("reading file {0}...this may take a while".format(input_file))

    with gzip.open(input_file, 'rb') as f:
        for i, line in enumerate(f):
            if (i % 10000 == 0):
                logging.info("read {0} reviews".format(i))

            # do some pre-processing and return list of words for each review text

            yield gensim.utils.simple_preprocess(line)



In [6]:
if __name__ == '__main__':

    
    input_file = 'dataset.gz'

    # read first line of the dataset
    # show_file_contents(input_file)

    # documents is a list of lists
    documents = list(read_input(input_file))
    logging.info("Done reading dataset")

    # build vocabulary and train model
    model = gensim.models.Word2Vec(
            documents,
            size=150,
            window=10,
            min_count=2,
            workers=10)


2019-05-27 15:40:17,615 : INFO : reading file dataset.gz...this may take a while
2019-05-27 15:40:17,618 : INFO : read 0 reviews
2019-05-27 15:40:20,273 : INFO : read 10000 reviews
2019-05-27 15:40:22,892 : INFO : read 20000 reviews
2019-05-27 15:40:25,908 : INFO : read 30000 reviews
2019-05-27 15:40:28,813 : INFO : read 40000 reviews
2019-05-27 15:40:31,949 : INFO : read 50000 reviews
2019-05-27 15:40:35,004 : INFO : read 60000 reviews
2019-05-27 15:40:37,584 : INFO : read 70000 reviews
2019-05-27 15:40:40,155 : INFO : read 80000 reviews
2019-05-27 15:40:42,662 : INFO : read 90000 reviews
2019-05-27 15:40:45,121 : INFO : read 100000 reviews
2019-05-27 15:40:47,497 : INFO : read 110000 reviews
2019-05-27 15:40:49,901 : INFO : read 120000 reviews
2019-05-27 15:40:52,360 : INFO : read 130000 reviews
2019-05-27 15:40:54,982 : INFO : read 140000 reviews
2019-05-27 15:40:57,387 : INFO : read 150000 reviews
2019-05-27 15:40:59,860 : INFO : read 160000 reviews
2019-05-27 15:41:02,695 : INFO :

2019-05-27 15:42:00,424 : INFO : EPOCH 1 - PROGRESS: at 51.91% examples, 663701 words/s, in_qsize 18, out_qsize 1
2019-05-27 15:42:01,426 : INFO : EPOCH 1 - PROGRESS: at 54.08% examples, 664186 words/s, in_qsize 19, out_qsize 0
2019-05-27 15:42:02,448 : INFO : EPOCH 1 - PROGRESS: at 56.49% examples, 664207 words/s, in_qsize 19, out_qsize 0
2019-05-27 15:42:03,453 : INFO : EPOCH 1 - PROGRESS: at 58.73% examples, 663574 words/s, in_qsize 20, out_qsize 0
2019-05-27 15:42:04,458 : INFO : EPOCH 1 - PROGRESS: at 61.05% examples, 663606 words/s, in_qsize 19, out_qsize 0
2019-05-27 15:42:05,477 : INFO : EPOCH 1 - PROGRESS: at 63.45% examples, 663761 words/s, in_qsize 18, out_qsize 4
2019-05-27 15:42:06,496 : INFO : EPOCH 1 - PROGRESS: at 65.83% examples, 664205 words/s, in_qsize 18, out_qsize 1
2019-05-27 15:42:07,516 : INFO : EPOCH 1 - PROGRESS: at 68.24% examples, 665033 words/s, in_qsize 19, out_qsize 0
2019-05-27 15:42:08,520 : INFO : EPOCH 1 - PROGRESS: at 70.36% examples, 664828 words/s,

2019-05-27 15:43:04,198 : INFO : EPOCH 2 - PROGRESS: at 92.59% examples, 662716 words/s, in_qsize 19, out_qsize 0
2019-05-27 15:43:05,213 : INFO : EPOCH 2 - PROGRESS: at 94.90% examples, 662871 words/s, in_qsize 19, out_qsize 0
2019-05-27 15:43:06,233 : INFO : EPOCH 2 - PROGRESS: at 97.29% examples, 663228 words/s, in_qsize 19, out_qsize 0
2019-05-27 15:43:07,239 : INFO : EPOCH 2 - PROGRESS: at 99.59% examples, 663301 words/s, in_qsize 15, out_qsize 1
2019-05-27 15:43:07,303 : INFO : worker thread finished; awaiting finish of 9 more threads
2019-05-27 15:43:07,304 : INFO : worker thread finished; awaiting finish of 8 more threads
2019-05-27 15:43:07,325 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-05-27 15:43:07,327 : INFO : worker thread finished; awaiting finish of 6 more threads
2019-05-27 15:43:07,329 : INFO : worker thread finished; awaiting finish of 5 more threads
2019-05-27 15:43:07,340 : INFO : worker thread finished; awaiting finish of 4 more thread

2019-05-27 15:43:59,332 : INFO : EPOCH 4 - PROGRESS: at 14.32% examples, 670955 words/s, in_qsize 19, out_qsize 0
2019-05-27 15:44:00,352 : INFO : EPOCH 4 - PROGRESS: at 16.33% examples, 672025 words/s, in_qsize 18, out_qsize 1
2019-05-27 15:44:01,378 : INFO : EPOCH 4 - PROGRESS: at 18.25% examples, 673668 words/s, in_qsize 18, out_qsize 1
2019-05-27 15:44:02,384 : INFO : EPOCH 4 - PROGRESS: at 20.01% examples, 673812 words/s, in_qsize 19, out_qsize 0
2019-05-27 15:44:03,399 : INFO : EPOCH 4 - PROGRESS: at 22.07% examples, 673517 words/s, in_qsize 19, out_qsize 0
2019-05-27 15:44:04,415 : INFO : EPOCH 4 - PROGRESS: at 23.84% examples, 674652 words/s, in_qsize 19, out_qsize 0
2019-05-27 15:44:05,452 : INFO : EPOCH 4 - PROGRESS: at 26.08% examples, 673524 words/s, in_qsize 19, out_qsize 0
2019-05-27 15:44:06,477 : INFO : EPOCH 4 - PROGRESS: at 28.68% examples, 674672 words/s, in_qsize 19, out_qsize 0
2019-05-27 15:44:07,479 : INFO : EPOCH 4 - PROGRESS: at 31.06% examples, 673943 words/s,

2019-05-27 15:45:03,340 : INFO : EPOCH 5 - PROGRESS: at 57.42% examples, 676473 words/s, in_qsize 19, out_qsize 0
2019-05-27 15:45:04,368 : INFO : EPOCH 5 - PROGRESS: at 59.72% examples, 675573 words/s, in_qsize 19, out_qsize 0
2019-05-27 15:45:05,371 : INFO : EPOCH 5 - PROGRESS: at 61.99% examples, 675499 words/s, in_qsize 18, out_qsize 1
2019-05-27 15:45:06,382 : INFO : EPOCH 5 - PROGRESS: at 64.69% examples, 676319 words/s, in_qsize 19, out_qsize 0
2019-05-27 15:45:07,396 : INFO : EPOCH 5 - PROGRESS: at 66.83% examples, 675873 words/s, in_qsize 18, out_qsize 1
2019-05-27 15:45:08,404 : INFO : EPOCH 5 - PROGRESS: at 69.24% examples, 676640 words/s, in_qsize 17, out_qsize 2
2019-05-27 15:45:09,406 : INFO : EPOCH 5 - PROGRESS: at 71.35% examples, 675971 words/s, in_qsize 19, out_qsize 0
2019-05-27 15:45:10,421 : INFO : EPOCH 5 - PROGRESS: at 73.92% examples, 677184 words/s, in_qsize 19, out_qsize 0
2019-05-27 15:45:11,435 : INFO : EPOCH 5 - PROGRESS: at 76.09% examples, 677308 words/s,

In [7]:
    model.train(documents, total_examples=len(documents), epochs=10)

2019-05-27 15:58:18,248 : INFO : training model with 10 workers on 70538 vocabulary and 150 features, using sg=0 hs=0 sample=0.001 negative=5 window=10
2019-05-27 15:58:19,293 : INFO : EPOCH 1 - PROGRESS: at 1.99% examples, 609057 words/s, in_qsize 19, out_qsize 0
2019-05-27 15:58:20,295 : INFO : EPOCH 1 - PROGRESS: at 4.07% examples, 617655 words/s, in_qsize 19, out_qsize 0
2019-05-27 15:58:21,296 : INFO : EPOCH 1 - PROGRESS: at 6.30% examples, 641805 words/s, in_qsize 19, out_qsize 0
2019-05-27 15:58:22,317 : INFO : EPOCH 1 - PROGRESS: at 8.51% examples, 648690 words/s, in_qsize 19, out_qsize 0
2019-05-27 15:58:23,343 : INFO : EPOCH 1 - PROGRESS: at 10.04% examples, 629318 words/s, in_qsize 19, out_qsize 0
2019-05-27 15:58:24,365 : INFO : EPOCH 1 - PROGRESS: at 11.57% examples, 616936 words/s, in_qsize 19, out_qsize 0
2019-05-27 15:58:25,418 : INFO : EPOCH 1 - PROGRESS: at 13.55% examples, 623136 words/s, in_qsize 19, out_qsize 0
2019-05-27 15:58:26,420 : INFO : EPOCH 1 - PROGRESS: a

2019-05-27 15:59:21,368 : INFO : EPOCH 2 - PROGRESS: at 32.01% examples, 648514 words/s, in_qsize 20, out_qsize 1
2019-05-27 15:59:22,387 : INFO : EPOCH 2 - PROGRESS: at 34.27% examples, 648723 words/s, in_qsize 18, out_qsize 1
2019-05-27 15:59:23,411 : INFO : EPOCH 2 - PROGRESS: at 36.67% examples, 649888 words/s, in_qsize 19, out_qsize 0
2019-05-27 15:59:24,411 : INFO : EPOCH 2 - PROGRESS: at 39.03% examples, 650508 words/s, in_qsize 19, out_qsize 0
2019-05-27 15:59:25,460 : INFO : EPOCH 2 - PROGRESS: at 41.43% examples, 649622 words/s, in_qsize 16, out_qsize 3
2019-05-27 15:59:26,515 : INFO : EPOCH 2 - PROGRESS: at 43.91% examples, 649665 words/s, in_qsize 20, out_qsize 2
2019-05-27 15:59:27,518 : INFO : EPOCH 2 - PROGRESS: at 46.38% examples, 651142 words/s, in_qsize 19, out_qsize 0
2019-05-27 15:59:28,548 : INFO : EPOCH 2 - PROGRESS: at 48.70% examples, 651342 words/s, in_qsize 18, out_qsize 1
2019-05-27 15:59:29,574 : INFO : EPOCH 2 - PROGRESS: at 51.01% examples, 651075 words/s,

2019-05-27 16:00:26,119 : INFO : EPOCH 3 - PROGRESS: at 74.38% examples, 660735 words/s, in_qsize 19, out_qsize 0
2019-05-27 16:00:27,148 : INFO : EPOCH 3 - PROGRESS: at 76.44% examples, 660441 words/s, in_qsize 19, out_qsize 0
2019-05-27 16:00:28,162 : INFO : EPOCH 3 - PROGRESS: at 78.67% examples, 661028 words/s, in_qsize 19, out_qsize 0
2019-05-27 16:00:29,200 : INFO : EPOCH 3 - PROGRESS: at 80.90% examples, 661167 words/s, in_qsize 18, out_qsize 1
2019-05-27 16:00:30,222 : INFO : EPOCH 3 - PROGRESS: at 83.20% examples, 661227 words/s, in_qsize 19, out_qsize 0
2019-05-27 16:00:31,232 : INFO : EPOCH 3 - PROGRESS: at 85.38% examples, 661568 words/s, in_qsize 19, out_qsize 0
2019-05-27 16:00:32,241 : INFO : EPOCH 3 - PROGRESS: at 87.79% examples, 661837 words/s, in_qsize 20, out_qsize 0
2019-05-27 16:00:33,262 : INFO : EPOCH 3 - PROGRESS: at 90.11% examples, 661569 words/s, in_qsize 19, out_qsize 0
2019-05-27 16:00:34,278 : INFO : EPOCH 3 - PROGRESS: at 92.55% examples, 662231 words/s,

2019-05-27 16:01:23,805 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-05-27 16:01:23,814 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-05-27 16:01:23,815 : INFO : EPOCH - 4 : training on 41519355 raw words (30350638 effective words) took 46.3s, 655954 effective words/s
2019-05-27 16:01:24,842 : INFO : EPOCH 5 - PROGRESS: at 1.92% examples, 595755 words/s, in_qsize 18, out_qsize 1
2019-05-27 16:01:25,848 : INFO : EPOCH 5 - PROGRESS: at 4.17% examples, 634198 words/s, in_qsize 18, out_qsize 1
2019-05-27 16:01:26,860 : INFO : EPOCH 5 - PROGRESS: at 6.32% examples, 643103 words/s, in_qsize 18, out_qsize 1
2019-05-27 16:01:27,882 : INFO : EPOCH 5 - PROGRESS: at 8.51% examples, 647762 words/s, in_qsize 19, out_qsize 0
2019-05-27 16:01:28,920 : INFO : EPOCH 5 - PROGRESS: at 10.40% examples, 652699 words/s, in_qsize 18, out_qsize 1
2019-05-27 16:01:29,929 : INFO : EPOCH 5 - PROGRESS: at 12.19% examples, 657243 words/s, in_qsize 19, out_qsize 

2019-05-27 16:02:25,725 : INFO : EPOCH 6 - PROGRESS: at 33.02% examples, 662544 words/s, in_qsize 15, out_qsize 4
2019-05-27 16:02:26,741 : INFO : EPOCH 6 - PROGRESS: at 35.25% examples, 662495 words/s, in_qsize 19, out_qsize 0
2019-05-27 16:02:27,751 : INFO : EPOCH 6 - PROGRESS: at 37.68% examples, 663743 words/s, in_qsize 18, out_qsize 1
2019-05-27 16:02:28,759 : INFO : EPOCH 6 - PROGRESS: at 40.16% examples, 664548 words/s, in_qsize 19, out_qsize 0
2019-05-27 16:02:29,775 : INFO : EPOCH 6 - PROGRESS: at 42.55% examples, 664057 words/s, in_qsize 18, out_qsize 1
2019-05-27 16:02:30,789 : INFO : EPOCH 6 - PROGRESS: at 45.04% examples, 663940 words/s, in_qsize 18, out_qsize 1
2019-05-27 16:02:31,808 : INFO : EPOCH 6 - PROGRESS: at 47.33% examples, 663549 words/s, in_qsize 16, out_qsize 3
2019-05-27 16:02:32,846 : INFO : EPOCH 6 - PROGRESS: at 49.75% examples, 663708 words/s, in_qsize 18, out_qsize 1
2019-05-27 16:02:33,851 : INFO : EPOCH 6 - PROGRESS: at 52.08% examples, 664352 words/s,

2019-05-27 16:03:30,518 : INFO : EPOCH 7 - PROGRESS: at 76.80% examples, 662771 words/s, in_qsize 19, out_qsize 0
2019-05-27 16:03:31,534 : INFO : EPOCH 7 - PROGRESS: at 78.86% examples, 662256 words/s, in_qsize 19, out_qsize 0
2019-05-27 16:03:32,548 : INFO : EPOCH 7 - PROGRESS: at 81.03% examples, 662223 words/s, in_qsize 18, out_qsize 1
2019-05-27 16:03:33,581 : INFO : EPOCH 7 - PROGRESS: at 83.14% examples, 660566 words/s, in_qsize 15, out_qsize 4
2019-05-27 16:03:34,620 : INFO : EPOCH 7 - PROGRESS: at 85.33% examples, 660469 words/s, in_qsize 19, out_qsize 4
2019-05-27 16:03:35,652 : INFO : EPOCH 7 - PROGRESS: at 87.83% examples, 661079 words/s, in_qsize 17, out_qsize 2
2019-05-27 16:03:36,663 : INFO : EPOCH 7 - PROGRESS: at 90.14% examples, 660895 words/s, in_qsize 20, out_qsize 3
2019-05-27 16:03:37,683 : INFO : EPOCH 7 - PROGRESS: at 92.55% examples, 661237 words/s, in_qsize 18, out_qsize 1
2019-05-27 16:03:38,684 : INFO : EPOCH 7 - PROGRESS: at 94.79% examples, 661301 words/s,

2019-05-27 16:04:26,564 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-05-27 16:04:26,565 : INFO : EPOCH - 8 : training on 41519355 raw words (30344940 effective words) took 45.6s, 665153 effective words/s
2019-05-27 16:04:27,584 : INFO : EPOCH 9 - PROGRESS: at 2.02% examples, 630951 words/s, in_qsize 19, out_qsize 0
2019-05-27 16:04:28,633 : INFO : EPOCH 9 - PROGRESS: at 4.33% examples, 646002 words/s, in_qsize 20, out_qsize 0
2019-05-27 16:04:29,649 : INFO : EPOCH 9 - PROGRESS: at 6.45% examples, 647845 words/s, in_qsize 18, out_qsize 1
2019-05-27 16:04:30,656 : INFO : EPOCH 9 - PROGRESS: at 8.60% examples, 651869 words/s, in_qsize 19, out_qsize 0
2019-05-27 16:04:31,684 : INFO : EPOCH 9 - PROGRESS: at 10.48% examples, 655680 words/s, in_qsize 19, out_qsize 0
2019-05-27 16:04:32,686 : INFO : EPOCH 9 - PROGRESS: at 12.17% examples, 655845 words/s, in_qsize 18, out_qsize 1
2019-05-27 16:04:33,689 : INFO : EPOCH 9 - PROGRESS: at 14.03% examples, 652104 words/s, 

2019-05-27 16:05:29,618 : INFO : EPOCH 10 - PROGRESS: at 34.96% examples, 662104 words/s, in_qsize 16, out_qsize 3
2019-05-27 16:05:30,656 : INFO : EPOCH 10 - PROGRESS: at 37.35% examples, 661528 words/s, in_qsize 18, out_qsize 4
2019-05-27 16:05:31,658 : INFO : EPOCH 10 - PROGRESS: at 39.77% examples, 662674 words/s, in_qsize 19, out_qsize 0
2019-05-27 16:05:32,681 : INFO : EPOCH 10 - PROGRESS: at 42.26% examples, 661974 words/s, in_qsize 18, out_qsize 1
2019-05-27 16:05:33,710 : INFO : EPOCH 10 - PROGRESS: at 44.73% examples, 661872 words/s, in_qsize 16, out_qsize 3
2019-05-27 16:05:34,732 : INFO : EPOCH 10 - PROGRESS: at 47.16% examples, 663138 words/s, in_qsize 20, out_qsize 0
2019-05-27 16:05:35,747 : INFO : EPOCH 10 - PROGRESS: at 49.57% examples, 663639 words/s, in_qsize 19, out_qsize 0
2019-05-27 16:05:36,779 : INFO : EPOCH 10 - PROGRESS: at 51.82% examples, 662702 words/s, in_qsize 18, out_qsize 1
2019-05-27 16:05:37,780 : INFO : EPOCH 10 - PROGRESS: at 54.02% examples, 663492

(303489286, 415193550)

In [17]:
model.wv.similarity(w1="vrle",w2="vrle")

KeyError: "word 'vrle' not in vocabulary"

In [17]:
fname = get_tmpfile("vectors.kv")
word_vectors = model.wv
word_vectors.save(fname)
word_vectors = KeyedVectors.load(fname, mmap='r')

2019-05-27 12:56:28,056 : INFO : saving Word2VecKeyedVectors object under /tmp/vectors.kv, separately None
2019-05-27 12:56:28,058 : INFO : storing np array 'vectors' to /tmp/vectors.kv.vectors.npy
2019-05-27 12:56:28,101 : INFO : not storing attribute vectors_norm
2019-05-27 12:56:28,650 : INFO : saved /tmp/vectors.kv
2019-05-27 12:56:28,651 : INFO : loading Word2VecKeyedVectors object from /tmp/vectors.kv
2019-05-27 12:56:29,458 : INFO : loading vectors from /tmp/vectors.kv.vectors.npy with mmap=r
2019-05-27 12:56:29,482 : INFO : setting ignored attribute vectors_norm to None
2019-05-27 12:56:29,484 : INFO : loaded /tmp/vectors.kv
