In [2]:
# imports needed and set up logging
import gzip
import gensim 
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


In [8]:
data_file="reviews.txt.gz"

with open ('reviews.txt', 'rb') as f:
    for i,line in enumerate (f):
        print(line)
        break

b'For bus is good\r\n'


In [10]:
def read_input(input_file):
    """This method reads the input file which is in gzip format"""
    
    logging.info("reading file {0}...this may take a while".format(input_file))
    
    with gzip.open (input_file, 'rb') as f:
        for i, line in enumerate (f): 

            if (i%10000==0):
                logging.info ("read {0} reviews".format (i))
            # do some pre-processing and return a list of words for each review text
            yield gensim.utils.simple_preprocess (line)

# read the tokenized reviews into a list
# each review item becomes a serries of words
# so this becomes a list of lists
documents = list (read_input (data_file))
logging.info ("Done reading data file")

2019-10-18 11:35:12,962 : INFO : reading file reviews.txt.gz...this may take a while
2019-10-18 11:35:12,965 : INFO : read 0 reviews
2019-10-18 11:35:13,884 : INFO : read 10000 reviews
2019-10-18 11:35:14,744 : INFO : read 20000 reviews
2019-10-18 11:35:15,488 : INFO : Done reading data file


In [11]:
model = gensim.models.Word2Vec (documents, size=150, window=10, min_count=2, workers=10)
model.train(documents,total_examples=len(documents),epochs=10)

2019-10-18 11:35:32,323 : INFO : collecting all words and their counts
2019-10-18 11:35:32,324 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-10-18 11:35:32,472 : INFO : PROGRESS: at sentence #10000, processed 625665 words, keeping 17674 word types
2019-10-18 11:35:32,587 : INFO : PROGRESS: at sentence #20000, processed 1224271 words, keeping 23596 word types
2019-10-18 11:35:32,697 : INFO : collected 27902 word types from a corpus of 1741281 raw words and 29441 sentences
2019-10-18 11:35:32,698 : INFO : Loading a fresh vocabulary
2019-10-18 11:35:32,815 : INFO : effective_min_count=2 retains 17168 unique words (61% of original 27902, drops 10734)
2019-10-18 11:35:32,816 : INFO : effective_min_count=2 leaves 1730547 word corpus (99% of original 1741281, drops 10734)
2019-10-18 11:35:32,883 : INFO : deleting the raw counts dictionary of 27902 items
2019-10-18 11:35:32,885 : INFO : sample=0.001 downsamples 51 most-common words
2019-10-18 11:35:32,885 : IN

2019-10-18 11:35:40,009 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-10-18 11:35:40,011 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-10-18 11:35:40,015 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-10-18 11:35:40,021 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-10-18 11:35:40,027 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-10-18 11:35:40,028 : INFO : EPOCH - 1 : training on 1741281 raw words (1310673 effective words) took 1.2s, 1104589 effective words/s
2019-10-18 11:35:41,046 : INFO : EPOCH 2 - PROGRESS: at 90.15% examples, 1181685 words/s, in_qsize 17, out_qsize 0
2019-10-18 11:35:41,088 : INFO : worker thread finished; awaiting finish of 9 more threads
2019-10-18 11:35:41,105 : INFO : worker thread finished; awaiting finish of 8 more threads
2019-10-18 11:35:41,111 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-10-18 11:35:41

2019-10-18 11:35:48,267 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-10-18 11:35:48,270 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-10-18 11:35:48,272 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-10-18 11:35:48,273 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-10-18 11:35:48,273 : INFO : EPOCH - 8 : training on 1741281 raw words (1310365 effective words) took 1.1s, 1187248 effective words/s
2019-10-18 11:35:49,287 : INFO : EPOCH 9 - PROGRESS: at 91.53% examples, 1199815 words/s, in_qsize 15, out_qsize 0
2019-10-18 11:35:49,311 : INFO : worker thread finished; awaiting finish of 9 more threads
2019-10-18 11:35:49,326 : INFO : worker thread finished; awaiting finish of 8 more threads
2019-10-18 11:35:49,327 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-10-18 11:35:49,328 : INFO : worker thread finished; awaiting finish of 6 more threads
2019-10-18 11:35:49

(13106143, 17412810)

In [14]:
w1 = "child"
model.wv.most_similar (positive=w1)

[('children', 0.6616529226303101),
 ('kid', 0.6371763944625854),
 ('fur', 0.5584065914154053),
 ('dad', 0.5566017031669617),
 ('babies', 0.5524616241455078),
 ('son', 0.5387352705001831),
 ('younger', 0.5365771055221558),
 ('teenager', 0.514395534992218),
 ('boys', 0.509743332862854),
 ('kids', 0.5097034573554993)]