In [19]:
from __future__ import print_function
import json, os, re, sys
from datetime import datetime


def print_log(*args):
    now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    print(now, *args)
    sys.stdout.flush()


# Getting a corpus

In [3]:
from sklearn.datasets import fetch_20newsgroups
newsgroups = fetch_20newsgroups(subset='all')

print(newsgroups.keys())
print("Number of categories:", len(newsgroups.target_names))
print("Number of samples:", len(newsgroups.data))


['description', 'DESCR', 'filenames', 'target_names', 'data', 'target']
('Number of categories:', 20)
('Number of samples:', 18846)


# Distribution of data

In [7]:
import math

def print_bar_chart(data, maxbarlen=30, plotnull=True):
    maxvalue = max([v for k, v in data])
    maxvlen = len(unicode(maxvalue))
    for k, v in data:
        if v == 0 and not plotnull:
            continue
        bar = int(math.ceil((v / float(maxvalue)) * maxbarlen)) * u"█"
        print((u"%" + unicode(maxvlen) + u"s %" + unicode(maxbarlen) + u"s %s") % (v, bar, k))
        
        
from collections import Counter
c = Counter(newsgroups.target)
stats = sorted([(newsgroups.target_names[target], count) for (target, count) in c.iteritems()],
               key=lambda x: (-x[1], x [0]))

print_bar_chart(stats)

999 ██████████████████████████████ rec.sport.hockey
997 ██████████████████████████████ soc.religion.christian
996 ██████████████████████████████ rec.motorcycles
994 ██████████████████████████████ rec.sport.baseball
991 ██████████████████████████████ sci.crypt
990 ██████████████████████████████ rec.autos
990 ██████████████████████████████ sci.med
988 ██████████████████████████████ comp.windows.x
987 ██████████████████████████████ sci.space
985 ██████████████████████████████ comp.os.ms-windows.misc
984 ██████████████████████████████ sci.electronics
982 ██████████████████████████████ comp.sys.ibm.pc.hardware
975 ██████████████████████████████ misc.forsale
973 ██████████████████████████████ comp.graphics
963  █████████████████████████████ comp.sys.mac.hardware
940  █████████████████████████████ talk.politics.mideast
910   ████████████████████████████ talk.politics.guns
799       ████████████████████████ alt.atheism
775       ████████████████████████ talk.politics.misc
628            ██████

# Gensim Word2Vec

https://radimrehurek.com/gensim/models/word2vec.html


## Make a document iterator

Gensim will use this multiple times:
* to build the vocabulary
* to do the training

Gensim expects an iterable that will yield a list of tokens per document.

In [63]:
tokenize = re.compile(r"[^\W_]+", flags=re.UNICODE).findall

class NewsgroupIterator():
    
    def __init__(self):
        self.i = 1
    
    def __iter__(self):
        print_log("start iteration", self.i)
        for text in newsgroups.data:        
            yield tokenize(text.lower())
        self.i += 1


## Train a Word2Vec model

[Word2Vec class doku](https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec)


In [64]:
from gensim.models.word2vec import Word2Vec, FAST_VERSION

# make sure we can use all cores
os.system("taskset -p 0xff %d" % os.getpid())
print ("word2vec.FAST_VERSION", FAST_VERSION)

# train the model
m = Word2Vec(NewsgroupIterator(), workers=4, size=256, window=5, min_count=2, iter=10)

word2vec.FAST_VERSION 1
2016-03-15 12:59:08 start iteration 1
2016-03-15 12:59:14 start iteration 2
2016-03-15 12:59:25 start iteration 3
2016-03-15 12:59:37 start iteration 4
2016-03-15 12:59:48 start iteration 5
2016-03-15 13:00:00 start iteration 6
2016-03-15 13:00:11 start iteration 7
2016-03-15 13:00:23 start iteration 8
2016-03-15 13:00:34 start iteration 9
2016-03-15 13:00:45 start iteration 10
2016-03-15 13:00:55 start iteration 11


## Saving the model

In [66]:
print_log("Saving model...")
m.save_word2vec_format("newsgroups_w2v.model")
print_log("done")

2016-03-15 13:01:10 Saving model...
2016-03-15 13:01:22 done


## Using the model

In [67]:
# word vectors!!

m["computer"]

array([-0.16550805, -0.32571301, -0.95591217, -2.14804506,  2.25657916,
       -0.20089439, -0.74087816,  1.01988864, -0.24207969, -2.35705972,
       -0.12641022,  0.06372685,  0.65881741,  0.47903129,  0.05196929,
        0.33446911, -2.20655894,  0.0183251 , -0.42534435,  1.41933274,
        2.02105522,  0.28198084,  0.08037232, -0.21583037, -3.49729443,
        1.47610402, -0.96690106, -2.5360465 ,  0.97365487, -2.02067065,
       -0.59269911, -1.90242946, -2.1068418 , -1.02947664,  0.04239455,
       -0.82397866,  0.39041638,  0.15510099, -0.04847286,  0.92273235,
       -1.00195968, -0.63456023,  0.9105376 ,  0.67231762, -0.47721529,
       -1.57983494,  1.04773879, -0.27772939, -0.88323915, -0.88461703,
        0.3014482 ,  1.68624055,  2.91848946,  2.85907602, -0.96259266,
        2.09554362,  2.08418679,  0.22845188, -0.55620462, -2.71090293,
       -1.61427462, -0.40508148,  0.94375992, -0.70776701,  0.1709096 ,
        0.48541191, -2.20617008, -0.57962573, -1.12773049, -1.35

In [75]:
m.similarity("computer", "laptop")

0.39825422751305117

In [69]:
# what are similar words? 

m.most_similar("computer", topn=15)

[(u'computing', 0.495266854763031),
 (u'computers', 0.4299651086330414),
 (u'electronics', 0.4242360293865204),
 (u'elsevier', 0.41329771280288696),
 (u'consulting', 0.4081593155860901),
 (u'silicon', 0.3996121287345886),
 (u'laptop', 0.39825403690338135),
 (u'accucorp', 0.39347150921821594),
 (u'sciplot3', 0.3866022229194641),
 (u'engineering', 0.38475874066352844),
 (u'library', 0.3831080198287964),
 (u'kubota', 0.378673791885376),
 (u'jurisprudence', 0.37765148282051086),
 (u'macintosh', 0.3754265010356903),
 (u'equipment', 0.36907872557640076)]

In [73]:
# man is to husband, as women is to ...?
m.most_similar(positive=['husband', 'woman'], negative=['man'], topn=5)

[(u'wife', 0.7510499954223633),
 (u'sister', 0.7154536843299866),
 (u'daughter', 0.7080667018890381),
 (u'girlfriend', 0.665849506855011),
 (u'parents', 0.651865541934967)]

In [76]:
m.doesnt_match("computer laptop keyboard god mouse".split())

'god'

# Python Glove

https://github.com/maciejkula/glove-python

In [84]:
from glove import Corpus, Glove

# construct cooccurrence matrix
corpus = Corpus()
corpus.fit(NewsgroupIterator(), window=5)
print_log("done")

2016-03-15 13:14:32 start iteration 1
2016-03-15 13:14:57 done


In [85]:
# estimate word vectors
glove = Glove(no_components=256, learning_rate=0.05)
glove.fit(corpus.matrix, epochs=10, no_threads=4, verbose=True)

Performing 10 training epochs with 4 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9


In [92]:
# supply a word-id dictionary to allow similarity queries
glove.add_dictionary(corpus.dictionary)
glove.most_similar('computer', number=15)

[(u'science', 0.89918296733840353),
 (u'network', 0.88060740692864747),
 (u'engineering', 0.85806903858788452),
 (u'2889', 0.84464236575769325),
 (u'department', 0.81272189190230171),
 (u'itti', 0.80586921298082848),
 (u'17000', 0.79886706219844672),
 (u'shopper', 0.78064593552730444),
 (u'rootstown', 0.77613259021377812),
 (u'electrical', 0.76821284886960883),
 (u'justice', 0.76297296789446867),
 (u'tech', 0.76148038752750691),
 (u'inc', 0.75542088607981894),
 (u'systems', 0.7543458068772888)]

In [91]:
word_idx = corpus.dictionary["computer"]
glove.word_vectors[word_idx]

array([-0.15294618, -0.16229331,  0.22968242, -0.06935435, -0.204179  ,
        0.15189339, -0.05116937, -0.04714758,  0.0928095 , -0.15227175,
       -0.13257226,  0.13242168,  0.22263267, -0.08370977, -0.09147217,
        0.05804088, -0.11254168,  0.15175754,  0.02864283,  0.09036119,
        0.19627194, -0.17712955,  0.11455667,  0.1828292 ,  0.16317328,
       -0.1469712 ,  0.07998185, -0.17448186,  0.15504458, -0.17607112,
       -0.21379806,  0.22414729, -0.11081201,  0.26255771, -0.23617736,
        0.0021399 ,  0.03964508, -0.27294068, -0.14710344, -0.25980023,
        0.17566646, -0.29988554,  0.07406638, -0.1086188 , -0.22955877,
        0.30700018,  0.075993  , -0.17794391,  0.19875335,  0.09574942,
        0.20711149,  0.1396056 ,  0.1804984 ,  0.14014868,  0.11824339,
       -0.16659382,  0.17253023, -0.20879712, -0.16514559, -0.06563502,
        0.23446966,  0.17092721, -0.15885723,  0.08113689,  0.02683762,
       -0.06619163,  0.05324607,  0.0826638 , -0.12250659, -0.13