# Word2Vec

Trained on Google News articles and pared down from the original 3-million-word vocabulary to about 300K words. 

## Data loading

In [1]:
from gensim.models import KeyedVectors

In [2]:
import requests
binary_url = "https://github.com/BrownDSI/word2vec-slim/releases/" + \
             "download/v0.1/GoogleNews-vectors-negative300-slim.bin.gz"
r = requests.get(binary_url)
open('google-word2vec-slim.bin.gz','wb').write(r.content);

In [4]:
model = KeyedVectors.load_word2vec_format('google-word2vec-slim.bin.gz',binary=True)

## word2vec analogies

In [21]:
def complete_analogy(a,b,c,topn=10):
    return model.similar_by_vector(- model.get_vector(a) 
                                   + model.get_vector(b)
                                   + model.get_vector(c),topn=topn)

In [22]:
complete_analogy("Paris","France","Tokyo")

[('Japan', 0.8330698013305664),
 ('Tokyo', 0.7668523192405701),
 ('Japanese', 0.6793394088745117),
 ('Japans', 0.63188636302948),
 ('Shizuoka', 0.6009104251861572),
 ('Kyushu', 0.5709357857704163),
 ('Maebashi', 0.5679153800010681),
 ('Hokkaido', 0.5624659657478333),
 ('Nagoya', 0.5609986186027527),
 ('Yokohama', 0.5578266382217407)]

In [23]:
complete_analogy("Woman","Man","Queen")

[('Queen', 0.70897376537323),
 ('King', 0.5065371990203857),
 ('Royal', 0.4004102349281311),
 ('Princes', 0.396414577960968),
 ('Scu', 0.388332724571228),
 ('king', 0.37896907329559326),
 ('Conqueror', 0.37822434306144714),
 ('Crown', 0.3727468252182007),
 ('Rameses', 0.3666337728500366),
 ('Conquerer', 0.3661169409751892)]

In [34]:
complete_analogy("swam","swim","flipped")

[('flipped', 0.6986249089241028),
 ('flip', 0.5979549884796143),
 ('flipping', 0.5788305997848511),
 ('flips', 0.575873613357544),
 ('Flipping', 0.433332622051239),
 ('jackknifing', 0.40923744440078735),
 ('switch', 0.40313392877578735),
 ('Flicking', 0.36930978298187256),
 ('overturned', 0.3692159652709961),
 ('careen', 0.36834967136383057)]

In [31]:
# negating does *not* reveal antonyms
model.similar_by_vector(-model.get_vector('quick'))

[('ABHA', 0.21067236363887787),
 ('KEZ', 0.20679625868797302),
 ('Directorships', 0.1989758014678955),
 ('Democratie', 0.19581405818462372),
 ('Constituting', 0.19313615560531616),
 ('Amelioration', 0.19119952619075775),
 ('TANGIER', 0.19055309891700745),
 ('Namangan', 0.19042852520942688),
 ('PRIDES', 0.18617694079875946),
 ('BHARATIYA', 0.18478044867515564)]

# Bias
word2vec transmits real-world bias in the training data: 

In [54]:
{race: model.similarity('criminal',race) for race in ['black','latino','white','asian']}

{'black': 0.08380793,
 'latino': 0.059527025,
 'white': 0.04107807,
 'asian': -0.0511574}

In [55]:
{gender: model.similarity('leader',gender) for gender in ['man','woman']}

{'man': 0.19424875, 'woman': 0.122108325}