### Install steps

#### Download pip:
- https://pip.pypa.io/en/stable/installing/
    - To install pip, securely download get-pip.py.
    - python3 get-pip.py (Run in sudo or admin mode - based on the OS)

#### Install Gensim Word2Vec
- pip3 install --upgrade gensim (Run in sudo or admin mode - based on the OS)
- pip3 install nltk
- sudo python -m nltk.downloader -d /usr/local/share/nltk_data all


In [1]:
# Import gensim and nltk:

import gensim

# http://www.nltk.org/book/ch02.html
# It contains 500 samples of English-language text,
# totaling roughly one million words, compiled from works published in the United States in 1961.
from nltk.corpus import brown

In [2]:
brown.sents()

[['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.'], ['The', 'jury', 'further', 'said', 'in', 'term-end', 'presentments', 'that', 'the', 'City', 'Executive', 'Committee', ',', 'which', 'had', 'over-all', 'charge', 'of', 'the', 'election', ',', '``', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'City', 'of', 'Atlanta', "''", 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted', '.'], ...]

In [3]:
brown.categories()

['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']

In [4]:
# Train the model:

sentences = brown.sents()
model = gensim.models.Word2Vec(sentences, min_count=1)
model.save('brown_model')
print("Brown corpus model saved.")

Brown corpus model saved.


In [5]:
# Load and Test the model:
model = gensim.models.Word2Vec.load('brown_model')
# words most similar to mother
print(model.most_similar('mother'))

[('father', 0.9846181869506836), ('husband', 0.9669593572616577), ('wife', 0.9486243724822998), ('friend', 0.9322052597999573), ('son', 0.9282881617546082), ('nickname', 0.9173303246498108), ('eagle', 0.9157270193099976), ('addiction', 0.906704843044281), ('voice', 0.9057698845863342), ('patient', 0.899213433265686)]


  after removing the cwd from sys.path.


In [6]:
# find the odd one out
print(model.doesnt_match("breakfast cereal dinner lunch".split()))
print(model.doesnt_match("cat dog table".split()))

cereal
table


  
  This is separate from the ipykernel package so we can avoid doing imports until


In [7]:
# vector representation of word human
print(model["human"])

[-0.35715294 -0.19201484  0.06453199 -0.31812358  0.34995535  0.37117353
  0.60374796  0.13953583  0.6168337   0.84024435 -0.4647691   0.28594983
  0.5884796   1.4222544   1.0639894   0.36073363 -0.5001473   0.76575464
  0.67404443  0.49063948 -0.0942038  -0.37926126  0.86331284  0.24414942
  0.22542927  0.6314688   0.06413365 -0.3089375  -1.1719805   0.442727
 -0.06680264 -0.645871   -0.43071994 -0.87876546  0.1020586   0.06740871
  0.51632214  0.87089676  0.12997636  0.34464735  0.00155788 -0.6318405
 -0.30814582 -0.85897815 -0.42125264 -0.22521545  0.09884993 -0.06979845
 -0.5507496  -0.48977223  1.0746925  -0.58177453  0.07736731 -0.26322338
  0.5677904  -0.29204562  0.18040363 -1.2391508   0.36376444  0.08585867
 -0.7801938  -0.5638774  -0.09588034 -0.872432   -0.26630586  0.09761101
 -0.361726    0.15841632  0.54823333  0.46109918 -0.07567     0.69156873
  0.68775296  0.77148163  0.6996729   1.0877684  -0.08485799  0.9228495
  0.4553159  -0.80853957  0.74754167 -0.2104344  -0.687

  


## Demo using pre-trained word vectors

Download Google News pre-trained vectors : https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit

In [8]:
# Showcase
# Pre-trained vectors trained on part of Google News dataset (about 100 billion words). 
# The model contains 300-dimensional vectors for 3 million words and phrases.
import gensim

# Load Google's pre-trained Word2Vec model.
# 3 million words * 300 features * 4bytes/feature = ~3.35GB
model = gensim.models.KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary=True)  

In [9]:
model['computer'].shape, model['computer'][:10]

((300,),
 array([ 0.10742188, -0.20117188,  0.12304688,  0.21191406, -0.09130859,
         0.21679688, -0.13183594,  0.08300781,  0.20214844,  0.04785156],
       dtype=float32))

In [10]:
# A simple way to investigate the learned representations is to find the closest words for a user-specified word.
# The distance tool serves that purpose.
model.most_similar('france')

[('spain', 0.6375303864479065),
 ('french', 0.6326056122779846),
 ('germany', 0.6314355134963989),
 ('europe', 0.626425564289093),
 ('italy', 0.6257959008216858),
 ('england', 0.6120775938034058),
 ('european', 0.6074904799461365),
 ('belgium', 0.5972345471382141),
 ('usa', 0.5948355197906494),
 ('serbia', 0.5805614590644836)]

In [11]:
# the word vectors capture many linguistic regularities, 
# for example vector operations vector('Paris') - vector('France') + vector('Italy') 
# results in a vector that is very close to vector('Rome'), 
# vector('king') - vector('man') + vector('woman') is close to vector('queen')
model.most_similar(positive=['Paris', 'Italy'], negative=['France'])

[('Milan', 0.7222141027450562),
 ('Rome', 0.7028310298919678),
 ('Palermo_Sicily', 0.5967569947242737),
 ('Italian', 0.5911272764205933),
 ('Tuscany', 0.563281238079071),
 ('Bologna', 0.5608358383178711),
 ('Sicily', 0.5596384406089783),
 ('Bologna_Italy', 0.5470059514045715),
 ('Berna_Milan', 0.5464027523994446),
 ('Genoa', 0.5308901071548462)]

In [12]:
model.most_similar(positive=['woman', 'king'], negative=['man'])

[('queen', 0.7118192911148071),
 ('monarch', 0.6189674735069275),
 ('princess', 0.5902431011199951),
 ('crown_prince', 0.549946129322052),
 ('prince', 0.5377321243286133),
 ('kings', 0.5236844420433044),
 ('Queen_Consort', 0.5235945582389832),
 ('queens', 0.5181134343147278),
 ('sultan', 0.5098593235015869),
 ('monarchy', 0.5087411999702454)]

In [13]:
model.most_similar('ganga')

[('jal', 0.596185028553009),
 ('Nimbu', 0.5948660969734192),
 ('pura', 0.5872788429260254),
 ('saathi', 0.5830987095832825),
 ('chillum', 0.5760275721549988),
 ('paani', 0.574300229549408),
 ('pana', 0.572252094745636),
 ('mahua', 0.5707277059555054),
 ('phensidyl', 0.5696573257446289),
 ('sada', 0.5695992708206177)]

## Visualization - Demo
https://ronxin.github.io/wevi/