In [2]:
# he neighborhood of a word carries important information in terms of what context the word is carrying in a sentence.
# The relationship between the word and its neighborhood tends to define the semantics of a word and its overall 
# positioning and presence in a sentence

In [3]:
# Word2vec captures relationships in text; consequently, similar words have similar representations

In [5]:
# relationship of Man:King is the same as Woman:Queen.

In [6]:
# Word2vec uses a simple neural network to build this architecture

In [7]:
# Word2vec is an unsupervised methodology for building word embeddings. In the
# Word2vec architecture, an attempt is made to do either of the following:
# Predict the target word based on the context word 
# Predict the context word based on the target word

In [8]:
# There are two main learning algorithms in word2vec : continuous bag-of-words and continuous skip-gram.

In [9]:
! pip install gensim

Collecting gensim
  Downloading gensim-4.0.1-cp38-cp38-macosx_10_9_x86_64.whl (23.9 MB)
[K     |████████████████████████████████| 23.9 MB 995 kB/s eta 0:00:01    |███████▋                        | 5.7 MB 1.5 MB/s eta 0:00:12     |████████████████████████▉       | 18.5 MB 416 kB/s eta 0:00:13
Collecting smart-open>=1.8.1
  Downloading smart_open-5.0.0-py3-none-any.whl (56 kB)
[K     |████████████████████████████████| 56 kB 228 kB/s eta 0:00:01
[?25hInstalling collected packages: smart-open, gensim
Successfully installed gensim-4.0.1 smart-open-5.0.0
You should consider upgrading via the '/opt/anaconda3/bin/python -m pip install --upgrade pip' command.[0m


In [10]:
import gensim
from gensim.models import KeyedVectors

In [13]:
# after extracting the model is 3.64 gb

In [15]:
model_w2v = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin',binary=True)

In [19]:
modelw2v.vector_size

300

In [39]:
# model.wv

In [22]:
model.most_similar('Nepal')

[('Bhutan', 0.8170939683914185),
 ('Nepali', 0.781296968460083),
 ('Nepalese', 0.7759099006652832),
 ('Kathmandu', 0.734721839427948),
 ('Himalayan_kingdom', 0.7084589004516602),
 ('Bhattarai', 0.7057297825813293),
 ('Thapa', 0.6967089176177979),
 ('Bhutanese', 0.6812286972999573),
 ('Dahal', 0.6810930371284485),
 ('Pokhara', 0.6747621297836304)]

In [11]:
# Skip-gram model

In [12]:
# trying to predict a context word when a target word is taken as input.

In [14]:
# window_size --- When the window size is defined as 5, the model takes in two words from the left and two words from the 
# right of the target word as the context words.

###  Training word2vec model

In [24]:
from gensim.models import Word2Vec

In [25]:
sentences = [['I','am','trying','to','understand','Natural','Language','Processing'],
            ['Natural','Language','Processing','is','fun','to','learn'],
            ['There','are','numerous','use','cases','of','Natural','Language','Processing']]

In [26]:
model = Word2Vec(sentences,min_count=1)

In [27]:
# The value of min_count sets a minimum threshold so that vectors are built only for words that occur more 
# often than the value specified in the min_count parameter.

In [28]:
model.vector_size

100

In [30]:
len(model.wv.key_to_index)

17

In [33]:
# Our vocabulary has a size of 17
# equal to number of unique words in the sentence we have defined

In [34]:
model.wv.key_to_index

{'Processing': 0,
 'Natural': 1,
 'Language': 2,
 'to': 3,
 'of': 4,
 'am': 5,
 'trying': 6,
 'understand': 7,
 'is': 8,
 'cases': 9,
 'fun': 10,
 'learn': 11,
 'There': 12,
 'are': 13,
 'numerous': 14,
 'use': 15,
 'I': 16}

In [35]:
# modifying min_count

In [36]:
model = Word2Vec(sentences,min_count=2)

In [37]:
model.wv.key_to_index

{'Processing': 0, 'Language': 1, 'Natural': 2, 'to': 3}

In [38]:
# Higher-dimensional vectors capture more information across dimensions, 
# especially when the corpus and vocabulary are big and the data is highly varied.


In [40]:
# playing with the vector size

In [41]:
model = Word2Vec(sentences,min_count=2,vector_size=300)

In [42]:
model.vector_size

300

In [43]:
# each of the four words that occur more than once is now represented using 300 dimensions.

In [44]:
# other parameters 
# sg =1 , uses skip-gram and if sg=0 uses CBOW

In [45]:
# negative = if value is greteaer than 1 signoifeis negative samples to use
# the interger value signifies number of negative samples to use

# workers = numbers of threads to use for training

In [47]:
model.save('modelw2v.sav')

In [48]:
## Applications of Word2vec

# document similarity or how related two or more documents are to each other
# search engines, building classification, and clustering models where sentences can be represented by 
# using embeddings of the words in them.

### Word Mover's Distance (WMD)

In [55]:
! pip install pyemd

Collecting pyemd
  Downloading pyemd-0.5.1.tar.gz (91 kB)
[K     |████████████████████████████████| 91 kB 329 kB/s eta 0:00:01
Building wheels for collected packages: pyemd
  Building wheel for pyemd (setup.py) ... [?25ldone
[?25h  Created wheel for pyemd: filename=pyemd-0.5.1-cp38-cp38-macosx_10_9_x86_64.whl size=78955 sha256=b94803d1445786a8e160bd9f76b81d4ecaf5ff1aeb3566f781e000eb36e16f1c
  Stored in directory: /Users/aayush/Library/Caches/pip/wheels/a2/a5/34/f960a47ca5c06b0e91b6f48117a79a66f53a879f8fac9529bf
Successfully built pyemd
Installing collected packages: pyemd
Successfully installed pyemd-0.5.1
You should consider upgrading via the '/opt/anaconda3/bin/python -m pip install --upgrade pip' command.[0m


In [60]:
# from pyemd import emd
# from gensim.similarities import WmdSimilarity

In [49]:
# for document similarity
# for finding resumes similar to a job description

In [50]:
# dissimilarity between two text documents as the minimum amount of distance that the embedded words of one document need to 
# travel to reach the embedded words of another document.

In [51]:
model_w2v = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin',binary=True)

In [52]:
sentence1= "Obama speaks to the media in Illinois"
sentence2 = "President greets the press in Chicago"
sentence3 = "Apple is my favorite company"

In [61]:
distance1 = model_w2v.wmdistance(sentence1,sentence2)
distance1

0.4277553083600646

In [64]:
# least the distance, more is the similarity between them

In [62]:
distance2 = model_w2v.wmdistance(sentence1,sentence3)
distance2

0.47793400675650705

In [63]:
distance3 = model_w2v.wmdistance(sentence2,sentence3)
distance3

0.5709657015168149