In [6]:
# Basic libraries import
import numpy as np
import pandas as pd
import seaborn as sns
import gensim, logging

# Plotting
%matplotlib notebook
%matplotlib inline

sns.set_context("notebook", font_scale=1.5)

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Intro
This notebook explores words embeddings.

A first test is done on Word2Vec using Gensim

**Resources**

* https://rare-technologies.com/word2vec-tutorial/
* https://radimrehurek.com/gensim/models/word2vec.html

# Prepare Data

In [4]:
sentences = ["A brown fox jumped on the lazy dog", 
            "A brown fox jumped on the brown duck",
            "A brown fox jumped on the lazy elephant",
            "An elephant is eating green grass near the alpaca",
            "A green alpaca tried to jump over an elephant",
            "May you rest in a deep and dreamless slumber"]

# Train Word2Vec

In [10]:
# parameters
size = 200    # size of NN layers, corresponding to word vector dimensionality                      
min_count = 1   # minimum word count in order to consider such word                        
workers = 4       # number of threads to run in parallel (only effect if you have Cython installed)
window = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

In [12]:
print("Training model...")
model = gensim.models.Word2Vec([s.split() for s in sentences],
                              workers=workers, 
            size=size, min_count = min_count, 
            window = window, sample = downsampling)

Training model...


In [None]:
# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)

In [None]:
# save model
model_name = "w2v_{}_size{}_mincount{}_window{}".format(corpus_name, size, min_count, window)
model.save(model_name)

# Test Word2Vec Model

In [None]:
# load model
model = gensim.models.Word2Vec.load(model_name)

In [None]:
model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
model.doesnt_match("breakfast cereal dinner lunch";.split())
model.similarity('woman', 'man')

In [19]:
# sentence to tensor
model[['brown', 'fox']].shape

(2, 200)

In [23]:
import nltk
import itertools
import collections
nltk.FreqDist(itertools.chain(*[s.split() for s in sentences]))

FreqDist({'A': 4,
          'An': 1,
          'May': 1,
          'a': 1,
          'alpaca': 2,
          'an': 1,
          'and': 1,
          'brown': 4,
          'deep': 1,
          'dog': 1,
          'dreamless': 1,
          'duck': 1,
          'eating': 1,
          'elephant': 3,
          'fox': 3,
          'grass': 1,
          'green': 2,
          'in': 1,
          'is': 1,
          'jump': 1,
          'jumped': 3,
          'lazy': 2,
          'near': 1,
          'on': 3,
          'over': 1,
          'rest': 1,
          'slumber': 1,
          'the': 4,
          'to': 1,
          'tried': 1,
          'you': 1})

In [24]:
collections.Counter(itertools.chain(*[s.split() for s in sentences]))

Counter({'A': 4,
         'An': 1,
         'May': 1,
         'a': 1,
         'alpaca': 2,
         'an': 1,
         'and': 1,
         'brown': 4,
         'deep': 1,
         'dog': 1,
         'dreamless': 1,
         'duck': 1,
         'eating': 1,
         'elephant': 3,
         'fox': 3,
         'grass': 1,
         'green': 2,
         'in': 1,
         'is': 1,
         'jump': 1,
         'jumped': 3,
         'lazy': 2,
         'near': 1,
         'on': 3,
         'over': 1,
         'rest': 1,
         'slumber': 1,
         'the': 4,
         'to': 1,
         'tried': 1,
         'you': 1})