In [1]:
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess

corpus = [
    "I love machine learning and natural language processing",
    "Deep learning models can achieve state of the art results",
    "Word2Vec is a technique to represent words as vectors",
    "Natural language understanding involves machine learning models",
    "Neural networks are the foundation of deep learning"
]

In [2]:
tokenized_corpus = [simple_preprocess(sentence) for sentence in corpus]
tokenized_corpus

[['love', 'machine', 'learning', 'and', 'natural', 'language', 'processing'],
 ['deep',
  'learning',
  'models',
  'can',
  'achieve',
  'state',
  'of',
  'the',
  'art',
  'results'],
 ['word',
  'vec',
  'is',
  'technique',
  'to',
  'represent',
  'words',
  'as',
  'vectors'],
 ['natural',
  'language',
  'understanding',
  'involves',
  'machine',
  'learning',
  'models'],
 ['neural', 'networks', 'are', 'the', 'foundation', 'of', 'deep', 'learning']]

In [3]:
model = Word2Vec(sentences=tokenized_corpus, vector_size=10, window=5, min_count=1)

In [5]:
print("vocabulary: ")
model.wv.key_to_index

vocabulary: 


{'learning': 0,
 'machine': 1,
 'natural': 2,
 'language': 3,
 'deep': 4,
 'models': 5,
 'of': 6,
 'the': 7,
 'foundation': 8,
 'art': 9,
 'and': 10,
 'processing': 11,
 'can': 12,
 'achieve': 13,
 'state': 14,
 'results': 15,
 'are': 16,
 'word': 17,
 'vec': 18,
 'is': 19,
 'technique': 20,
 'to': 21,
 'represent': 22,
 'words': 23,
 'as': 24,
 'vectors': 25,
 'understanding': 26,
 'involves': 27,
 'neural': 28,
 'networks': 29,
 'love': 30}

In [7]:
word_vector = model.wv['learning']
word_vector

array([-0.00538373,  0.00234567,  0.05099428,  0.09004445, -0.09300488,
       -0.07116753,  0.0645984 ,  0.08973689, -0.05016401, -0.03759819],
      dtype=float32)

In [8]:
similar_words = model.wv.most_similar('learning', topn=5)

similar_words

[('machine', 0.5435398817062378),
 ('to', 0.5109580159187317),
 ('and', 0.4318232536315918),
 ('understanding', 0.4006551206111908),
 ('foundation', 0.37922871112823486)]

In [None]:
model.save("word2vec.model")