# Get word embeddings using Word2vec

### 0. Before we get started

In [9]:
import io

def load_corpus(fname):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    documents = []
    for line in fin:
        documents.append(line.split())
    return documents

In [10]:
def save_dictionary(fname, dictionary, args):
    length, dimension = args
    fin = io.open(fname, 'w', encoding='utf-8')
    fin.write('%d %d\n' % (length, dimension))
    for word in dictionary:
        fin.write('%s %s\n' % (word, ' '.join(map(str, dictionary[word]))))
        
def load_dictionary(fname):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    length, dimension = map(int, fin.readline().split())
    dictionary = {}
    for line in fin:
        tokens = line.rstrip().split(' ')
        dictionary[tokens[0]] = map(float, tokens[1:])
    return dictionary

### 1. Load our corpus

In [None]:
documents = load_corpus('origin_corpus_oss.txt')

For checking, the corpus contains 12692 documents

In [None]:
len(documents)

### 2. Train word2vec model

In [None]:
%%time
from gensim.models import Word2Vec

dimension = 20
model = Word2Vec(sentences=documents, vector_size=dimension, min_count=1)

In [None]:
dictionary = {key : model.wv[key] for key in model.wv.key_to_index}

For checking, the dictionary contains 192881 different words (if min_count = 1)

In [None]:
len(dictionary)

Using word2vec models, you can find the closest word. For instance, I test the quality of the model on abstract nouns 

### 3. Save dictionary in file 

In [None]:
save_dictionary('cbow_oss_dict.txt', dictionary, (len(dictionary), dimension))

### 4. Check that everything is saved correctly (optional)

In [None]:
loaded_dictionary = load_dictionary('cbow_oss_dict.txt')
len(dictionary) == len(loaded_dictionary)