In [12]:
from gensim.models import Word2Vec
import pickle 
import pandas as pd
import os
from time import time

### Get data

In [2]:
with open("/content/drive/MyDrive/ML/rnn/word2vec_emoji/Data/corpus.txt" , 'r', encoding='utf-8') as f:
            lines = f.readlines()
            train_data = [line.strip().split() for line in lines]

In [5]:
train_data[:5]

[['they', 'know', 'what', 'i', 'love', '😊'],
 ['lemme', 'get', 'to', 'school', '🙄'],
 ['watching', 'the', 'ellen', 'degeneres', 'show', '😃'],
 ['i',
  'spent',
  'twenty',
  'minutes',
  'herding',
  'turkeys',
  'by',
  'myself',
  'so',
  "that's",
  'how',
  'my',
  'days',
  'going',
  '🦃'],
 ['hello', 'seattle', 'cityscape', '🏗']]

In [6]:
len(train_data)

1000000

### Get model
* without sentences it will just build the model

In [15]:
model = Word2Vec(sentences=None, size=300, workers=4, min_count=10, negative=5, window=5)

### Building the Vocabulary Table:
Word2Vec requires us to build the vocabulary table (simply digesting all the words and filtering out the unique words, and doing some basic counts on them):

In [16]:
t = time()

model.build_vocab(train_data, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

Time to build vocab: 0.13 mins


In [17]:
model.corpus_count

1000000

In [19]:
t = time()

model.train(train_data, total_examples=model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

Time to train the model: 9.84 mins


### Save model

In [20]:
os.chdir("/content/drive/MyDrive/ML/rnn/word2vec_emoji")

In [21]:
model.save("word2vec5.bin")
model.wv.save_word2vec_format('word2vec5.txt')

### Exploring results

In [23]:
from src.prediction import Prediction

In [27]:
predictor = Prediction("src/models/word2vec5.bin")

In [29]:
predictor.getPrediction("cat", emoji_only=True)

[('🐱', 0.41397377848625183),
 ('🐈', 0.36478036642074585),
 ('🐶', 0.294055700302124),
 ('😺', 0.25501549243927),
 ('🐕', 0.24377763271331787),
 ('🐹', 0.2014644294977188)]

In [30]:
predictor.getPrediction("dog", emoji_only=True)

[('🐶', 0.4354158043861389),
 ('🐱', 0.3315426707267761),
 ('🐕', 0.30471840500831604),
 ('🐩', 0.2935483455657959),
 ('🐷', 0.24418601393699646),
 ('🐭', 0.2383570373058319),
 ('🌭', 0.2345047891139984)]

## similarity

In [33]:
predictor.get_similarity("cat", "dog")

0.5570577