# Training word2vec on twitter dataset

In [1]:
from sklearn.model_selection import train_test_split
import pickle 
import pandas as pd
import os
from gensim.models import Word2Vec

### Train word2vec model on twitter data

In [2]:
os.chdir("/content/drive/MyDrive/ML/rnn/word2vec_emoji")

In [39]:
!python src/train_word2vec.py --help

usage: train_word2vec.py [-h] [--data_path DATA_PATH] [--size SIZE]
                         [--window WINDOW] [--min_count MIN_COUNT]
                         [--workers WORKERS] [--vocab_size VOCAB_SIZE]
                         [--negative NEGATIVE]

optional arguments:
  -h, --help            show this help message and exit
  --data_path DATA_PATH
                        Path to text dataset which is expected to be .txt
                        file.
  --size SIZE           word2vec model's hidden layers size.
  --window WINDOW       word2vec model's window
  --min_count MIN_COUNT
                        word2vec model's min_count
  --workers WORKERS     word2vec model's window
  --vocab_size VOCAB_SIZE
                        word2vec vocab size
  --negative NEGATIVE   negative samples


In [42]:
!python src/train_word2vec.py --data_path="Data/corpus.txt" --size=300 --window=5 --min_count=10 --workers=4 --negative=5

training word2vec with Namespace(data_path='Data/corpus.txt', min_count=10, negative=5, size=300, vocab_size=None, window=5, workers=4)
saving model


### Get predictions for emojis

In [17]:
from src.prediction import Prediction

In [45]:
predictor = Prediction(model_path="src/models/word2vec4.bin")

In [46]:
predictor.getPrediction(pos="cat", emoji_only=True)

[('🐱', 0.646553099155426),
 ('🐈', 0.5324569940567017),
 ('🐶', 0.4572571814060211),
 ('🐭', 0.36624863743782043),
 ('🐕', 0.3622549772262573),
 ('🐰', 0.34097522497177124),
 ('🐩', 0.32535815238952637),
 ('😻', 0.3226172924041748)]

In [47]:
predictor.getPrediction(pos="cat", emoji_only=True)

[('🐱', 0.646553099155426),
 ('🐈', 0.5324569940567017),
 ('🐶', 0.4572571814060211),
 ('🐭', 0.36624863743782043),
 ('🐕', 0.3622549772262573),
 ('🐰', 0.34097522497177124),
 ('🐩', 0.32535815238952637),
 ('😻', 0.3226172924041748)]

In [48]:
predictor.getPrediction(pos="broke", emoji_only=True)

[('😭', 0.41767802834510803),
 ('💀', 0.3983471691608429),
 ('😩', 0.3708803057670593),
 ('🚮', 0.3530346751213074),
 ('🙄', 0.3480338752269745),
 ('😤', 0.33949151635169983),
 ('😂', 0.3224979043006897),
 ('💔', 0.32222121953964233),
 ('😑', 0.3209999203681946),
 ('😐', 0.3200664520263672),
 ('😒', 0.31943196058273315),
 ('🙃', 0.3131222724914551),
 ('😪', 0.31249216198921204),
 ('😫', 0.3077697157859802),
 ('🤒', 0.29515308141708374),
 ('😕', 0.29288020730018616),
 ('💯', 0.291854590177536),
 ('💸', 0.2913130521774292),
 ('😖', 0.29010769724845886),
 ('🏌', 0.2857724726200104),
 ('😓', 0.28321224451065063),
 ('😔', 0.2779437303543091)]

In [49]:
predictor.getPrediction(pos="cake", emoji_only=True)

[('🍰', 0.7112141251564026),
 ('🧀', 0.5774571895599365),
 ('🎂', 0.5486853718757629),
 ('🍖', 0.5420448780059814),
 ('🍝', 0.5398286581039429),
 ('🍫', 0.5373072028160095),
 ('🍪', 0.5362316370010376),
 ('🍞', 0.5303407311439514),
 ('🍚', 0.5089666843414307),
 ('🍤', 0.5054020881652832),
 ('🍮', 0.5007839202880859),
 ('🍔', 0.4943925142288208),
 ('🌯', 0.4926435947418213),
 ('🍜', 0.4918401837348938),
 ('🍛', 0.4744541049003601),
 ('🍲', 0.4719371497631073),
 ('🍌', 0.45822960138320923),
 ('🍟', 0.4499587118625641),
 ('🍍', 0.4442404806613922),
 ('🍳', 0.44404780864715576),
 ('🍋', 0.44189614057540894),
 ('🍡', 0.43670231103897095),
 ('🌭', 0.4355071783065796),
 ('🍇', 0.4322744309902191),
 ('🍼', 0.41948920488357544),
 ('🍗', 0.4125286340713501),
 ('🍣', 0.406379759311676),
 ('🍯', 0.4054545760154724),
 ('🍅', 0.3853513300418854),
 ('😋', 0.38439735770225525),
 ('🍱', 0.3837733566761017),
 ('🌽', 0.37885284423828125),
 ('🎁', 0.373563289642334),
 ('🍶', 0.37098848819732666),
 ('🍈', 0.37015286087989807),
 ('🍽', 0.3695

In [50]:
predictor.getPrediction(pos="happy birthday", emoji_only=True)

[('🎂', 0.5828697681427002),
 ('🎈', 0.5597606897354126),
 ('🎉', 0.4761180877685547),
 ('🎊', 0.42458921670913696),
 ('💘', 0.37494543194770813),
 ('💗', 0.3692004978656769),
 ('😘', 0.3394244909286499),
 ('🎁', 0.3365127444267273),
 ('💞', 0.3354981243610382),
 ('💖', 0.33199426531791687),
 ('💓', 0.3303772211074829),
 ('💕', 0.3193649351596832),
 ('😚', 0.3161160945892334),
 ('💜', 0.3084559142589569),
 ('💝', 0.2958551347255707),
 ('💐', 0.2948242425918579),
 ('💟', 0.28993913531303406),
 ('💙', 0.27523234486579895),
 ('😙', 0.27499625086784363),
 ('🍰', 0.2576941251754761),
 ('💛', 0.24958959221839905),
 ('😻', 0.24478691816329956),
 ('🤗', 0.2437189817428589),
 ('🍾', 0.2403097152709961),
 ('❤', 0.23027075827121735),
 ('🦄', 0.22660471498966217),
 ('🌹', 0.21441985666751862),
 ('👸', 0.21304695308208466),
 ('😽', 0.2109816074371338),
 ('👑', 0.2105635106563568),
 ('🎀', 0.20109213888645172),
 ('🍌', 0.19830679893493652),
 ('😍', 0.19626103341579437),
 ('🌻', 0.1934908628463745),
 ('☺', 0.19319063425064087),
 ('👶

### Get similarity

In [51]:
predictor.get_similarity(w1="😘", w2="😙")

0.5593131

In [52]:
predictor.get_similarity("cat", "dog")

0.73376894

In [53]:
predictor.get_similarity("cat", "🐱")

0.64655304

### Get vectors

In [54]:
vector = predictor.get_vector_embedding("😙")
vector

# Visualize embeddings

In [55]:
os.chdir("/content/drive/MyDrive/ML/rnn/word2vec_emoji")

In [57]:
model = Word2Vec.load("/content/drive/MyDrive/ML/rnn/word2vec_emoji/src/models/word2vec4.bin")

In [58]:
embd_weights = model.wv.vectors

In [59]:
vocab = model.wv.vocab.keys()

### Save embeddings

In [60]:
import io
from src.utils import is_emoji
out_v = io.open('embd_vectors4.tsv', 'w', encoding='utf-8')
out_m = io.open('embd_metadata4.tsv', 'w', encoding='utf-8')

for index, word in enumerate(vocab):
  if index == 0:
    continue  # skip 0, it's padding.
  vec = embd_weights[index]
  out_v.write('\t'.join([str(x) for x in vec]) + "\n")
  out_m.write(word + "\n")
out_v.close()
out_m.close()

### Save only emoji embeddings

In [61]:
import io
from src.utils import is_emoji
out_v = io.open('embd_vectors_emoji4.tsv', 'w', encoding='utf-8')
out_m = io.open('embd_metadata_emoji4.tsv', 'w', encoding='utf-8')

for index, word in enumerate(vocab):
 if is_emoji(word): 
  if index == 0:
    continue  # skip 0, it's padding.
  vec = embd_weights[index]
  out_v.write('\t'.join([str(x) for x in vec]) + "\n")
  out_m.write(word + "\n")
out_v.close()
out_m.close()

In [63]:
# Download files locally to upload to Embedding Projector
try:
  from google.colab import files
except ImportError:
  pass
else:
  files.download("embd_vectors4.tsv")
  files.download("embd_metadata4.tsv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### Now we can use downloaded tsv file to visualize using [tensoflow's projector tool](https://projector.tensorflow.org/)