In [1]:
import numpy as np
import pandas as pd
import json, os
import spacy
from tqdm import tqdm
from pathlib import Path

In [2]:
nlp = spacy.load('en_core_web_sm')
current_dir = Path(os.getcwd()).absolute()

In [3]:
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

In [4]:
glove_file = datapath(current_dir / 'glove.6B' / 'glove.6B.100d.txt')
word2vec_glove_file = get_tmpfile(current_dir / 'glove.6B' / "glove.6B.100d.word2vec.txt")
glove2word2vec(glove_file, word2vec_glove_file)

(400001, 100)

In [5]:
model = KeyedVectors.load_word2vec_format(word2vec_glove_file)
# with open(current_dir / 'glove.6B' / 'glove.6B.100d.txt', 'r', encoding='utf-8') as f:
#     for line in tqdm(f, total=400000):
#         parts = line.split()
#         word = parts[0]
#         vec = np.array([float(v) for v in parts[1:]], dtype='f')
#         nlp.vocab.set_vector(word, vec)


In [6]:
model.most_similar("mouse")

[('cat', 0.6915251016616821),
 ('rabbit', 0.6907329559326172),
 ('monkey', 0.6769239902496338),
 ('mice', 0.6706715226173401),
 ('rat', 0.644423246383667),
 ('spider', 0.6091823577880859),
 ('bugs', 0.6057944297790527),
 ('dog', 0.5969077944755554),
 ('robot', 0.590456485748291),
 ('clone', 0.5791175365447998)]

In [67]:
def load_emojis():
    rows = []
    with open(current_dir / './emojis.json/emojis.json', encoding='utf-8') as f:
        for emoji in json.loads(f.read()):
            # Skip emoji with several unicodes
            if len(emoji['unicode'].split(' ')) > 1:
                continue
            rows.append({'name': emoji['name'], 'unicode': emoji['unicode'], 'keywords': ' '.join(emoji['keywords']), 'definition': emoji['definition']})
    return np.array(rows)


In [68]:
emojis = load_emojis()
emojis[:5]

array([{'name': 'squid', 'unicode': 'U+1F991', 'keywords': 'squid molusc food', 'definition': 'Squid was approved as part of Unicode 9.0 in 2016 and added to Emoji 3.0 in 2016. Copy and paste this emoji: Copy ?? New in\xa0iOS 10.2 http://emojipedia.org/squid/'},
       {'name': 'shrimp', 'unicode': 'U+1F990', 'keywords': 'small shrimp shellfish food', 'definition': 'Shrimp was approved as part of Unicode 9.0 in 2016 and added to Emoji 3.0 in 2016. Copy and paste this emoji: Copy ?? New in\xa0iOS 10.2 http://emojipedia.org/shrimp/'},
       {'name': 'rhinoceros', 'unicode': 'U+1F98F', 'keywords': 'rhinoceros', 'definition': 'A rhinoceros, displayed either as the entire animal, or just the face. Rhinoceros was approved as part of Unicode 9.0 in 2016 and added to E http://emojipedia.org/rhinoceros/'},
       {'name': 'lizard', 'unicode': 'U+1F98E', 'keywords': 'lizard reptile', 'definition': 'A small lizard, shown in green or brown colors, often from above. Lizard was approved as part of 

In [61]:
# df = pd.DataFrame(emojis, columns=['name', 'unicode', 'keywords', 'definition'])
# print(df.head())

In [69]:
sentence_vectors = []
valid_emojis = []

for i in tqdm(range(len(emojis))):
    row = emojis[i]
    desc = row['name'] + row['keywords']
    doc = nlp(desc)
    vecs = [model.wv[token.text] for token in doc if token.text in model.wv]
    # If we cannot generate a vector for the emoji, skip it
    if len(vecs) == 0:
        continue
    vec = np.vstack(vecs).mean(axis=0)
    sentence_vectors.append(vec)
    valid_emojis.append(row)

sentence_vectors = np.asarray(sentence_vectors)

  
100%|██████████| 1088/1088 [00:06<00:00, 177.41it/s]


In [70]:
print(sentence_vectors.shape)

(917, 100)


In [97]:
np.save(current_dir / "vectors.npy", sentence_vectors)
valid_emojis = pd.DataFrame(valid_emojis)
valid_emojis.to_csv(current_dir / "vocab.csv", index=False)


In [74]:
from numpy import dot
from numpy.linalg import norm

def most_similar(vectors, vec):
    # cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2))
    dst = np.dot(vectors, vec) / (norm(vectors) * norm(vec))
    return np.argsort(-dst)

In [98]:
from IPython.core.display import display, HTML

def query(v, vectors, most_n=5):
    ids = most_similar(vectors, v)[:most_n]
    print(ids)
    html = []
    for i in ids:
        row = valid_emojis.iloc[i]
        unicode = row["unicode"]
        name = row["name"]
        c = chr(int(unicode.replace('U+', ''), 16))
        print(c, name)
        html.append(c)
    display(HTML('<font size="+3">{}</font>'.format(' '.join([x for x in html]))))

In [101]:
v = model.wv["sad"]
query(v, sentence_vectors)

[677 220 615 202 309]
🐉 dragon
😭 loudly crying face
👌 OK hand
😿 crying cat face
🕺 man dancing


  """Entry point for launching an IPython kernel.


In [100]:
import emoji

values = "U+1F643".split(' ')
for v in values:
    c = chr(int(v.replace('U+', ''), 16))
    print(c, emoji.demojize(c))

🙃 :upside-down_face:
