In [1]:
import spacy
import pandas as pd
import numpy as np
from pathlib import Path

In [2]:
current_path = Path(".").resolve()
project_path = current_path.parent

In [3]:
def load_vocab(fpath):
    """
    Load the emoji vocab.
    """
    vocab = {}
    with open(fpath, "r", encoding="utf-8") as f:
        for idx, line in enumerate(f):
            emoji = line.strip()
            vocab[emoji] = idx
    
    return vocab

In [4]:
vocab = load_vocab(current_path / "data" / "vocab.txt")

In [5]:
list(vocab)[:10]

['PAD', '🏊', '💃🏻', '🅱️', '⬛', '☔️', '📱', '🇯🇲', '👶🏿', '3⃣️']

In [6]:
nlp = spacy.load("en_core_web_md")

In [23]:
import json
import emoji
import string
import re

# A dict that maps the emoji's unicode to its info
emojinet = {}

with open(current_path / '..' / 'preprocess_emojinet' / 'emojis.json' / 'emojis.json', encoding='utf-8') as f:
    # for emoji_dict in json.loads(f.read()):
    #     # If the emoji doeasn't have a shortcode
    #     if "shortcode" not in emoji_dict or emoji_dict["shortcode"] is None:
    #         c_list = []
    #         for unicode in emoji_dict["unicode"].split(" "):
    #             c_list.append(chr(int(unicode.replace('U+', ''), 16)))

    #         # if len(emoji_dict["unicode"].split(" ")) > 1:
    #         #     unicode = emoji_dict["unicode"].split(" ")[1]
    #         #     c = chr(int(unicode.replace('U+', ''), 16))
                
            
    #         # unicode = emoji_dict["unicode"].split(" ")[0]
    #         # c = chr(int(unicode.replace('U+', ''), 16))
    #         c_list = [c for c in c_list if c in vocab]
    #         if len(c_list) == 0:
    #             continue
    #     else: 
    #         c_list = [emoji.emojize(emoji_dict["shortcode"], use_aliases=True)]

    #     for c in c_list:
    #         emojinet[c] = {'name': emoji_dict['name'], 
    #         'keywords': emoji_dict['keywords'],
    #         'definition': emoji_dict['definition']}
    for emoji_dict in json.loads(f.read()):
        token_list = []
        desc = re.sub(r'^https?:\/\/.*[\r\n]*', '', emoji_dict['definition'])   # Remove the URL
        for token in nlp(desc):
            if token.text in spacy.lang.en.stop_words.STOP_WORDS or token.text in string.punctuation:
                continue
            token_list.append(token.text)
        
        # Usually 
        token_list = token_list[:-1]
        
        desc = " ".join(token_list)
        emojinet[emoji_dict["unicode"]] = {'name': emoji_dict['name'], 
            'keywords': emoji_dict['keywords'],
            'definition': desc}

In [24]:
emojinet

{'U+1F645 U+1F3FF U+200D U+2640 U+FE0F': {'name': 'woman gesturing NO: dark skin tone',
  'keywords': ['dark skin tone',
   'hand',
   'forbidden',
   'gesture',
   'woman',
   'prohibited',
   'no'],
  'definition': 'The Woman Gesturing Not OK Type-6 emoji sequence Face With No Good Gesture Emoji Modifier Fitzpatrick Type-6 Female S'},
 'U+1F482 U+1F3FF U+200D U+2640 U+FE0F': {'name': 'woman guard: dark skin tone',
  'keywords': ['dark skin tone', 'woman', 'guard'],
  'definition': 'The Female Guard Type-6 emoji sequence Guardsman Emoji Modifier Fitzpatrick Type-6 Female Sign Variation Select'},
 'U+1F3C3 U+200D U+2640 U+FE0F': {'name': 'woman running',
  'keywords': ['racing', 'running', 'woman', 'marathon'],
  'definition': 'The female version \xa0 Runner emoji The Woman Running emoji sequence Runner Female Sign Variation Selector-16'},
 'U+1F938 U+1F3FC U+200D U+2640 U+FE0F': {'name': 'woman cartwheeling: medium-light skin tone',
  'keywords': ['gymnastics', 'medium-light skin tone

To search the emoji inside EmojiNet, we need to convert each emoji into a string of unicode.

In [25]:
def emoji2unicode_str(e):
    return " ".join(['U+{:X}'.format(ord(c)) for c in e])

In [26]:
e = "💃🏻"
emoji2unicode_str(e)

'U+1F483 U+1F3FB'

In [27]:
i = 0
for e in vocab:
    if e == "PAD":
        continue
    unicode = emoji2unicode_str(e)
    if unicode in emojinet:
        i += 1

print("Vocab size for emoji2vec:", len(vocab))
print("Number of emojies covered by EmojiNet:", i)

Vocab size for emoji2vec: 1662
Number of emojies covered by EmojiNet: 1519


Generate new training samples using the descriptions and keywords from EmojiNet

In [28]:
"""
Generate new samples using the description and keywords.
"""

desc_list = []
emoji_list = []
keyword_list = []

for e in vocab:
    unicode = emoji2unicode_str(e)
    if unicode not in emojinet:
        continue
    desc_list.append(emojinet[unicode]["definition"])
    emoji_list.append(e)
    keyword_list.append(" ".join(emojinet[unicode]["keywords"]))

labels = [1 for i in range(len(emoji_list))]

desc_df = pd.DataFrame({"description": desc_list, "emoji": emoji_list, "label": labels})
keyword_df = pd.DataFrame({"keywords": keyword_list, "emoji": emoji_list, "label": labels})

In [29]:
desc_df

Unnamed: 0,description,emoji,label
0,A person swimming pool ocean body water Lack...,🏊,1
1,This version Woman Dancing emoji Light Skin To...,💃🏻,1
2,Black Large Square approved Unicode 5.1 2008 a...,⬛,1
3,A mobile phone referred cellular cell phone ...,📱,1
4,The flag Jamaica letters JM platforms Th...,🇯🇲,1
...,...,...,...
1514,A bar chart showing comparison figures data Ba...,📊,1
1515,This version Baby Angel emoji Medium dark Skin...,👼🏾,1
1516,An embarrassed face flushed red cheeks Thi...,😳,1
1517,A large ship transport people cargo ocean May ...,🚢,1


In [30]:
keyword_df

Unnamed: 0,keywords,emoji,label
0,swim,🏊,1
1,light skin tone dancing woman,💃🏻,1
2,square geometric,⬛,1
3,cell phone telephone mobile,📱,1
4,flag,🇯🇲,1
...,...,...,...
1514,chart bar graph,📊,1
1515,fairy tale medium-dark skin tone fantasy face ...,👼🏾,1
1516,flushed face dazed,😳,1
1517,passenger boat,🚢,1


In [31]:
desc_df.to_csv(current_path / "data" / "train_desc.txt", sep="\t", index=False, header=False)

In [32]:
keyword_df.to_csv(current_path / "data" / "train_keyword.txt", sep="\t", index=False, header=False)