In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = ''

In [2]:
# !pip3 install googletrans==4.0.0rc1

In [3]:
import requests

In [4]:
URL = 'https://unicode.org/Public/emoji/15.0/emoji-test.txt'

In [5]:
def _raw_stream_unicodeorg_emojifile(url):
    resp = requests.request("GET", url, stream=True)

    POUNDSIGN = "#"
    POUNDSIGN_B = b"#"
    SEMICOLON = ";"
    SPACE = " "
    for line in resp.iter_lines():
        if not line or line.startswith(POUNDSIGN_B):
            continue
        line = line.decode("utf-8")
        codes, desc = line.split(SEMICOLON, 1)
        _, desc = desc.split(POUNDSIGN, 1)
        desc = desc.split(SPACE, 3)[-1]
        yield (codes.strip(), desc.strip())

In [6]:
def parse_unicode_sequence(string):
    return "".join((chr(int(i.zfill(8), 16)) for i in string.split()))


def parse_unicode_range(string):
    start, _, end = string.partition("..")
    start, end = map(lambda i: int(i.zfill(8), 16), (start, end))
    return (chr(i) for i in range(start, end + 1))


def stream_unicodeorg_emojifile(url=URL):
    for codes, desc in _raw_stream_unicodeorg_emojifile(url):
        if ".." in codes:
            for cp in parse_unicode_range(codes):
                yield cp, desc
        else:
            yield parse_unicode_sequence(codes), desc

In [7]:
codes = dict(stream_unicodeorg_emojifile(URL))

In [8]:
len(codes)

4733

In [9]:
texts = list(set(codes.values()))
len(texts)

3664

In [10]:
texts[:10]

['person gesturing NO: medium-light skin tone',
 'deaf person: medium-dark skin tone',
 'woman dancing: dark skin tone',
 'footprints',
 'black medium-small square',
 'flag: Solomon Islands',
 'flag: Papua New Guinea',
 'man scientist',
 'woman mountain biking: medium-light skin tone',
 'shark']

In [11]:
from googletrans import Translator

translator = Translator()

In [12]:
from tqdm import tqdm

results = {}
for t in tqdm(texts):
    r = translator.translate(t, src='en', dest = 'ms')
    results[t] = r.text

100%|████████████████████████████████████████████████████████████████████████████████████████████| 3664/3664 [39:28<00:00,  1.55it/s]


In [15]:
import json

with open('demoji-translated.json', 'w') as fopen:
    json.dump(results, fopen)

In [16]:
codes

{'😀': 'grinning face',
 '😃': 'grinning face with big eyes',
 '😄': 'grinning face with smiling eyes',
 '😁': 'beaming face with smiling eyes',
 '😆': 'grinning squinting face',
 '😅': 'grinning face with sweat',
 '🤣': 'rolling on the floor laughing',
 '😂': 'face with tears of joy',
 '🙂': 'slightly smiling face',
 '🙃': 'upside-down face',
 '\U0001fae0': 'melting face',
 '😉': 'winking face',
 '😊': 'smiling face with smiling eyes',
 '😇': 'smiling face with halo',
 '🥰': 'smiling face with hearts',
 '😍': 'smiling face with heart-eyes',
 '🤩': 'star-struck',
 '😘': 'face blowing a kiss',
 '😗': 'kissing face',
 '☺️': 'smiling face',
 '☺': 'smiling face',
 '😚': 'kissing face with closed eyes',
 '😙': 'kissing face with smiling eyes',
 '\U0001f972': 'smiling face with tear',
 '😋': 'face savoring food',
 '😛': 'face with tongue',
 '😜': 'winking face with tongue',
 '🤪': 'zany face',
 '😝': 'squinting face with tongue',
 '🤑': 'money-mouth face',
 '🤗': 'smiling face with open hands',
 '🤭': 'face with hand o

In [17]:
new_codes = {}
for k, v in codes.items():
    new_codes[k] = {'en': v, 'ms': results[v]}

In [18]:
with open('demoji.json', 'w') as fopen:
    json.dump(new_codes, fopen)

In [19]:
with open('demoji.json') as fopen:
    data = json.load(fopen)