# Emojis

In [1]:
import re
def clean_code_label(code, label):
    codes = list()
    labels = list()
    splitted_codes = code.split("..")
    splitted_labels = label.split("..")
    for i in range(len(splitted_codes)):
        codes.append(splitted_codes[i])
        labels.append(splitted_labels[i])
    return codes, labels

In [2]:
def get_emojis_dict(emojis_path="emojis.txt"):
    with open(emojis_path, 'r') as f:
        emojis_raw = f.read()
    
    emojis_dict = dict()  # {unicode: label}
    for l in emojis_raw.split("\n"):
        if "; Emoji" in l:
            emoji_info = re.findall(r"\)\s+[A-Za-z]+.+",l)
            try:
                emoji_info[0]
            except IndexError:
                continue
            else:
                label_raw = re.sub("\)\s+","",emoji_info[0])
                code_raw = l.split()[0]

                codes, labels = clean_code_label(code_raw, label_raw)
                for i in range(len(codes)):
                    emojis_dict[codes[i]] = labels[i]
    return emojis_dict

In [3]:
emojis_dict = get_emojis_dict()

# Emoticons

In [4]:
import urllib.request as ureq
from bs4 import BeautifulSoup

def download_emoticons(url="https://pc.net/emoticons/"):
    opener = ureq.FancyURLopener({})
    f = opener.open(url)
    content = f.read()
    soup = BeautifulSoup(content, "html")
    emoticons_html_tags = soup.find_all(class_="smiley")

    emoticons_dict = dict()
    for t in emoticons_html_tags:
        label = t.a.attrs["href"].split("/")[1]
        emoticons_dict[t.text] = label
    return emoticons_dict

In [5]:
emoticons_dict = download_emoticons()

  """


 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


In [6]:
import json

In [7]:
# json.dump({"emojis":emojis_dict, "emoticons":emoticons_dict},
#           open("e_labels.json","w"),
#           ensure_ascii=False)

In [8]:
e_labels = json.load(open("e_labels.json","r"))
emojis_dict = e_labels["emojis"]
emoticons_dict = e_labels["emoticons"]

In [9]:
tags_dict = emojis_dict
tags_dict.update(emoticons_dict)

In [10]:
len(emoticons_dict)

74

In [11]:
import copy

In [12]:
tags_dict = copy.deepcopy(emoticons_dict)
tags_dict.update(emojis_dict)

In [13]:
class Node(object):
    def __init__(self, char: str):
        self.char = char
        self.children = list()
        self.tag = None
        self.counter = 1  # number of words having this character


def add(root, emotic: str, tag: str):
    node = root
    for char in emotic:
        in_child = False
        # is the character in any of the children of that node?
        for child in node.children:
            if child.char == char:
                child.counter += 1
                node = child  # the current node now is the child
                in_child = True
                break

        # the character not in any of the children
        # create a new child
        if not in_child:
            new_node = Node(char)
            node.children.append(new_node)

            node = new_node
    node.tag = tag  # this is the last character in the word, specify its tag

root = Node("")
for emot, tag in tags_dict.items():
    add(root, emot, tag)

In [14]:
import pickle

In [15]:
pickle.dump(root, open("emoticons_trie.pkl", "wb"))

In [16]:
root = pickle.load(open("emoticons_trie.pkl","rb"))

In [17]:
def get_emoticon_tag(token:str):
    node = root
    tag = None

    for i in range(len(token)):
        char = token[i]
        in_children = False
        for c in node.children:
            if char == c.char:
                node = c
                tag = node.tag
                in_children = True
                break
        if not in_children:
            break
    return tag

In [18]:
def generate_tagged_text(text:str):
    generate_tag = lambda tag,emoticon: "<emoticon type={}>{}</emoticon>".format(tag,emoticon)
    tokens = text.split()

    tagged_tokens = copy.deepcopy(tokens)
    for i,token in enumerate(tokens):
        tag = get_emoticon_tag(token.upper())  # Emojis codes are all upper case
        if tag:
            tagged_tokens[i] = generate_tag(tag, token)
        else:
            tag = get_emoticon_tag(token)
            if tag:
                tagged_tokens[i] = generate_tag(tag, token)
    return " ".join(tagged_tokens)

In [19]:
## used for unittesting
def is_emot(self, txt:str):
    is_emot = False
    if txt in self.tags_dict:
        is_emot = True
    else:
        for t in self.tags_dict:
            if t in txt and t[0] == txt[0]:
                is_emot = True
    return is_emot

In [20]:
generate_tagged_text("I will always find it funny . Lol :)))))). However it is not so funny for my girlfriend :-O . Anyway, I love her <3")

'I will always find it funny \uf04a. Lol <emoticon type=smile>:)))))).</emoticon> However it is not so funny for my girlfriend :-O . Anyway, I love her <emoticon type=love><3</emoticon>'

In [21]:
"uF04A" in tags_dict

False