In [2]:
import sent2vec
from scipy.spatial.distance import cosine
from typing import List, Tuple
import numpy as np
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

In [3]:
# Initialize the sent2vec model
s2v = sent2vec.Sent2vecModel()
s2v.load_model('../models/wiki_unigrams.bin') # https://drive.google.com/open?id=0B6VhzidiLvjSa19uYWlLUEkzX3c

# Intitialize the NLTK lemmatizer
lemmatizer = WordNetLemmatizer() 

In [4]:
def clean_sentence(sent: str) -> str:
    """
    Clean and lemmatize a sentence
    
    TODO: More complex cleaning when the dataset get's more messy
    
    Args:
        sent(str): Sentence to clean
    Rets:
        (str): Cleaned sentence
    """
    # Lemmatize each word in the sentence
    return " ".join([lemmatizer.lemmatize(word) for word in word_tokenize(sent.lower())])

In [7]:
# Define the array to store the (emoji, repr) 2-tuple
emoji_embeddings = []
# Open the file that stores the emoji, description 2-tuple list
with open("emoji_joined.txt") as emojis:
    for defn in emojis:
        # The file is tab-delim
        split = defn.split("\t")

        # Get the emoji and the description from the current line
        emoji = split[-1].replace("\n", "")
        desc = split[0]

        # Add each emoji and embedded description to the list
        emoji_embeddings.append((emoji, s2v.embed_sentence(desc)))

In [16]:
def closest_emoji(sent: str) -> Tuple[str, int]:
    """
    Get the closest emoji to the given sentence
    
    Args:
        sent(List[str]): Sentence to check
    Ret:
        (Tuple[str, int]) Closest emoji, the respective cosine similarity
    
    """

    # Embed the sentence using sent2vec 
    emb = s2v.embed_sentence(sent)

    # Start the lowest cosine at higher than it could ever be
    lowest_cos = 1_000_000

    # The best emoji starts as an empty string placeholder
    best_emoji = ""

    # Loop through the dictionary
    for emoji in emoji_embeddings:
        # Get the current emoji's embedding
        emoji_emb = emoji[1]

        # Check the cosine difference between the emoji's embedding and
        # the sentence's embedding
        curr_cos = cosine(emoji_emb, emb)

        # If it lower than the lowest then it is the new best
        if curr_cos < lowest_cos:
            lowest_cos = curr_cos
            best_emoji = emoji[0]

    # Return a 2-tuple containing the best emoji and its cosine differnece
    return best_emoji, lowest_cos

In [8]:
def summarize(sent:str): 
    # Clean and tokenize the sentence
    sent = word_tokenize(clean_sentence(sent))
    # 
    lowest_cos = 1000
    lowest_emoji = ""
    # Create vectors to store the return values
    emojis = []
    uncertainty = []
    grams = []
    
    # Start the trailing n-gram 
    i = 0
    for j in range(1, len(sent) + 1):
        curr_emoji, curr_cos = closest_emoji(" ".join(sent[i:j]))
        if curr_cos < lowest_cos:
            lowest_emoji = curr_emoji
            lowest_cos = curr_cos
        elif curr_cos != 1_000_000:
            emojis.append(lowest_emoji)
            uncertainty.append(lowest_cos)
            grams.append(" ".join(sent[i:j]))
            lowest_cos = 1000
            if j == len(sent): 
                break
            i = j - 1

        if j == len(sent):
            uncertainty.append(lowest_cos)
            emojis.append(lowest_emoji)
            grams.append(" ".join(sent[i:j]))

    return ("".join(emojis), uncertainty, grams)

In [17]:
summarize("isn't perfect but it is a start")

('🙅💯🌱',
 [0.3903335928916931, 0.22808468341827393, 0.3317239284515381],
 ["is n't perfect", 'perfect but it', 'it is a start'])

3.665261890427453