In [2]:
import sent2vec
from scipy.spatial.distance import cosine
from typing import List, Tuple, Callable
from itertools import combinations
import numpy as np
from nltk import word_tokenize
from functools import lru_cache
from nltk.stem import PorterStemmer, WordNetLemmatizer, SnowballStemmer
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))
import spacy
from abc import ABC, abstractmethod

In [7]:
class EmojiTranslation(ABC):
    @abstractmethod
    def summarize(sent: str) -> str:
        """
        Summarize the given sentence into emoji
        """
        raise NotImplementedError
        

class NGramGroupingTranslation(EmojiTranslation):
    sent2vec_model = None
    def __init__(self, emoji_file: str="emoji_joined.txt", sent2vec_model: str="../models/wiki_unigrams.bin", lemmatizer: Callable[[str],str]=WordNetLemmatizer().lemmatize, remove_stops: bool=False):
        self.emoji_file = emoji_file
        
        if NGramGroupingTranslation.sent2vec_model is None:
            self.s2v = sent2vec.Sent2vecModel()
            self.s2v.load_model(sent2vec_model)
            NGramGroupingTranslation.sent2vec_model = self.s2v
        else:
            self.s2v = NGramGroupingTranslation.sent2vec_model
        
        self.lemma_func = lemmatizer
        self.remove_stops = remove_stops
        self.emoji_embeddings = self.load_emoji_embeddings()
    
    def clean_sentence(self, sent: str) -> str:
        """
        Clean the given sentence using the given lemmatization technique and 
        removing stop words if the flag is set
        
        Args:
            sent(str): Sentence to clean
        Rets:
            (str): Cleaned sentence
        """
        
        return " ".join([self.lemma_func(token) for token in word_tokenize(sent) if not self.remove_stops or token not in stopwords])
    
    def load_emoji_embeddings(self) -> List[Tuple[str, List[float]]]:
        """
        Load the emoji embeddings by embedding the emoji definitions loaded in from a file
        """
        
        emoji_embeddings = []
        with open(self.emoji_file) as emojis:
            for defn in emojis:
                split = defn.split("\t")
                
                # Get the emoji and the description from the current line
                emoji = split[-1].replace("\n", "")
                desc = self.clean_sentence(split[0])

                # Add each emoji and embedded description to the list
                emoji_embeddings.append((emoji, self.s2v.embed_sentence(desc)))
        
        return emoji_embeddings
    
    @lru_cache(maxsize=100)
    def closest_emoji(self, sent: str) -> Tuple[str, int]:
        """
        Get the closest emoji to the given sentence

        Args:
            sent(List[str]): Sentence to check
        Ret:
            (Tuple[str, int]) Closest emoji, the respective cosine similarity

        """    
        # Embed the sentence using sent2vec 
        emb = self.s2v.embed_sentence(sent)

        # Start the lowest cosine at higher than it could ever be
        lowest_cos = 1_000_000

        # The best emoji starts as an empty string placeholder
        best_emoji = ""

        # Loop through the dictionary
        for emoji in self.emoji_embeddings:
            # Get the current emoji's embedding
            emoji_emb = emoji[1]

            # Check the cosine difference between the emoji's embedding and
            # the sentence's embedding
            curr_cos = cosine(emoji_emb, emb)

            # If it lower than the lowest then it is the new best
            if curr_cos < lowest_cos:
                lowest_cos = curr_cos
                best_emoji = emoji[0]

        # Return a 2-tuple containing the best emoji and its cosine differnece
        return best_emoji, lowest_cos
    
    @staticmethod
    def sentence_combinations(sent):
        def combinations_of_sum(sum_to, combo=None):
            combos = []
            if combo is None:
                combo = [1 for x in range(sum_to)]
                combos.append(combo)

            if len(combo) == 0:
                return None

            for i in range(1, len(combo)):
                combo_to_query = combo[:i-1] + [sum(combo[i - 1:i + 1])] + combo[i+1:]
                combos.append(combo_to_query)
                [combos.append(combo) for combo in combinations_of_sum(sum_to, combo_to_query) if combo is not None]

            return combos
    
        sent_combos = []
        def combinations_of_sent_helper(sent):
            sent = word_tokenize(sent)
            combos = np.unique(combinations_of_sum(len(sent)))
            sent_combos = []
            for combo in combos:
                sent_combo = []
                curr_i = 0
                for combo_len in combo:
                    space_joined = " ".join(sent[curr_i:combo_len + curr_i])
                    if space_joined not in sent_combo:
                        sent_combo.append(space_joined) 
                    curr_i += combo_len

                if sent_combo not in sent_combos:
                    sent_combos.append(sent_combo)
            return sent_combos

        return combinations_of_sent_helper(sent)

    def summarize(self, sent:str) -> Tuple[List[str], List[float], List[str]]: 
        """
        Summarize the given sentence into emojis

        Args:
            sent(str): Sentence to summarize
        Rets:
            (Tuple[List[str], List[float], List[str]]): (Emoji Sentence, 
            List of Uncertainty values for the corresponding emoji,
            list of n-grams used to generate the corresponding emoji)
        """
        # Clean the sentence
        sent = self.clean_sentence(sent)

        # Generate all combinations of sentences
        sent_combos = self.sentence_combinations(sent)
        # Init "best" datamembers as empty or exceedingly high
        best_emojis = ""
        best_n_grams = []
        best_uncertainties = [100_000_000]
        # Iterate through every combination of sentence combos
        for sent_combo in sent_combos:
            # Start the local data members as empty
            emojis = ""
            uncertainties = []
            # Iterate through each n_gram adding the uncertainty and emoji to the lists
            for n_gram in sent_combo:
                close_emoji, cos_diff = self.closest_emoji(n_gram)
                emojis += close_emoji
                uncertainties.append(cos_diff)

            # Check if the average uncertainty is less than the best
            # TODO: Maybe a median check would be helpful as well?
            if sum(uncertainties)/len(uncertainties) < sum(best_uncertainties)/len(best_uncertainties):
                # Update the best emojis
                best_emojis = emojis
                best_n_grams = sent_combo
                best_uncertainties = uncertainties[:]

        # Clear the function cache on closest_emoji because it is unlikely the next run will make use of them
        closest_emoji.cache_clear()
    
        # Return the emoji "sentence", list of all the cosine similarities, and all of the n-grams
        return (best_emojis, best_uncertainties, best_n_grams)


In [None]:
nggt = NGramGroupingTranslation()
nggt.summarize("christmas music rings from the clock tower")