In [1]:
import re  # Import regex module for text processing
import os  # Import os module for file and directory management
import numpy as np  # Import NumPy for numerical operations
import pandas as pd  # Import pandas for data manipulation and analysis
import torch  # Import PyTorch for deep learning tasks
from transformers import AutoTokenizer, AutoModel  # Import Hugging Face transformers for NLP models and tokenizers
from sklearn.metrics.pairwise import cosine_similarity  # Import cosine similarity function for comparing sentence embeddings
from nltk.corpus import stopwords  # Import stopwords from NLTK for text preprocessing
from nltk.tokenize import word_tokenize  # Import word_tokenize from NLTK for tokenizing sentences
import nltk  # Import Natural Language Toolkit (NLTK) for NLP tasks

nltk.download('stopwords')  # Download stopwords from NLTK
nltk.download('punkt')  # Download punkt tokenizer from NLTK

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\abdul\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\abdul\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True


This code defines a class `EmojifyAI` that suggests emojis based on the input sentence. It uses a pre-trained BERT model to calculate sentence embeddings and find similarity between the input sentence and emojis' descriptions. The class also includes methods to process sentences, tokenize, and compute embeddings for sentences and emojis' descriptions. Finally, it has methods to generate and save an emoji data CSV file.


In [2]:
class EmojifyAI:  # Define the EmojifyAI class for suggesting emojis
    def __init__(self):  # Initialize the class and load the model
        self.load_model()

    def load_model(self):  # Load the pre-trained BERT model for sentence embeddings
        self.tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')
        self.model = AutoModel.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')

    def process_sentence(self, sentence):  # Preprocess and tokenize input sentence
        sentence = sentence.lower()  # Convert sentence to lowercase
        sentence = re.sub('[^a-z]+', ' ', sentence)  # Remove non-letter characters
        stop_words = set(stopwords.words('english'))  # Define English stopwords
        word_tokens = word_tokenize(sentence)  # Tokenize sentence
        sentence = [w for w in word_tokens if not w.lower() in stop_words]  # Remove stopwords
        sentence = ' '.join(sentence)  # Join tokens to form processed sentence
        return self.get_mean_tokens([sentence])  # Return mean tokens for the processed sentence

    def process_csv(self):  # Process and filter emoji descriptions from CSV file
        self.emoji_df = pd.read_csv("data/emoji-data.csv")  # Read the emoji data CSV
        self.all_emoji_df = self.emoji_df  # Store all emoji data
        self.emoji_df = self.emoji_df[1800:2000]  # Select a subset of emoji data
        self.emoji_df = self.emoji_df.reset_index(drop=True)  # Reset index
        return self.get_mean_tokens(self.emoji_df['description'])  # Return mean tokens for emoji descriptions

    def get_mean_tokens(self, sentences):  # Calculate mean tokens for sentences
        self.obtain_tokens(sentences)  # Obtain tokens for sentences
        self.compute_embeddings()  # Compute embeddings for tokens
        return self.calculate_mean_value()  # Calculate and return mean values of embeddings

    def obtain_tokens(self, sentences):  # Tokenize input sentences
        self.tokens = {'input_ids': [], 'attention_mask': []}  # Initialize token dictionaries

        for sentence in sentences:  # Iterate through sentences
            new_tokens = self.tokenizer.encode_plus(sentence, max_length=128,
                                                    truncation=True, padding='max_length',
                                                    return_tensors='pt')  # Tokenize sentence
            self.tokens['input_ids'].append(new_tokens['input_ids'][0])  # Append input_ids
            self.tokens['attention_mask'].append(new_tokens['attention_mask'][0])  # Append attention_mask

        self.tokens['input_ids'] = torch.stack(self.tokens['input_ids'])  # Stack input_ids
        self.tokens['attention_mask'] = torch.stack(self.tokens['attention_mask'])  # Stack attention_mask

    def compute_embeddings(self):  # Compute embeddings for tokens
        outputs = self.model(**self.tokens)  # Forward pass through the model
        self.embeddings = outputs.last_hidden_state  # Extract embeddings

    def calculate_mean_value(self):  # Calculate mean values of embeddings
        attention_mask = self.tokens['attention_mask']  # Retrieve attention_mask
        mask = attention_mask.unsqueeze(-1).expand(self.embeddings.size()).float()  # Create mask
        masked_embeddings = self.embeddings * mask  # Apply mask to embeddings
        summed = torch.sum(masked_embeddings, 1)  # Sum masked embeddings
        summed_mask = torch.clamp(mask.sum(1), min=1e-9)  # Sum mask values
        self.mean_pooled = summed / summed_mask  # Calculate mean pooled embeddings
        self.mean_pooled = self.mean_pooled.detach().numpy()  # Convert to numpy array
        return self.mean_pooled  # Return mean pooled embeddings

    def find_similarity(self, sentence_tokens, mean_tokens):  # Find similarity between sentence and emoji tokens
        similarity = cosine_similarity([sentence_tokens], mean_tokens)  # Calculate cosine similarity
        return similarity  # Return similarity

    def generate_emoji_csv(self):  # Generate emoji data CSV file
        df = pd.read_csv("data/raw-emoji-data.csv", usecols=[1, 3], header=None)  # Read raw emoji data CSV
        df = df.dropna()  # Drop rows with missing values
        df = df.iloc[1:, :]  # Select all rows except the header
        self.save_csv(df)  # Save processed data as CSV

    def save_csv(self, df):  # Save emoji data to CSV file
        df = pd.DataFrame({'emoji': df[1], 'description': df[3]})  # Create a DataFrame with emoji and description
        df.to_csv("data/emoji-data.csv", encoding='utf-8', index=False)  # Save DataFrame as CSV

This code demonstrates how to use the EmojifyAI class to suggest emojis for a given sentence. It starts by creating an instance of the class, generating an emoji CSV file, and processing the CSV to obtain mean tokens for emojis. Then, it defines an example sentence, processes it, and finds the similarity between the sentence and emojis. It prints the top 5 most similar emojis and their descriptions. Finally, it defines a suggestEmojis function that takes a sentence as input and suggests emojis based on the similarity between the sentence and emojis' descriptions.

In [3]:
emoji_rec = EmojifyAI() # Instantiate the EmojifyAI class

In [4]:
emoji_rec.generate_emoji_csv() # Generate the emoji data CSV file

In [5]:
mean_tokens = emoji_rec.process_csv() # Process the CSV and obtain mean tokens for emojis
print(mean_tokens.shape) # Print the shape of the mean tokens

(16, 768)


In [6]:
example_sentence = "Do you play games?" # Define an example sentence
sentence_token = emoji_rec.process_sentence(example_sentence) # Process the example sentence
similarity = emoji_rec.find_similarity(sentence_token[0], mean_tokens) # Find the similarity between the sentence and emojis

In [7]:
top_indices = (-similarity[0]).argsort()[:5] # Get the indices of the top 5 most similar emojis
for i in top_indices: # Iterate through the top indices
    print(i, emoji_rec.emoji_df['emoji'][i], emoji_rec.emoji_df['description'][i]) # Print the index, emoji, and description

12 🇿🇼 flag: Zimbabwe
0 🇻🇪 flag: Venezuela
15 🏴󠁧󠁢󠁷󠁬󠁳󠁿 flag: Wales
13 🏴󠁧󠁢󠁥󠁮󠁧󠁿 flag: England
4 🇻🇺 flag: Vanuatu


In [88]:
# Only run 1 time
# torch.save(mean_tokens, 'checkpoint/token-all.pt') # Save the mean tokens to a file named 'token-all.pt' in the 'checkpoint' directory

In [8]:
def suggestEmojis(sentence): # Define the suggestEmojis function
    name = 'token-all.pt' # Define the name of the token file
    all_tokens = torch.load('checkpoint/'+name) # Load the precomputed tokens
    sentence_token = emoji_rec.process_sentence(sentence) # Process the input sentence
    similarity = emoji_rec.find_similarity(sentence_token[0], all_tokens) # Find the similarity between the sentence and emojis
    indices = (-similarity[0]).argsort()[:5] # Get the indices of the top 5 most similar emojis
    emoji_df = pd.read_csv("data/emoji-data.csv") # Read the emoji data CSV file
    for j in indices: # Iterate through the top indices
        print(emoji_df['emoji'][j], emoji_df['description'][j]) # Print the emoji and its description

In [9]:
print("Example - 1")
test_sentence1 = "I am going to the movies"
print(test_sentence1)
print("Following are the suggested emojis:\n-----------------")
suggestEmojis(test_sentence1)

Example - 1
I am going to the movies
Following are the suggested emojis:
-----------------
🎥 movie camera
🎦 cinema
📽 film projector
📀 dvd
🎞 film frames


In [10]:
print("Example - 2")
test_sentence2 = "I love eating pizza"
print(test_sentence2)
print("Following are the suggested emojis:\n-----------------")
suggestEmojis(test_sentence2)

Example - 2
I love eating pizza
Following are the suggested emojis:
-----------------
🍕 pizza
😋 face savoring food
🍟 french fries
🍔 hamburger
🌮 taco


In [63]:
print("Example - 3")
test_sentence3 = "The weather is sunny today"
print(test_sentence3)
print("Following are the suggested emojis:\n-----------------")
suggestEmojis(test_sentence3)

Example - 3
The weather is sunny today
Following are the suggested emojis:
-----------------
☀ sun
😁 beaming face with smiling eyes
🌞 sun with face
🌤 sun behind small cloud
⛅ sun behind cloud


In [64]:
print("Example - 4")
test_sentence4 = "I am feeling tired and sleepy"
print(test_sentence4)
print("Following are the suggested emojis:\n-----------------")
suggestEmojis(test_sentence4)

Example - 4
I am feeling tired and sleepy
Following are the suggested emojis:
-----------------
😫 tired face
😪 sleepy face
😩 weary face
😞 disappointed face
🙁 slightly frowning face


In [65]:
print("Example - 5")
test_sentence5 = "My favorite sport is soccer"
print(test_sentence5)
print("Following are the suggested emojis:\n-----------------")
suggestEmojis(test_sentence5)

Example - 5
My favorite sport is soccer
Following are the suggested emojis:
-----------------
⚽ soccer ball
🤟 love you gesture
👩‍❤️‍👩 couple with heart woman woman
😍 smiling face with heart eyes
✌ victory hand


In [66]:
print("Example - 6")
test_sentence6 = "Let's go to the beach this weekend"
print(test_sentence6)
print("Following are the suggested emojis:\n-----------------")
suggestEmojis(test_sentence6)

Example - 6
Let's go to the beach this weekend
Following are the suggested emojis:
-----------------
🏖 beach with umbrella
🏕 camping
🌇 sunset
☀ sun
🦪 oyster


In [67]:
print("Example - 7")
test_sentence7 = "I am so excited for the party tonight"
print(test_sentence7)
print("Following are the suggested emojis:\n-----------------")
suggestEmojis(test_sentence7)

Example - 7
I am so excited for the party tonight
Following are the suggested emojis:
-----------------
🥳 partying face
🎉 party popper
👏 clapping hands
🔆 bright button
💖 sparkling heart


In [68]:
print("Example - 8")
test_sentence8 = "I am working on a new project at my job"
print(test_sentence8)
print("Following are the suggested emojis:\n-----------------")
suggestEmojis(test_sentence8)

Example - 8
I am working on a new project at my job
Following are the suggested emojis:
-----------------
👷 construction worker
🆕 new button
🧑‍🏭 factory worker
🚧 construction
🏗 building construction


In [69]:
print("Example - 9")
test_sentence9 = "My dog loves to play fetch"
print(test_sentence9)
print("Following are the suggested emojis:\n-----------------")
suggestEmojis(test_sentence9)

Example - 9
My dog loves to play fetch
Following are the suggested emojis:
-----------------
🐩 poodle
🦮 guide dog
🐕 dog
🐶 dog face
🐕‍🦺 service dog


In [70]:
print("Example - 10")
test_sentence10 = "The traffic is terrible during rush hour"
print(test_sentence10)
print("Following are the suggested emojis:\n-----------------")
suggestEmojis(test_sentence10)

Example - 10
The traffic is terrible during rush hour
Following are the suggested emojis:
-----------------
👿 angry face with horns
🌁 foggy
😠 angry face
😱 face screaming in fear
😨 fearful face
