In [5]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import re

import typing as tp

import torchview

import transformers
from transformers import BartTokenizer, BartForConditionalGeneration, pipeline

In [6]:
transformers.pipeline

<function transformers.pipelines.pipeline(task: str = None, model: Union[str, ForwardRef('PreTrainedModel'), ForwardRef('TFPreTrainedModel'), NoneType] = None, config: Union[str, transformers.configuration_utils.PretrainedConfig, NoneType] = None, tokenizer: Union[str, transformers.tokenization_utils.PreTrainedTokenizer, ForwardRef('PreTrainedTokenizerFast'), NoneType] = None, feature_extractor: Union[str, ForwardRef('SequenceFeatureExtractor'), NoneType] = None, image_processor: Union[str, transformers.image_processing_utils.BaseImageProcessor, NoneType] = None, processor: Union[str, transformers.processing_utils.ProcessorMixin, NoneType] = None, framework: Optional[str] = None, revision: Optional[str] = None, use_fast: bool = True, token: Union[str, bool, NoneType] = None, device: Union[int, str, ForwardRef('torch.device'), NoneType] = None, device_map=None, torch_dtype=None, trust_remote_code: Optional[bool] = None, model_kwargs: Dict[str, Any] = None, pipeline_class: Optional[Any] 

In [8]:
text2emoji_dataset = pd.read_csv('data/text2emoji.csv')
text2emoji_dataset.dropna(inplace=True)

In [9]:
text_list = text2emoji_dataset['text'].to_list()

In [10]:
class Msg2EmojiTranslator:
    def __init__(
        self,
        tokenizer,
        generator,
        device: torch.device
    ) -> None:
        self.device = device
        self.tokenizer = tokenizer
        self.generator = generator.to(self.device)
        
    def translate(self, sentence: str | list[str], sep: str = '.', **kwargs) -> torch.Tensor:
        decoded_emojis_list = []
        
        if isinstance(sentence, str):
            sentence = [sentence]

        for s in sentence:
            text_tokens = self.tokenizer(s, return_tensors="pt")
            generated_emoji_tokens = self.generator.generate(text_tokens["input_ids"].to(self.device), **kwargs)
            decoded_emojis = self.tokenizer.decode(generated_emoji_tokens[0].cpu(), skip_special_tokens=True).replace(" ", "")
            decoded_emojis_list.append(decoded_emojis)
            
        return sep.join(decoded_emojis_list)

In [11]:
tokenizer = BartTokenizer.from_pretrained('AiratNazmiev/text2emoji-tokenizer')
generator = BartForConditionalGeneration.from_pretrained('AiratNazmiev/text2emoji-bart-base')

tokenizer_config.json:   0%|          | 0.00/401k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.json:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/506k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/43.2k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/565M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/305 [00:00<?, ?B/s]

In [12]:
ru_en_translator = pipeline(
    "translation_ru_to_en", 
    model="Helsinki-NLP/opus-mt-ru-en"
)

zh_en_translator = pipeline(
    "translation_zh_to_en", 
    model="Helsinki-NLP/opus-mt-zh-en"
)

Device set to use cuda:0
Device set to use cuda:0


In [13]:
input_data_vis = tokenizer("To be or not to be. That's the question", return_tensors="pt")['input_ids']

generator_graph = torchview.draw_graph(
    generator.cpu(), 
    input_data=input_data_vis, 
    depth=3,
    expand_nested=True
)

In [14]:
msg2emoji_translator = Msg2EmojiTranslator(
    tokenizer=tokenizer,
    generator=generator,
    device=torch.device('cuda')
)

In [15]:
twitter_magic_number = 280

def text_preprocessing(text: str, language: str = 'en') -> str:
    if language == 'ru':
        text = ru_en_translator(text)[0]['translation_text']
    elif language == 'zh':
        text = zh_en_translator(text)[0]['translation_text']
    
    if len(text) > twitter_magic_number:
        print(f"It's twit translator. The max length of the input is {twitter_magic_number} characters")
        
    text_re = re.split(r"(?<=[.|!|?|\.\.\.])\s+", text.strip())
    
    return text_re

In [16]:
sentence = """As I walk through the valley of the shadow of death
I take a look at my life and realize there's nothing left.
Cause I've been blasting and laughing so long that
Even my momma thinks that my mind is gone"""

sentence_re = text_preprocessing(sentence, language='en')
sentence_re

["As I walk through the valley of the shadow of death\nI take a look at my life and realize there's nothing left.",
 "Cause I've been blasting and laughing so long that\nEven my momma thinks that my mind is gone"]

In [17]:
decoded = msg2emoji_translator.translate(
    text_preprocessing(sentence),
    sep='.',
    num_beams=5, 
    do_sample=True, 
    max_length=20
)
print(fr'{decoded}')

🚶‍♀️🏞️😔💔.😂👩‍👧‍👦💭
