# Embed emojis for our models

In [1]:
# install advertools to easily identify emojis in dataset
!pip install advertools

Collecting advertools
  Downloading advertools-0.13.1-py2.py3-none-any.whl (309 kB)
[?25l[K     |█                               | 10 kB 23.8 MB/s eta 0:00:01[K     |██▏                             | 20 kB 13.4 MB/s eta 0:00:01[K     |███▏                            | 30 kB 10.2 MB/s eta 0:00:01[K     |████▎                           | 40 kB 9.1 MB/s eta 0:00:01[K     |█████▎                          | 51 kB 4.8 MB/s eta 0:00:01[K     |██████▍                         | 61 kB 5.6 MB/s eta 0:00:01[K     |███████▍                        | 71 kB 5.8 MB/s eta 0:00:01[K     |████████▌                       | 81 kB 5.7 MB/s eta 0:00:01[K     |█████████▌                      | 92 kB 6.3 MB/s eta 0:00:01[K     |██████████▋                     | 102 kB 5.4 MB/s eta 0:00:01[K     |███████████▋                    | 112 kB 5.4 MB/s eta 0:00:01[K     |████████████▊                   | 122 kB 5.4 MB/s eta 0:00:01[K     |█████████████▊                  | 133 kB 5.4 MB/s eta 

In [2]:
import pandas as pd
import numpy as np
import advertools as adv

In [7]:
df_tweets = pd.read_excel("tweets.xlsx")
df_tweets = df_tweets[df_tweets['GR']!='unclear']
df_tweets

Unnamed: 0,Tweet_id,text_clean,GR,iaa
0,1,ik heb 13u aan een stuk gewerkt come at me,neutral,0.579270
1,2,Olive Garden - SNL,neutral,0.556808
2,3,"Jozef De Kesel wordt zondag kardinaal: ""Ik kij...",positive,0.613758
3,4,mijn wifi SUCKT echt en ik kant niemeer aan,negative,1.000000
4,5,Gedraag je maar als een hoe,negative,0.464437
...,...,...,...,...
2995,2996,Kheb mij gisterenavond best wel goe gehad 😌,positive,1.000000
2996,2997,Blij dat ze zich aan de charter houden! #homo...,positive,0.518596
2997,2998,Ik mis mijn fiets,negative,0.575348
2998,2999,Weer veel te laat naar bed. Dat ga ik morgenvr...,negative,1.000000


## Emoji group nr

In [4]:
adv.emoji_df

Unnamed: 0,codepoint,status,emoji,name,group,sub_group
0,1F600,fully-qualified,😀,grinning face,Smileys & Emotion,face-smiling
1,1F603,fully-qualified,😃,grinning face with big eyes,Smileys & Emotion,face-smiling
2,1F604,fully-qualified,😄,grinning face with smiling eyes,Smileys & Emotion,face-smiling
3,1F601,fully-qualified,😁,beaming face with smiling eyes,Smileys & Emotion,face-smiling
4,1F606,fully-qualified,😆,grinning squinting face,Smileys & Emotion,face-smiling
...,...,...,...,...,...,...
4576,1F1FF 1F1F2,fully-qualified,🇿🇲,flag: Zambia,Flags,country-flag
4577,1F1FF 1F1FC,fully-qualified,🇿🇼,flag: Zimbabwe,Flags,country-flag
4578,1F3F4 E0067 E0062 E0065 E006E E0067 E007F,fully-qualified,🏴󠁧󠁢󠁥󠁮󠁧󠁿,flag: England,Flags,subdivision-flag
4579,1F3F4 E0067 E0062 E0073 E0063 E0074 E007F,fully-qualified,🏴󠁧󠁢󠁳󠁣󠁴󠁿,flag: Scotland,Flags,subdivision-flag


In [5]:
# give each sub_group an unique id (this unique id will be added to the feature vector later on)
df_emoji_sub_group = adv.emoji_df[['sub_group']].drop_duplicates(ignore_index=True)
df_emoji_sub_group['sub_group_id'] = df_emoji_sub_group.index+1
df_emoji_sub_group

Unnamed: 0,sub_group,sub_group_id
0,face-smiling,1
1,face-affection,2
2,face-tongue,3
3,face-hand,4
4,face-neutral-skeptical,5
...,...,...
93,alphanum,94
94,geometric,95
95,flag,96
96,country-flag,97


In [6]:
# get the distinct list of emojis assigned to a tweet
df_tweets['emojis'] = adv.extract_emoji(df_tweets['text_clean'])['emoji']
max_group_length = 0

dist_emojis = []
for index, row in df_tweets.iterrows():
    tweet_dist_emojis = (list(set(row['emojis'])))
    dist_emojis.append(tweet_dist_emojis)

    # Look for the highest number of different sub_groups assigned to a tweet
    if tweet_dist_emojis: 
      tweet_emoji_sub_groups_id = []
      for sg in adv.extract_emoji(tweet_dist_emojis)['top_emoji_sub_groups']:
        tweet_emoji_sub_groups_id.append(int(df_emoji_sub_group[df_emoji_sub_group['sub_group']==sg[0]].sub_group_id))

      if len(tweet_emoji_sub_groups_id)>max_group_length:
        max_group_length = len(tweet_emoji_sub_groups_id)

df_tweets['dist_emojis'] = dist_emojis
df_tweets

Unnamed: 0,Tweet_id,text_clean,GR,iaa,emojis,dist_emojis
0,1,ik heb 13u aan een stuk gewerkt come at me,neutral,0.579270,[],[]
1,2,Olive Garden - SNL,neutral,0.556808,[],[]
2,3,"Jozef De Kesel wordt zondag kardinaal: ""Ik kij...",positive,0.613758,[],[]
3,4,mijn wifi SUCKT echt en ik kant niemeer aan,negative,1.000000,[],[]
4,5,Gedraag je maar als een hoe,negative,0.464437,[],[]
...,...,...,...,...,...,...
2995,2996,Kheb mij gisterenavond best wel goe gehad 😌,positive,1.000000,[😌],[😌]
2996,2997,Blij dat ze zich aan de charter houden! #homo...,positive,0.518596,[],[]
2997,2998,Ik mis mijn fiets,negative,0.575348,[],[]
2998,2999,Weer veel te laat naar bed. Dat ga ik morgenvr...,negative,1.000000,[],[]


In [None]:
print(max_group_length)

6


In [None]:
emojis_sub_groups_id = []
for index, row in df_tweets.iterrows():
    if row['dist_emojis']:
      emoji_sub_groups = adv.extract_emoji(row['dist_emojis'])['top_emoji_sub_groups']
      emoji_sub_groups_id = []

      # add the sub group beloning to the tweet to a list
      i = 0
      for sg in emoji_sub_groups:
          emoji_sub_groups_id.append(int(df_emoji_sub_group[df_emoji_sub_group['sub_group']==sg[0]].sub_group_id))
          i +=1

      # add suplementare zeros if the list doesn't contain the maximum number of sub_groups 
      # (important beceause every item in the feature vector should have the same length)
      while i < max_group_length:
          emoji_sub_groups_id.append(0)
          i += 1

      emojis_sub_groups_id.append(emoji_sub_groups_id)
    else:
      # if the tweet doesn't containt emojies a list of zeros is added
      emoji_sub_groups_id = []
      for x in range(max_group_length):
        emoji_sub_groups_id.append(0)
      emojis_sub_groups_id.append(emoji_sub_groups_id)


df_tweets['emoji_sub_groups_id'] = emojis_sub_groups_id

In [None]:
df_tweets

Unnamed: 0,Tweet_id,text_clean,GR,iaa,emojis,dist_emojis,emoji_sub_groups_id
0,1,ik heb 13u aan een stuk gewerkt come at me,neutral,0.579270,[],[],"[0, 0, 0, 0, 0, 0]"
1,2,Olive Garden - SNL,neutral,0.556808,[],[],"[0, 0, 0, 0, 0, 0]"
2,3,"Jozef De Kesel wordt zondag kardinaal: ""Ik kij...",positive,0.613758,[],[],"[0, 0, 0, 0, 0, 0]"
3,4,mijn wifi SUCKT echt en ik kant niemeer aan,negative,1.000000,[],[],"[0, 0, 0, 0, 0, 0]"
4,5,Gedraag je maar als een hoe,negative,0.464437,[],[],"[0, 0, 0, 0, 0, 0]"
...,...,...,...,...,...,...,...
2913,2996,Kheb mij gisterenavond best wel goe gehad 😌,positive,1.000000,[😌],[😌],"[6, 0, 0, 0, 0, 0]"
2914,2997,Blij dat ze zich aan de charter houden! #homo...,positive,0.518596,[],[],"[0, 0, 0, 0, 0, 0]"
2915,2998,Ik mis mijn fiets,negative,0.575348,[],[],"[0, 0, 0, 0, 0, 0]"
2916,2999,Weer veel te laat naar bed. Dat ga ik morgenvr...,negative,1.000000,[],[],"[0, 0, 0, 0, 0, 0]"


In [None]:
!pip install tensorflow_text
import tensorflow as tf
import tensorflow_hub as hub
from sklearn.model_selection import train_test_split
import tensorflow_text

Collecting tensorflow_text
  Downloading tensorflow_text-2.8.2-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 6.7 MB/s 
Collecting tf-estimator-nightly==2.8.0.dev2021122109
  Downloading tf_estimator_nightly-2.8.0.dev2021122109-py2.py3-none-any.whl (462 kB)
[K     |████████████████████████████████| 462 kB 44.0 MB/s 
Installing collected packages: tf-estimator-nightly, tensorflow-text
Successfully installed tensorflow-text-2.8.2 tf-estimator-nightly-2.8.0.dev2021122109


In [None]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3"
embedding = hub.load(module_url)
print ("module %s loaded" % module_url)
def embed(input):
  return embedding(input)

2022-05-15 10:14:07,885 | INFO | resolver.py:106 | tfhub_cache_dir | Using /tmp/tfhub_modules to cache modules.
2022-05-15 10:14:07,901 | INFO | resolver.py:416 | atomic_download | Downloading TF-Hub Module 'https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3'.
2022-05-15 10:14:13,511 | INFO | resolver.py:154 | _print_download_progress_msg | Downloaded https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3, Total size: 334.32MB
2022-05-15 10:14:13,519 | INFO | resolver.py:431 | atomic_download | Downloaded TF-Hub Module 'https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3'.


module https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3 loaded


In [None]:
embedded_tweets = embed(df_tweets['text_clean'].tolist()).numpy()
embedded_tweets

array([[ 0.08971003,  0.02642841, -0.04346822, ..., -0.02482886,
        -0.01913874, -0.02431319],
       [ 0.05951805,  0.10216852, -0.03719075, ..., -0.05552849,
         0.06851763,  0.05723611],
       [-0.01087562,  0.02928172,  0.06526016, ...,  0.04664649,
        -0.0222609 , -0.0344855 ],
       ...,
       [-0.06531661,  0.01648606, -0.01719313, ...,  0.00969632,
         0.03454359, -0.02602193],
       [ 0.03892171, -0.09024114,  0.01263994, ..., -0.05475952,
         0.04965481, -0.00746642],
       [-0.03458066,  0.0634369 ,  0.05479438, ...,  0.02760923,
        -0.03043297, -0.00257517]], dtype=float32)

In [None]:
embedded_tweets.shape

(2918, 512)

In [None]:
np_emoji  = df_tweets['emoji_sub_groups_id'].to_numpy()
np_emoji

array([list([0, 0, 0, 0, 0, 0]), list([0, 0, 0, 0, 0, 0]),
       list([0, 0, 0, 0, 0, 0]), ..., list([0, 0, 0, 0, 0, 0]),
       list([0, 0, 0, 0, 0, 0]), list([0, 0, 0, 0, 0, 0])], dtype=object)

In [None]:
np_emoji = np.array(df_tweets['emoji_sub_groups_id'].values.tolist())

In [None]:
embedded_tweets_emoji = np.c_[ embedded_tweets, np_emoji]  

In [None]:
embedded_tweets_emoji.shape

(2918, 518)

In [None]:
embedded_tweets_emoji

array([[ 0.08971003,  0.02642841, -0.04346822, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.05951805,  0.10216852, -0.03719075, ...,  0.        ,
         0.        ,  0.        ],
       [-0.01087562,  0.02928172,  0.06526016, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-0.06531661,  0.01648606, -0.01719313, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.03892171, -0.09024114,  0.01263994, ...,  0.        ,
         0.        ,  0.        ],
       [-0.03458066,  0.0634369 ,  0.05479438, ...,  0.        ,
         0.        ,  0.        ]])