# 0. Before We Start

In this paper, to generate representation of an emoji, we use the idea proposed by Eisner, Ben, et al.2016. The link of the paper is given below:

https://arxiv.org/pdf/1609.08359.pdf

To run the code, please go to this paper's github page first and download that repository to a local path:

https://github.com/uclmr/emoji2vec

Then use the os package in python to change the current working directory to the path you have just specified

In [6]:
import os

path = r'F:\CityU\Hong Kong Twitter 2016\emoji2vec'
os.chdir(path)

In [7]:
os.getcwd()

'F:\\CityU\\Hong Kong Twitter 2016\\emoji2vec'

# 1. Introduction

Load some packages

In [9]:
# Commonly used
import gensim.models as gs
import pickle as pk
import numpy as np
import pandas as pd
from collections import Counter

# This paper requires
import twitter_sentiment_dataset as tsd
import phrase2vec as p2v
from twitter_sentiment_dataset import TweetTrainingExample
from model import ModelParams

# tokenization
import nltk.tokenize as tk

# Neglect some warnings
import warnings
warnings.simplefilter("ignore", DeprecationWarning)
warnings.simplefilter("ignore", FutureWarning)

# 2. Set Global Variables

In [10]:
w2v_path='./data/word2vec/'

in_dim = 100   # Length of word2vec vectors
out_dim = 100  # Desired dimension of output vectors
pos_ex = 4
neg_ratio = 1
max_epochs = 40
dropout = 0.1

params = ModelParams(in_dim=in_dim, out_dim=out_dim, pos_ex=pos_ex, max_epochs=max_epochs,
                    neg_ratio=neg_ratio, learning_rate=0.001, dropout=dropout, class_threshold=0.5)



e2v_ours_path = params.model_folder('unicode') + '/emoji2vec_100.bin'

# 3. Load the Pre-trained Word Vectors and Emoji Representations

In [11]:
e2v_ours_path

'./results/unicode/k-100_pos-4_rat-1_ep-40_dr-1/emoji2vec_100.bin'

In [12]:
# The fasttext_model is the model pretrained from sentiment140
w2v = gs.FastText.load(os.path.join(w2v_path, 'fasttext_model'))
e2v_ours = gs.KeyedVectors.load_word2vec_format(e2v_ours_path, binary=True)

In [13]:
# Combine the word embedding and emoji embedding together
p2v_our_emoji = p2v.Phrase2Vec(out_dim, w2v, e2v=e2v_ours)

# 4. Generate Representations

In [28]:
# Load a sample dataframe which contains processed tweets, geoinformation, etc
tweet_2017_path = r'F:\CityU\Datasets\Hong Kong Tweets 2017'
final_zh_sample_cleaned_and_translated = pd.read_pickle(
        os.path.join(tweet_2017_path, 'final_sample_cleaned_and_translated_2.pkl'))

In [29]:
final_zh_sample_cleaned_and_translated.loc[
        final_zh_sample_cleaned_and_translated['cleaned_text'] == '', 'cleaned_text'] = \
        final_zh_sample_cleaned_and_translated['text']

In [30]:
final_zh_sample_cleaned_and_translated = final_zh_sample_cleaned_and_translated[['user_id_str', 'cleaned_text', 'lang', 'lat', 'lon', 'month']]

In [31]:
final_zh_sample_cleaned_and_translated.head(10)

Unnamed: 0,user_id_str,cleaned_text,lang,lat,lon,month
0,378134755,taste food festival 🥙 🍻 🍜 🥙 🍻 🍜 🥙 🍻 🍜 🥙 🍻 🍜 ce...,zh,22.283612,114.162818,Mar
1,80997783,😋 godiva chocolate ice cream holiday internati...,zh,22.315168,113.934905,Jan
2,785638,because of the church band camp receive the ba...,zh,22.27807,114.18471,Aug
3,1473379776,if you want to leave you have to be refuel you...,zh,22.371072,114.111811,Nov
4,5417862,a big share in fact a few good food be expensi...,zh,22.29469,114.168125,May
5,53893147,three paste of brine clam meat and the classic...,zh,22.28025,114.15833,Feb
6,213927190,enter the market,zh,22.285359,114.157874,Dec
7,2422020475,for six year i use to be the enemy of the left...,zh,22.3336,114.159,Feb
8,28544855,memory repeat taste different now repeat victo...,zh,22.281611,114.188718,Jan
9,53893147,hand hold with a hand so tasty 😋 causeway poin...,zh,22.27923,114.18187,Aug


In [40]:
final_zh_sample_cleaned_and_translated.shape

(886, 6)

In [33]:
# Get a list of all the tweets in this dataset
sample_zh_tweets = list(final_zh_sample_cleaned_and_translated['cleaned_text'])

In [37]:
def prepare_tweet_vector_averages_for_prediction(tweets, p2v):
    """
    Take the vector sum of all tokens in each tweet

    Args:
        tweets: All tweets
        p2v: Phrase2Vec model

    Returns:
        Average vectors for each tweet
    """
    tokenizer = tk.TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True)

    avg_vecs = list()

    for tweet in tweets:
        tokens = tokenizer.tokenize(tweet)
        avg_vecs.append(np.sum([p2v[x] for x in tokens], axis=0) / len(tokens))

    return avg_vecs


def list_of_array_to_array(list_array):
    """
    Transform a list of one-dimensional arrays to a numpy array
    Args:
        list_array: a list of arrays
    
    Returns:
        A numpy array
    """
    shape = list(list_array[0].shape)
    shape[:0] = [len(list_array)]
    arr = np.concatenate(list_array).reshape(shape)
    return arr

In [35]:
tweets_representations_zh_sample = prepare_tweet_vector_averages_for_prediction(sample_zh_tweets, p2v_our_emoji)

In [38]:
tweets_representations_zh_sample_array = list_of_array_to_array(tweets_representations_zh_sample)

In [39]:
tweets_representations_zh_sample_array

array([[-0.44123602, -0.09781744, -0.18259196, ..., -0.2746119 ,
        -0.26236635, -0.47610205],
       [ 0.00924975,  0.03655616, -0.55427706, ..., -0.10963333,
         0.5381336 , -1.3387452 ],
       [-1.0994582 , -0.7486428 , -0.96823746, ...,  0.48931405,
        -0.8802878 , -1.2700877 ],
       ...,
       [-0.41797507, -0.18330587, -0.9719057 , ...,  0.16747317,
        -0.81420517, -0.95749223],
       [ 0.39502248,  0.11920539, -1.9165957 , ..., -0.40935957,
        -0.6471186 , -1.3098006 ],
       [-1.4964355 , -0.16747546, -1.0323752 , ...,  0.30381542,
        -0.26012287, -1.1634724 ]], dtype=float32)

In [41]:
np.shape(tweets_representations_zh_sample_array)

(886, 100)