In [18]:
import json
import os
import random
import re
import sys
import tqdm

# data
from collections import defaultdict
from collections import Counter
import numpy as np

# viz
import matplotlib.pyplot as plt

# torch
import torch
import torch.nn as nn
from torch.utils.data import Dataset

# pretrained embeddings
import gensim.models as gsm

## Config

In [5]:
#Pre-filtered GloVe embeddings
!wget https://raw.githubusercontent.com/aritter/aritter.github.io/master/files/glove.840B.300d.conll_filtered.txt

--2022-05-03 02:40:05--  https://raw.githubusercontent.com/aritter/aritter.github.io/master/files/glove.840B.300d.conll_filtered.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 69798443 (67M) [text/plain]
Saving to: ‘glove.840B.300d.conll_filtered.txt’


2022-05-03 02:40:07 (241 MB/s) - ‘glove.840B.300d.conll_filtered.txt’ saved [69798443/69798443]



In [3]:
root_path = '/content/'
clean_data_folder_path = os.path.join(root_path, 'data', 'clean_data')
glove_path = os.path.join(root_path, "glove.840B.300d.conll_filtered.txt")

# target emojis
mapping = { 
    '❤':'0' , '😍':'1' , '😂':'2' , '💕':'3' , 
    '🔥':'4' , '😊':'5' , '😎':'6' , '✨':'7' , 
    '💙':'8' , '😘':'9' , '📷':'10' , '🇺🇸':'11' , 
    '☀':'12' , '💜':'13' , '😉':'14' , '💯':'15' , 
    '😁':'16' , '🎄':'17' , '📸':'18' , '😜':'19'
}

# Dataset Preparation

In [4]:
class EmojiDataset(Dataset):
    def __init__(self, dataset_path, transforms=None):
        tweet_text_path = os.path.join(dataset_path, 'tweets.text')
        tweet_label_path = os.path.join(dataset_path, 'tweets.labels')
        tweet_tokenized_path = os.path.join(dataset_path, 'tweets.tokenized')
        
        # init glove
        self.glove_emb = self.read_GloVe(glove_path)
        
        self.word_sentences = []
        self.labels = []
        
        # curate the sentences
        count = 0
        for line in open(tweet_tokenized_path).readlines():
            current_sentence = ['<START>']
            current_sentence.extend(line.rstrip().split(' '))
            current_sentence.append('<END>')
            self.word_sentences.append(current_sentence)
            
            # count += 1
            # if count > 50:
            #     break
                
        # curate the labels
        count = 0
        for line in open(tweet_label_path).readlines():
            emojis = line.rstrip().split(' ')
            
            try:
                emoji_code = int(emojis[0].split(',')[0][1:])
                self.labels.append(emoji_code)
            except Exception as e:
                # no emoji for this tweet
                print(line)
                self.labels.append(-1)
            
            # count += 1
            # if count > 50:
            #     break
        
        # compute char sentences from word sentences
        self.char_sentences = self.sentences2char(self.word_sentences)
        
        # compute counts
        self.word_counts = Counter([w for l in self.word_sentences for w in l])
        self.char_counts = Counter([c for l in self.word_sentences for w in l for c in w])
        self.singletons = set([w for (w,c) in self.word_counts.items() if c == 1 and not w in self.glove_emb.keys()])
        self.char_singletons = set([w for (w,c) in self.char_counts.items() if c == 1])
        
        # Build dictionaries to map from words, characters to indices and vice versa.
        # Save first two words in the vocabulary for padding and "UNK" token.
        self.word2i = {w:i+2 for i,w in enumerate(set([w for l in self.word_sentences for w in l] + list(self.glove_emb.keys())))}
        self.char2i = {w:i+2 for i,w in enumerate(set([c for l in self.char_sentences for w in l for c in w]))}
        self.i2word = {i:w for w,i in self.word2i.items()}
        self.i2char = {i:w for w,i in self.char2i.items()}
        
        # compute vocab size
        self.vocab_size = max(self.word2i.values()) + 1
        self.char_vocab_size = max(self.char2i.values()) + 1
        
        # emoji dictionaries.
        self.emoji2i = {e:int(i) for e,i in mapping.items()}
        self.i2emoji = {i:e for e,i in self.emoji2i.items()}
    
    def sentences2char(self, sentences):
        return [[['start'] + [c for c in w] + ['end'] for w in l] for l in sentences]
    
    def read_GloVe(self, filename):
        embeddings = {}
        for line in open(filename).readlines():
            #print(line)
            fields = line.strip().split(" ")
            word = fields[0]
            embeddings[word] = [float(x) for x in fields[1:]]
        return embeddings
    
    #When training, randomly replace singletons with UNK tokens sometimes to simulate situation at test time.
    def getDictionaryRandomUnk(self, w, dictionary, train=False):
        if train and (w in self.singletons and random.random() > 0.5):
            return 1
        else:
            return dictionary.get(w, 1)
        
    #Map a list of sentences from words to indices.
    def sentences2indices(self, words, dictionary, train=False):
        #1.0 => UNK
        return [[self.getDictionaryRandomUnk(w,dictionary, train=train) for w in l] for l in words]
    
    #Map a list of sentences containing to indices (character indices)
    def sentences2indicesChar(self, chars, dictionary):
        #1.0 => UNK
        return [[[dictionary.get(c,1) for c in w] for w in l] for l in chars]


## Test the dataset class

In [5]:
dataset = EmojiDataset(clean_data_folder_path)

test_idx = 5
print(dataset.word_sentences[test_idx])
print(dataset.char_sentences[test_idx])
print(dataset.i2emoji[dataset.labels[test_idx]])







































































































































































































































['<START>', 'So', 'lovely', 'catching', 'up', 'with', 'my', 'soul', 'sister', '<EMOJI>', 'University', 'of', 'Victoria', '<END>']
[['start', '<', 'S', 'T', 'A', 'R', 'T', '>', 'end'], ['start', 'S', 'o', 'end'], ['start', 'l', 'o', 'v', 'e', 'l', 'y', 'end'], ['start', 'c', 'a', 't', 'c', 'h', 'i', 'n', 'g', 'end'], ['start', 'u', 'p', 'end'], ['start', 'w', 'i', 't', 'h', 'end'], ['start', 'm', 'y', 'end'], ['start', 's', 'o', 'u', 'l', 'end'], ['start', 's', 'i', 's', 't', 'e', 'r', 'end'], ['start', '<', 'E', 'M', 'O', 'J', 'I', '>', 'end'], ['start', 'U', 'n', 'i', 'v', 'e', 'r', 's', 'i', 't', 'y', 'end'], ['start', 'o', 'f', 'end'], ['start', 'V', 'i', 'c', 't', 'o', 'r', 'i', 'a', 'end'], ['start', '<', 'E', 'N', 'D', '>', 'end']]
💜


## Utility Methods

### Pad inputs to max sequence length (for batching)

In [6]:
def prepare_input(X_list):
    X_padded = torch.nn.utils.rnn.pad_sequence([torch.as_tensor(l) for l in X_list], batch_first=True).type(torch.LongTensor) # padding the sequences with 0
    X_mask   = torch.nn.utils.rnn.pad_sequence([torch.as_tensor([1.0] * len(l)) for l in X_list], batch_first=True).type(torch.FloatTensor) # consisting of 0 and 1, 0 for padded positions, 1 for non-padded positions
    return (X_padded, X_mask)

In [7]:
#Maximum word length (for character representations)
MAX_CLEN=32

def prepare_input_char(X_list):
    MAX_SLEN = max([len(l) for l in X_list])
    X_padded  = [l + [[]]*(MAX_SLEN-len(l))  for l in X_list]
    X_padded  = [[w[0:MAX_CLEN] for w in l] for l in X_padded]
    X_padded  = [[w + [1]*(MAX_CLEN-len(w)) for w in l] for l in X_padded]
    return torch.as_tensor(X_padded).type(torch.LongTensor)

### Pad outputs using one-hot encoding

In [8]:
def prepare_output_onehot(Y_list, NUM_TAGS=max(dataset.emoji2i.values())+1):
    Y_onehot = [torch.zeros(len(l), NUM_TAGS) for l in Y_list]
    for i in range(len(Y_list)):
        for j in range(len(Y_list[i])):
            Y_onehot[i][j,Y_list[i][j]] = 1.0
    Y_padded = torch.nn.utils.rnn.pad_sequence(Y_onehot, batch_first=True).type(torch.FloatTensor)
    return Y_padded

Output Emojis to 300D vector using emoji2vec

In [64]:
e2v_model = gsm.KeyedVectors.load_word2vec_format('/content/emoji2vec.txt', binary=False)

In [67]:
#check
happy_vector = e2v_model['❤'] 
print(happy_vector.shape)

(300,)


In [121]:
emoji2vec_weights = torch.FloatTensor(e2v_model.wv.vectors)
print(emoji2vec_weights.shape)

torch.Size([1661, 300])


  """Entry point for launching an IPython kernel.


In [82]:
# labels - list of size N, each element is an emoji
# return N x n_dim embeddings
def prepare_output_vector(labels, n_dim=300):
    N = len(labels)
    Y_vector = torch.zeros((N, n_dim))
    for i in range(0, N):
        if labels[i][0] >= 0:
          emoj = dataset.i2emoji[labels[i][0]]
          #if emoj < 0 or emoj > 20:
          #    print(i)
          Y_vector[i, :] = torch.from_numpy(e2v_model[emoj])
    return Y_vector

inp_emoj_test = [ dataset.emoji2i['📸'], dataset.emoji2i['❤'], 
                             dataset.emoji2i['😂'], dataset.emoji2i['🔥'], dataset.emoji2i['😍']]
inp_emoj_test = np.array(inp_emoj_test).reshape(-1, 1)
print(inp_emoj_test)
print(prepare_output_vector(inp_emoj_test).shape)

[[18]
 [ 0]
 [ 2]
 [ 4]
 [ 1]]
torch.Size([5, 300])


Word2Vec

In [15]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [16]:
word2vec_model = gsm.KeyedVectors.load_word2vec_format('/content/drive/MyDrive/NLP/GoogleNews-vectors-negative300.bin.gz', binary=True)
word2vec_weights = torch.FloatTensor(word2vec_model.wv.vectors)

  


In [119]:
print(word2vec_weights.shape)

torch.Size([3000000, 300])


In [17]:
example = word2vec_model.wv["california"]
print(example.shape)

(300,)


  """Entry point for launching an IPython kernel.


## Define training set and labels

In [9]:
#Indices
X       = dataset.sentences2indices(dataset.word_sentences, dataset.word2i, train=True)
#X_char  = dataset.sentences2indicesChar(dataset.char_sentences, dataset.char2i)
Y       = dataset.labels

In [98]:
print(len(dataset.word_sentences), len(Y))

385351 385351


In [13]:
#print("max slen:", max([len(x) for x in X_char]))

max slen: 43


In [10]:
(X_padded, X_mask) = prepare_input(X)
#X_padded_char      = prepare_input_char(X_char)
#Y_onehot           = prepare_output_onehot(Y)

In [28]:
print("X_padded:", X_padded.shape)
print("X_mask:", X_mask.shape)
#print("X_padded_char:", X_padded_char.shape)
print("Y shape:", len(Y))

X_padded: torch.Size([385351, 43])
X_mask: torch.Size([385351, 43])
Y shape: 385351


In [93]:
Y = np.array(Y).reshape(-1, 1)

In [31]:
print(X_padded[430], Y[430])

tensor([ 53855, 164628, 105972, 129111,  21328, 111177, 132491,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0]) 2


Train Test split

In [109]:
N = len(Y)

In [113]:
np.random.seed(1)
ran = np.random.uniform(size=(N))

In [114]:
fil = np.array(ran < 0.8, dtype=bool)
word_sentences_train = [item for i, item in enumerate(dataset.word_sentences) if fil[i]]
labels_train = [item for i, item in enumerate(dataset.labels) if fil[i]]
ws_rem = [item for i, item in enumerate(dataset.word_sentences) if ~fil[i]]
lab_rem = [item for i, item in enumerate(dataset.labels) if ~fil[i]]
print(len(word_sentences_train), len(labels_train), len(ws_rem), len(lab_rem))

308396 308396 76955 76955


In [115]:
rem = len(lab_rem)
np.random.seed(2)
ran = np.random.uniform(size=(rem))

In [116]:
fil = np.array(ran < 0.5, dtype=bool)
word_sentences_val = [item for i, item in enumerate(ws_rem) if fil[i]]
labels_val = [item for i, item in enumerate(lab_rem) if fil[i]]
word_sentences_test = [item for i, item in enumerate(ws_rem) if ~fil[i]]
labels_test = [item for i, item in enumerate(lab_rem) if ~fil[i]]
print(len(word_sentences_val), len(labels_val), len(word_sentences_test), len(labels_test))

38544 38544 38411 38411


# Start Modeling

In [19]:
print(torch.cuda.is_available())
if torch.cuda.is_available():
  device = torch.device("cuda")
else:
  device = torch.device("cpu")
print("Using device:", device)

False
Using device: cpu


In [127]:
e2v_weights_mag = emoji2vec_weights.norm(dim=1)[:, None]
e2v_weights_norm = emoji2vec_weights / torch.clamp(e2v_weights_mag, min=1e-8)
print(e2v_weights_norm.shape)

torch.Size([1661, 300])


In [140]:
class EmojiPredictor2(nn.Module):
    def __init__(self, DIM_EMB=300, DIM_HID=500, DIM_OUTPUT=300):
        super(EmojiPredictor2, self).__init__()

        #self.emb = nn.Embedding(vocab_size, DIM_EMB)
        self.emb = nn.Embedding.from_pretrained(word2vec_weights)
        self.emb.requires_grad = False
        #self.init_glove(GloVe)
        self.rnn = nn.LSTM(DIM_EMB, DIM_HID, 1, bidirectional=True, batch_first=True)
        
        self.lin = nn.Linear(DIM_HID*2, DIM_OUTPUT)


    def forward(self, X, train=False):
        #print('X', X.shape)
        embe = self.emb(X)
        #print('embe', embe.shape)
        op, (h_n, c_n) = self.rnn(embe)
        #print('op', op.shape)
        o = torch.max(op, dim=1).values
        #print('o', o.shape)

        out_vec = self.lin(o)
        #print('out_vec', out_vec.shape)

        return out_vec
    '''
    def init_glove(self, GloVe):
        #TODO: initialize word embeddings using GloVe (you can skip this part in your first version, if you want, see instructions below).
        #self.emb.weight.data.uniform_(-1, 1)
        #self.emb = torch.nn.Embedding.from_pretrained(GloVe, freeze=False)
        embeddings_matrix = np.zeros((vocab_size, self.DIM_EMB))

        for i in i2word.keys():
          try:
            embeddings_matrix[i] = GloVe[i2word[i]]
          except KeyError:
            embeddings_matrix[i] = np.random.normal(scale=0.6, size=(self.DIM_EMB, ))

        self.emb.load_state_dict({'weight': torch.from_numpy(embeddings_matrix)})
        #pass
    '''
    def inference(self, sentences):
        X = dataset.sentences2indices(sentences, dataset.word2i, train=False)
        X = prepare_input(X)[0].to(device)
        pred = self.forward(X)
        pred_mag = pred.norm(dim=1)[:, None] 
        pred_norm = pred / torch.clamp(pred_mag, min=1e-8)
        sim_mt = torch.mm(pred_norm, e2v_weights_norm.transpose(0, 1))
        print(sim_mt.shape)
        res = torch.argmax(sim_mt, dim=1)
        print(res.shape)
        #cosine similarity
        return res
        #return [dataset.i2emoji[i] for i in res]
        #TO DO to map indices to emojis in emojitovec
        #return [[i2tag[pred[i,j].item()] for j in range(len(sentences[i]))] for i in range(len(sentences))]

    def print_predictions(self, words, tags):
        Y_pred = self.inference(words)
        print('Y_pred:', Y_pred)
        print('Gold:', tags)
        '''for i in range(len(words)):
            print("----------------------------")
            print(" ".join([f"{words[i][j]}/{Y_pred[i][j]}/{tags[i][j]}" for j in range(len(words[i]))]))
            print("Predicted:\t", Y_pred[i])
            print("Gold:\t\t", tags[i])'''

    def write_predictions(self, sentences, outFile):
        fOut = open(outFile, 'w')
        for s in sentences:
            y = self.inference([s])[0]
            #print("\n".join(y[1:len(y)-1]))
            fOut.write("\n".join(y[1:len(y)-1]))  #Skip start and end tokens
            fOut.write("\n\n")

#The following code will initialize a model and test that your forward computation runs without errors.
lstm_test   = EmojiPredictor2(DIM_HID=500, DIM_EMB=300)
lstm_output = lstm_test.forward(X_padded[11:16])
Y_onehot    = prepare_output_vector(Y[11:16])

#Check the shape of the lstm_output and one-hot label tensors.
print("lstm output shape:", lstm_output.shape)
print("Y onehot shape:", Y_onehot.shape)

lstm output shape: torch.Size([5, 300])
Y onehot shape: torch.Size([5, 300])


In [141]:
def shuffle_sentences(sentences, tags):
    shuffled_sentences = []
    shuffled_tags      = []
    indices = list(range(len(sentences)))
    random.shuffle(indices)
    for i in indices:
        shuffled_sentences.append(sentences[i])
        shuffled_tags.append(tags[i])
    return (shuffled_sentences, shuffled_tags)

In [None]:
nEpochs = 5

def train_emoji_predictor2(sentences, tags, lstm):
  #optimizer = optim.Adadelta(lstm.parameters(), lr=0.0001)
  #TODO: initialize optimizer
    optimizer = torch.optim.Adadelta(lstm.parameters(), lr = 0.1)
    batchSize = 5
    tags = np.array(tags).reshape(-1, 1)
    print(tags.shape)
    #sentences = sentences[11:16] # TO DO remove
    #tags = tags[11:16] # TO DO remove
    #print(sentences, tags)

    for epoch in range(nEpochs):
        totalLoss = 0.0
        lstm.train()
        #(sentences_shuffled, tags_shuffled) = shuffle_sentences(sentences, tags) #TO DO remove
        (sentences_shuffled, tags_shuffled) = (sentences, tags)
        i = 0
        sent = dataset.sentences2indices(sentences_shuffled, dataset.word2i, train=True)
        #print(sent)
        (sentences_input, sentence_mask) = prepare_input(sent)
        #print(sentences_input)
        
        #print("sentences_input", sentences_input.shape, sentences_input)
        #gt_lab = sentences2indices(tags_shuffled, tag2i) #
        #gt_lab = get_dummy_emoticons() #TO DO remove
        #print(gt_lab)
        gt_labels = prepare_output_vector(tags_shuffled)
        #gt_labels = prepare_output_onehot(tags)
        print("gt_labels", gt_labels.shape) #, gt_labels
        for batch in tqdm.notebook.tqdm(range(0, len(sentences), batchSize), leave=False):
            lstm.zero_grad()
            #TODO: Impelement gradient update.
            optimizer.zero_grad()
            #print(len(sentences_input))
            #print(sentences_shuffled[i:max(i+batchSize, len(sentences))])
            
            input = sentences_input[i:min(i+batchSize, len(sentences))]
            #print(input)
            output = lstm.forward(input, train=True).to(device)
            exp_output = gt_labels[i:min(i+batchSize, len(sentences))].to(device)
            
            #print(output.shape, exp_output.shape, torch.mul(output, exp_output).shape)
            #print("out", output, exp_output)

            #mask = sentence_mask[i:min(i+batchSize, len(sentences))]
            #prod = output.reshape(-1, output.shape[2]) * exp_output.reshape(-1, exp_output.shape[2])
            
            loss_fn = nn.MSELoss(reduction='sum')

            loss = loss_fn(output, exp_output)
            print('loss: ', loss)
            #print("prod", prod)
            #loss = - torch.sum(prod) / torch.sum(exp_output)
            
            totalLoss += loss
            loss.backward()
            optimizer.step()
            i += batchSize
            #if i >=50:
            #  break
        
        lstm.eval()
        print(f"loss on epoch {epoch} = {totalLoss}")
        #lstm.write_predictions(sentences_dev, 'dev_pred')   #Performance on dev set
        #lstm.write_predictions(sentences_dev, 'dev_pred')   #Performance on dev set
        #print('conlleval:')
        #print(subprocess.Popen('paste dev dev_pred | perl conlleval.pl -d "\t"', shell=True, stdout=subprocess.PIPE,stderr=subprocess.STDOUT).communicate()[0].decode('UTF-8'))

        if epoch % 5 == 0:
            s = [1,2,3,4,5]#sample(range(len(word_sentences_val)), 5)
            lstm.print_predictions([word_sentences_val[i] for i in s], [labels_val[i] for i in s])
            #TO DO to map indices to emojis in emojitovec

lstm = EmojiPredictor2(DIM_HID=500, DIM_EMB=300).to(device)
train_emoji_predictor2(word_sentences_train, labels_train, lstm)

(308396, 1)
gt_labels torch.Size([308396, 300])


  0%|          | 0/61680 [00:00<?, ?it/s]

loss:  tensor(1925.7181, grad_fn=<MseLossBackward0>)
loss:  tensor(2115.2371, grad_fn=<MseLossBackward0>)
loss:  tensor(1926.1086, grad_fn=<MseLossBackward0>)
loss:  tensor(2125.3462, grad_fn=<MseLossBackward0>)
loss:  tensor(2396.4229, grad_fn=<MseLossBackward0>)
loss:  tensor(2286.7473, grad_fn=<MseLossBackward0>)
loss:  tensor(2435.9158, grad_fn=<MseLossBackward0>)
loss:  tensor(1937.5304, grad_fn=<MseLossBackward0>)
