In [None]:
import json
import os
import random
import re
import sys
import tqdm

# data
from collections import defaultdict
from collections import Counter
import numpy as np

# viz
import matplotlib.pyplot as plt

# torch
import torch
import torch.nn as nn
from torch.utils.data import Dataset

# pretrained embeddings
import gensim.models as gsm

from google.colab import files, drive

## Config

In [None]:
#Pre-filtered GloVe embeddings
!wget https://raw.githubusercontent.com/aritter/aritter.github.io/master/files/glove.840B.300d.conll_filtered.txt

--2022-05-04 23:28:43--  https://raw.githubusercontent.com/aritter/aritter.github.io/master/files/glove.840B.300d.conll_filtered.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 69798443 (67M) [text/plain]
Saving to: ‘glove.840B.300d.conll_filtered.txt’


2022-05-04 23:28:45 (324 MB/s) - ‘glove.840B.300d.conll_filtered.txt’ saved [69798443/69798443]



In [None]:
root_path = '/content/'
clean_data_folder_path = os.path.join(root_path, 'data', 'clean_data')
glove_path = os.path.join(root_path, "glove.840B.300d.conll_filtered.txt")

# target emojis
mapping = { 
    '❤':'0' , '😍':'1' , '😂':'2' , '💕':'3' , 
    '🔥':'4' , '😊':'5' , '😎':'6' , '✨':'7' , 
    '💙':'8' , '😘':'9' , '📷':'10' , '🇺🇸':'11' , 
    '☀':'12' , '💜':'13' , '😉':'14' , '💯':'15' , 
    '😁':'16' , '🎄':'17' , '📸':'18' , '😜':'19'
}

# Dataset Preparation

In [None]:
class EmojiDataset(Dataset):
    def __init__(self, dataset_path, transforms=None):
        tweet_text_path = os.path.join(dataset_path, 'tweets.text')
        tweet_label_path = os.path.join(dataset_path, 'tweets.labels')
        tweet_tokenized_path = os.path.join(dataset_path, 'tweets.notoken')
        
        # init glove
        self.glove_emb = self.read_GloVe(glove_path)
        
        self.word_sentences = []
        self.labels = []
        self.positions = []
        
        # curate the sentences
        count = 0
        for line in open(tweet_tokenized_path).readlines():
            current_sentence = ['<START>']
            current_sentence.extend(line.rstrip().split(' '))
            current_sentence.append('<END>')
            self.word_sentences.append(current_sentence)
            
            # count += 1
            # if count > 50:
            #     break
                
        # curate the labels
        count = 0
        for line in open(tweet_label_path).readlines():
            emojis = line.rstrip().split(' ')
            
            #get emoji code
            try:
                emoji_code = int(emojis[0].split(',')[0])
                self.labels.append(emoji_code)
            except Exception as e:
                # no emoji for this tweet
                print(line)
                self.labels.append(-1)
            
            # get emoji position (index of word after which the emoji is expected)
            try:
                emoji_position = int(emojis[0].split(',')[2])
                self.positions.append(emoji_position)
            except Exception as e:
                # no emoji for this tweet
                print(line)
                self.positions.append(-1)

            # count += 1
            # if count > 50:
            #     break
        
        # compute char sentences from word sentences
        self.char_sentences = self.sentences2char(self.word_sentences)
        
        # compute counts
        self.word_counts = Counter([w for l in self.word_sentences for w in l])
        self.char_counts = Counter([c for l in self.word_sentences for w in l for c in w])
        self.singletons = set([w for (w,c) in self.word_counts.items() if c == 1 and not w in self.glove_emb.keys()])
        self.char_singletons = set([w for (w,c) in self.char_counts.items() if c == 1])
        
        # Build dictionaries to map from words, characters to indices and vice versa.
        # Save first two words in the vocabulary for padding and "UNK" token.
        self.word2i = {w:i+2 for i,w in enumerate(set([w for l in self.word_sentences for w in l] + list(self.glove_emb.keys())))}
        self.char2i = {w:i+2 for i,w in enumerate(set([c for l in self.char_sentences for w in l for c in w]))}
        self.i2word = {i:w for w,i in self.word2i.items()}
        self.i2char = {i:w for w,i in self.char2i.items()}
        
        # compute vocab size
        self.vocab_size = max(self.word2i.values()) + 1
        self.char_vocab_size = max(self.char2i.values()) + 1
        
        # emoji dictionaries.
        self.emoji2i = {e:int(i) for e,i in mapping.items()}
        self.i2emoji = {i:e for e,i in self.emoji2i.items()}
    
    def sentences2char(self, sentences):
        return [[['start'] + [c for c in w] + ['end'] for w in l] for l in sentences]
    
    def read_GloVe(self, filename):
        embeddings = {}
        for line in open(filename).readlines():
            #print(line)
            fields = line.strip().split(" ")
            word = fields[0]
            embeddings[word] = [float(x) for x in fields[1:]]
        return embeddings
    
    #When training, randomly replace singletons with UNK tokens sometimes to simulate situation at test time.
    def getDictionaryRandomUnk(self, w, dictionary, train=False):
        if train and (w in self.singletons and random.random() > 0.5):
            return 1
        else:
            return dictionary.get(w, 1)
        
    #Map a list of sentences from words to indices.
    def sentences2indices(self, words, dictionary, train=False):
        #1.0 => UNK
        return [[self.getDictionaryRandomUnk(w,dictionary, train=train) for w in l] for l in words]
    
    #Map a list of sentences containing to indices (character indices)
    def sentences2indicesChar(self, chars, dictionary):
        #1.0 => UNK
        return [[[dictionary.get(c,1) for c in w] for w in l] for l in chars]


## Test the dataset class

In [None]:
dataset = EmojiDataset(clean_data_folder_path)
   
test_idx = 5
print(dataset.word_sentences[test_idx])
print(dataset.char_sentences[test_idx])
print(dataset.i2emoji[dataset.labels[test_idx]])
print(dataset.positions[test_idx])

['<START>', 'Perfect', 'for', 'this', 'weather', 'Snow', 'White', 'Cafe', '<END>']
[['start', '<', 'S', 'T', 'A', 'R', 'T', '>', 'end'], ['start', 'P', 'e', 'r', 'f', 'e', 'c', 't', 'end'], ['start', 'f', 'o', 'r', 'end'], ['start', 't', 'h', 'i', 's', 'end'], ['start', 'w', 'e', 'a', 't', 'h', 'e', 'r', 'end'], ['start', 'S', 'n', 'o', 'w', 'end'], ['start', 'W', 'h', 'i', 't', 'e', 'end'], ['start', 'C', 'a', 'f', 'e', 'end'], ['start', '<', 'E', 'N', 'D', '>', 'end']]
❤
5


## Utility Methods

### Pad inputs to max sequence length (for batching)

In [None]:
def prepare_input(X_list):
    X_padded = torch.nn.utils.rnn.pad_sequence([torch.as_tensor(l) for l in X_list], batch_first=True).type(torch.LongTensor) # padding the sequences with 0
    X_mask   = torch.nn.utils.rnn.pad_sequence([torch.as_tensor([1.0] * len(l)) for l in X_list], batch_first=True).type(torch.FloatTensor) # consisting of 0 and 1, 0 for padded positions, 1 for non-padded positions
    return (X_padded, X_mask)

In [None]:
#Maximum word length (for character representations)
MAX_CLEN=32

def prepare_input_char(X_list):
    MAX_SLEN = max([len(l) for l in X_list])
    X_padded  = [l + [[]]*(MAX_SLEN-len(l))  for l in X_list]
    X_padded  = [[w[0:MAX_CLEN] for w in l] for l in X_padded]
    X_padded  = [[w + [1]*(MAX_CLEN-len(w)) for w in l] for l in X_padded]
    return torch.as_tensor(X_padded).type(torch.LongTensor)

### Pad outputs using one-hot encoding

In [None]:
def prepare_output_onehot(Y_list, NUM_TAGS=max(dataset.emoji2i.values())+1):
    Y_onehot = [torch.zeros(len(l), NUM_TAGS) for l in Y_list]
    for i in range(len(Y_list)):
        for j in range(len(Y_list[i])):
            Y_onehot[i][j,Y_list[i][j]] = 1.0
    Y_padded = torch.nn.utils.rnn.pad_sequence(Y_onehot, batch_first=True).type(torch.FloatTensor)
    return Y_padded

Output Emojis to 300D vector using emoji2vec

In [None]:
e2v_model = gsm.KeyedVectors.load_word2vec_format('/content/emoji2vec.txt', binary=False)

In [None]:
#check
happy_vector = e2v_model['❤'] 
print(happy_vector.shape)

(300,)


In [None]:
emoji2vec_weights = torch.FloatTensor(e2v_model.wv.vectors)
print(emoji2vec_weights.shape)

torch.Size([1661, 300])


  """Entry point for launching an IPython kernel.


In [None]:
# labels - list of size N, each element is an emoji
# return N x n_dim embeddings of emojis
def prepare_output_vector(labels, n_dim=300):
    N = len(labels)
    Y_vector = torch.zeros((N, n_dim))
    for i in range(0, N):
        if labels[i][0] >= 0:
          emoj = dataset.i2emoji[labels[i][0]]
          Y_vector[i, :] = torch.from_numpy(e2v_model[emoj])
    return Y_vector

inp_emoj_test = [ dataset.emoji2i['📸'], dataset.emoji2i['❤'], 
                             dataset.emoji2i['😂'], dataset.emoji2i['🔥'], dataset.emoji2i['😍']]
inp_emoj_test = np.array(inp_emoj_test).reshape(-1, 1)
print(inp_emoj_test)
print(prepare_output_vector(inp_emoj_test).shape)

[[18]
 [ 0]
 [ 2]
 [ 4]
 [ 1]]
torch.Size([5, 300])


  # This is added back by InteractiveShellApp.init_path()


In [None]:
e2v_model.index2word[368]                        

'❤'

## Define training set and labels

In [None]:
#Indices
X       = dataset.sentences2indices(dataset.word_sentences, dataset.word2i, train=True)
Y       = dataset.labels

In [None]:
print(len(dataset.word_sentences), len(Y))

365671 365671


In [None]:
(X_padded, X_mask) = prepare_input(X)


In [None]:
Y = np.array(dataset.positions).reshape(-1, 1)
Y_pos = prepare_output_onehot(Y, X_padded.shape[-1])
Y_pos = Y_pos.squeeze(dim=1)[:, 1:]
print(Y_pos.shape, Y_pos[0:3])

torch.Size([365671, 36]) tensor([[0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])


In [None]:
print("X_padded:", X_padded.shape)
print("X_mask:", X_mask.shape)
print("Y shape:", len(Y))
print('Y_pos')

X_padded: torch.Size([365671, 37])
X_mask: torch.Size([365671, 37])
Y shape: 365671
Y_pos


In [None]:
Y = np.array(Y).reshape(-1, 1)

Train Test split

In [None]:
N = len(Y)
print(N)

365671


In [None]:
np.random.seed(1)
ran = np.random.uniform(size=(N))

In [None]:
fil = np.array(ran < 0.8, dtype=bool)
word_sentences_train = [item for i, item in enumerate(dataset.word_sentences) if fil[i]]
labels_train = [item for i, item in enumerate(dataset.labels) if fil[i]]
positions_train = [item for i, item in enumerate(dataset.positions) if fil[i]]

ws_rem = [item for i, item in enumerate(dataset.word_sentences) if ~fil[i]]
lab_rem = [item for i, item in enumerate(dataset.labels) if ~fil[i]]
pos_rem = [item for i, item in enumerate(dataset.positions) if ~fil[i]]
print(len(word_sentences_train), len(labels_train), len(positions_train), len(ws_rem), len(lab_rem), len(pos_rem))

292743 292743 292743 72928 72928 72928


In [None]:
rem = len(lab_rem)
np.random.seed(2)
ran = np.random.uniform(size=(rem))

In [None]:
fil = np.array(ran < 0.5, dtype=bool)
word_sentences_val = [item for i, item in enumerate(ws_rem) if fil[i]]
labels_val = [item for i, item in enumerate(lab_rem) if fil[i]]
positions_val = [item for i, item in enumerate(pos_rem) if fil[i]]

word_sentences_test = [item for i, item in enumerate(ws_rem) if ~fil[i]]
labels_test = [item for i, item in enumerate(lab_rem) if ~fil[i]]
positions_test = [item for i, item in enumerate(pos_rem) if ~fil[i]]
print(len(word_sentences_val), len(labels_val), len(positions_val), len(word_sentences_test), len(labels_test), len(positions_test))

36478 36478 36478 36450 36450 36450


# Start Modeling

In [None]:
print(torch.cuda.is_available())
if torch.cuda.is_available():
  device = torch.device("cuda")
else:
  device = torch.device("cpu")
print("Using device:", device)

True
Using device: cuda


In [None]:
#normalized weights
e2v_weights_mag = emoji2vec_weights.norm(dim=1)[:, None]
e2v_weights_norm = emoji2vec_weights / torch.clamp(e2v_weights_mag, min=1e-8)
print(e2v_weights_norm.shape)
e2v_weights_norm = e2v_weights_norm.to(device)

torch.Size([1661, 300])


In [None]:
class EmojiPredictor2(nn.Module):
    def __init__(self, DIM_EMB=300, DIM_HID=500, DIM_OUTPUT=300):
        super(EmojiPredictor2, self).__init__()

        self.DIM_EMB = DIM_EMB
        self.emb = nn.Embedding(dataset.vocab_size, DIM_EMB)

        self.init_glove(dataset.glove_emb)
        self.rnn = nn.LSTM(DIM_EMB, DIM_HID, 1, bidirectional=True, batch_first=True)
        
        self.lin = nn.Linear(DIM_HID*2, DIM_OUTPUT)


    def forward(self, X, train=False):
        embe = self.emb(X)
        op, (h_n, c_n) = self.rnn(embe)
        o = torch.max(op, dim=1).values

        out_vec = self.lin(o)
        return out_vec
    
    def init_glove(self, GloVe):
        embeddings_matrix = np.zeros((dataset.vocab_size, self.DIM_EMB))

        for i in dataset.i2word.keys():
          try:
            embeddings_matrix[i] = GloVe[dataset.i2word[i]]
          except KeyError:
            embeddings_matrix[i] = np.random.normal(scale=0.6, size=(self.DIM_EMB, ))

        self.emb.load_state_dict({'weight': torch.from_numpy(embeddings_matrix)})
    
    def inference(self, sentences):
        X = dataset.sentences2indices(sentences, dataset.word2i, train=False)
        X = prepare_input(X)[0].to(device)
        pred = self.forward(X)
        pred_mag = pred.norm(dim=1)[:, None] 
        pred_norm = pred / torch.clamp(pred_mag, min=1e-8)
        sim_mt = torch.mm(pred_norm, e2v_weights_norm.transpose(0, 1))
        print(sim_mt.shape)
        res = torch.argmax(sim_mt, dim=1)
        print(res.shape)
        return res
        #return [dataset.i2emoji[i] for i in res]
    

lstm_test   = EmojiPredictor2(DIM_HID=500, DIM_EMB=300).to(device)
lstm_output = lstm_test.forward(X_padded[11:16].to(device))
Y_onehot    = prepare_output_vector(Y[11:16])

print("lstm output shape:", lstm_output.shape)
print("Y onehot shape:", Y_onehot.shape)

lstm output shape: torch.Size([5, 300])
Y onehot shape: torch.Size([5, 300])


In [None]:
def compute_topk_accuracy(sentences, labels, k=5):
    X = dataset.sentences2indices(sentences, dataset.word2i, train=False)
    X = prepare_input(X)[0]
    batch_sz = 64
    b=0
    count_correct = 0
    top_preds_emoji = np.empty((len(sentences), k), dtype=str)
    for batch in tqdm.notebook.tqdm(range(0, len(sentences), batch_sz), leave=False):
        inp = X[b:min(b+batch_sz, len(sentences))].to(device)
        lab = labels[b:min(b+batch_sz, len(sentences))]
        pred = lstm.forward(inp)

        #Cosine similarity
        pred_mag = pred.norm(dim=1)[:, None] 
        pred_norm = pred / torch.clamp(pred_mag, min=1e-8)
        sim_mt = torch.mm(pred_norm, e2v_weights_norm.transpose(0, 1))
        #print(sim_mt.shape)
        top_preds = torch.topk(sim_mt, k=k, dim=1, largest=True).indices
        
        for i in range(len(inp)):
            label = ''  
            if lab[i] >= 0:
                label = dataset.i2emoji[lab[i]]
            for j in range(k):
                top_preds_emoji[b +i, j] = e2v_model.index2word[top_preds[i, j]]
                if label == top_preds_emoji[b +i, j]:
                    count_correct += 1
        b += batch_sz
    #print(count_correct)
    accuracy = count_correct / len(sentences)
    return top_preds_emoji, accuracy


In [None]:
def shuffle_sentences(sentences, tags):
    shuffled_sentences = []
    shuffled_tags      = []
    indices = list(range(len(sentences)))
    random.shuffle(indices)
    for i in indices:
        shuffled_sentences.append(sentences[i])
        shuffled_tags.append(tags[i])
    return (shuffled_sentences, shuffled_tags)

In [None]:
nEpochs = 3

def train_emoji_predictor2(sentences, tags, lstm):
  
    optimizer = torch.optim.Adam(lstm.parameters(), lr = 0.0005)
    batchSize = 64
    tags = np.array(tags).reshape(-1, 1)
    print(tags.shape)

    for epoch in range(nEpochs):
        totalLoss = 0.0
        lstm.train()
        (sentences_shuffled, tags_shuffled) = shuffle_sentences(sentences, tags) 
      
        i = 0
        sent = dataset.sentences2indices(sentences_shuffled, dataset.word2i, train=True)
        (sentences_input, sentence_mask) = prepare_input(sent)
        
        gt_labels = prepare_output_vector(tags_shuffled)
        
        for batch in tqdm.notebook.tqdm(range(0, len(sentences), batchSize), leave=False):
            lstm.zero_grad()
            
            optimizer.zero_grad()
            
            input = sentences_input[i:min(i+batchSize, len(sentences))].to(device)
            
            output = lstm.forward(input, train=True).to(device)
            exp_output = gt_labels[i:min(i+batchSize, len(sentences))].to(device)
            
            loss_fn = nn.MSELoss(reduction='sum')

            loss = loss_fn(output, exp_output)
            
            totalLoss += loss
            loss.backward()
            optimizer.step()
            i += batchSize

        
        lstm.eval()
        print(f"loss on epoch {epoch} = {totalLoss}")

        if epoch % 2 == 0:
            train_preds, train_accuracy = compute_topk_accuracy(word_sentences_train, labels_train, k=5)
            print('train accuracy: ', train_accuracy)       
            preds, accuracy = compute_topk_accuracy(word_sentences_val, labels_val, k=5)
            print('validation accuracy:', accuracy)                                                 
            
lstm = EmojiPredictor2(DIM_HID=500, DIM_EMB=300).to(device)
train_emoji_predictor2(word_sentences_train, labels_train, lstm)

(292743, 1)
gt_labels torch.Size([292743, 300])


  0%|          | 0/4575 [00:00<?, ?it/s]

loss on epoch 0 = 67009820.0


  0%|          | 0/4575 [00:00<?, ?it/s]

train accuracy:  0.48308243066443946


  0%|          | 0/570 [00:00<?, ?it/s]

validation accuracy: 0.4629913920719338
gt_labels torch.Size([292743, 300])


  0%|          | 0/4575 [00:00<?, ?it/s]

loss on epoch 1 = 62696172.0
gt_labels torch.Size([292743, 300])


  0%|          | 0/4575 [00:00<?, ?it/s]

loss on epoch 2 = 59338272.0


  0%|          | 0/4575 [00:00<?, ?it/s]

train accuracy:  0.5654208640343236


  0%|          | 0/570 [00:00<?, ?it/s]

validation accuracy: 0.4751356982290696


In [None]:
print([dataset.i2emoji[l] for l in labels_val[50:70]])

['😂', '🔥', '😁', '📷', '😁', '🇺🇸', '😂', '😍', '😜', '🔥', '😂', '😊', '✨', '😍', '📸', '😎', '😎', '❤', '❤', '😍']


In [None]:
preds, accuracy = compute_topk_accuracy(word_sentences_val[50:70], labels_val[50:70], k=5)
print(accuracy, preds)

  0%|          | 0/1 [00:00<?, ?it/s]

0.45 [['😂' '😹' '😃' '😆' '😀']
 ['😁' '😎' '😃' '😊' '😀']
 ['❤' '💕' '😽' '😚' '😍']
 ['❤' '😍' '😚' '💛' '😽']
 ['😍' '😃' '😀' '😁' '😊']
 ['🇺' '🇲' '🇦' '🇦' '🇪']
 ['😍' '❤' '😂' '😚' '😽']
 ['🎄' '🎅' '🌲' '🎃' '🎋']
 ['❤' '😍' '😚' '😽' '💕']
 ['🔥' '🌋' '🍲' '🐉' '☕']
 ['😍' '😚' '😽' '😃' '😜']
 ['😂' '😃' '😹' '😆' '😀']
 ['😍' '❤' '😚' '😂' '😽']
 ['😍' '😚' '😽' '💜' '💕']
 ['📷' '🎥' '😍' '😚' '😘']
 ['😎' '😜' '😃' '😂' '😀']
 ['😎' '❤' '😍' '😅' '😁']
 ['❤' '😍' '😻' '😚' '😽']
 ['❤' '💕' '😍' '😚' '😽']
 ['❤' '💕' '😽' '😚' '😻']]


In [None]:
word_sentences_val[50:70]

[['<START>',
  'alright',
  'baby',
  'boy',
  'I',
  'am',
  'going',
  'to',
  'give',
  'u',
  'ur',
  'bad',
  'news',
  'keep',
  'playing',
  'LOL',
  '<END>'],
 ['<START>', '', 'on', 'this', 'great', 'day', 'Corporate', '<END>'],
 ['<START>',
  'Birthday',
  'weekend',
  'was',
  'a',
  'success',
  '',
  'thank',
  'you',
  'to',
  'everyone',
  'that',
  'showed',
  'me',
  'love',
  'and',
  'helped',
  'celebrate',
  '<END>'],
 ['<START>',
  'Sunday',
  'night',
  'Christmas',
  'lightsSpreading',
  'cheer',
  'and',
  'good',
  'vibes',
  'to',
  'all',
  '<END>'],
 ['<START>', 'Smiles', 'for', 'days', '', 'Stockton', 'University', '<END>'],
 ['<START>', 'Camp', 'Creek', 'Atlanta', 'Ga', '<END>'],
 ['<START>',
  'Im',
  'a',
  'cruiser',
  'on',
  'a',
  'crotch',
  '',
  'Nashville',
  'Tennessee',
  '<END>'],
 ['<START>',
  'My',
  'nephew',
  'just',
  'won',
  'Christmas',
  '',
  'Santa',
  'Fe',
  'Kids',
  'Co',
  'of',
  'Whittier',
  '<END>'],
 ['<START>',
  'Opini

In [None]:
torch.save(lstm.state_dict(), 'emoji_pred_0405.pt')
files.download('emoji_pred_0405.pt')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
lstm.load_state_dict(torch.load("emoji_pred.pt", map_location=device))

In [None]:
preds, accuracy = compute_topk_accuracy(word_sentences_val, labels_val, k=5)
print(accuracy)                                                    

  0%|          | 0/603 [00:00<?, ?it/s]

18016
0.46741386467413865


In [None]:
nEpochs = 3
train_emoji_predictor2(word_sentences_train, labels_train, lstm)                

(308396, 1)
gt_labels torch.Size([308396, 300])


  0%|          | 0/4819 [00:00<?, ?it/s]

loss on epoch 0 = 41597524.0
torch.Size([10, 1661])
torch.Size([10])
Y_pred: tensor([147, 686, 191, 502, 191, 368, 686, 368, 686, 368], device='cuda:0')
Gold: [14, 1, 6, 6, 2, 0, 4, 7, 3, 7]
gt_labels torch.Size([308396, 300])


  0%|          | 0/4819 [00:00<?, ?it/s]

loss on epoch 1 = 39806368.0
gt_labels torch.Size([308396, 300])


  0%|          | 0/4819 [00:00<?, ?it/s]

loss on epoch 2 = 38195332.0


In [None]:
test_preds, test_accuracy = compute_topk_accuracy(word_sentences_test, labels_test, k=5)
print('Test acc: ', test_accuracy)
preds, accuracy = compute_topk_accuracy(word_sentences_val, labels_val, k=5)
print('Val acc: ', accuracy)
tr_preds, tr_accuracy = compute_topk_accuracy(word_sentences_train, labels_train, k=5)
print('Train acc: ', tr_accuracy)

  0%|          | 0/570 [00:00<?, ?it/s]

Test acc:  0.4778326474622771


  0%|          | 0/570 [00:00<?, ?it/s]

Val acc:  0.4751356982290696


  0%|          | 0/4575 [00:00<?, ?it/s]

Train acc:  0.5654208640343236


In [None]:
test_preds, test_accuracy = compute_topk_accuracy(word_sentences_test, labels_test, k=10)
print('Test acc: ', test_accuracy)
preds, accuracy = compute_topk_accuracy(word_sentences_val, labels_val, k=10)
print('Val acc: ', accuracy)
tr_preds, tr_accuracy = compute_topk_accuracy(word_sentences_train, labels_train, k=10)
print('Train acc: ', tr_accuracy)

  0%|          | 0/570 [00:00<?, ?it/s]

Test acc:  0.5744307270233197


  0%|          | 0/570 [00:00<?, ?it/s]

Val acc:  0.5742913536926366


  0%|          | 0/4575 [00:00<?, ?it/s]

Train acc:  0.6638894866828583


In [None]:
test_preds, test_accuracy = compute_topk_accuracy(word_sentences_test, labels_test, k=1)
print('Test acc: ', test_accuracy)
preds, accuracy = compute_topk_accuracy(word_sentences_val, labels_val, k=1)
print('Val acc: ', accuracy)
tr_preds, tr_accuracy = compute_topk_accuracy(word_sentences_train, labels_train, k=1)
print('Train acc: ', tr_accuracy)

  0%|          | 0/570 [00:00<?, ?it/s]

Test acc:  0.29939643347050754


  0%|          | 0/570 [00:00<?, ?it/s]

Val acc:  0.29346455397774


  0%|          | 0/4575 [00:00<?, ?it/s]

Train acc:  0.38296048069467076


##Position Prediction

In [None]:
class EmojiPositionPredictor(nn.Module):
    def __init__(self, DIM_EMB=300, DIM_HID=500, DIM_OUTPUT=1):
        super(EmojiPositionPredictor, self).__init__()

        self.DIM_EMB = DIM_EMB
        self.emb = nn.Embedding(dataset.vocab_size, DIM_EMB)
        
        self.init_glove(dataset.glove_emb)
        self.rnn = nn.LSTM(DIM_EMB, DIM_HID, 1, bidirectional=True, batch_first=True)
    
        self.lin = nn.Linear(DIM_HID*4, DIM_HID)
        self.relu = nn.ReLU()
        self.lin2 = nn.Linear(DIM_HID, DIM_OUTPUT)


    def forward(self, X, train=False):
        
        embe = self.emb(X)
        op, (h_n, c_n) = self.rnn(embe)

        prev = op[:, :-1, :]
        next = op[:, 1:, :]
        
        conc = torch.concat([prev, next], dim=-1)

        hid_vec = self.relu(self.lin(conc))

        out_vec = self.relu(self.lin2(hid_vec))
        out_vec = out_vec.squeeze(dim=-1)
        
        return out_vec
    
    def init_glove(self, GloVe):
        embeddings_matrix = np.zeros((dataset.vocab_size, self.DIM_EMB))

        for i in dataset.i2word.keys():
          try:
            embeddings_matrix[i] = GloVe[dataset.i2word[i]]
          except KeyError:
            embeddings_matrix[i] = np.random.normal(scale=0.6, size=(self.DIM_EMB, ))

        self.emb.load_state_dict({'weight': torch.from_numpy(embeddings_matrix)})
    
    def inference(self, sentences):
        X = dataset.sentences2indices(sentences, dataset.word2i, train=False)
        X = prepare_input(X)[0].to(device)
        pred = self.forward(X)
        return torch.argmax(pred, dim=1)
        
    def compute_accuracy(self, sentences, positions):
        pred = self.inference(sentences)
        
        count = np.sum((positions == pred +1))
        print(count)
        return count / len(positions), pred
    
lstm_test   = EmojiPositionPredictor(DIM_HID=500, DIM_EMB=300).to(device)
lstm_output = lstm_test.forward(X_padded[11:16].to(device))
Y_pos_test = np.array(dataset.positions[11:16]).reshape(-1, 1)
Y_onehot    = prepare_output_onehot(Y_pos_test, X_padded.shape[-1]).squeeze(dim=1)[:, 1:]

print("lstm output shape:", lstm_output.shape)
print("Y onehot shape:", Y_onehot.shape)

lstm output shape: torch.Size([5, 36])
Y onehot shape: torch.Size([5, 36])


In [None]:
nEpochs = 5

def train_emoji_predictor2(sentences, tags, lstm):
  
    optimizer = torch.optim.Adam(lstm.parameters(), lr = 0.0005)
    batchSize = 64
    tags = np.array(tags).reshape(-1, 1)
  
    print(tags.shape)
  
    for epoch in range(nEpochs):
        totalLoss = 0.0
        lstm.train()
        (sentences_shuffled, tags_shuffled) = shuffle_sentences(sentences, tags)
        
        i = 0
        sent = dataset.sentences2indices(sentences_shuffled, dataset.word2i, train=True)
        (sentences_input, sentence_mask) = prepare_input(sent)
        
        Y_pos = prepare_output_onehot(tags_shuffled, X_padded.shape[-1])
        gt_labels = Y_pos.squeeze(dim=1)[:, 1:]
        
        print("gt_labels", gt_labels.shape)
        for batch in tqdm.notebook.tqdm(range(0, len(sentences), batchSize), leave=False):
            lstm.zero_grad()
            
            optimizer.zero_grad()
            
            input = sentences_input[i:min(i+batchSize, len(sentences))].to(device)
            output = lstm.forward(input, train=True).to(device)
            exp_output = gt_labels[i:min(i+batchSize, len(sentences))].to(device)
            
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(output, exp_output)
            
            totalLoss += loss
            loss.backward()
            optimizer.step()
            i += batchSize
        
        lstm.eval()
        print(f"loss on epoch {epoch} = {totalLoss}")

train_emoji_predictor2(word_sentences_train, positions_train, lstm)

(292743, 1)
gt_labels torch.Size([292743, 36])


  0%|          | 0/4575 [00:00<?, ?it/s]

loss on epoch 0 = 3818.191162109375
gt_labels torch.Size([292743, 36])


  0%|          | 0/4575 [00:00<?, ?it/s]

loss on epoch 1 = 2337.922607421875
gt_labels torch.Size([292743, 36])


  0%|          | 0/4575 [00:00<?, ?it/s]

loss on epoch 2 = 1404.463134765625
gt_labels torch.Size([292743, 36])


  0%|          | 0/4575 [00:00<?, ?it/s]

loss on epoch 3 = 681.5301513671875
gt_labels torch.Size([292743, 36])


  0%|          | 0/4575 [00:00<?, ?it/s]

loss on epoch 4 = 370.49578857421875


In [None]:
torch.save(lstm.state_dict(), 'emoji_pos_pred.pt')

In [None]:
lstm_pos = EmojiPositionPredictor(DIM_HID=500, DIM_EMB=300).to(device)

In [None]:
lstm_pos.load_state_dict(torch.load("emoji_pos_pred_0405.pt", map_location=device))

<All keys matched successfully>

In [None]:
preds = lstm.inference(word_sentences_val[0:50])
exp = positions_val[0:50]           
print(preds)   
print(exp)

In [None]:
word_sentences_val[0:20]

In [None]:
def compute_accuracy(sentences, positions):
    total_corr = 0
    predictions = torch.empty((35000)).to(device)
    for i in range(0, 35):
        #print(i*1000, (i+1)*1000)
        preds = lstm.inference(sentences[i*1000:(i+1)*1000])
        exp = positions[i*1000:(i+1)*1000]           
        #print(preds)   
        #print(exp)
        corr = [1 for i in range(len(exp)) if preds[i] == (exp[i] - 1)]   
        print(np.sum(np.array(corr)))
        predictions[i*1000:(i+1)*1000] = preds
        total_corr += np.sum(np.array(corr))
    acc = total_corr/len(positions)
    print(acc)
    return acc, predictions


In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

In [None]:
print(predictions_val.shape)
positions_val_f1 = np.array(positions_val[0:35000]) - 1
print(positions_val.shape)

In [None]:
prec = precision_score(positions_val_f1, predictions_val.cpu(), average=None)
print(prec)
prec = precision_score(positions_val_f1, predictions_val.cpu(), average='weighted')
print(prec)

In [None]:
np.unique(predictions_val.cpu())

In [None]:
rec = recall_score(positions_val_f1, predictions_val.cpu(), average=None)
print(rec)
rec = recall_score(positions_val_f1, predictions_val.cpu(), average='weighted')
print(rec)

In [None]:
f1 = f1_score(positions_val_f1, predictions_val.cpu(), average=None)
print(f1)
f1 = f1_score(positions_val_f1, predictions_val.cpu(), average='weighted')
print(f1)

In [None]:
print(predictions_test.shape)
positions_test_f1 = np.array(positions_test[0:35000]) - 1
print(positions_test_f1.shape)

In [None]:
prec = precision_score(positions_test_f1, predictions_test.cpu(), average=None)
print(prec)
prec = precision_score(positions_test_f1, predictions_test.cpu(), average='weighted')
print(prec)

In [None]:
rec = recall_score(positions_test_f1, predictions_test.cpu(), average=None)
print(rec)
rec = recall_score(positions_test_f1, predictions_test.cpu(), average='weighted')
print(rec)

In [None]:
f1 = f1_score(positions_test_f1, predictions_test.cpu(), average=None)
print(f1)
f1 = f1_score(positions_test_f1, predictions_test.cpu(), average='weighted')
print(f1)