In [None]:
import json
import os
import random
import re
import sys
import tqdm

# data
from collections import defaultdict
from collections import Counter
import numpy as np

# viz
import matplotlib.pyplot as plt

# torch
import torch
from torch.utils.data import Dataset

## Config

In [None]:
root_path = './../'
clean_data_folder_path = os.path.join(root_path, 'data', 'clean_data')
glove_path = os.path.join(root_path, "glove.840B.300d.conll_filtered.txt")

# target emojis
mapping = { 
    '❤':'0' , '😍':'1' , '😂':'2' , '💕':'3' , 
    '🔥':'4' , '😊':'5' , '😎':'6' , '✨':'7' , 
    '💙':'8' , '😘':'9' , '📷':'10' , '🇺🇸':'11' , 
    '☀':'12' , '💜':'13' , '😉':'14' , '💯':'15' , 
    '😁':'16' , '🎄':'17' , '📸':'18' , '😜':'19'
}

# Dataset Preparation

In [None]:
class EmojiDataset(Dataset):
    def __init__(self, dataset_path, transforms=None):
        tweet_text_path = os.path.join(dataset_path, 'tweets.text')
        tweet_label_path = os.path.join(dataset_path, 'tweets.labels')
        tweet_tokenized_path = os.path.join(dataset_path, 'tweets.tokenized')
        
        # init glove
        self.glove_emb = self.read_GloVe(glove_path)
        
        self.word_sentences = []
        self.labels = []
        
        # curate the sentences
        count = 0
        for line in open(tweet_tokenized_path).readlines():
            current_sentence = ['<START>']
            current_sentence.extend(line.rstrip().split(' '))
            current_sentence.append('<END>')
            self.word_sentences.append(current_sentence)
            
            # count += 1
            # if count > 50:
            #     break
                
        # curate the labels
        count = 0
        for line in open(tweet_label_path).readlines():
            emojis = line.rstrip().split(' ')
            
            try:
                emoji_code = int(emojis[0].split(',')[0][1:])
                self.labels.append(emoji_code)
            except Exception as e:
                # no emoji for this tweet
                print(line)
                self.labels.append(-1)
            
            # count += 1
            # if count > 50:
            #     break
        
        # compute char sentences from word sentences
        self.char_sentences = self.sentences2char(self.word_sentences)
        
        # compute counts
        self.word_counts = Counter([w for l in self.word_sentences for w in l])
        self.char_counts = Counter([c for l in self.word_sentences for w in l for c in w])
        self.singletons = set([w for (w,c) in self.word_counts.items() if c == 1 and not w in self.glove_emb.keys()])
        self.char_singletons = set([w for (w,c) in self.char_counts.items() if c == 1])
        
        # Build dictionaries to map from words, characters to indices and vice versa.
        # Save first two words in the vocabulary for padding and "UNK" token.
        self.word2i = {w:i+2 for i,w in enumerate(set([w for l in self.word_sentences for w in l] + list(self.glove_emb.keys())))}
        self.char2i = {w:i+2 for i,w in enumerate(set([c for l in self.char_sentences for w in l for c in w]))}
        self.i2word = {i:w for w,i in self.word2i.items()}
        self.i2char = {i:w for w,i in self.char2i.items()}
        
        # compute vocab size
        self.vocab_size = max(self.word2i.values()) + 1
        self.char_vocab_size = max(self.char2i.values()) + 1
        
        # emoji dictionaries.
        self.emoji2i = {e:int(i) for e,i in mapping.items()}
        self.i2emoji = {i:e for e,i in self.emoji2i.items()}
    
    def sentences2char(self, sentences):
        return [[['start'] + [c for c in w] + ['end'] for w in l] for l in sentences]
    
    def read_GloVe(self, filename):
        embeddings = {}
        for line in open(filename).readlines():
            #print(line)
            fields = line.strip().split(" ")
            word = fields[0]
            embeddings[word] = [float(x) for x in fields[1:]]
        return embeddings
    
    #When training, randomly replace singletons with UNK tokens sometimes to simulate situation at test time.
    def getDictionaryRandomUnk(self, w, dictionary, train=False):
        if train and (w in self.singletons and random.random() > 0.5):
            return 1
        else:
            return dictionary.get(w, 1)
        
    #Map a list of sentences from words to indices.
    def sentences2indices(self, words, dictionary, train=False):
        #1.0 => UNK
        return [[self.getDictionaryRandomUnk(w,dictionary, train=train) for w in l] for l in words]
    
    #Map a list of sentences containing to indices (character indices)
    def sentences2indicesChar(self, chars, dictionary):
        #1.0 => UNK
        return [[[dictionary.get(c,1) for c in w] for w in l] for l in chars]


## Test the dataset class

In [None]:
dataset = EmojiDataset(clean_data_folder_path)

test_idx = 5
print(dataset.word_sentences[test_idx])
print(dataset.char_sentences[test_idx])
print(dataset.i2emoji[dataset.labels[test_idx]])

## Utility Methods

### Pad inputs to max sequence length (for batching)

In [None]:
def prepare_input(X_list):
    X_padded = torch.nn.utils.rnn.pad_sequence([torch.as_tensor(l) for l in X_list], batch_first=True).type(torch.LongTensor) # padding the sequences with 0
    X_mask   = torch.nn.utils.rnn.pad_sequence([torch.as_tensor([1.0] * len(l)) for l in X_list], batch_first=True).type(torch.FloatTensor) # consisting of 0 and 1, 0 for padded positions, 1 for non-padded positions
    return (X_padded, X_mask)

In [None]:
#Maximum word length (for character representations)
MAX_CLEN=32

def prepare_input_char(X_list):
    MAX_SLEN = max([len(l) for l in X_list])
    X_padded  = [l + [[]]*(MAX_SLEN-len(l))  for l in X_list]
    X_padded  = [[w[0:MAX_CLEN] for w in l] for l in X_padded]
    X_padded  = [[w + [1]*(MAX_CLEN-len(w)) for w in l] for l in X_padded]
    return torch.as_tensor(X_padded).type(torch.LongTensor)

### Pad outputs using one-hot encoding

In [None]:
def prepare_output_onehot(Y_list, NUM_TAGS=max(dataset.emoji2i.values())+1):
    Y_onehot = [torch.zeros(len(l), NUM_TAGS) for l in Y_list]
    for i in range(len(Y_list)):
        for j in range(len(Y_list[i])):
            Y_onehot[i][j,Y_list[i][j]] = 1.0
    Y_padded = torch.nn.utils.rnn.pad_sequence(Y_onehot, batch_first=True).type(torch.FloatTensor)
    return Y_padded

## Define training set and labels

In [None]:
#Indices
X       = dataset.sentences2indices(dataset.word_sentences, dataset.word2i, train=True)
X_char  = dataset.sentences2indicesChar(dataset.char_sentences, dataset.char2i)
Y       = dataset.labels

In [None]:
print("max slen:", max([len(x) for x in X_char]))

In [None]:
(X_padded, X_mask) = prepare_input(X)
X_padded_char      = prepare_input_char(X_char)
#Y_onehot           = prepare_output_onehot(Y)

In [None]:
print("X_padded:", X_padded.shape)
print("X_mask:", X_mask.shape)
print("X_padded_char:", X_padded_char.shape)
print("Y shape:", len(Y))

# Start Modeling