# Emojify - V1
Implementaremos un modelo simple de recomendación de emojis basado en una oración de entrada ocupando Word Embeddings. Este modelo es una adaptación de la tarea "Emojify" del curso de Sequence Models de Andrew Ng. En esta adaptación ocuparemos Pytorch a cambio de Keras.

In [1]:
import numpy as np
from emo_utils import *
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F



In [2]:
X_train, Y_train = read_csv('data/train_emoji.csv')
X_test, Y_test = read_csv('data/tesss.csv')

In [3]:
X_train.shape, Y_train.shape, X_test.shape, Y_test.shape

((132,), (132,), (56,), (56,))

Obtendremos los vectores de embeddings asociados a cada palabra de cada oración y el promedio será la entrada de la red.


<center>
<img src="arquitectura.png" style="width:900px;height:300px;">
    <caption><center><font color='purple'>Baseline model (Emojifier-V1).</center></caption>
</center></font>

In [4]:
Y_oh_train = convert_to_one_hot(Y_train, C = 5) # One hot encoding clases
Y_oh_test = convert_to_one_hot(Y_test, C = 5)

In [5]:
Y_oh_train[0].shape

(5,)

In [6]:
def read_glove_vecs(glove_file):
    with open(glove_file, 'r', encoding='utf-8') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
        
        i = 1
        words_to_index = {}
        index_to_words = {}
        for w in sorted(words):
            words_to_index[w] = i
            index_to_words[i] = w
            i = i + 1
    return words_to_index, index_to_words, word_to_vec_map

In [7]:
word_to_index, index_to_word, word_to_vec_map = read_glove_vecs('data/glove.6B.50d.txt')

In [8]:
word_to_vec_map['the']

array([ 4.1800e-01,  2.4968e-01, -4.1242e-01,  1.2170e-01,  3.4527e-01,
       -4.4457e-02, -4.9688e-01, -1.7862e-01, -6.6023e-04, -6.5660e-01,
        2.7843e-01, -1.4767e-01, -5.5677e-01,  1.4658e-01, -9.5095e-03,
        1.1658e-02,  1.0204e-01, -1.2792e-01, -8.4430e-01, -1.2181e-01,
       -1.6801e-02, -3.3279e-01, -1.5520e-01, -2.3131e-01, -1.9181e-01,
       -1.8823e+00, -7.6746e-01,  9.9051e-02, -4.2125e-01, -1.9526e-01,
        4.0071e+00, -1.8594e-01, -5.2287e-01, -3.1681e-01,  5.9213e-04,
        7.4449e-03,  1.7778e-01, -1.5897e-01,  1.2041e-02, -5.4223e-02,
       -2.9871e-01, -1.5749e-01, -3.4758e-01, -4.5637e-02, -4.4251e-01,
        1.8785e-01,  2.7849e-03, -1.8411e-01, -1.1514e-01, -7.8581e-01])

In [9]:
class Dataset(Dataset):
    def __init__(self, X_train, y_train, any_word):
        self.n_samples = len(X_train)
        self.x_data = X_train
        self.y_data = y_train
        self.any_word = any_word
        
    def __getitem__(self, idx):
        sentence = self.x_data[idx]
        words = sentence.lower().split() # Palabras de cada frase
        avg = np.zeros(word_to_vec_map[self.any_word].shape) # Vector de ceros - embeddings
        count = 0
        for w in words:
            if w in list(word_to_vec_map.keys()):
                avg += word_to_vec_map[w] # Vector asociado a la palabra
                count +=1
        if count > 0:
            avg = avg/count # Promedio
        return torch.tensor(avg, dtype=torch.float32), torch.tensor(self.y_data[idx], dtype=torch.float32)
    
    def __len__(self):
        return self.n_samples

In [10]:
batch_size = 8

dataset = Dataset(X_train, Y_oh_train, any_word = 'the')
train_loader = DataLoader(dataset = dataset, batch_size = batch_size, shuffle = True)

In [11]:
next(iter(train_loader))[0].shape

torch.Size([8, 50])

In [12]:
class Emojify(nn.Module):
    def __init__(self, input_size, num_classes):
        super().__init__()
        self.l1 = nn.Linear(input_size, num_classes)
        
    def forward(self, x):
        out = self.l1(x)
        out = F.softmax(out, dim=-1)
        return out

In [14]:
num_classes = 5 # Número de clases
input_size = 50 # Caracteristicas
learning_rate = 0.001
num_epochs = 100

device = torch.device("cuda") if torch.cuda.is_available() else 'cpu'
model = Emojify(input_size, num_classes).to(device)
model

Emojify(
  (l1): Linear(in_features=50, out_features=5, bias=True)
)

In [15]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)

In [16]:
for epoch in range(num_epochs):
    for (sentence, classes) in train_loader:
        sentence = sentence.to(device)
        classes = classes.to(device)
        
        outputs = model(sentence) 
        loss = criterion(outputs, classes)
        
        # backward
        optimizer.zero_grad() # gradientes en 0 para cada batch
        loss.backward() # Gradientes
        optimizer.step() # Actualizamos parámetros
    
    if (epoch + 1) % 10 == 0:
        print(f'epoch {epoch + 1}/{num_epochs}, loss={loss.item():.4f}')
        
print(f'final loss, loss = {loss.item():.4f}')

epoch 10/100, loss=1.6429
epoch 20/100, loss=1.4636
epoch 30/100, loss=1.5762
epoch 40/100, loss=1.3012
epoch 50/100, loss=1.5506
epoch 60/100, loss=1.1893
epoch 70/100, loss=1.5244
epoch 80/100, loss=1.3904
epoch 90/100, loss=1.0520
epoch 100/100, loss=1.2299
final loss, loss = 1.2299


In [69]:
emoji_dictionary = {"0": "\u2764\uFE0F",
                    "1": ":baseball:",
                    "2": ":smile:",
                    "3": ":disappointed:",
                    "4": ":fork_and_knife:"}

In [70]:
import emoji
examples = np.array(["i cherish you", "i love you", "funny lol", "lets play with a ball", "food is ready", "not feeling happy"])
predicts = []
for sentence in examples:
    words = sentence.lower().split() 
    avg = np.zeros(word_to_vec_map['the'].shape)
    count = 0
    for w in words:
        if w in list(word_to_vec_map.keys()):
            avg += word_to_vec_map[w] # Vector asociado a la palabra
            count +=1
        if count > 0:
            avg = avg/count # Promedio
    avg = torch.tensor(avg, dtype=torch.float32)  
    output = model(avg)
    max_index = torch.argmax(output).item()
    print(emoji.emojize(sentence + ' ' + emoji_dictionary[str(max_index)]))

i cherish you :disappointed:
i love you ❤️
funny lol :smile:
lets play with a ball ⚾
food is ready 🍴
not feeling happy :smile:
