In [2]:
import numpy as np
import emo_utils
import emoji
import matplotlib.pyplot as plt

%matplotlib inline

In [40]:
X_train, Y_train = emo_utils.read_csv('data/train_emoji.csv')
X_test, Y_test = emo_utils.read_csv('data/test.csv')

maxLen = len(max(X_train, key=len).split())
index  = 3
print(X_train[index], emo_utils.label_to_emoji(Y_train[index]))

Miss you so much ❤️


In [8]:
Y_oh_train = emo_utils.convert_to_one_hot(Y_train, C=5)
Y_oh_test  = emo_utils.convert_to_one_hot(Y_test,  C=5)
#test
# index = 0 
# print("{0}对应的独热编码是{1}".format(Y_train[index], Y_oh_train[index]))

3对应的独热编码是[0. 0. 0. 1. 0.]


In [10]:
word_to_index, index_to_word, word_to_vec_map = emo_utils.read_glove_vecs('data/glove.6B.50d.txt')
#word_to_index：字典类型的词汇（400,001个）与索引的映射（有效范围：0-400,000）
#index_to_word：字典类型的索引与词汇之间的映射。
#word_to_vec_map：字典类型的词汇与对应GloVe向量的映射。

单词cucumber对应的索引是：113317
索引113317对应的单词是：cucumber


In [17]:
#1st step: transform and average
def sentence_to_avg(sentence, word_to_vec_map):
    """
    将句子转换为单词列表，提取其GloVe向量，然后将其平均。
    
    参数：
        sentence -- 字符串类型，从X中获取的样本。
        word_to_vec_map -- 字典类型，单词映射到50维的向量的字典
        
    返回：
        avg -- 对句子的均值编码，维度为(50,)
    """
    #转化为小写,并分割句子
    words = sentence.lower().split()
    
    #初始化average
    avg = np.zeros(50,)
    
    #对sentence中的所有单词求平均
    for w in words:
        avg += word_to_vec_map[w]
    avg = np.divide(avg, len(words))
    
    return avg

#test
# avg = sentence_to_avg("Morrocan couscous is my favorite dish", word_to_vec_map)
# print("avg = ", avg)

In [24]:
#2nd step: train and optimize
def model(X, Y, word_to_vec_map, learning_rate=0.01, num_iterations=400):
    """
    在numpy中训练词向量模型。
    
    参数：
        X -- 输入的字符串类型的数据，维度为(m, 1)。
        Y -- 对应的标签，0-7的数组，维度为(m, 1)。
        word_to_vec_map -- 字典类型的单词到50维词向量的映射。
        learning_rate -- 学习率.
        num_iterations -- 迭代次数。
        
    返回：
        pred -- 预测的向量，维度为(m, 1)。
        W -- 权重参数，维度为(n_y, n_h)。
        b -- 偏置参数，维度为(n_y,)
    """
    np.random.seed(1)
    
    #定义训练数量
    m = Y.shape[0]
    n_y = 5
    n_h = 50
    
    #使用Xavier初始化参数
    W = np.random.randn(n_y, n_h) / np.sqrt(n_h)
    b = np.zeros((n_y,))
    
    #将Y转化成独热编码
    Y_oh = emo_utils.convert_to_one_hot(Y, C=n_y)
    
    #优化循环
    for t in range(num_iterations):
        for i in range(m):
            #对第i个样本运行Model
            avg = sentence_to_avg(X[i],word_to_vec_map)
        
            #forward propagation
            z = np.dot(W, avg) + b
            a = emo_utils.softmax(z)
        
            #计算损失
            cost = -np.sum(Y_oh[i]*np.log(a))
        
            #backward propagation
            dz = a - Y_oh[i]
            dW = np.dot(dz.reshape(n_y,1),avg.reshape(1,n_h)) #初始化的dz, avg是rank=1的,要转化为矩阵
            db = dz
        
            #optimize
            W = W - learning_rate * dW
            b = b - learning_rate * db
            
        if t % 100 == 0:
            print("第{t}轮, 损失为{cost}".format(t=t,cost=cost))
            pred = emo_utils.predict(X, Y, W, b, word_to_vec_map)
    
    return pred, W, b

第0轮, 损失为1.9520498812810072
Accuracy: 0.3484848484848485
第100轮, 损失为0.07971818726014807
Accuracy: 0.9318181818181818
第200轮, 损失为0.04456369243681402
Accuracy: 0.9545454545454546
第300轮, 损失为0.03432267378786059
Accuracy: 0.9696969696969697


In [26]:
#test
pred, W, b = model(X_train, Y_train, word_to_vec_map)
print("=====训练集====")
pred_train = emo_utils.predict(X_train, Y_train, W, b, word_to_vec_map)
print("=====测试集====")
pred_test = emo_utils.predict(X_test, Y_test, W, b, word_to_vec_map)

第0轮, 损失为1.9520498812810072
Accuracy: 0.3484848484848485
第100轮, 损失为0.07971818726014807
Accuracy: 0.9318181818181818
第200轮, 损失为0.04456369243681402
Accuracy: 0.9545454545454546
第300轮, 损失为0.03432267378786059
Accuracy: 0.9696969696969697
=====训练集====
Accuracy: 0.9772727272727273
=====测试集====
Accuracy: 0.8571428571428571


In [29]:
#test on my_senteces
X_my_sentences = np.array(["i adore you", "i love you", "funny lol", "lets play with a ball", "food is ready", "you are not happy"])
Y_my_labels = np.array([[0], [0], [2], [1], [4],[3]])

pred = emo_utils.predict(X_my_sentences, Y_my_labels , W, b, word_to_vec_map)
emo_utils.print_predictions(X_my_sentences, pred)

Accuracy: 0.8333333333333334

i adore you ❤️
i love you ❤️
funny lol 😄
lets play with a ball ⚾
food is ready 🍴
you are not happy ❤️


即使你只有128个训练样本，你也可以得到很好地表情符号模型，因为词向量是训练好了的，它会给你一个较好的概括能力。

Emojifier-V1是有缺陷的，比如它不会把“This movie is not good and not enjoyable”划分为不好一类，因为它只是将所有单词的向量做了平均，没有关心过顺序。

In [30]:
import numpy as np
np.random.seed(0)
import keras
from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, Activation
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

np.random.seed(1)
from keras.initializers import glorot_uniform

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [34]:
def sentences_to_indices(X, word_to_index, max_len):
    """
    输入的是X（字符串类型的句子的数组），再转化为对应的句子列表，
    输出的是能够让Embedding()函数接受的列表或矩阵（参见图4）。
    
    参数：
        X -- 句子数组，维度为(m, 1)
        word_to_index -- 字典类型的单词到索引的映射
        max_len -- 最大句子的长度，数据集中所有的句子的长度都不会超过它。
        
    返回：
        X_indices -- 对应于X中的单词索引数组，维度为(m, max_len)
    """
    m = X.shape[0]
    X_indices = np.zeros((m, max_len))
    
    for i in range(m):
        j = 0
        sentence_words = X[i].lower().split()
        for w in sentence_words:
            X_indices[i][j] = word_to_index[w]
            j += 1
    
    return X_indices

#test
# X1 = np.array(["funny lol", "lets play baseball", "food is ready for you"])
# X1_indices = sentences_to_indices(X1,word_to_index, max_len = 5)
# print("X1 =", X1)
# print("X1_indices =", X1_indices)

In [36]:
def pretrained_embedding_layer(word_to_vec_map, word_to_index):
    """
    创建Keras Embedding()层，加载已经训练好了的50维GloVe向量
    
    参数：
        word_to_vec_map -- 字典类型的单词与词嵌入的映射
        word_to_index -- 字典类型的单词到词汇表（400,001个单词）的索引的映射。
        
    返回：
        embedding_layer() -- 训练好了的Keras的实体层。
    """
    vocab_len = len(word_to_index) + 1
    emb_dim = word_to_vec_map["cucumber"].shape[0]
    
    #初始化嵌入矩阵
    emb_matrix = np.zeros((vocab_len, emb_dim))
    
    #讲嵌入矩阵的每行的“index”设置为词汇"index"的词向量表示
    for word, index in word_to_index.items():
        emb_matrix[index,:] = word_to_vec_map[word]
        
    #定义Keras的embedding层
    embedding_layer = Embedding(vocab_len, emb_dim, trainable=False)
    
    #构建embedding层
    embedding_layer.build((None,))
    
    #讲嵌入层的权重设置为嵌入矩阵
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer

#test
# embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
# print("weights[0][1][3] =", embedding_layer.get_weights()[0][1][3])

weights[0][1][3] = -0.3403


In [38]:
def Emojify_V2(input_shape, word_to_vec_map, word_to_index):
    """
    实现Emojify-V2模型的计算图
    
    参数：
        input_shape -- 输入的维度，通常是(max_len,)
        word_to_vec_map -- 字典类型的单词与词嵌入的映射。
        word_to_index -- 字典类型的单词到词汇表（400,001个单词）的索引的映射。
    
    返回：
        model -- Keras模型实体
    """
    #定义sentece_indices为计算图的输入，维度为(input_shape,),类型为dtype"int32"
    sentence_indices = Input(input_shape, dtype="int32")
    
    #创建embedding层
    embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
    
    #通过嵌入层传播sentence_indices，得到嵌入的结果
    embeddings = embedding_layer(sentence_indices)
    
    #通过带有128维隐藏状态的LSTM层传播嵌入
    #需要注意的是，这里是图形的第一层，返回的输出应该是一批序列
    X = LSTM(128, return_sequences=True)(embeddings)
    #使用dropout
    X = Dropout(0.5)(X)
    #通过第二层128维隐藏状态的LSTM层传播X
    #这时是many-to-one模型，输出是最后一个y
    X = LSTM(128, return_sequences=False)(X)
    #使用dropout
    X = Dropout(0.5)(X)
    #Dense层全连接
    X = Dense(5)(X)
    #添加softmax激活
    X = Activation("softmax")(X)
    
    #创建模型实体
    model = Model(inputs=sentence_indices, outputs=X)
    
    return model

In [41]:
model = Emojify_V2((10,), word_to_vec_map, word_to_index)
model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 10)                0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 10, 50)            20000050  
_________________________________________________________________
lstm_1 (LSTM)                (None, 10, 128)           91648     
_________________________________________________________________
dropout_1 (Dropout)          (None, 10, 128)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 645 

In [42]:
#编译模型
model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])

In [44]:
#训练集上的结果
X_train_indices = sentences_to_indices(X_train, word_to_index, 10)
Y_train_oh = emo_utils.convert_to_one_hot(Y_train, C = 5)
model.fit(X_train_indices, Y_train_oh, epochs = 50, batch_size = 32, shuffle=True)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.callbacks.History at 0x2c1127358d0>

In [45]:
#测试集上的结果
X_test_indices = sentences_to_indices(X_test, word_to_index, max_len = 10)
Y_test_oh = emo_utils.convert_to_one_hot(Y_test, C = 5)
loss, acc = model.evaluate(X_test_indices, Y_test_oh)

print("Test accuracy = ", acc)

Test accuracy =  0.8392857313156128


In [46]:
#查看错误例子
C = 5
y_test_oh = np.eye(C)[Y_test.reshape(-1)]
X_test_indices = sentences_to_indices(X_test, word_to_index, 10)
pred = model.predict(X_test_indices)
for i in range(len(X_test)):
    x = X_test_indices
    num = np.argmax(pred[i])
    if(num != Y_test[i]):
        print('正确表情：'+ emo_utils.label_to_emoji(Y_test[i]) + '   预测结果： '+ X_test[i] + emo_utils.label_to_emoji(num).strip())

正确表情：😄   预测结果： he got a very nice raise	❤️
正确表情：😄   预测结果： she got me a nice present	❤️
正确表情：😄   预测结果： he is a good friend	❤️
正确表情：😞   预测结果： work is hard	😄
正确表情：😞   预测结果： This girl is messing with me	❤️
正确表情：❤️   预测结果： I love taking breaks	😞
正确表情：😄   预测结果： you brighten my day	❤️
正确表情：😄   预测结果： will you be my valentine	❤️
正确表情：😄   预测结果： I like to laugh	❤️


In [60]:
#测试my sentences
x_test = np.array(['you are not happy'])
X_test_indices = sentences_to_indices(x_test, word_to_index, maxLen)
print(x_test[0] +' '+  emo_utils.label_to_emoji(np.argmax(model.predict(X_test_indices))))

you are not happy 😞
