Implementación de un modelo de Sentiment Analysis con alguna arquitectura de Deep Learning.

In [0]:
#Importamos librerías necesarias

%matplotlib inline
import keras
import numpy as np
from keras.layers import Conv1D, Conv2D, SimpleRNN, LSTM, Dense, Dropout
from keras.models import Sequential

In [0]:
emb_size = 10
# Model
rnn = SimpleRNN(5)

cnn_1d = Conv1D(filters=2, kernel_size=3)
cnn_2d = Conv2D(filters=2, kernel_size=(3, emb_size), strides=(1, 1),)

dnn = Dense(5)

Modelo Clasificación clásico


In [63]:
!pip install spacy
!python -m spacy download en_core_web_sm

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [0]:
#Importa librerías necesarias
import spacy
import pickle
import json
import os
import csv
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
import io

from random import sample
from sklearn.model_selection import StratifiedShuffleSplit

import warnings
warnings.filterwarnings('ignore')

In [0]:
# parámetos del NLP
nlp = spacy.load('en_core_web_sm', disable=['parser', 'tagger', 'ner', 'textcat'])

In [0]:
# Función que separa el conjunto de train del de test
def split_train_val_test(dataset, split=0.2):

    x, y = zip(*dataset)
    x = np.array(list(x))
    y = np.array(list(y))
    sss = StratifiedShuffleSplit(n_splits=1, test_size=split, random_state=1337) #l33t seed
    for train_index, test_index in sss.split(x, y):
        x_train, x_val = x[train_index], x[test_index]
        y_train, y_val = y[train_index], y[test_index]
    splits = {'train':(x_train, y_train), 'test':(x_val, y_val)}
    return splits

In [67]:
# Se carga el dataset para la práctica desde un xlsx (no tengo la versión buena de excel, y por no tanto no puedo
# recortar documentos tipo csv). Reducido, pues no soporta todos los registros.

import openpyxl

import numpy as np 
import pandas as pd

from google.colab import drive
drive.mount('/content/drive')

# Crea un dataframe con las columnas :
# ItemID : número de registro
# Sentiment : 0 o 1 (negativo o positivo)
# SentimentText : texto
df = pd.read_excel('/content/drive/My Drive/BootCamp - BigDataIV - NLP/data_practica_train_sentiment_20000.xlsx',names=['ItemID', 'Sentiment', 'SentimentText'])

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
# Descartamos del dataset el ItemID, pues el número de fila no lo usamos para nada
df = df.drop(['ItemID'], axis=1)

In [69]:
#Pidiendo información, vemos que hay nulos en la columna de texto.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19989 entries, 0 to 19988
Data columns (total 2 columns):
Sentiment        19989 non-null int64
SentimentText    19861 non-null object
dtypes: int64(1), object(1)
memory usage: 312.5+ KB


In [0]:
# Eliminamos las filas con nulos en el campo texto 
df=df.dropna()

In [0]:
labels = ['0','1']  # etiquetas, sólo hay 2

In [0]:
dataset = []                # inicializa lista
for row in df.iterrows():   # para cada registro tenemos SentimetText, y Sentiment(0/1)
    ix, data = row          # info de cada registro  
    dataset.append((data[1], data[0])) # dataset en dos columnas

In [73]:
from collections import Counter

tokens = []
tokenized = []

#tokenizamos el dataset
for x, y in dataset:  # para cada registro, tenemos texto y label
    x_t = nlp(x)      # texto pasado por el nlp
    toks = [t.text for t in x_t] # frase tokenizada palabra a palabra
    tokens+= toks
    tokenized.append((toks, y)) # tokenización más label
    
vocab_counter = Counter(tokens)
vocab = set(tokens) # todas las palabras en todos las frases de forma única
print('Num de features a usar: ', len(vocab))  # num de palabras del vocabulario
print(len(tokenized))                          # num de tokens en total 

Num de features a usar:  39507
19861


In [74]:
maxlen = max([len(x) for x, _ in tokenized]) # número de caracteres que hay en la frase con más caracteres
maxlen

186

In [75]:
lens = [len(x) for x, _ in tokenized]
median = np.median(np.array(lens))
mean = np.mean(np.array(lens))
maxlen = int(median)*2
print(median, mean, maxlen)  # mediana, media, longmax de los tokens

16.0 17.536830975278182 32


In [76]:
# los tokens no pueden medir más de la long max calculada arriba
tokenized_filtered = [(x, y) for x, y in tokenized if len(x) < maxlen]
len(tokenized_filtered)  # se tiran más de 1400 tokens

18401

In [0]:
# Se asigna un índice a cada palabra del diccionario creado   
w2id = {k:i for i, k in enumerate(vocab)}
w2id['<UNK>'] = len(w2id) # se añade el UNK

Relleno e Input preparation


In [0]:
l2id = {label:i for i, label in enumerate(labels)} # dict de labels

In [0]:
# maxlen = min(maxlen, 50)
input_ready = []
for x, y in tokenized_filtered: # para cada palabra filtradas arriba
    sentence = np.zeros((maxlen))
    label = np.zeros((len(labels)))
    label[int(y)-1] = 1
    for i, t in enumerate(x): # para número de palabra y token de cada frase
        #crea un array para cada frase que contiene el índice de cada palabra del dict creado arriba 
        #si las palabras son tan raras que aparecen menos de 5 veces, se cambia por Unknown
        sentence[i] = w2id[t] if t in vocab_counter and vocab_counter[t]>=5 else w2id['<UNK>']
    input_ready.append((sentence,label))

In [122]:
maxlen

32

In [80]:
# usamos la función definida al principio, que crea train/test, pasándole el dataset recién creado
splits = split_train_val_test(input_ready)
print(splits['train'][0].shape) # (14720, 32) -> array con los tokens
print(splits['train'][1].shape) # (14720, 2) -> array con las labels

(14720, 32)
(14720, 2)


In [121]:
df

Unnamed: 0,Sentiment,SentimentText
0,0,is so sad for my APL frie...
1,0,I missed the New Moon trail...
2,1,omg its already 7:30 :O
3,0,.. Omgaga. Im sooo im gunna CRy. I'...
4,0,i think mi bf is cheating on me!!! ...
...,...,...
19984,0,@ no more going out on school days &lt;3
19985,0,@ o my God this class is tooooo long. I still ...
19986,1,@ ok now i believe what you are saying!!!!!
19987,1,@ ontario mills. Got a good parking spot!


Modelo simple. No deberíamos usarla, ni se nos ocurre montar este clasificador, pues no lleva ni Embeddings. Es malísimo, pero sirve para aprender.


In [81]:
model = Sequential()
model.add(Dense(100, input_dim=maxlen, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(100, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(len(l2id), activation='softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_21 (Dense)             (None, 100)               3300      
_________________________________________________________________
dropout_13 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_22 (Dense)             (None, 100)               10100     
_________________________________________________________________
dropout_14 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_23 (Dense)             (None, 2)                 202       
Total params: 13,602
Trainable params: 13,602
Non-trainable params: 0
_________________________________________________________________


In [82]:
model.fit(splits['train'][0], splits['train'][1],
          epochs=100,
          batch_size=16)

#Se comprueba qué malo es por la acc que no sube de 0.54, y la loss no se mueve 

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x7fc973153d68>

Modelo con unigrams. Tampoco se debería hacer.

In [83]:
input_ready = []
for x, y in tokenized_filtered:
     sentence = np.zeros((len(w2id))) # es lo mismo que el modelo anterior, pero cambiando maxlen por len(w2id)
     label = np.zeros((len(labels)))
     label[int(y)-1] = 1
     for t in x:
         idx = w2id[t] if t in vocab_counter and vocab_counter[t]>5 else w2id['<UNK>']
         sentence[idx] += 1 
     input_ready.append((sentence, label))
    
splits = split_train_val_test(input_ready)
print(splits['train'][0].shape)
print(splits['train'][1].shape)

(14720, 39508)
(14720, 2)


In [84]:
model = Sequential()
model.add(Dense(100, input_dim=len(w2id), activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(100, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(len(l2id), activation='softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_24 (Dense)             (None, 100)               3950900   
_________________________________________________________________
dropout_15 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_25 (Dense)             (None, 100)               10100     
_________________________________________________________________
dropout_16 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_26 (Dense)             (None, 2)                 202       
Total params: 3,961,202
Trainable params: 3,961,202
Non-trainable params: 0
_________________________________________________________________


In [85]:
model.fit(splits['train'][0], splits['train'][1],
          epochs=100,
          batch_size=16)
# La acc ha subido respecto del modelo anterior

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x7fc972e738d0>

Modelo con embeddings.

In [0]:
from keras.layers import Embedding
from keras.layers import Flatten, Input
from keras.models import Model

In [87]:
input_layer = Input(shape=(maxlen,) )
embedding = Embedding(output_dim=100, input_dim=len(w2id), input_length=maxlen)(input_layer)
dense_1 = Dense(300, activation='relu')(embedding)
drop_1 = Dropout(0.5)(dense_1)
dense_2 = Dense(300, activation='relu')(drop_1)
drop_2 = Dropout(0.5)(dense_2)
out = Dense(len(l2id), activation='softmax')(drop_2)

model = Model(inputs=input_layer, outputs=out)

model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

model.summary()

Model: "model_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         (None, 32)                0         
_________________________________________________________________
embedding_5 (Embedding)      (None, 32, 100)           3950800   
_________________________________________________________________
dense_27 (Dense)             (None, 32, 300)           30300     
_________________________________________________________________
dropout_17 (Dropout)         (None, 32, 300)           0         
_________________________________________________________________
dense_28 (Dense)             (None, 32, 300)           90300     
_________________________________________________________________
dropout_18 (Dropout)         (None, 32, 300)           0         
_________________________________________________________________
dense_29 (Dense)             (None, 32, 2)             602 

Modelo Deep Averaging Networks (DAN). Usamos embeddings, y usa su media más pasarlo por dos capas.


In [0]:
from keras.layers import Input, Average, average, Lambda
from keras.models import Model

from keras import backend as K

In [0]:
# genera una máscara para implementar capa custom
def mask_aware_mean(x):
    # genera una máscara filas cero
    mask = K.not_equal(K.sum(K.abs(x), axis=2, keepdims=True), 0)
    # number of that rows are not all zeros
    n = K.sum(K.cast(mask, 'float32'), axis=1, keepdims=False)
    # compute mask-aware mean of x
    x_mean = K.sum(x, axis=1, keepdims=False) / n
    return x_mean

def mask_aware_mean_output_shape(input_shape):
    shape = list(input_shape)
    assert len(shape) == 3
    return (shape[0], shape[2])

In [90]:
input_layer = Input(shape=(maxlen,) )
embedding = Embedding(output_dim=100, input_dim=len(w2id), input_length=maxlen)(input_layer)
doc_representation = Lambda(mask_aware_mean, mask_aware_mean_output_shape, name='embedding_average')(embedding)
dense_1 = Dense(100, activation='relu')(doc_representation)
drop_1 = Dropout(0.5)(dense_1)
dense_2 = Dense(100, activation='relu')(drop_1)
drop_2 = Dropout(0.5)(dense_2)
out = Dense(len(l2id), activation='softmax')(drop_2)

model = Model(inputs=input_layer, outputs=out)

model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

model.summary()

Model: "model_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_6 (InputLayer)         (None, 32)                0         
_________________________________________________________________
embedding_6 (Embedding)      (None, 32, 100)           3950800   
_________________________________________________________________
embedding_average (Lambda)   (None, 100)               0         
_________________________________________________________________
dense_30 (Dense)             (None, 100)               10100     
_________________________________________________________________
dropout_19 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_31 (Dense)             (None, 100)               10100     
_________________________________________________________________
dropout_20 (Dropout)         (None, 100)               0   

In [91]:
input_ready = []
for x, y in tokenized_filtered:
    sentence = np.zeros((maxlen))
    label = np.zeros((len(labels)))
    label[int(y)-1] = 1
    for i, t in enumerate(x):
        sentence[i] = w2id[t] if t in vocab_counter and vocab_counter[t]>2 else w2id['<UNK>']
    input_ready.append((sentence,label))
    
splits = split_train_val_test(input_ready)
print(splits['train'][0].shape)
print(splits['train'][1].shape)
   
model.fit(splits['train'][0], splits['train'][1],
          epochs=100,
          batch_size=16)

# el modelo supera la acc de 0.95

(14720, 32)
(14720, 2)
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/10

<keras.callbacks.History at 0x7fc972b972b0>

In [0]:
from keras.layers import MaxPooling1D, GlobalMaxPooling2D
from keras.layers import Input, Embedding, Concatenate, Reshape
from keras.models import Model

In [93]:
embedding_dim = 100

input_layer = Input(shape=(maxlen,) )# maxlen
embedding = Embedding(output_dim=embedding_dim, input_dim=len(w2id), input_length=maxlen)(input_layer)#w2id maxlen
reshape = Reshape((maxlen,embedding_dim,1))(embedding)

conv_1 = Conv2D(filters=50, kernel_size=(1, embedding_dim), activation='relu', padding='valid')(reshape)
mp_1 = GlobalMaxPooling2D()(conv_1)

conv_2 = Conv2D(filters=50, kernel_size=(2, embedding_dim), activation='relu', padding='valid')(reshape)
mp_2 = GlobalMaxPooling2D()(conv_2)

conv_5 = Conv2D(filters=50, kernel_size=(5, embedding_dim), activation='relu', padding='valid')(reshape)
mp_5 = GlobalMaxPooling2D()(conv_5)

doc_representation = Concatenate()([mp_1, mp_2, mp_5])

dense_1 = Dense(100, activation='relu')(doc_representation)
drop_1 = Dropout(0.5)(dense_1)
dense_2 = Dense(100, activation='relu')(drop_1)
drop_2 = Dropout(0.5)(dense_2)
out = Dense(len(l2id), activation='softmax')(drop_2)

model = Model(inputs=input_layer, outputs=out)

model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

model.summary()


Model: "model_7"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_7 (InputLayer)            (None, 32)           0                                            
__________________________________________________________________________________________________
embedding_7 (Embedding)         (None, 32, 100)      3950800     input_7[0][0]                    
__________________________________________________________________________________________________
reshape_2 (Reshape)             (None, 32, 100, 1)   0           embedding_7[0][0]                
__________________________________________________________________________________________________
conv2d_6 (Conv2D)               (None, 32, 1, 50)    5050        reshape_2[0][0]                  
____________________________________________________________________________________________

In [94]:
print(splits['train'][0].shape)
print(splits['train'][1].shape)
    

model.fit(splits['train'][0], splits['train'][1],
          epochs=50,
          batch_size=16)

(14720, 32)
(14720, 2)
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7fc972c0a7f0>

# Recurrent Neural Network para texto



In [0]:
def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0) # only difference


In [0]:
vocabulary = 123
hidden_size = 512
max_t = 12

U = np.random.uniform(size=(hidden_size, vocabulary))
W = np.random.uniform(size=(hidden_size, hidden_size))
V = np.random.uniform(size=(vocabulary, hidden_size))

t=1

In [97]:
x = np.eye(max_t, vocabulary)
print(x.shape, x[0,:].shape)

(12, 123) (123,)


In [98]:
s = np.zeros(shape=(max_t, hidden_size))
s.shape

(12, 512)

In [0]:
s[1]=np.tanh(U.dot(x[t,:]) + W.dot(s[t-1,:]))

In [0]:
yt = softmax(V.dot(s[t,:]))

In [101]:
# Comprobemos shapes!
print(s[1].shape)
print(yt.shape)

(512,)
(123,)


De hecho lo que hemos hecho aquí arriba practicamente sería el forward pass implementado en numpy.

Vamos a ver la implementación en Keras, y a entrenar una para un problema de clasificación

In [0]:
from keras.layers import LSTM, SimpleRNN

In [0]:
rnn_type = SimpleRNN(100)

In [104]:
embedding_dim = 100

input_layer = Input(shape=(maxlen,) )
embedding = Embedding(output_dim=embedding_dim, input_dim=len(w2id), input_length=maxlen)(input_layer)
rnn = rnn_type(embedding)
dense_1 = Dense(100, activation='relu')(rnn)
drop_1 = Dropout(0.5)(dense_1)
dense_2 = Dense(100, activation='relu')(drop_1)
drop_2 = Dropout(0.5)(dense_2)
out = Dense(len(l2id), activation='softmax')(drop_2)

model = Model(inputs=input_layer, outputs=out)

model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

model.summary()


Model: "model_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_8 (InputLayer)         (None, 32)                0         
_________________________________________________________________
embedding_8 (Embedding)      (None, 32, 100)           3950800   
_________________________________________________________________
simple_rnn_3 (SimpleRNN)     (None, 100)               20100     
_________________________________________________________________
dense_36 (Dense)             (None, 100)               10100     
_________________________________________________________________
dropout_23 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_37 (Dense)             (None, 100)               10100     
_________________________________________________________________
dropout_24 (Dropout)         (None, 100)               0   

In [105]:

model.fit(splits['train'][0], splits['train'][1],
          epochs=5,
          batch_size=16)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fc972e730f0>

In [106]:
score_rnn = model.evaluate(splits['test'][0], splits['test'][1], batch_size=16)
score_rnn



[0.6480603786921637, 0.637055148057593]

# LSTM



In [0]:
# introducir ecuaciones como en Vanilla RNN
import numpy as np

In [0]:
# cosas
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [109]:
# definamos hidden_size
vocab_size = 25
emb_size = 50
use_embeddings = True

if use_embeddings:
    x_size = emb_size
    x = np.random.uniform(size=(max_t, x_size))
else:
    x_size = vocab_size
    x = np.eye(max_t, x_size)


hidden_size = 100
max_t = 10
HX_size = hidden_size + x_size
# Que parametros necesitamos? Ignoraremos los bias al ser un ejemplo
# Tenemos 4 capas, forget, input, output, cell state, por lo tanto, 4 matrices.

W_f = np.random.uniform(size=(hidden_size, HX_size))

W_i = np.random.uniform(size=(hidden_size, HX_size))

W_o = np.random.uniform(size=(hidden_size, HX_size))

W_c = np.random.uniform(size=(hidden_size, HX_size))
C = np.zeros(shape=(max_t, hidden_size))


h = np.zeros(shape=(max_t, hidden_size))

for w, name in zip([W_f, W_i, W_o, W_c, h, x], ['w_forget', 'w_input', 'w_output', 'w_cell_state', 'hidden_state', 'x']):
    print("{}:  {}".format(name, w.shape))

w_forget:  (100, 150)
w_input:  (100, 150)
w_output:  (100, 150)
w_cell_state:  (100, 150)
hidden_state:  (10, 100)
x:  (12, 50)


In [110]:
t = 3

z = np.concatenate((h[t-1], x[t-1]))

f_t = sigmoid(W_f.dot(z)) # Olvidamos
print(f_t.shape)
i_t = sigmoid(W_i.dot(z)) # Que queremos de nuevo
print(i_t.shape)
Casi_C = np.tanh(W_c.dot(z)) # Como queda esto?
print(Casi_C.shape)
C_t = f_t * C[t-1] + i_t * Casi_C # la cell queda así.
C[t] = C_t
print(C_t.shape)
o_t = sigmoid(W_o.dot(z))
print(o_t.shape)
h_t = o_t * np.tanh(C_t) # output y pafuera!
print(h_t.shape)

(100,)
(100,)
(100,)
(100,)
(100,)
(100,)


### Implementación en Keras

In [0]:
from keras.layers import CuDNNLSTM

In [0]:
rnn_type = CuDNNLSTM(100)

In [113]:
embedding_dim = 100

input_layer = Input(shape=(maxlen,) )
embedding = Embedding(output_dim=embedding_dim, input_dim=len(w2id), input_length=maxlen)(input_layer)
rnn = rnn_type(embedding)
dense_1 = Dense(100, activation='relu')(rnn)
drop_1 = Dropout(0.5)(dense_1)
dense_2 = Dense(100, activation='relu')(drop_1)
drop_2 = Dropout(0.5)(dense_2)
out = Dense(len(l2id), activation='softmax')(drop_2)

model = Model(inputs=input_layer, outputs=out)

model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

model.summary()


Model: "model_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_9 (InputLayer)         (None, 32)                0         
_________________________________________________________________
embedding_9 (Embedding)      (None, 32, 100)           3950800   
_________________________________________________________________
cu_dnnlstm_1 (CuDNNLSTM)     (None, 100)               80800     
_________________________________________________________________
dense_39 (Dense)             (None, 100)               10100     
_________________________________________________________________
dropout_25 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_40 (Dense)             (None, 100)               10100     
_________________________________________________________________
dropout_26 (Dropout)         (None, 100)               0   

In [114]:
model.fit(splits['train'][0], splits['train'][1],
          epochs=5,
          batch_size=16)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fc97250e160>

In [115]:
score_lstm = model.evaluate(splits['test'][0], splits['test'][1], batch_size=16)
score_lstm



[0.5078811290405976, 0.7530562347188264]

No tengo predicciones porque me he quedado sin tiempo de convertir frases de prueba a las dimensiones de entrada para probar el predict.