# Text Autoencoder

In this notebook we generate the text autoencoder in order to reduce the dimension of our text vector

In [1]:
import pandas as pd
import os
import numpy as np
import nltk.data
from nltk import word_tokenize, sent_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
import gensim
from gensim.models import Word2Vec

from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.layers import Embedding
from keras.models import Model, Sequential, load_model
from keras.layers import Input, Flatten, Dense, Conv1D, MaxPooling1D, GlobalMaxPool1D, SpatialDropout1D, \
                          UpSampling1D, LSTM, RepeatVector, TimeDistributed, GRU, Bidirectional, concatenate, \
                          GlobalAveragePooling1D
from keras.utils import plot_model, to_categorical
from sklearn.neighbors import NearestNeighbors
from keras.utils.vis_utils import plot_model

from keras.models import model_from_json

pd.set_option('max_colwidth', 250)

Using TensorFlow backend.


In [2]:
path = os.path.join('../Data/')
path_models = os.path.join('../Models/')

In [3]:
data = pd.read_csv(path + 'Texto_PreProcesado.csv', sep = ';', index_col = False)
data.head()

Unnamed: 0,item_id,brand,text
0,A28233506,Woman Limited El Corte Inglés,abrigo masculino textura mujer
1,A29054782,Woman Limited El Corte Inglés,abrigo doble faz mujer cinturon tono
2,A27354432,Woman El Corte Inglés,abrigo largo antelina mujer woman corte_ingles
3,A28302706,Lloyd's,chaqueta termica mujer efecto cortavientos
4,A27435502,Lloyd's,parka algodon mujer capucha


## Vectorize Sentences

- Initialize tokenizer with num_words = MAX_NB_WORDS (200K). i.e. The tokenizer will perform a word count, sorted by number of occurences in descending order and pick top N words, 200K in this case 
- Use tokenizer's texts_to_sequences method to convert text to array of integers.
- The arrays obtained from previous step might not be of uniform length, use pad_sequences method to obtain arrays with length equal to MAX_SEQUENCE_LENGTH (30)

In [4]:
MAX_NB_WORDS = 30_000 #decided by cumsum wordcount plot (Script 01)
MAX_SEQUENCE_LENGTH = 24 #decided by max words in a product (Script 00)
EMBEDDING_DIM = 100 #Same dim as our W2V embedding

In [6]:
data['text'] = data['text'].astype(str)
all_text = data['text']
all_text = all_text.drop_duplicates (keep = False)

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, )
tokenizer.fit_on_texts(all_text)

data_sequences = tokenizer.texts_to_sequences(data['text'])
data_vec = pad_sequences(data_sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [7]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 48964 unique tokens.


In [8]:
data_vec[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,  173, 9335,
       1135,    1])

#### Lets load our Custom Embedding

In [9]:
modelWV = Word2Vec.load(path_models + "word2vec_model")

#### Build Keras Embedding 

In [10]:
word_vectors = modelWV.wv
vocabulary_size = len(word_index) + 1
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))

for word, i in word_index.items():
    if word in modelWV:
        embedding_matrix[i] = modelWV[word]
    else:
        embedding_matrix[i] = np.random.rand(1, EMBEDDING_DIM)[0]
            

del(word_vectors)

embedding_layer = Embedding(input_dim = vocabulary_size,
                            output_dim = EMBEDDING_DIM,
                            input_length = MAX_SEQUENCE_LENGTH,
                            weights=[embedding_matrix],
                            name='w2v_embedding',
                            trainable=False)

  
  import sys


In [11]:
model = Sequential()
model.add(embedding_layer)
model.compile('adam', 'mse')
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
w2v_embedding (Embedding)    (None, 24, 100)           4896500   
Total params: 4,896,500
Trainable params: 0
Non-trainable params: 4,896,500
_________________________________________________________________


In [12]:
data_vec.shape

(204812, 24)

In [13]:
data_embedded = model.predict(data_vec, verbose = 1)



## Model 1

Dense Layer based, low quantity of params and easy to train

In [91]:
input_i = Input(shape=(MAX_SEQUENCE_LENGTH, ))

text_embedding = Embedding(*embedding_matrix.shape, weights = [embedding_matrix], trainable = False)(input_i)

encoded_h1 = Dense(128, activation='relu')(text_embedding)
encoded_h2 = Dense(64, activation='relu')(encoded_h1)
encoded_h3 = Dense(32, activation='relu')(encoded_h2)
encoded_h4 = Dense(16, activation='relu')(encoded_h3)
#encoded_h5 = Dense(8, activation='relu')(encoded_h4)

latent = Dense(8, activation='relu', name = 'ENCODER')(encoded_h4)

#decoder_h1 = Dense(8, activation='relu')(latent)
decoder_h2 = Dense(16, activation='relu')(latent)
decoder_h3 = Dense(32, activation='relu')(decoder_h2)
decoder_h4 = Dense(64, activation='relu')(decoder_h3)
decoder_h5 = Dense(128, activation='relu')(decoder_h4)

#output = Dense(EMBEDDING_DIM, activation='relu')(decoder_h5)
glob = Flatten()(decoder_h5)
output = Dense(MAX_SEQUENCE_LENGTH, activation = 'relu')(glob)

autoencoder = Model(input_i, output)

autoencoder.compile(optimizer = 'rmsprop', loss = 'mse', metrics = ['acc'])
autoencoder.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_39 (InputLayer)        (None, 24)                0         
_________________________________________________________________
embedding_39 (Embedding)     (None, 24, 100)           4941900   
_________________________________________________________________
dense_120 (Dense)            (None, 24, 128)           12928     
_________________________________________________________________
dense_121 (Dense)            (None, 24, 64)            8256      
_________________________________________________________________
dense_122 (Dense)            (None, 24, 32)            2080      
_________________________________________________________________
dense_123 (Dense)            (None, 24, 16)            528       
_________________________________________________________________
ENCODER (Dense)              (None, 24, 8)             136       
__________

In [92]:
%%time
autoencoder.fit(data_vec, data_vec, epochs=10,
            batch_size=128, verbose = 1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Wall time: 13min 24s


<keras.callbacks.History at 0x20c927561d0>

## Model 2

### The more complex the model is the better?

- Basic LSTM Layer based

- Take nothe that we are compressing more the encoder layer, instead of LENTGH = 192 in dense approach, now we are compressing to LENGHT = 128.


In [84]:
input_i = Input(shape=(MAX_SEQUENCE_LENGTH, ))

text_embedding = Embedding(*embedding_matrix.shape, weights = [embedding_matrix], trainable = False)(input_i)

x1 = LSTM(128, return_sequences=False, name = 'ENCODER')(text_embedding)
x2 = RepeatVector(24)(x1)
x3 = LSTM (24, return_sequences=False)(x2)

autoencoder = Model(inputs = input_i, outputs = x3)
autoencoder.compile(optimizer = 'rmsprop', loss = 'mse' , metrics = ['acc'])
autoencoder.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_35 (InputLayer)        (None, 24)                0         
_________________________________________________________________
embedding_35 (Embedding)     (None, 24, 100)           4941900   
_________________________________________________________________
ENCODER (LSTM)               (None, 128)               117248    
_________________________________________________________________
repeat_vector_15 (RepeatVect (None, 24, 128)           0         
_________________________________________________________________
lstm_37 (LSTM)               (None, 24, 128)           131584    
_________________________________________________________________
time_distributed_2 (TimeDist (None, 24, 49419)         6375051   
Total params: 11,565,783
Trainable params: 6,623,883
Non-trainable params: 4,941,900
_________________________________________________________

In [82]:
%%time
autoencoder.fit(data_vec, data_vec ,epochs=3,
            batch_size=64, verbose = 1)

Epoch 1/3
Epoch 2/3
Epoch 3/3
Wall time: 19min 32s


<keras.callbacks.History at 0x20c52d90978>

## Model 3

- Lets give it another chance to LSTM Layer based but now we will give the LSTM 2 inputs in order to fit better to the order word changes.

- Take nothe that we are compressing more the encoder layer, instead of LENTGH = 192 in dense approach, now we are compressing to LENGHT = 128 as we did in Model 2

- Also we have less training params so we will train the autoencoder faster than in Model2 Approach


In [14]:
input_i1 = Input(shape=(MAX_SEQUENCE_LENGTH, ))
input_i2 = Input(shape=(MAX_SEQUENCE_LENGTH, ))

text_embedding1 = Embedding(*embedding_matrix.shape, weights = [embedding_matrix], trainable = False)(input_i1)
text_embedding2 = Embedding(*embedding_matrix.shape, weights = [embedding_matrix], trainable = False)(input_i2)

am1 = LSTM(64, return_sequences=False)(text_embedding1)
am2 = LSTM(64, return_sequences=False)(text_embedding2)

decoder = concatenate([am1, am2], name = 'ENCODER')

x2 = RepeatVector(24)(decoder)
x3 = LSTM(24, return_sequences=False)(x2)

autoencoder = Model([input_i1, input_i2], x3)

autoencoder.compile(optimizer = 'adam', loss = 'mse', metrics = ['acc'])
autoencoder.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 24)           0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 24)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 24, 100)      4896500     input_1[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 24, 100)      4896500     input_2[0][0]                    
__________________________________________________________________________________________________
lstm_1 (LS

In [15]:
%%time
autoencoder.fit([data_vec, data_vec], data_vec ,epochs=3,
            batch_size=64, verbose = 1)

Instructions for updating:
Use tf.cast instead.
Epoch 1/3
 27136/204812 [==>...........................] - ETA: 3:47 - loss: 4052401.7070 - acc: 0.0273

KeyboardInterrupt: 

In [93]:
encoder = Model(inputs=autoencoder.input, outputs=autoencoder.get_layer('ENCODER').output)

In [94]:
encoder.save(path_models + 'encoder_text.h5')

In [16]:
encoder = load_model(path_models + 'encoder_text.h5')



In [17]:
shape = encoder.get_layer('ENCODER').output_shape[1] * encoder.get_layer('ENCODER').output_shape[2]
shape

192

In [18]:
encoder.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_39 (InputLayer)        (None, 24)                0         
_________________________________________________________________
embedding_39 (Embedding)     (None, 24, 100)           4941900   
_________________________________________________________________
dense_120 (Dense)            (None, 24, 128)           12928     
_________________________________________________________________
dense_121 (Dense)            (None, 24, 64)            8256      
_________________________________________________________________
dense_122 (Dense)            (None, 24, 32)            2080      
_________________________________________________________________
dense_123 (Dense)            (None, 24, 16)            528       
_________________________________________________________________
ENCODER (Dense)              (None, 24, 8)             136       
Total para

### Most similar products

In [36]:
data.loc[70000]

item_id                        A24259977
brand                    El Corte Inglés
text       set taza te buga corte_ingles
Name: 70000, dtype: object

In [37]:
query = data_vec[70000]
query.shape

(24,)

In [38]:
X_test = data_vec.copy()
X_test.shape

(204812, 24)

In [39]:
codes = encoder.predict(X_test)
codes.shape

(204812, 24, 8)

In [40]:
codes.shape

(204812, 24, 8)

In [41]:
query_code = encoder.predict([query.reshape(1, MAX_SEQUENCE_LENGTH)])
query_code.shape

(1, 24, 8)

In [42]:
codes = codes.reshape(-1, shape)
print(codes.shape)
query_code = query_code.reshape(1, shape)
print(query_code.shape)

(204812, 192)
(1, 192)


### Fit the KNN to the test set

In [43]:
n_neigh = 10
nbrs = NearestNeighbors(n_neighbors=n_neigh).fit(codes)

In [44]:
distances, indices = nbrs.kneighbors(np.array(query_code))

In [45]:
closest_sent = X_test[indices]
print(closest_sent.shape)
closest_sent = closest_sent.reshape(n_neigh, MAX_SEQUENCE_LENGTH); 
print(closest_sent.shape)

(1, 10, 24)
(10, 24)


In [46]:
data.loc[58000]

item_id                                       A26136146
brand                                             Miele
text       campana isla da touch control obsidian black
Name: 58000, dtype: object

In [47]:
mis_indices = indices.tolist()[0]
for i in range(n_neigh):
    print (data.loc[mis_indices[i]])
    print('-'*50)

item_id                        A24259977
brand                    El Corte Inglés
text       set taza te buga corte_ingles
Name: 70000, dtype: object
--------------------------------------------------
item_id                                    A13099826
brand                                El Corte Inglés
text       set accesorio venir winegift corte_ingles
Name: 103310, dtype: object
--------------------------------------------------
item_id                              A24346780
brand                          El Corte Inglés
text       set copa helado ancona corte_ingles
Name: 70523, dtype: object
--------------------------------------------------
item_id                                  A27996468
brand                              El Corte Inglés
text       reloj mesa plateado bayona corte_ingles
Name: 66172, dtype: object
--------------------------------------------------
item_id                          A24259995
brand                      El Corte Inglés
text       set taza cafe 