In [24]:
import pandas as pd
import numpy as np
import nltk.data
from nltk import word_tokenize, sent_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
import gensim
from gensim.models import Word2Vec

from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.layers import Embedding
from keras.models import Model, Sequential, load_model
from keras.layers import Input, Flatten, Dense, Conv1D, MaxPooling1D, GlobalMaxPool1D, SpatialDropout1D, \
                          UpSampling1D, LSTM, RepeatVector, TimeDistributed, GRU, Bidirectional, concatenate
from keras.utils import plot_model, to_categorical
from sklearn.neighbors import NearestNeighbors
from keras.utils.vis_utils import plot_model

#import my_functions

pd.set_option('max_colwidth', 250)

In [3]:
path = os.path.join('../../Data/')

In [7]:
data = pd.read_csv(path + 'Texto_PreProcesado_v1.csv', sep = ';', index_col = False)
data.head()

Unnamed: 0,id,brand,text
0,1060651400131,Woman_Limited_El_Corte_Inglés,moda mujer abrigo masculino textura
1,1060651400180,Woman_Limited_El_Corte_Inglés,moda mujer abrigo doble faz cinturon tono
2,1051056400107,Woman_El_Corte_Inglés,moda mujer abrigo largo antelina woman corte_ingles
3,1019350401147,Lloyd's,moda mujer abrigo chaqueta termica efecto cortavientos
4,1019353400229,Lloyd's,moda mujer abrigo parka algodon capucha


## Vectorize Sentences

- Initialize tokenizer with num_words = MAX_NB_WORDS (200K). i.e. The tokenizer will perform a word count, sorted by number of occurences in descending order and pick top N words, 200K in this case 
- Use tokenizer's texts_to_sequences method to convert text to array of integers.
- The arrays obtained from previous step might not be of uniform length, use pad_sequences method to obtain arrays with length equal to MAX_SEQUENCE_LENGTH (30)

In [8]:
MAX_NB_WORDS = 30_000 #decided by cumsum wordcount plot (Script 01)
MAX_SEQUENCE_LENGTH = 24 #decided by max words in a product (Script 00)
EMBEDDING_DIM = 100 #Same dim as our W2V embedding

In [9]:
all_text = data['text']
all_text = all_text.drop_duplicates (keep = False)

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, )
tokenizer.fit_on_texts(all_text)

data_sequences = tokenizer.texts_to_sequences(data['text'])
data_vec = pad_sequences(data_sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [10]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 49418 unique tokens.


In [11]:
data_vec[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    1,    3,   94,
       2461, 1432])

#### Lets load our Custom Embedding

In [12]:
modelWV = Word2Vec.load("word2vec_model_v2")

#### Build Keras Embedding 

In [13]:
word_vectors = modelWV.wv
vocabulary_size = len(word_index) + 1
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))

for word, i in word_index.items():
    if word in modelWV:
        embedding_matrix[i] = modelWV[word]
    else:
        embedding_matrix[i] = np.random.rand(1, EMBEDDING_DIM)[0]
            

del(word_vectors)

embedding_layer = Embedding(input_dim = vocabulary_size,
                            output_dim = EMBEDDING_DIM,
                            input_length = MAX_SEQUENCE_LENGTH,
                            weights=[embedding_matrix],
                            name='w2v_embedding',
                            trainable=False)

  
  import sys


In [14]:
model = Sequential()
model.add(embedding_layer)
model.compile('adam', 'mse')
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
w2v_embedding (Embedding)    (None, 24, 100)           4941900   
Total params: 4,941,900
Trainable params: 0
Non-trainable params: 4,941,900
_________________________________________________________________


In [16]:
data_embedded = model.predict(data_vec, verbose = 1)



## Model 1

Dense Layer based, low quantity of params and easy to train

In [None]:
input_i = Input(shape=(MAX_SEQUENCE_LENGTH, EMBEDDING_DIM))
encoded_h1 = Dense(128, activation='relu')(input_i)
encoded_h2 = Dense(64, activation='relu')(encoded_h1)
encoded_h3 = Dense(32, activation='relu')(encoded_h2)
encoded_h4 = Dense(16, activation='relu')(encoded_h3)
#encoded_h5 = Dense(8, activation='relu')(encoded_h4)

latent = Dense(8, activation='relu', name = 'ENCODER')(encoded_h4)

#decoder_h1 = Dense(8, activation='relu')(latent)
decoder_h2 = Dense(16, activation='relu')(latent)
decoder_h3 = Dense(32, activation='relu')(decoder_h2)
decoder_h4 = Dense(64, activation='relu')(decoder_h3)
decoder_h5 = Dense(128, activation='relu')(decoder_h4)

output = Dense(EMBEDDING_DIM, activation='relu')(decoder_h5)

autoencoder = Model(input_i,output)

autoencoder.compile(optimizer = 'adam', loss = 'mse')
autoencoder.summary()

In [56]:
%%time
autoencoder.fit(data_embedded,data_embedded,epochs=3,
            batch_size=64, verbose = 1)

Epoch 1/3
Epoch 2/3
Epoch 3/3
Wall time: 7min 15s


<keras.callbacks.History at 0x2b912402240>

## Model 2

### The more complex the model is the better?

- Basic LSTM Layer based

- Take nothe that we are compressing more the encoder layer, instead of LENTGH = 192 in dense approach, now we are compressing to LENGHT = 128.


In [19]:
input_i = Input(shape=(MAX_SEQUENCE_LENGTH, EMBEDDING_DIM))

x1 = LSTM(128, return_sequences=False, name = 'ENCODER')(input_i)
x2 = RepeatVector(24)(x1)
x3 = LSTM (100, return_sequences=True)(x2)

autoencoder = Model(inputs = input_i, outputs = x3)
autoencoder.compile(optimizer = 'adam', loss = 'mse')
autoencoder.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 24, 100)           0         
_________________________________________________________________
lstm_5 (LSTM)                (None, 128)               117248    
_________________________________________________________________
repeat_vector_3 (RepeatVecto (None, 24, 128)           0         
_________________________________________________________________
lstm_6 (LSTM)                (None, 24, 100)           91600     
Total params: 208,848
Trainable params: 208,848
Non-trainable params: 0
_________________________________________________________________


In [20]:
%%time
autoencoder.fit(data_embedded, data_embedded ,epochs=3,
            batch_size=32, verbose = 1)

Epoch 1/3
Epoch 2/3
Epoch 3/3
Wall time: 22min 25s


<keras.callbacks.History at 0x2ee2118e898>

## Model 3

- Lets give it another chance to LSTM Layer based but now we will give the LSTM 2 inputs in order to fit better to the order word changes.

- Take nothe that we are compressing more the encoder layer, instead of LENTGH = 192 in dense approach, now we are compressing to LENGHT = 128 as we did in Model 2

- Also we have less training params so we will train the autoencoder faster than in Model2 Approach


In [71]:
input_i1 = Input(shape=(MAX_SEQUENCE_LENGTH, EMBEDDING_DIM))
input_i2 = Input(shape=(MAX_SEQUENCE_LENGTH, EMBEDDING_DIM))

am1 = LSTM(64, return_sequences=False)(input_i1)
am2 = LSTM(64, return_sequences=False)(input_i2)

decoder = concatenate([am1, am2], name = 'ENCODER')

x2 = RepeatVector(24)(decoder)
x3 = LSTM(100, return_sequences=True)(x2)

autoencoder = Model([input_i1, input_i2], x3)

autoencoder.compile(optimizer = 'adam', loss = 'mse')
autoencoder.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_53 (InputLayer)           (None, 24, 100)      0                                            
__________________________________________________________________________________________________
input_54 (InputLayer)           (None, 24, 100)      0                                            
__________________________________________________________________________________________________
lstm_46 (LSTM)                  (None, 64)           42240       input_53[0][0]                   
__________________________________________________________________________________________________
lstm_47 (LSTM)                  (None, 64)           42240       input_54[0][0]                   
__________________________________________________________________________________________________
ENCODER (C

In [72]:
%%time
autoencoder.fit([data_embedded,data_embedded], data_embedded ,epochs=3,
            batch_size=32, verbose = 1)

Epoch 1/3
Epoch 2/3
Epoch 3/3
Wall time: 23min 31s


<keras.callbacks.History at 0x2ce1c69c320>

In [58]:
#encoder = Model(inputs=autoencoder.input, outputs=autoencoder.get_layer('ENCODER').output)

In [25]:
#encoder.save('encoder_text_V1.h5')
encoder = load_model('encoder_text_V1.h5')



In [41]:
shape = encoder.get_layer('ENCODER').output_shape[1]

In [59]:
encoder.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 24, 100)           0         
_________________________________________________________________
lstm_5 (LSTM)                (None, 128)               117248    
Total params: 117,248
Trainable params: 117,248
Non-trainable params: 0
_________________________________________________________________


### Most similar products

In [43]:
data.loc[58000]

id                                                                                    001004742503321
brand                                                                                           Miele
text     electrodomestico horno placa campana extractoras cocina isla da touch control obsidian black
Name: 58000, dtype: object

In [45]:
query = data_embedded[58000]

In [54]:
X_test = data_embedded.copy()
X_test.shape

(204812, 24, 100)

In [60]:
%%time
codes = encoder.predict(X_test)
codes.shape

KeyboardInterrupt: 

In [None]:
query_code = encoder.predict(query.reshape(1,MAX_SEQUENCE_LENGTH,EMBEDDING_DIM))
query_code.shape

In [None]:
codes = codes.reshape(-1, shape)
print(codes.shape)
query_code = query_code.reshape(1, shape)
print(query_code.shape)

### Fit the KNN to the test set

In [None]:
%%time
n_neigh = 10
nbrs = NearestNeighbors(n_neighbors=n_neigh).fit(codes)

In [None]:
distances, indices = nbrs.kneighbors(np.array(query_code))