In [36]:
#Basic libraries to import:
import numpy as np  #used for scientific computing - not used in this case but you never know when you might need it
import pandas as pd #for data manipulation and analysis - used to upload de DS we are working with.

#NLP
import nltk # Natural Language Toolkit, platform for building Python programs to work with human language data.

#nltk.download('punkt') # tokenizer that divides a text into a list of sentences

from collections import Counter #container that keeps track of how many times equivalent values are added.

from keras.models import Model, load_model   #groups layers into an object with training and inference features

from keras.layers import Dense, Input, Embedding
from keras.layers.recurrent import LSTM

from keras.preprocessing.sequence import pad_sequences

from keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint, Callback


from sklearn.model_selection import train_test_split



In [37]:
# set default parameters
BATCH_SIZE = 128
NUM_EPOCHS = 200
HIDDEN_UNITS = 100 #256
MAX_INPUT_SEQ_LENGTH = 20
MAX_TARGET_SEQ_LENGTH = 20
MAX_VOCAB_SIZE = 20000 #10-20k  https://coursefinders.com/blog/es/5669/espanol-cuantas-palabras-se-necesitan-para-hablar-con-fluidez-un-idioma

input_counter = Counter()
target_counter = Counter()

# read the data

df = pd.read_csv('SARC_DS.csv')


In [38]:
df.head()

Unnamed: 0,Questions,Answer_1,Answer_2,all
0,I've been searching for the answer for this fo...,Religion must have the answer,It's obviously tracks from a giant water tract...,I've been searching for the answer for this fo...
1,"Michael Phelps Apologizes For ""Regrettable"" Be...",Wow...he smoked pot...oh lord hes such a horri...,"Wow, his girlfriend is uhm... Ah fuck it, he's...","Michael Phelps Apologizes For ""Regrettable"" Be..."
2,Utah wants to create a database to track the i...,I think the government should track every morm...,Another idea from the party that wants to get ...,Utah wants to create a database to track the i...
3,The Six Million Dead Jews of World War ONE!,"Oh right, *both* wars were just jewish conspir...","i know this seems strange but, what if he was ...",The Six Million Dead Jews of World War ONE! Oh...
4,WSJ begins the Jeb Bush campaign for 2016,Good luck with that.,time to get that shack in montana.,WSJ begins the Jeb Bush campaign for 2016 Good...


In [39]:
df['all'][0]

"I've been searching for the answer for this for some time, but I still can't find any answer... Can anyone please explain to me what this is? Religion must have the answer It's obviously tracks from a giant water tractor, farming for giant arctic sea prawn!"

In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128540 entries, 0 to 128539
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   Questions  128540 non-null  object
 1   Answer_1   128540 non-null  object
 2   Answer_2   128540 non-null  object
 3   all        128540 non-null  object
dtypes: object(4)
memory usage: 3.9+ MB


In [41]:
lines = df['all'][:100]
input_texts = []
target_texts = []

In [42]:
len(lines)
# len(input_texts)
# len(target_texts)

100

In [43]:
prev_words = []

for line in lines:

    next_words = [w.lower() for w in nltk.word_tokenize(line) if w.isalpha()]
    

    if len(next_words) > MAX_TARGET_SEQ_LENGTH:
        next_words = next_words[0:MAX_TARGET_SEQ_LENGTH]

    if len(prev_words) > 0:
        input_texts.append(prev_words)
        for w in prev_words:
            input_counter[w] += 1
        target_words = next_words[:]
        target_words.insert(0, 'START')
        target_words.append('END')
        for w in target_words:
            target_counter[w] += 1
        target_texts.append(target_words)

    prev_words = next_words

In [44]:
len(next_words)

20

In [45]:
# encode the data
input_word2idx = dict()
target_word2idx = dict()
for idx, word in enumerate(input_counter.most_common(MAX_VOCAB_SIZE)):
    input_word2idx[word[0]] = idx + 2
for idx, word in enumerate(target_counter.most_common(MAX_VOCAB_SIZE)):
    target_word2idx[word[0]] = idx + 1

input_word2idx['PAD'] = 0
input_word2idx['UNK'] = 1
target_word2idx['UNK'] = 0

input_idx2word = dict([(idx, word) for word, idx in input_word2idx.items()])
target_idx2word = dict([(idx, word) for word, idx in target_word2idx.items()])

num_encoder_tokens = len(input_idx2word)
num_decoder_tokens = len(target_idx2word)


encoder_input_data = []

encoder_max_seq_length = 0
decoder_max_seq_length = 0

for input_words, target_words in zip(input_texts, target_texts):
    encoder_input_wids = []
    for w in input_words:
        w2idx = 1
        if w in input_word2idx:
            w2idx = input_word2idx[w]
        encoder_input_wids.append(w2idx)

    encoder_input_data.append(encoder_input_wids)
    encoder_max_seq_length = max(len(encoder_input_wids), encoder_max_seq_length)
    decoder_max_seq_length = max(len(target_words), decoder_max_seq_length)

context = dict()
context['num_encoder_tokens'] = num_encoder_tokens
context['num_decoder_tokens'] = num_decoder_tokens
context['encoder_max_seq_length'] = encoder_max_seq_length
context['decoder_max_seq_length'] = decoder_max_seq_length

In [46]:
print(len(encoder_input_data))

99


In [47]:
# custom function to generate batches

def generate_batch(input_data, output_text_data):
    '''
    
    input: 
        - input_data:
        - output_text_data:
        
    output:
        - 
        - 
    '''
    
    num_batches = len(input_data) // BATCH_SIZE
    while True:
        for batchIdx in range(0, num_batches):
            start = batchIdx * BATCH_SIZE
            end = (batchIdx + 1) * BATCH_SIZE
            encoder_input_data_batch = pad_sequences(input_data[start:end], encoder_max_seq_length)
            decoder_target_data_batch = np.zeros(shape=(BATCH_SIZE, decoder_max_seq_length, num_decoder_tokens))
            decoder_input_data_batch = np.zeros(shape=(BATCH_SIZE, decoder_max_seq_length, num_decoder_tokens))
            for lineIdx, target_words in enumerate(output_text_data[start:end]):
                for idx, w in enumerate(target_words):
                    w2idx = 0
                    if w in target_word2idx:
                        w2idx = target_word2idx[w]
                    decoder_input_data_batch[lineIdx, idx, w2idx] = 1
                    if idx > 0:
                        decoder_target_data_batch[lineIdx, idx - 1, w2idx] = 1
            yield [encoder_input_data_batch, decoder_input_data_batch], decoder_target_data_batch

In [48]:
# Compiling and training

encoder_inputs = Input(shape=(None,), name='encoder_inputs')
encoder_embedding = Embedding(input_dim=num_encoder_tokens, output_dim=HIDDEN_UNITS,
                              input_length=encoder_max_seq_length, name='encoder_embedding')

encoder_lstm = LSTM(units=HIDDEN_UNITS, return_state=True, name='encoder_lstm')
encoder_outputs, encoder_state_h, encoder_state_c = encoder_lstm(encoder_embedding(encoder_inputs))
encoder_states = [encoder_state_h, encoder_state_c]

decoder_inputs = Input(shape=(None, num_decoder_tokens), name='decoder_inputs')
decoder_lstm = LSTM(units=HIDDEN_UNITS, return_state=True, return_sequences=True, name='decoder_lstm')
decoder_outputs, decoder_state_h, decoder_state_c = decoder_lstm(decoder_inputs,
                                                                 initial_state=encoder_states)
decoder_dense = Dense(units=num_decoder_tokens, activation='softmax', name='decoder_dense')
decoder_outputs = decoder_dense(decoder_outputs)

In [49]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

keras.Model Arguments <br>
- **inputs**: The input(s) of the model: a `keras.Input` object or list of `keras.Input` objects. <br>
- **outputs**: The output(s) of the model. <br>
- **name**: String, the name of the model. (opt) <br>

In [51]:
#Compiling
model.compile(loss='categorical_crossentropy', optimizer='adam')
model.summary()
#Compiling
training_model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
training_model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'], sample_weight_mode='temporal')
training_model.summary()

Model: "model_10"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_inputs (InputLayer)     (None, None)         0                                            
__________________________________________________________________________________________________
encoder_embedding (Embedding)   (None, 20, 100)      95100       encoder_inputs[0][0]             
__________________________________________________________________________________________________
decoder_inputs (InputLayer)     (None, None, 952)    0                                            
__________________________________________________________________________________________________
encoder_lstm (LSTM)             [(None, 100), (None, 80400       encoder_embedding[0][0]          
___________________________________________________________________________________________

In [None]:
X_train, X_test, y_train, y_test = train_test_split(encoder_input_data, target_texts, test_size=0.2, random_state=42)

train_gen = generate_batch(X_train, y_train)
test_gen = generate_batch(X_test, y_test)

train_num_batches = len(X_train) // BATCH_SIZE
test_num_batches = len(X_test) // BATCH_SIZE

In [None]:
early_stop = EarlyStopping(
                        monitor='loss', 
                        min_delta=0.001, 
                        patience=3, mode='min', 
                        verbose=1
                        )

reduce_lr = ReduceLROnPlateau(
                            monitor='val_loss', 
                            factor=0.1, 
                            patience=4, 
                            verbose=1, 
                            min_delta=1e-4
                            ) 

checkpoint = ModelCheckpoint(
                            'model_best_weights.h5', #name of the document where the checkpoints will be saved
                            monitor='loss', 
                            verbose=1, 
                            save_best_only=True, 
                            mode='min', 
                            period=1
                            )


my_callbacks = [early_stop,reduce_lr, checkpoint]


To train my model with the established vocabulary of 20k words and 500 epochs; and starting with 40k lines of my ds it would take an estimate of 17 days to train in my local machine.

Epoch 1/500
  8/312 [..............................] - ETA: 53:45 - loss: 9.0315
  
 Using a VM the same model with the same parameters takes an estimate of 5 days.
 
 Epoch 1/500
249/249 [==============================] - 923s 4s/step - loss: 6.7226 - val_loss: 6.4012

To avoid unnecessary training I am using the callback attribute with `Earlystop` which will automatically stop the training once it has stopped improving, this means that it might take less time. We will see, today is 31.May.2020 01:19-

-- **`40k lines of training is insufficient to have a properly trained model`**

At the same time, given that it takes days to train, to avoid any technical issues I am using the `ModelCheckpoint` function which saves a copy of the model, in this case it is monitoring the 'loss', which on second though might not be the correct one and it should be monitoring the 'val_loss', but the main thing is that it makes a "Security Copy" of the model so if the Server of the VM were to disconnect or the Kernel were to die I'd have a least part of the model saved.

Because we are working with NLG the training has to be very extense. That's why I'm creating different models, which might not be completed for due date but will be useful to get a general idea and even continue training that model.

In [None]:
model.summary()

In [None]:
# model_tot_500e = model.fit_generator(generator=train_gen,
#                     steps_per_epoch=train_num_batches,
#                     epochs=500, #NUM_EPOCHS,
#                     verbose=1,
#                     validation_data=test_gen,
#                     validation_steps=test_num_batches,
#                     callbacks = my_callbacks
#                    )

In [None]:
# model.save("models/model_e_50k__lines_20kvocab.h5")


In [24]:
#model = load_model('model_55e_lines_20kvocab.h5')# Not trained enough
#model = load_model('model_79e_5klines_1kvocab.h5')# Not trained enough
#model = load_model('model_All[600](vocab10k)_100epochs.h5')# Not trained enough
#model = load_model("model_best_weights.h5")

model = load_model("model_35e_40klines_20kvocab.h5")



  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [25]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_inputs (InputLayer)     (None, None)         0                                            
__________________________________________________________________________________________________
encoder_embedding (Embedding)   (None, 20, 100)      2000200     encoder_inputs[0][0]             
__________________________________________________________________________________________________
decoder_inputs (InputLayer)     (None, None, 20001)  0                                            
__________________________________________________________________________________________________
encoder_lstm (LSTM)             [(None, 100), (None, 80400       encoder_embedding[0][0]          
____________________________________________________________________________________________

In [None]:
%matplotlib inline

def plot_train_vs_test (model):
    import matplotlib.pyplot as plt

    """
    **************************************************************
    
    Visualiza en forma de gráfico las curvas del accuracy y del loss
    entre el train y el test.

    ***************************************************************
    """
    
    plt.rcParams['figure.figsize'] = (8, 6) 
    
    plt.figure(0)  
    plt.plot(model.history['acc'],'r')  
    plt.plot(model.history['val_acc'],'g')  
    plt.xticks(np.arange(0, epochs, 2.0))   
    plt.xlabel("Num of Epochs")  
    plt.ylabel("Accuracy")  
    plt.title("Training Accuracy vs Validation Accuracy")  
    plt.legend(['train','validation'])

    plt.figure(1)  
    plt.plot(model.history['loss'],'r')  
    plt.plot(model.history['val_loss'],'g')  
    plt.xticks(np.arange(0, epochs, 2.0))   
    plt.xlabel("Num of Epochs")  
    plt.ylabel("Loss")  
    plt.title("Training Loss vs Validation Loss")  
    plt.legend(['train','validation'])


In [None]:
plot_train_vs_test (model)

In [26]:
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_inputs = [Input(shape=(HIDDEN_UNITS,)), Input(shape=(HIDDEN_UNITS,))]
decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=decoder_state_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model([decoder_inputs] + decoder_state_inputs, [decoder_outputs] + decoder_states)

In [30]:
input_text = input()
input_seq = []
input_wids = []
max_encoder_seq_length = 20
max_decoder_seq_length = 20

for word in nltk.word_tokenize(input_text.lower()):
    idx = 1
    if word in input_word2idx:
        idx = input_word2idx[word]
    input_wids.append(idx)
    
input_seq.append(input_wids)
input_seq = pad_sequences(input_seq, max_encoder_seq_length)
states_value = encoder_model.predict(input_seq)
target_seq = np.zeros((1, 1, num_decoder_tokens))
target_seq[0, 0, target_word2idx['START']] = 1
target_text = ''
target_text_len = 0
terminated = False

while not terminated:
    output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
    sample_token_idx = np.argmax(output_tokens[0, -1, :])
    sample_word = target_idx2word[sample_token_idx]
    target_text_len += 1

    if sample_word != 'START' and sample_word != 'END':
        target_text += ' ' + sample_word

    if sample_word == 'END' or target_text_len >= max_decoder_seq_length:
        terminated = True

    target_seq = np.zeros((1, 1, num_decoder_tokens))
    target_seq[0, 0, sample_token_idx] = 1

    states_value = [h, c]
    

target_text.strip().replace('UNK', '')

njlkvlakef


'homophobes origen origen testified chances mir furry tipping leek leek teased donaldson featherweight disgusting devalued job teach pulling hawkeye rub'

In [None]:
model.test()

In [None]:
model.summary()