# Abdi Seq2Seq (Input -> Input)

### Seq2Seq_Abdi
 
- Dataset: Movies Lines
- Lines: 25,141
- Samples Size: 20,000 
- Vocab Size: 9,000 
- Epochs: 30
- Acc: 99%
- Loss: 0.0049
    
by Abdi (Sep, 2019)    

## Preparing the Data

In [4]:
### Imports
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers , activations , models , preprocessing , utils
import pandas as pd
import warnings
import itertools
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')

print( tf.VERSION )

1.14.0


In [3]:
### Variables Initialization
VOCAB_SIZE = 9000
MAX_LEN = 20
SAMPLES_SIZE = 20000
EPOCHS = 30
BATCH_SIZE = 32
HIDDEN_DIM = 256
EMB_DIM = 256

class color:
        PURPLE = '\033[95m'
        CYAN = '\033[96m'
        DARKCYAN = '\033[36m'
        BLUE = '\033[94m'
        GREEN = '\033[92m'
        YELLOW = '\033[93m'
        RED = '\033[91m'
        BOLD = '\033[1m'
        UNDERLINE = '\033[4m'
        END = '\033[0m'

In [5]:
### Load the dataset
def load_dataset():
    
    import glob

    path = r'Dataset'
    all_files = glob.glob(path + "/*.csv")

    li = []

    for filename in all_files:
        df = pd.read_csv(filename, index_col=None, header=0)
        li.append(df)

    movies = pd.concat(li, axis=0, ignore_index=True)
    
    print("Data shape:", movies.shape)
    print("Load completed!")
    print("---------\n")
    
    return movies

movies = load_dataset()

Data shape: (25141, 4)
Load completed!
---------



In [6]:
### Clean text function
def clean_text(text):
    import re
    text = text.lower()
    
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "that is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"n'", "ng", text)
    text = re.sub(r"'bout", "about", text)
    text = re.sub(r"'til", "until", text)
    text = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", text)
    
    return text

In [7]:
### Pre-processing
target = movies['sentences']
target = target.reset_index(drop=True)
movies['target'] = target
movies.rename(columns={'sentences':'input'}, inplace=True)
movies.dropna(axis=0, how='any', inplace=True)
movies = movies.iloc[ : SAMPLES_SIZE] 
print(movies.shape)

movies['input'] = movies['input'].apply(clean_text)
movies['target'] = movies['target'].apply(clean_text)
movies.head()

(20000, 5)


Unnamed: 0,input,actors,to_who,movie_name,target
0,please luke please please please,LORELAI,LUKE,"gilmore girls, season : 01, episode : 01",please luke please please please
1,how many cups have you had this morning,LUKE,LORELAI,"gilmore girls, season : 01, episode : 01",how many cups have you had this morning
2,none,LORELAI,LORELAI,"gilmore girls, season : 01, episode : 01",none
3,plus,LUKE,LUKE,"gilmore girls, season : 01, episode : 01",plus
4,five but yours is better,LORELAI,LORELAI,"gilmore girls, season : 01, episode : 01",five but yours is better


## Vectorizing and Tokenizing

In [8]:
### INPUTS
input_lines = list()
for line in movies.input:
    input_lines.append(line) 

tokenizer = preprocessing.text.Tokenizer(num_words = VOCAB_SIZE)
tokenizer.fit_on_texts(input_lines ) 
tokenized_input_lines = tokenizer.texts_to_sequences(input_lines) 

### Arrumar aqui
length_list = list()
for token_seq in tokenized_input_lines:
    length_list.append(len(token_seq))
    max_input_length = max(length_list) # Gets the higher value in the list   
print('Input max length is:', max_input_length)

padded_input_lines = preprocessing.sequence.pad_sequences(tokenized_input_lines , maxlen= MAX_LEN , padding='post')
encoder_input_data = np.array(padded_input_lines)
print('Encoder input data shape:', encoder_input_data.shape)

input_word_dict = tokenizer.word_index
input_word_dict = dict(itertools.islice(input_word_dict.items(), VOCAB_SIZE-1))
num_input_tokens = len(input_word_dict)+1
print('Number of Input tokens:', num_input_tokens)

Input max length is: 280
Encoder input data shape: (20000, 20)
Number of Input tokens: 9000


In [9]:
### TARGETS
target_lines = list()
for line in movies.target:
    target_lines.append('<BOS> ' + line + ' <EOS>')  

tokenizer = preprocessing.text.Tokenizer(num_words = VOCAB_SIZE)
tokenizer.fit_on_texts(target_lines) 
tokenized_target_lines = tokenizer.texts_to_sequences(target_lines) 

length_list = list()
for token_seq in tokenized_target_lines:
    length_list.append(len(token_seq))
    max_target_length = max(length_list) # Gets the higher value in the list   
print('Target max length is:', max_target_length)

padded_target_lines = preprocessing.sequence.pad_sequences(tokenized_target_lines , maxlen = MAX_LEN, padding='post' )
decoder_input_data = np.array(padded_target_lines)
print('Decoder input data shape:', decoder_input_data.shape)


target_word_dict = tokenizer.word_index
target_word_dict = dict(itertools.islice(target_word_dict.items(), VOCAB_SIZE-1))
num_target_tokens = len(target_word_dict)+1
print('Number of Target tokens:', num_target_tokens)

Target max length is: 282
Decoder input data shape: (20000, 20)
Number of Target tokens: 9000


In [10]:
### Prepare the decoder target data
decoder_target_data = list()

for token_seq in tokenized_target_lines:
    decoder_target_data.append(token_seq[1:]) 
    
padded_target_lines = preprocessing.sequence.pad_sequences(decoder_target_data , maxlen = MAX_LEN, padding='post')
decoder_target_data = utils.to_categorical(padded_target_lines, VOCAB_SIZE)


print('Decoder target data shape:', decoder_target_data.shape )

Decoder target data shape: (20000, 20, 9000)


In [None]:
### Define the embedding matrix
def embedding_matrix_creator(EMB_DIM, embeddings_index):
    embedding_matrix = np.zeros((VOCAB_SIZE, EMB_DIM))
    for word, i in target_word_dict.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:      
            embedding_matrix[i] = embedding_vector # words not found in embedding index will be all-zeros
    return embedding_matrix

### Load GloVe
def embedding_master():
    embeddings_index = {}
    
    path = r'C:\Users\abdi\Desktop\2Meu\WordEmbeddings'
    #with open(path + 'glove.6B.50d.txt', encoding='utf-8') as f: # 50 dimension, 400,000 words
    with open(path + '\glove.42B.300d.txt', encoding='utf-8') as f:# 300 dimension 1,900,000 words
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
        f.close()

    print("Glove Loaded!") 
    
    from keras.layers import Embedding
    
    ### Define the embedding layer
    embedding_matrix = embedding_matrix_creator(EMB_DIM, embeddings_index) # change embedding dimensions
    embed_layer = tf.keras.layers.Embedding(input_dim = VOCAB_SIZE, 
                        output_dim = EMB_DIM, 
                        input_length = MAX_LEN, 
                        weights = [embedding_matrix], trainable =  True, mask_zero = True) # True for dynamic | False for static 
    
    print("Embedded layer completed!")
    print("---------\n")
    
    return embed_layer

## Training Model

In [38]:
encoder_inputs = tf.keras.layers.Input(shape=(None , ))
encoder_embedding = tf.keras.layers.Embedding(VOCAB_SIZE, EMB_DIM, mask_zero=True )(encoder_inputs)
encoder_outputs , state_h , state_c = tf.keras.layers.LSTM(HIDDEN_DIM, return_state = True , recurrent_dropout = 0.1 , dropout = 0.1 )(encoder_embedding)
encoder_states = [ state_h , state_c ]

decoder_inputs = tf.keras.layers.Input(shape=(None ,  ))
decoder_embedding = tf.keras.layers.Embedding(VOCAB_SIZE, EMB_DIM, mask_zero=True)(decoder_inputs)
decoder_lstm = tf.keras.layers.LSTM(HIDDEN_DIM, return_state = True , return_sequences=True , recurrent_dropout = 0.1 , dropout = 0.1)
decoder_outputs , _ , _ = decoder_lstm (decoder_embedding , initial_state=encoder_states)
decoder_dense = tf.keras.layers.Dense(VOCAB_SIZE, activation=tf.keras.activations.softmax ) 
output = decoder_dense (decoder_outputs)

model = tf.keras.models.Model([encoder_inputs, decoder_inputs], output )
model.compile(optimizer=tf.keras.optimizers.Adam(), loss='categorical_crossentropy', metrics = ['accuracy'])

model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, None, 256)    2304000     input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 256)    2304000     input_2[0][0]                    
______________________________________________________________________________________________

In [None]:
history = model.fit([encoder_input_data , decoder_input_data], decoder_target_data, 
                    batch_size = BATCH_SIZE, 
                    epochs = EPOCHS,
                    validation_split = 0.1) 

## Inference Model

In [None]:
encoder_model = tf.keras.models.Model(encoder_inputs, encoder_states)
    
decoder_state_input_h = tf.keras.layers.Input(shape=(HIDDEN_DIM,))    
decoder_state_input_c = tf.keras.layers.Input(shape=(HIDDEN_DIM,))
    
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
    
decoder_outputs, state_h, state_c = decoder_lstm(decoder_embedding , initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = tf.keras.models.Model([decoder_inputs] + decoder_states_inputs,[decoder_outputs] + decoder_states)
    
enc_model, dec_model = encoder_model, decoder_model

#enc_model.save( 'enc_model_B.h5' ) 
#dec_model.save( 'dec_model_B.h5' ) 
#model.save( 'model_B.h5' ) 

In [11]:
def encoding_input( sentence : str ):
    sentence = clean_text(sentence)
    words = sentence.lower().split()
    
    tokens_list = list()
    for word in words:
        tokens_list.append(input_word_dict[ word ]) 
    return preprocessing.sequence.pad_sequences( [tokens_list] , maxlen = MAX_LEN , padding='post')

In [39]:
### Inference
for epoch in range(5):
    input_seq = input('Enter a sentence: ')
    states_values = enc_model.predict(encoding_input(input_seq))
    empty_target_seq = np.zeros( ( 1 , 1 ) )
    empty_target_seq[0, 0] = target_word_dict['bos']
    stop_condition = False
    decoded_translation = ''
    decoded_vector = []
    
    while not stop_condition :
        dec_outputs , h , c = dec_model.predict([empty_target_seq] + states_values)
        sampled_word_index = np.argmax(dec_outputs[0, -1, :] ) ### Prediction Vector
        sampled_word = None
        
        for word , index in target_word_dict.items() :
            if sampled_word_index == index :                
                sampled_word = word 
                #print(sampled_word)
                
                if sampled_word == 'eos' or len(decoded_translation.split()) > MAX_LEN:
                    stop_condition = True
                    
                else: 
                    decoded_translation += word + ' '
                    decoded_vector.append(sampled_word_index)
                    
                              
        empty_target_seq = np.zeros(( 1 , 1 ))  
        empty_target_seq[0 , 0] = sampled_word_index
        states_values = [h , c] 
    
    
    print(color.GREEN + "Input:\n" + color.END, input_seq)
    print(color.BLUE + "Prediction:\n" + color.END, decoded_translation)
    print(color.BLUE + "Vector:\n" + color.END, decoded_vector)
    print("\n")

Enter a sentence: you are very awesome
[92mInput:
[0m you are very awesome
[94mPrediction:
[0m you are very rude 
[94mVector:
[0m [4, 13, 94, 1370]


Enter a sentence: oh you know what actually i am meeting someone
[92mInput:
[0m oh you know what actually i am meeting someone
[94mPrediction:
[0m oh you know what actually i am meeting someone 
[94mVector:
[0m [26, 4, 23, 18, 187, 3, 15, 564, 348]


Enter a sentence: oh you know what actually i am going home today
[92mInput:
[0m oh you know what actually i am going home today
[94mPrediction:
[0m oh you know what actually i am going to go 
[94mVector:
[0m [26, 4, 23, 18, 187, 3, 15, 69, 7, 44]


Enter a sentence: I am thinking about a new world
[92mInput:
[0m I am thinking about a new world
[94mPrediction:
[0m i am thinking about a new world 
[94mVector:
[0m [3, 15, 275, 45, 8, 182, 344]


Enter a sentence: I am very curious about the presents on this christmas
[92mInput:
[0m I am very curious about the presents 

In [21]:
def performance_plot(history):
    #plt.figure(figsize=(8, 4))
    plt.plot(history.history['acc'])
    plt.plot(history.history['val_acc'])
    plt.title('model accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()
    
    #plt.figure(figsize=(8, 4))
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()
    
#performance_plot(history)   

In [12]:
def load_model():
    model = tf.keras.models.load_model('model_B.h5')
    enc_model = tf.keras.models.load_model('enc_model_B.h5') 
    dec_model = tf.keras.models.load_model('dec_model_B.h5')   
    
    return model, enc_model, dec_model

model, enc_model, dec_model = load_model()

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
