In [1]:
import numpy as np
import random as rd
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
import matplotlib.pyplot as plt
from rnn_ae import RNN_AE
from tensorflow.keras.utils import to_categorical

In [2]:
train_data_frame = pd.read_csv("c:\\Users\\1\\Downloads\\archive (20)\\train.csv")

In [3]:
print(train_data_frame.shape[0])

11118


In [4]:
dialogs = []
for question_n in range(0, train_data_frame.shape[0]):

    question = train_data_frame.iloc[question_n]["dialog"]
    dialogs.append(question)



In [5]:
questions = []
answers = []
for dialog in dialogs:

    dialog = dialog.split("\n")
    for (message_n, message) in enumerate(dialog):
        
        message = message.split()
        if message_n % 2 == 0:
            answers.append(message)
        
        else:
            questions.append(message)
        
        

In [6]:
question_tokenizer = Tokenizer()
answer_tokenizer = Tokenizer()

In [7]:
question_tokenizer.fit_on_texts(questions)
answer_tokenizer.fit_on_texts(answers)

In [8]:
questions_tokens = question_tokenizer.texts_to_sequences(questions)
answers_tokens = answer_tokenizer.texts_to_sequences(answers)

In [9]:
def project_shape(array, need_len):
    
    result_array = []

    for row in array:
    
        if len(row) > need_len:
            tmp_array = np.asarray(row[:need_len], dtype="int")
        
        else:
            
            tmp_array = []
            i = 0
            while len(tmp_array) < need_len:
                
                if i < len(row):
                    tmp_array.append(row[i])
                
                else:
                    i = 0
                    tmp_array.append(row[i])

                i += 1
            
        result_array.append(tmp_array)
    
    result_array = np.asarray(result_array, dtype="int")
    return result_array
                

In [10]:
def convert_dim(array, seq_len):
    
    result_tensor = []
    result_labels = []
    for row in array:

        sub_tensor = []
        sub_labels = []
        for i in range(len(row) - seq_len):
            
            
            sub_tensor.append(np.asarray(row[i: i + seq_len], dtype="int"))
            sub_labels.append(row[i + seq_len])

        sub_tensor = np.asarray(sub_tensor, dtype="int") 
        sub_labels = np.asarray(sub_labels, dtype="int")
        
        result_tensor.append(sub_tensor)
        result_labels.append(sub_labels)

    result_tensor = np.asarray(result_tensor, dtype="int")
    result_labels = np.asarray(result_labels, dtype="int")
    return (result_tensor, result_labels)



In [11]:
decoder_array = project_shape(array=answers_tokens, need_len=80)

In [12]:
decoder_train_tensor, decoder_train_labels = convert_dim(array=decoder_array, seq_len=40)
print(decoder_train_tensor.shape, decoder_train_labels.shape)

(40666, 40, 40) (40666, 40)


In [13]:
encoder_train_tensor = project_shape(array=questions_tokens, need_len=40)
print(encoder_train_tensor.shape)

(35450, 40)


In [14]:
params_json = {
        "run_folder": "c:\\Users\\1\\Desktop\\models_save\\RNN_AE_save",
        "embedding_dim": 456,
        
        "encoder_block": {
            
            "total_words_n": len(question_tokenizer.word_index),
            "lstm_block": {
                "LayerType": "lstm",
                "weights_init": {
                    "init_type": "random_normal",
                    "params": {
                        "mean": 0.0,
                        "stddev": 1.0
                    }
                },
                "params": {

                    "units": [251, 251, 251],
                    "activations": ["linear", "linear", "linear"],
                    "bi": [False, False, True],
                    "return_sequences": True
                }
            }
        },
        "decoder_block": {

            "total_words_n": len(answer_tokenizer.word_index),
            "lstm_block": {
                "LayerType": "lstm",
                "weights_init": {
                    "init_type": "random_normal",
                    "params": {
                        "mean": 0.0,
                        "stddev": 1.0
                    }
                },
                "params": {

                    "units": [251, 251, 251],
                    "activations": ["linear", "linear", "linear"],
                    "bi": [False, False, True],
                    "return_sequences": False
                }
            },

            "linear_block": {
                "LayerType": "dense",
                "weights_init": {
                    "init_type": "random_normal",
                    "params": {
                        "mean": 0.0,
                        "stddev": 1.0
                    }
                },
                "params": {
                    "units": [32, 128, len(answer_tokenizer.word_index)],
                    "activations": ["linear", "linear", "softmax"]
                }
            }
        }
    }

In [15]:
rnn_ae = RNN_AE(params_json=params_json)
rnn_ae.load_tokenizers(encoder_tokenizer=question_tokenizer, decoder_tokenizer=answer_tokenizer)

In [16]:
encoder_output = rnn_ae.encoder.predict(encoder_train_tensor[:40])
print(encoder_output.shape)

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 579ms/step
(40, 40, 502)


In [17]:
random_idx = np.random.randint(0, min(encoder_train_tensor.shape[0], decoder_train_tensor.shape[0]), 1000)
encoder_train_tensor = encoder_train_tensor[random_idx]
decoder_train_tensor = decoder_train_tensor[random_idx]
decoder_train_labels = decoder_train_labels[random_idx]

In [18]:
random_sample = encoder_train_tensor[np.random.randint(0, 1000)]
input_tokens = random_sample.tolist()
input_text = question_tokenizer.sequences_to_texts([input_tokens])[0]
print(input_text)
generated_text = rnn_ae.generate_sequence(input_question=input_text, sequence_len=100, target_sequence_len=40)

" because i've already proved it . smoking's the easiest thing in the world to give up . i've done it hundreds of times ! "] " because i've already proved it . smoking's the easiest thing in the world
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 562ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 644ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 660ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m

In [19]:
print(generated_text)

laid-back declared flipping sunning you'recommending else's ther're insulted cameras copier fly bounce sensibility 
 whisper deep nandu satellite amounts eligibility gone over-adored yeah transoceanic nervous fit eggnog yo-yo 
"nice tablets club.do jealousy weeping phil would've mustard taylor mislaid learn slow medium-sized guidance sarah reynolds sunbath ends fell scheme 
"in warms checking alpha's intended civilizations conclusion dixon oriental gracious shill ok.take exceptions resting chairs mean.we cape towed


In [20]:
print(encoder_train_tensor.shape, decoder_train_tensor.shape, decoder_train_labels.shape)

(1000, 40) (1000, 40, 40) (1000, 40)


In [21]:
encoder_data = []
decoder_data = []
decoder_labels = []

for (question_seq_number, question_seq) in enumerate(encoder_train_tensor):

    for (answer_sequence, sequence_labels) in zip(decoder_train_tensor[question_seq_number], 
                                                  decoder_train_labels[question_seq_number]):
        
        encoder_data.append(question_seq)
        decoder_data.append(answer_sequence)
        decoder_labels.append(sequence_labels)

encoder_data = np.asarray(encoder_data)
decoder_data = np.asarray(decoder_data)
decoder_labels = np.asarray(decoder_labels)
decoder_labels = to_categorical(decoder_labels, num_classes=params_json["decoder_block"]["total_words_n"])
    

In [22]:
print(encoder_data.shape, decoder_data.shape, decoder_labels.shape)

random_idx = np.random.randint(0, min(encoder_data.shape[0], decoder_data.shape[1]), 2000)
encoder_data = encoder_data[random_idx]
decoder_data = decoder_data[random_idx]
decoder_labels = decoder_labels[random_idx]

(40000, 40) (40000, 40) (40000, 16810)


In [23]:
#rnn_ae.train_model(encoder_data, decoder_data, decoder_labels, batch_size=32, epochs=100)
rnn_ae.load_weights()

  saveable.load_own_variables(weights_store.get(inner_path))


In [24]:
random_sample = encoder_train_tensor[np.random.randint(0, 1000)]
input_tokens = random_sample.tolist()                                                                                                                                                                                                                                                                                                                                                  
input_text = question_tokenizer.sequences_to_texts([input_tokens])[0][:]
input_text = set([worn for worn in input_text.split()])
saved_shape = len(input_text)
input_text = " ".join(word for word in input_text)
print(type(input_text))


generated_text = rnn_ae.generate_sequence(input_question=input_text, sequence_len=100, target_sequence_len=100)



<class 'str'>
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 633ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

In [25]:
sep_text = generated_text.replace("[", "\n")
print(sep_text)


'pardon full-service.and haggle.but high pop tracks easy-going preferably bunch markweed 
 
"mrs code 19 enormously
