In [1]:
import numpy as np
import random as rd
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
import matplotlib.pyplot as plt
from rnn_ae import RNN_AE
from tensorflow.keras.utils import to_categorical

In [2]:
train_data_frame = pd.read_csv("c:\\Users\\1\\Downloads\\archive (20)\\train.csv")

In [3]:
print(train_data_frame.shape[0])

11118


In [4]:
dialogs = []
for question_n in range(0, train_data_frame.shape[0]):

    question = train_data_frame.iloc[question_n]["dialog"]
    dialogs.append(question)



In [5]:
questions = []
answers = []
for dialog in dialogs:

    dialog = dialog.split("\n")
    for (message_n, message) in enumerate(dialog):
        
        message = message.split()
        if message_n % 2 == 0:
            answers.append(message)
        
        else:
            questions.append(message)
        
        

In [6]:
question_tokenizer = Tokenizer()
answer_tokenizer = Tokenizer()

In [7]:
question_tokenizer.fit_on_texts(questions)
answer_tokenizer.fit_on_texts(answers)

In [8]:
questions_tokens = question_tokenizer.texts_to_sequences(questions)
answers_tokens = answer_tokenizer.texts_to_sequences(answers)

In [9]:
def project_shape(array, need_len):
    
    result_array = []

    for row in array:
    
        if len(row) > need_len:
            tmp_array = np.asarray(row[:need_len], dtype="int")
        
        else:
            
            tmp_array = []
            i = 0
            while len(tmp_array) < need_len:
                
                if i < len(row):
                    tmp_array.append(row[i])
                
                else:
                    i = 0
                    tmp_array.append(row[i])

                i += 1
            
        result_array.append(tmp_array)
    
    result_array = np.asarray(result_array, dtype="int")
    return result_array
                

In [10]:
def convert_dim(array, seq_len):
    
    result_tensor = []
    result_labels = []
    for row in array:

        sub_tensor = []
        sub_labels = []
        for i in range(len(row) - seq_len):
            
            
            sub_tensor.append(np.asarray(row[i: i + seq_len], dtype="int"))
            sub_labels.append(row[i + seq_len])

        sub_tensor = np.asarray(sub_tensor, dtype="int") 
        sub_labels = np.asarray(sub_labels, dtype="int")
        
        result_tensor.append(sub_tensor)
        result_labels.append(sub_labels)

    result_tensor = np.asarray(result_tensor, dtype="int")
    result_labels = np.asarray(result_labels, dtype="int")
    return (result_tensor, result_labels)



In [11]:
decoder_array = project_shape(array=answers_tokens, need_len=80)

In [12]:
decoder_train_tensor, decoder_train_labels = convert_dim(array=decoder_array, seq_len=40)
print(decoder_train_tensor.shape, decoder_train_labels.shape)

(40666, 40, 40) (40666, 40)


In [13]:
encoder_train_tensor = project_shape(array=questions_tokens, need_len=40)
print(encoder_train_tensor.shape)

(35450, 40)


In [14]:
params_json = {
        "run_folder": "c:\\Users\\1\\Desktop\\models_save\\RNN_AE_save",
        "embedding_dim": 456,
        "weights_init": {
            "mean": 0.0,
            "stddev": 1.0
        },
        "encoder_params": {
            "total_words_n": len(question_tokenizer.word_index),
            "lstm_params": {
                "layers_n": 3,
                "units": 215,
                "dropout_rate": 0.26
            }
        },
        "decoder_params": {
            "total_words_n": len(answer_tokenizer.word_index),
            "lstm_params": {
                "layers_n": 3,
                "units": 215,
                "dropout_rate": 0.26
            },
        }
    }

In [15]:
rnn_ae = RNN_AE(params_json=params_json)
rnn_ae.load_tokenizers(encoder_tokenizer=question_tokenizer, decoder_tokenizer=answer_tokenizer)

In [16]:
encoder_output = rnn_ae.encoder.predict(encoder_train_tensor[:40])
print(encoder_output.shape)

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 409ms/step
(40, 40, 215)


In [17]:
random_idx = np.random.randint(0, min(encoder_train_tensor.shape[0], decoder_train_tensor.shape[0]), 1000)
encoder_train_tensor = encoder_train_tensor[random_idx]
decoder_train_tensor = decoder_train_tensor[random_idx]
decoder_train_labels = decoder_train_labels[random_idx]

In [18]:
random_sample = encoder_train_tensor[np.random.randint(0, 1000)]
input_tokens = random_sample.tolist()
input_text = question_tokenizer.sequences_to_texts([input_tokens])[0]
print(input_text)
generated_text = rnn_ae.generate_sequence(input_question=input_text, sequence_len=100, target_sequence_len=40)

' she must have been a part-time worker . they didn ’ t have much training . why didn ’ t you register then ? ' ' she must have been a part-time worker . they didn ’ t have
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 480ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 495ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 506ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━

In [19]:
print(generated_text)

[ hots hots discuss war war cards war 20tha hots war defender compensation hots jess war hots jess war ["tu discuss 20tha cards jess cards ["tu jess war capability jess war ok.let cards hots establishment war capability war discuss routes jess cards cards war war war cards cards jess scent cards discuss cards cards ["tu war cards cards ["tu war ["tu ["tu ["tu jess capability war war war cards ["tu establishment establishment cards ["tu cards succeeded ["tu ["tu cards cards cards jess cards cards defender cards establishment war axe cards establishment war cards ["tu projects ["tu ["tu ["tu war cards md


In [20]:
print(encoder_train_tensor.shape, decoder_train_tensor.shape, decoder_train_labels.shape)

(1000, 40) (1000, 40, 40) (1000, 40)


In [21]:
encoder_data = []
decoder_data = []
decoder_labels = []

for (question_seq_number, question_seq) in enumerate(encoder_train_tensor):

    for (answer_sequence, sequence_labels) in zip(decoder_train_tensor[question_seq_number], 
                                                  decoder_train_labels[question_seq_number]):
        
        encoder_data.append(question_seq)
        decoder_data.append(answer_sequence)
        decoder_labels.append(sequence_labels)

encoder_data = np.asarray(encoder_data)
decoder_data = np.asarray(decoder_data)
decoder_labels = np.asarray(decoder_labels)
decoder_labels = to_categorical(decoder_labels, num_classes=params_json["decoder_params"]["total_words_n"])
    

In [22]:
print(encoder_data.shape, decoder_data.shape, decoder_labels.shape)

random_idx = np.random.randint(0, min(encoder_data.shape[0], decoder_data.shape[1]), 2000)
encoder_data = encoder_data[random_idx]
decoder_data = decoder_data[random_idx]
decoder_labels = decoder_labels[random_idx]

(40000, 40) (40000, 40) (40000, 16810)


In [23]:
rnn_ae.train_model(encoder_data, decoder_data, decoder_labels, batch_size=32, epochs=100)
#rnn_ae.load_weights()

Epoch 1/100
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 319ms/step - loss: 4.7127
Epoch 2/100
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 316ms/step - loss: 2.8613
Epoch 3/100
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 329ms/step - loss: 2.7545
Epoch 4/100
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 337ms/step - loss: 2.7340
Epoch 5/100
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 338ms/step - loss: 2.6989
Epoch 6/100
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 327ms/step - loss: 2.6807
Epoch 7/100
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 325ms/step - loss: 2.6646
Epoch 8/100
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 327ms/step - loss: 2.6597
Epoch 9/100
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 328ms/step - loss: 2.6829
Epoch 10/100
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s

KeyboardInterrupt: 

In [None]:
random_sample = encoder_train_tensor[np.random.randint(0, 1000)]
input_tokens = random_sample.tolist()
input_text = question_tokenizer.sequences_to_texts([input_tokens])[0][:]
input_text = set([worn for worn in input_text.split()])
saved_shape = len(input_text)
input_text = " ".join(word for word in input_text)
print(type(input_text))


generated_text = rnn_ae.generate_sequence(input_question=input_text, sequence_len=100, target_sequence_len=100)



<class 'str'>
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m

In [None]:
print(generated_text)
print(input_text)

[ minutes minutes ' leave ' that in leave , leave minutes minutes about alright that " that way.we minutes , feel " ' , minutes minutes ' in minutes leave minutes leave " minutes feel way.we you " in you that ' minutes , ' in way.we minutes way.we minutes , leave ' " ' ' in minutes you in , ten you ' ' ' minutes in , , ten that minutes about ? ? alright ' minutes way.we ' " minutes " way.we ' " minutes " leave you ' minutes minutes in you leave " minutes that
from this , society the yes . just weekend i ' humane adopted her
