In [1]:
import os
import re
import time
import pickle
import json
import random
from random import seed, randrange

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

from nltk.translate.bleu_score import sentence_bleu
from sklearn.model_selection import train_test_split
import sacrebleu
import bert_score
from tqdm import tqdm

# from models.LSTMBahdanau import Encoder, Decoder, Seq2Seq
# from models.BiLSTMLuong import Encoder, Decoder, Seq2Seq
from utils.tokenizer import Tokenizer, pad_sequences, respond
from utils.tokenizer import respond_only_lstm_attn, respond_only_lstm_no_attn, respond_only_gru_no_attn, respond_only_gru_attn, respond_only_lstm_attn_reg, respond_only_lstm_attn_reg_lm
from utils.preprocess import preprocess_1, preprocess_2
# from trainer import train, loss_function, sort_within_batch
from utils.evaluate import calculate_rouge, calculate_bertscore, calculate_bleu

root_dir = '/home/alfirsafauzulh@student.ub.ac.id/Firsa/Research/Chatbot'

data_dir = root_dir + '/Datasets'
dailydialogs_root_dir = data_dir + '/dailydialog'
cornell_root_dir = data_dir + '/cornell_movie'
# saved_model_path = '/home/alfirsafauzulh@student.ub.ac.id/Firsa/Research/Chatbot/Code/autogen-chatbot-v2/saved_models/siet/LSTMSA-dailydialog-50-SbertFreeze'
saved_model_path = '/home/alfirsafauzulh/Firsa/autogen-chatbot-v2/saved_models/siet/LSTMBahdanauLM-dailydialog-100'

RANDOM_SEED = 42
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class MyData(Dataset):
    def __init__(self, X, y):
        self.data = X
        self.target = y
        # TODO: convert this into torch code is possible
        self.length = [ np.sum(1 - np.equal(x, 0)) for x in X]
        
    def __getitem__(self, index):
        x = self.data[index]
        y = self.target[index]
        x_len = self.length[index]
        return x,y,x_len
    
    def __len__(self):
        return len(self.data)  

def loss_function(real, pred):
    """ Only consider non-zero inputs in the loss; mask needed """
    #mask = 1 - np.equal(real, 0) # assign 0 to all above 0 and 1 to all 0s
    #print(mask)
    mask = real.ge(1).type(torch.cuda.FloatTensor)
    
    loss_ = criterion(pred, real) * mask 
    return torch.mean(loss_)

### sort batch function to be able to use with pad_packed_sequence
def sort_within_batch(X, y, lengths):
    lengths, indx = lengths.sort(dim=0, descending=True)
    X = X[indx]
    y = y[indx]
    return X, y, lengths # transpose (batch x seq) to (seq x batch)

In [3]:
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed(RANDOM_SEED)

th = 100
# df = pd.read_csv(dailydialogs_root_dir + f'/df_dailydialogs_max_{th}.csv')
# df = pd.read_csv(f'./Datasets/dailydialog/df_dailydialogs_max_{th}.csv')
df = pd.read_csv(f'./Datasets/dailydialog/df_dailydialogs_factory.csv')
# df = pd.read_csv(f'./Datasets/cornell_movie/df_cornell_max_{th}.csv')
df = df.dropna()

In [4]:
with open(saved_model_path + "/tokenizer.pickle", 'rb') as handle:
    tokenizer = pickle.load(handle)

In [5]:
max_len = th+2

df['questions_preprocessed'] = df['questions'].apply(preprocess_1)
df['answers_preprocessed'] = df['answers'].apply(preprocess_1)

df['questions_preprocessed'] = df['questions'].map(lambda x: preprocess_2(x))
df['answers_preprocessed'] = df['answers'].map(lambda x: preprocess_2(x))

df['questions_preprocessed'] = df['questions_preprocessed'].map(lambda x: tokenizer.text_to_sequence(x))
df['questions_preprocessed'] = df['questions_preprocessed'].map(lambda x: pad_sequences(x, max_len))

df['answers_preprocessed'] = df['answers_preprocessed'].map(lambda x: tokenizer.text_to_sequence(x))
df['answers_preprocessed'] = df['answers_preprocessed'].map(lambda x: pad_sequences(x, max_len))

df_train, df_test = train_test_split(df, test_size=.1, random_state=RANDOM_SEED)
# df_train, df_val = train_test_split(df_train, test_size=.25, random_state=RANDOM_SEED)
    
print(f"Train Data \t: {len(df_train)}")
# print(f"Val Data \t: {len(df_val)}")
print(f"Test Data\t: {len(df_test)}\n")

Train Data 	: 80875
Test Data	: 8987



In [6]:
# from models.LSTMBahdanau import Encoder, Decoder, Seq2Seq
# from models.LSTMBahdanauImproved import Encoder, Decoder, Seq2Seq
# from models.LSTMSelfAttn import Encoder, Decoder, Seq2Seq
from models.LSTMBahdanauImprovedLM import Encoder, Decoder, Seq2Seq

In [7]:
input_size_encoder = len(tokenizer.vocab)+4
input_size_decoder = len(tokenizer.vocab)+4
output_size = len(tokenizer.vocab)+4
vocab_len = len(tokenizer.vocab)+4

# encoder_embedding_size = pretrained_word_embedding_dimensions
# decoder_embedding_size = pretrained_word_embedding_dimensions

encoder_embedding_size = 384
decoder_embedding_size = 384

hidden_size = 768
batch_size = 256
num_layers = 1
enc_dropout = 0.5
dec_dropout = 0.5

input_tensor_test = df_test['questions_preprocessed'].values.tolist()
target_tensor_test = df_test['answers_preprocessed'].values.tolist()

test_data = MyData(input_tensor_test, target_tensor_test)
test_dataset = DataLoader(test_data, batch_size = batch_size, drop_last=True, shuffle=True)

encoder_net = Encoder(input_size_encoder, encoder_embedding_size, hidden_size, 
                  num_layers, enc_dropout, pretrained_word_embedding=False, embedding_matrix=None, freeze=False).to(device)

decoder_net = Decoder(input_size_decoder, decoder_embedding_size, hidden_size, 
                      output_size, num_layers, dec_dropout, pretrained_word_embedding=False, embedding_matrix=None, freeze=False).to(device)
    
model = Seq2Seq(encoder_net, decoder_net, vocab_len=vocab_len)
# model.load_state_dict(torch.load(saved_model_path + "/best_loss.pth", map_location=device))
model.load_state_dict(torch.load(saved_model_path + "/model.pth", map_location=device))
model.to(device)



RuntimeError: cuDNN error: CUDNN_STATUS_INTERNAL_ERROR

In [None]:
pad_idx = tokenizer.word2index["<PAD>"]
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

with torch.no_grad():
  model.eval()
  test_batch_loss = 0
  test_num_batch = 0
  for (test_batch_idx, (X_test, y_test, input_len)) in enumerate(test_dataset):
    X, y, input_lengths = sort_within_batch(X_test, y_test, input_len)

    X = X.permute(1,0)
    y = y.permute(1,0)

    test_inp_data = X.to(device)
    test_target = y.to(device)

    output, _ = model(test_inp_data, test_target, input_lengths)
#     output = model(test_inp_data, test_target, input_lengths)
      
    output = output[1:].reshape(-1, output.shape[2])
    test_target = test_target[1:].reshape(-1)

    test_loss = loss_function(test_target, output)
    test_batch_loss += test_loss
    test_num_batch+=1

  test_loss_ = test_batch_loss/test_num_batch

  print(f"test_loss: {test_loss_}")

In [None]:
test_questions = df_test['questions'].values
test_answers = df_test['answers'].values

preds = []
for x in test_questions:
#     preds.append(respond_only_lstm_no_attn(model, str(x), tokenizer, tokenizer, device, max_length=52))
#     preds.append(respond_only_lstm_attn(model, str(x), tokenizer, tokenizer, device, max_length=52))
    # preds.append(respond_only_lstm_attn_reg(model, str(x), tokenizer, tokenizer, device, max_length=52))
    preds.append(respond_only_lstm_attn_reg_lm(model, str(x), tokenizer, tokenizer, device, max_length=102))
#     preds.append(respond_only_gru_attn(model, str(x), tokenizer, tokenizer, device, max_length=52))
#     preds.append(respond_only_gru_no_attn(model, str(x), tokenizer, tokenizer, device, max_length=52))

In [None]:
calculate_bleu(preds, test_questions, test_answers)

In [None]:
calculate_bertscore(preds, test_answers)

In [None]:
def respond(sentence):
  answer = respond_only_lstm_attn_reg_lm(model, str(sentence), tokenizer, tokenizer, device, max_length=52)
  print('Me\t:', sentence)
  print('Bot\t:', answer)
  print()

In [None]:
respond('hi')
respond('good morning')
respond('how are you ?')
respond('Nice to meet you')
respond('Where do you live now ?')
respond('Good bye')
respond("Do you like football ?")
respond("What is your favourite food ?")
respond("Do you know john ?")
respond("What is the matter ?")
respond("See you again")
respond("Who created you ?")
respond("It’s a pleasure to meet you")
respond("How’s it going ?")
respond("Can you help me ?")
respond("Hey")
respond("I need your help")
respond("Hello")
respond("Are you a bot ?")
respond("Hey Bot, how are you doing ?")
respond("Bot, can you come and help me ?")
respond("Would you please help me with my luggage ?")
respond("Do you need anything else ?")
respond("Ok, thank you")
respond("I am worried")
respond("Can we walk there ?")
respond("Ok, see you then, bye")
respond("I love you")