In [10]:
import os
import re
import time
import pickle
import json
import random
from random import seed, randrange

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

from nltk.translate.bleu_score import sentence_bleu
from sklearn.model_selection import train_test_split
import sacrebleu
import bert_score
from tqdm import tqdm

# from models.LSTMBahdanau import Encoder, Decoder, Seq2Seq
# from models.BiLSTMLuong import Encoder, Decoder, Seq2Seq
from utils.tokenizer import Tokenizer, pad_sequences, respond, MyData
from utils.tokenizer import respond_only_lstm_attn, respond_only_lstm_no_attn, respond_only_gru_no_attn, respond_only_gru_attn
from utils.preprocess import preprocess_1, preprocess_2
from trainer import train, loss_function, sort_within_batch
from utils.evaluate import calculate_rouge, calculate_bertscore, calculate_bleu

root_dir = '/home/alfirsafauzulh@student.ub.ac.id/Firsa/Research/Chatbot'

data_dir = root_dir + '/Datasets'
dailydialogs_root_dir = data_dir + '/dailydialog'
cornell_root_dir = data_dir + '/cornell_movie'
saved_model_path = '/home/alfirsafauzulh@student.ub.ac.id/Firsa/Research/Chatbot/Code/Train/saved_models/BiGRULuong-cornell'

RANDOM_SEED = 1111
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [11]:
np.random.seed(RANDOM_SEED)

th = 50
# df = pd.read_csv(dailydialogs_root_dir + f'/df_dailydialogs_max_{th}.csv')
# df = pd.read_csv(f'./Datasets/dailydialog/df_dailydialogs_max_{th}.csv')
df = pd.read_csv(f'./Datasets/cornell_movie/df_cornell_max_{th}.csv')
df = df.dropna()

In [12]:
with open(saved_model_path + "/tokenizer.pickle", 'rb') as handle:
    tokenizer = pickle.load(handle)

In [13]:
max_len = th+2

df['questions_preprocessed'] = df['questions'].apply(preprocess_1)
df['answers_preprocessed'] = df['answers'].apply(preprocess_1)

df['questions_preprocessed'] = df['questions'].map(lambda x: preprocess_2(x))
df['answers_preprocessed'] = df['answers'].map(lambda x: preprocess_2(x))

df['questions_preprocessed'] = df['questions_preprocessed'].map(lambda x: tokenizer.text_to_sequence(x))
df['questions_preprocessed'] = df['questions_preprocessed'].map(lambda x: pad_sequences(x, max_len))

df['answers_preprocessed'] = df['answers_preprocessed'].map(lambda x: tokenizer.text_to_sequence(x))
df['answers_preprocessed'] = df['answers_preprocessed'].map(lambda x: pad_sequences(x, max_len))

df_train, df_test = train_test_split(df, test_size=.2, random_state=RANDOM_SEED)
print(f"Train\t: {len(df_train)}")
print(f"Test\t: {len(df_test)}")

Train	: 171350
Test	: 42838


In [14]:
from models.BiGRULuong import Encoder, Decoder, Seq2Seq

In [19]:
input_size_encoder = len(tokenizer.vocab)+4
input_size_decoder = len(tokenizer.vocab)+4
output_size = len(tokenizer.vocab)+4
vocab_len = len(tokenizer.vocab)+4

# encoder_embedding_size = pretrained_word_embedding_dimensions
# decoder_embedding_size = pretrained_word_embedding_dimensions

encoder_embedding_size = 768
decoder_embedding_size = 768

hidden_size = 768
batch_size = 64
num_layers = 1
enc_dropout = 0.5
dec_dropout = 0.5

# input_tensor_test = df_test['questions_preprocessed'].values.tolist()
# target_tensor_test = df_test['answers_preprocessed'].values.tolist()

# test_data = MyData(input_tensor_test, target_tensor_test)
# test_dataset = DataLoader(test_data, batch_size = batch_size, drop_last=True, shuffle=True)

encoder_net = Encoder(input_size_encoder, encoder_embedding_size, hidden_size, 
                  num_layers, enc_dropout, pretrained_word_embedding=False, embedding_matrix=None, freeze=False).to(device)

decoder_net = Decoder(input_size_decoder, decoder_embedding_size, hidden_size, 
                      output_size, num_layers, dec_dropout, pretrained_word_embedding=False, embedding_matrix=None, freeze=False).to(device)
    
model = Seq2Seq(encoder_net, decoder_net, vocab_len=vocab_len)
model.load_state_dict(torch.load(saved_model_path + "/best_loss.pth", map_location=device))
# model.load_state_dict(torch.load(saved_model_path + "/model.pth", map_location=device))
model.to(device)



Seq2Seq(
  (encoder): Encoder(
    (dropout): Dropout(p=0.5, inplace=False)
    (embedding): Embedding(45809, 768)
    (rnn): GRU(768, 768, dropout=0.5, bidirectional=True)
    (fc_hidden): Linear(in_features=1536, out_features=768, bias=True)
    (fc_encoder_states): Linear(in_features=1536, out_features=768, bias=True)
  )
  (decoder): Decoder(
    (dropout): Dropout(p=0.5, inplace=False)
    (embedding): Embedding(45809, 768)
    (rnn): GRU(1536, 768, dropout=0.5)
    (fc): Linear(in_features=768, out_features=45809, bias=True)
    (W1): Linear(in_features=1536, out_features=768, bias=True)
    (W2): Linear(in_features=768, out_features=768, bias=True)
    (V): Linear(in_features=768, out_features=1, bias=True)
    (softmax): Softmax(dim=1)
  )
)

In [20]:
test_questions = df_test['questions'].values
test_answers = df_test['answers'].values

preds = []
for x in test_questions:
#     preds.append(respond_only_lstm_no_attn(model, str(x), tokenizer, tokenizer, device, max_length=52))
    preds.append(respond_only_gru_attn(model, str(x), tokenizer, tokenizer, device, max_length=52))

In [21]:
calculate_bleu(preds, test_questions, test_answers)

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


{'1-gram': 0.045183853155937564,
 '2-gram': 0.008124208419023492,
 '3-gram': 0.0027972806307226413,
 '4-gram': 0.0010883742640694176}

In [22]:
calculate_bertscore(preds, test_answers)

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

{'p': 0.07786496728658676,
 'r': -0.03902457281947136,
 'f': 0.018437162041664124}