In [1]:
import os
import re
import time
import pickle
import json
import random
from random import seed, randrange

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from collections import Counter

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel

from nltk.translate.bleu_score import sentence_bleu
from sklearn.model_selection import train_test_split
import sacrebleu
import bert_score
from tqdm import tqdm

# from models.LSTMBahdanau import Encoder, Decoder, Seq2Seq
from models.BiGRU import Encoder, Decoder, Seq2Seq
from utils.tokenizer import Tokenizer, pad_sequences, respond_only, respond, MyData
from utils.preprocess import preprocess_1, preprocess_2
from trainer import train, loss_function, sort_within_batch

root_dir = '/home/alfirsafauzulh@student.ub.ac.id/Firsa/Research/Chatbot/'

data_dir = root_dir + '/Datasets'
dailydialogs_root_dir = data_dir + '/dailydialog'
cornell_root_dir = data_dir + '/cornell_movie'
RANDOM_SEED = 1111

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
np.random.seed(RANDOM_SEED)

th = 25
df = pd.read_csv(dailydialogs_root_dir + f'/df_dailydialogs_max_{th}.csv')
df = df.dropna()

In [3]:
tokenizer = Tokenizer(pd.concat([df['questions'], df['answers']], axis=0).values, min_freq=1)
print(f"vocabulary length : {len(tokenizer.vocab)}")

vocabulary length : 14814


In [4]:
max_len = th+2

# df['questions_preprocessed'] = df['questions'].map(lambda x: preprocess_2(x, tokenizer=tokenizer, max_len=max_len))
# df['answers_preprocessed'] = df['answers'].map(lambda x: preprocess_2(x, tokenizer=tokenizer, max_len=max_len))

df['questions_preprocessed'] = df['questions'].map(lambda x: preprocess_2(x))
df['answers_preprocessed'] = df['answers'].map(lambda x: preprocess_2(x))

df['questions_preprocessed'] = df['questions_preprocessed'].map(lambda x: tokenizer.text_to_sequence(x))
df['questions_preprocessed'] = df['questions_preprocessed'].map(lambda x: pad_sequences(x, max_len))

df['answers_preprocessed'] = df['answers_preprocessed'].map(lambda x: tokenizer.text_to_sequence(x))
df['answers_preprocessed'] = df['answers_preprocessed'].map(lambda x: pad_sequences(x, max_len))

In [5]:
df_train, df_test = train_test_split(df, test_size=.1, random_state=RANDOM_SEED)
print(f"Train\t: {len(df_train)}")
print(f"Test\t: {len(df_test)}")

Train	: 69529
Test	: 7726


In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [7]:
input_size_encoder = len(tokenizer.vocab)+4
input_size_decoder = len(tokenizer.vocab)+4
output_size = len(tokenizer.vocab)+4
vocab_len = len(tokenizer.vocab)+4

# encoder_embedding_size = pretrained_word_embedding_dimensions
# decoder_embedding_size = pretrained_word_embedding_dimensions

encoder_embedding_size = 512
decoder_embedding_size = 512

hidden_size = 1280
batch_size = 64
num_layers = 1
enc_dropout = 0.5
dec_dropout = 0.5

input_tensor_train = df_train['questions_preprocessed'].values.tolist()
target_tensor_train = df_train['answers_preprocessed'].values.tolist()

input_tensor_test = df_test['questions_preprocessed'].values.tolist()
target_tensor_test = df_test['answers_preprocessed'].values.tolist()

train_data = MyData(input_tensor_train, target_tensor_train)
test_data = MyData(input_tensor_test, target_tensor_test)

train_dataset = DataLoader(train_data, batch_size = batch_size, drop_last=True, shuffle=True)
test_dataset = DataLoader(test_data, batch_size = batch_size, drop_last=True, shuffle=True)

encoder_net = Encoder(input_size_encoder, encoder_embedding_size, hidden_size, 
                  num_layers, enc_dropout, pretrained_word_embedding=False, embedding_matrix=None, freeze=False).to(device)

decoder_net = Decoder(input_size_decoder, decoder_embedding_size, hidden_size, 
                      output_size, num_layers, dec_dropout, pretrained_word_embedding=False, embedding_matrix=None, freeze=False).to(device)
    
model = Seq2Seq(encoder_net, decoder_net, vocab_len=vocab_len).to(device)



In [8]:
train(model=model, num_epochs=500, lr=0.0001, tokenizer=tokenizer, train_dataset=train_dataset, val_dataset=test_dataset, crit='CEL')

Train Seq2Seq Model [train_loss=2.4829:   4%|▍         | 41/1086 [00:07<03:18,  5.26it/s]


KeyboardInterrupt: 

In [None]:
# model = torch.load('model.pth')
# model = model.to('cuda')

test_questions = df_test['questions'].values
test_answers = df_test['answers'].values

preds = []
for x in test_questions:
    preds.append(respond_only(model, str(x), tokenizer, tokenizer, device='cuda', max_length=27))

In [None]:
from utils.evaluate import calculate_rouge, calculate_bertscore, calculate_bleu

In [None]:
calculate_bleu(preds, test_questions, test_answers)

In [14]:
calculate_bertscore(preds, test_answers)

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'p': 0.1674077957868576, 'r': 0.11925370246171951, 'f': 0.143015056848526}