In [159]:
import sys, os
# sys.path.append(os.path.abspath(os.path.join('..', 'config')))
sys.path.append(os.path.abspath(".."))

import re
import time
import pickle
import json
import random
from random import seed, randrange

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

from nltk.translate.bleu_score import sentence_bleu
from sklearn.model_selection import train_test_split
import sacrebleu
import bert_score
from tqdm import tqdm

from utils.tokenizer import Tokenizer, pad_sequences, respond, MyData
from utils.tokenizer import respond_only_lstm_attn, respond_only_lstm_no_attn, respond_only_gru_no_attn, respond_only_gru_attn
from utils.preprocess import preprocess_1, preprocess_2
from trainer import train, loss_function, sort_within_batch
from utils.evaluate import calculate_rouge, calculate_bertscore, calculate_bleu

root_dir = '/home/alfirsafauzulh@student.ub.ac.id/Firsa/Research/Chatbot'

data_dir = root_dir + '/Datasets'
dailydialogs_root_dir = data_dir + '/dailydialog'
cornell_root_dir = data_dir + '/cornell_movie'

RANDOM_SEED = 1111
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [160]:
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed(RANDOM_SEED)

In [161]:
th = 15
df = pd.read_csv(f"{dailydialogs_root_dir}/df_dailydialogs_max_{th}.csv")
df = df.dropna()

df['questions_preprocessed'] = df['questions'].apply(preprocess_1)
df['answers_preprocessed'] = df['answers'].apply(preprocess_1)

tokenizer = Tokenizer(pd.concat([df['questions'], df['answers']], axis=0).values, min_freq=1)

max_len = th+2
df['questions_preprocessed'] = df['questions'].map(lambda x: preprocess_2(x))
df['answers_preprocessed'] = df['answers'].map(lambda x: preprocess_2(x))

df['questions_preprocessed'] = df['questions_preprocessed'].map(lambda x: tokenizer.text_to_sequence(x))
df['questions_preprocessed'] = df['questions_preprocessed'].map(lambda x: pad_sequences(x, max_len))

df['answers_preprocessed'] = df['answers_preprocessed'].map(lambda x: tokenizer.text_to_sequence(x))
df['answers_preprocessed'] = df['answers_preprocessed'].map(lambda x: pad_sequences(x, max_len))
    
df_train, df_test = train_test_split(df, test_size=.2, random_state=RANDOM_SEED)
print(f"Train Data \t: {len(df_train)}")
print(f"Test Data\t: {len(df_test)}\n")

Train Data 	: 44164
Test Data	: 11042



In [162]:
from transformers import BertTokenizer
from transformers import BertModel

custom_special_tokens = ["<sos>", "<eos>", "<PAD>", "<UNK>"]
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', return_tensors="pt")
bert_tokenizer.add_special_tokens({"additional_special_tokens": custom_special_tokens})

bert_model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)
bert_model.resize_token_embeddings(len(bert_tokenizer))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Embedding(30526, 768)

In [163]:
text = df['questions'].values[333]
marked_text = "<sos> " + text + " <eos>"
tokenized_text = bert_tokenizer.tokenize(marked_text)
indexed_tokens = bert_tokenizer.convert_tokens_to_ids(tokenized_text)

for tup in zip(tokenized_text, indexed_tokens):
    print('{:<12} {:>6,}'.format(tup[0], tup[1]))

<sos>        30,522
can           2,064
you           2,017
call          2,655
an            2,019
ambulance    10,771
<eos>        30,523


In [164]:
tokenized_text = bert_tokenizer.tokenize(marked_text)
indexed_tokens = bert_tokenizer.convert_tokens_to_ids(tokenized_text)
tokens_tensor = torch.tensor([indexed_tokens])
bert_model.eval()

with torch.no_grad():
    outputs = bert_model(tokens_tensor)
    # use last hidden state as word embeddings
    last_hidden_state = outputs[0]
    word_embed_1 = last_hidden_state
    
    hidden_states = outputs[2]
    
    # sum of last four layer
    word_embed = torch.stack(hidden_states[-4:]).sum(0)
    
    # concatenate last four layers
    word_embed_cat = torch.cat([hidden_states[i] for i in [-1,-2,-3,-4]], dim=-1)

In [165]:
for param in bert_model.parameters():
    param.requires_grad = False

In [179]:
vocab = tokenizer.index2word
vocab_size = len(vocab)

pretrained_word_embedding_dimensions = 768
embedding_matrix = np.zeros((vocab_size, pretrained_word_embedding_dimensions))

for i, word in vocab.items():
    tokenized_text = bert_tokenizer.tokenize(word)
    indexed_tokens = bert_tokenizer.convert_tokens_to_ids(tokenized_text)
    tokens_tensor = torch.tensor([indexed_tokens])
    
    outputs = bert_model(tokens_tensor)
    hidden_states = outputs[2]
    word_embed = torch.stack(hidden_states[-4:]).sum(0).squeeze(0)
    
    try:
        if word_embed is not None:
            if word_embed.size(0) > 1 :
                word_embed = word_embed.mean(dim=0).unsqueeze(0)
                
            embedding_matrix[i] = word_embed.numpy()
        else:
            print("Embedding not found")
    except:
        print(f"{word}-{word_embed.shape}")
        break

In [221]:
def get_word_embed(word):
    tokenized_text = bert_tokenizer.tokenize(word)
    indexed_tokens = bert_tokenizer.convert_tokens_to_ids(tokenized_text)
    tokens_tensor = torch.tensor([indexed_tokens])
    
    outputs = bert_model(tokens_tensor)
    hidden_states = outputs[2]
    word_embed = torch.stack(hidden_states[-4:]).sum(0).squeeze(0)
    if word_embed.size(0) > 1 :
        word_embed = word_embed.mean(dim=0).unsqueeze(0)
    return word_embed

In [222]:
cos = nn.CosineSimilarity(dim=1, eps=1e-6)

In [223]:
input1 = get_word_embed("king") - get_word_embed("man") + get_word_embed("woman")
input2 = get_word_embed("queen")
cos(input1, input2)

tensor([0.5409])

In [225]:
input1 = get_word_embed("who")
input2 = get_word_embed("whom")
cos(input1, input2)

tensor([0.6419])