# Initialization

In [None]:
from google.colab import drive
drive.mount("drive")

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at drive


In [None]:
!pip install fairseq
!pip install transformers

In [None]:
%cd drive/My Drive/Colab Notebooks/chatbot

/content/drive/My Drive/Colab Notebooks/chatbot


# Body

In [None]:
bertmodel_name = "bert-base-multilingual-cased"

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import os
import numpy as np
import argparse
from transformers import AdamW, get_linear_schedule_with_warmup, get_constant_schedule
import time
import torch.nn.functional as F
import math

In [None]:
import torch
import torch.nn as nn
from transformers import BertConfig
from transformers import BertModel

class Net(nn.Module):
    def __init__(self, device='cpu', pretrain = True):
        super().__init__()
        config = BertConfig.from_pretrained(bertmodel_name)
        if pretrain:
            # roberta just a name, should be self.bert
            self.roberta = BertModel.from_pretrained(bertmodel_name)
        else:
            self.roberta = BertModel(config)

        self.fc = nn.Linear(768, 300)
        self.device = device

    def forward(self, x):
        if self.device == "cuda":
            x = x.to(self.device, non_blocking=True)
        else:
            x = x.to(self.device)
        
        if self.training:
            self.roberta.train()
            enc, _ = self.roberta(x)
        else:
            self.roberta.eval()
            with torch.no_grad():
                enc, _ = self.roberta(x)
        
        # CLS
        enc = enc[:, 0, :]

        result  = self.fc(enc)
        return result


In [None]:
import numpy as np
from tqdm import tqdm
import torch
from torch.utils import data
 
from fairseq.data import Dictionary
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained(bertmodel_name)

# tokenizer.add_tokens("[SOC]")
pad_idx = tokenizer.pad_token_id
 
def preprocess(text, split_sep = " "):
    return (
        text.replace(".", " . ")
        .replace("_comma_", ",")
        .replace(". . .", "...")
        .replace(",", " , ")
        .replace(";", " ; ")
        .replace(":", " : ")
        .replace("!", " ! ")
        .replace("'", " ' ")
        .replace("?", " ? ")
        .replace("  ", " ")
        .replace("  ", " ")
        .strip()
        .lower() # adjust
        # .split(split_sep)
    )


def txt2vec(text, max_tokens_len):
   
    text = preprocess(text)
     
    subwords = '[CLS] ' + text + ' [SEP]'
    #text to tensor
    input_ids = torch.tensor(tokenizer.encode(subwords, add_special_tokens=False)).long().tolist()[:max_tokens_len]
    return torch.LongTensor([input_ids])

def batchify(batch):
        
        input_list = list(zip(*batch))
        contexts, next_ = [
            pad(ex, pad_idx) for ex in [input_list[0], input_list[1]]
        ]
        
        return contexts, next_, input_list[2]

def pad(tensors, padding_value=-1, max_len = 256):
    """
    Concatenate and pad the input tensors, which may be 1D or 2D.
    """

    max_len = max(t.size(-1) for t in tensors) 

    if tensors[0].dim() == 1:
        out = torch.LongTensor(len(tensors), max_len).fill_(padding_value)
        for i, t in enumerate(tensors):
            out[i, : t.size(0)] = t
        return out
    elif tensors[0].dim() == 2:
        max_width = max(t.size(0) for t in tensors)
        out = torch.LongTensor(len(tensors), max_width, max_len).fill_(padding_value)
        for i, t in enumerate(tensors):
            out[i, : t.size(0), : t.size(1)] = t
        return out
    else:
        raise ValueError("Input tensors must be either 1D or 2D!")



class EmpDataset(data.Dataset):
    def __init__(
        self,
        splitname,
        maxlen=256, # max number of tokens per sentence # chua su dung
        history_len=3,
    ):
        df = open(f"{splitname}.csv", "r" ,encoding="utf-8").readlines()
         
        self.max_hist_len = history_len
        self.data = []
        self.ids = []
        history = []
        for i in range(1, len(df)):
            cparts = df[i - 1].strip().split(",")
            sparts = df[i].strip().split(",")
            if cparts[0] == sparts[0]:

                history.append(cparts[5])
                
                idx = int(sparts[1])
                if (idx % 2) == 0:
                    
                    # SOC|SEP start of comment
                    sentence1 = " [SEP] ".join(history[-self.max_hist_len :]) 
                    sentence2 =  sparts[5]

                    self.data.append((txt2vec(sentence1, maxlen), txt2vec(sentence2, maxlen), sparts[2]))
                    self.ids.append((sparts[0], sparts[1]))
                    
            else:
                history = []
                    
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]
        
    def getid(self, index):
        return self.ids[index]
        


In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
 
max_tokens_length = 256
 
class Args():
  def __init__(self):
    self.optimizer = "adamax"
    self.learning_rate = 1e-5
    self.stop_crit_num_epochs = 10
    self.batch_size = 20
    self.max_turn = 4
    self.epochs = 30
    self.hits_at_nb_cands = 100 # p@1,100
    self.display_iter = 100 # help="Frequency of train logging"
    self.log_file = "logs/bert.txt"
    self.model_file = "models/bert.pt"

option = Args()


In [None]:
train_dataset = EmpDataset(
  "ED/train",
  maxlen = max_tokens_length,
  history_len = option.max_turn,
)

dev_dataset = EmpDataset(
  "ED/valid",
  maxlen = max_tokens_length,
  history_len = option.max_turn,
)

train_iter = DataLoader(
  dataset     = train_dataset,
  batch_size  = option.batch_size,
  shuffle     = True,
  num_workers = 0,
  collate_fn  = batchify,
  pin_memory  = True,
)


dev_iter = DataLoader(
  dataset     = dev_dataset,
  batch_size  = option.batch_size,
  shuffle     = True,
  num_workers = 0,
  collate_fn  = batchify,
  pin_memory  = True,
)

In [None]:
torch.save(train_iter,"torch_pre_load/bert_train.pth")
torch.save(dev_iter,"torch_pre_load/bert_dev.pth")

In [None]:
# train_iter = torch.load("torch_pre_load/train_auto.pth")
# dev_iter = torch.load("torch_pre_load/dev_auto.pth")

In [None]:
import logging
import sys
import json

def get_logger(opt):
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    fmt = logging.Formatter("%(asctime)s: [ %(message)s ]", "%m/%d/%Y %I:%M:%S %p")
    console = logging.StreamHandler()
    console.setFormatter(fmt)
    logger.handlers = []
    logger.addHandler(console)
    if opt.log_file:
        logfile = logging.FileHandler(opt.log_file, "a")
        logfile.setFormatter(fmt)
        logger.addHandler(logfile)
    command = " ".join(sys.argv)
    # logger.info(f"COMMAND: {command}")
    # logger.info("-" * 100)
    config = json.dumps(vars(opt), indent=4, sort_keys=True)
    logger.info(f"CONFIG:\n{config}")
    return logger
logger = get_logger(option)

09/08/2020 04:19:13 AM: [ CONFIG:
{
    "batch_size": 20,
    "display_iter": 100,
    "epochs": 30,
    "hits_at_nb_cands": 100,
    "learning_rate": 1e-05,
    "log_file": "logs/bert_auto.txt",
    "max_turn": 4,
    "model_file": "models/bert_auto.pt",
    "optimizer": "adamax",
    "stop_crit_num_epochs": 10
} ]


In [None]:
def score_candidates(all_context, all_cands, top_k=20, normalize=False):
    
    dot_products = all_context.mm(all_cands.t())  # [ctx, cand]
    if normalize:
        dot_products /= all_context.norm(2, dim=1).unsqueeze(1)
        dot_products /= all_cands.norm(2, dim=1).unsqueeze(0)
    scores, answers = dot_products.topk(top_k, dim=1)
    # Index of top-k items in decreasing order. Answers is of size [ctx, top_k]
    return scores, answers

def loss_fn(ctx, labels):
     
    batch_size = ctx.size(0)
    dot_products = ctx.mm(labels.t())
   
    log_prob = F.log_softmax(dot_products, dim=1)
    targets = log_prob.new_empty(batch_size).long()
    targets = torch.arange(batch_size, out=targets)
    loss = F.nll_loss(log_prob, targets)
    nb_ok = (log_prob.max(dim=1)[1] == targets).float().sum()
    return loss, nb_ok

def train(epoch, start_time, model, optimizer, opt, data_loader):
    """Run through one epoch of model training with the provided data loader."""
    model.train()
    # Initialize meters + timers
    train_loss = 0
    nb_ok = 0
    nb_exs = 0
    nb_losses = 0
    epoch_start = time.time()
    # Run one epoch
    for idx, ex in enumerate(data_loader, 1):
        params = [
            field
            if field is not None
            else None
            for field in ex
        ]
        ctx = model(params[0][:,0,:])
        cands = model(params[1][:,0,:])
        loss, ok = loss_fn(ctx, cands)
        nb_ok += ok
        nb_exs += ex[0].size(0)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss += loss.sum().item()
        nb_losses += 1
        if idx % opt.display_iter == 0 or idx == len(data_loader):
            avg_loss = train_loss / nb_losses
            acc = 100 * nb_ok / nb_exs
            elapsed = time.time() - start_time
            logging.info(
                f"train: Epoch = {epoch} | iter = {idx}/{len(data_loader)} | loss = "
                f"{avg_loss:.3f} | batch P@1 = {acc:.2f} % | elapsed time = "
                f"{elapsed:.2f} (s)"
            )
            train_loss = 0
            nb_losses = 0
    epoch_elapsed = time.time() - epoch_start
    logging.info(
        f"train: Epoch {epoch:d} done. Time for epoch = {epoch_elapsed:.2f} (s)"
    )


def validate(
    epoch,
    model,
    data_loader,   
    is_test=False,
    nb_candidates=100,
    shuffle="shuffled",
):
    model.eval()
    examples = 0
    eval_start = time.time()
    sum_losses = 0
    n_losses = 0
    correct = 0
    all_context = []
    all_cands = []
    n_skipped = 0
   
    for i, ex in enumerate(data_loader):
        batch_size = ex[0].size(0)
        
        params = [
            field
            if field is not None
            else None
            for field in ex
        ]
        # ctx, cands = model(*params)
        ctx = model(params[0][:,0,:])
        cands = model(params[1][:,0,:])
        all_context.append(ctx)
        all_cands.append(cands)
        loss, nb_ok = loss_fn(ctx, cands)
        sum_losses += loss
        correct += nb_ok
        n_losses += 1
        examples += batch_size
        
    n_examples = 0
    if len(all_context) > 0:
        logging.info("Processing candidate top-K")
        all_context = torch.cat(all_context, dim=0)  # [:50000]  # [N, 2h]
        all_cands = torch.cat(all_cands, dim=0)  # [:50000]  # [N, 2h]
        acc_ranges = [1, 3, 10]
        n_correct = {r: 0 for r in acc_ranges}
        for context, cands in list(
            zip(all_context.split(nb_candidates), all_cands.split(nb_candidates))
        )[:-1]:
            _, top_answers = score_candidates(context, cands)
            n_cands = cands.size(0)
            gt_index = torch.arange(n_cands, out=top_answers.new(n_cands, 1))
            for acc_range in acc_ranges:
                n_acc = (top_answers[:, :acc_range] == gt_index).float().sum()
                n_correct[acc_range] += n_acc
            n_examples += n_cands
        accuracies = {r: 100 * n_acc / n_examples for r, n_acc in n_correct.items()}
        avg_loss = sum_losses / (n_losses + 0.00001)
        avg_acc = 100 * correct / (examples + 0.000001)
        valid_time = time.time() - eval_start
        logging.info(
            f"Valid ({shuffle}): Epoch = {epoch:d} | avg loss = {avg_loss:.3f} | "
            f"batch P@1 = {avg_acc:.2f} % | "
            + f" | ".join(
                f"P@{k},{nb_candidates} = {v:.2f}%" for k, v in accuracies.items()
            )
            + f" | valid time = {valid_time:.2f} (s)"
        )
        return avg_loss
    return 10


# Train

In [None]:
net = Net(device)

if device == "cuda":
  torch.cuda.set_device(-1) # get the lastest device (GPU)
  net = torch.nn.DataParallel(net)
  net.cuda()

In [None]:
net = Net(device, False)

if device == "cuda":
  torch.cuda.set_device(-1) # get the lastest device (GPU)
  net = torch.nn.DataParallel(net)
  net.cuda()
  
# ctx_net.to(device)
net.load_state_dict(torch.load(option.model_file), strict = False)
net.eval()
 

In [None]:
###############
### Train #####
###############
    
if option.optimizer == "adamax":
    lr = option.learning_rate
    named_params_to_optimize = filter(
        lambda p: p[1].requires_grad, net.named_parameters()
    )
    params_to_optimize = (p[1] for p in named_params_to_optimize)
    optimizer = optim.Adamax(params_to_optimize, lr=lr)

    
start_time = time.time()
best_loss = float("+inf")

with torch.no_grad():
    validate(
        0,
        net,
        dev_iter,
        shuffle = False,
        nb_candidates=option.hits_at_nb_cands,
    )

 
for epoch in range(0, option.epochs):
    train(epoch, start_time, net, optimizer, option, train_iter)
    with torch.no_grad():
        loss = validate(
            epoch,
            net,
            dev_iter,
            nb_candidates=option.hits_at_nb_cands,
        )
        if loss < best_loss:
            best_loss = loss
            best_loss_epoch = epoch
            logging.info(f"New best loss, saving model to {option.model_file}")
            torch.save(net.state_dict(), f"{option.model_file}")
        # Stop if it's been too many epochs since the loss has decreased
        if option.stop_crit_num_epochs != -1:
            if epoch - best_loss_epoch >= option.stop_crit_num_epochs:
                break

09/08/2020 04:20:01 AM: [ Processing candidate top-K ]
09/08/2020 04:20:01 AM: [ Valid (False): Epoch = 0 | avg loss = 1.505 | batch P@1 = 54.91 % | P@1,100 = 33.65% | P@3,100 = 51.98% | P@10,100 = 72.47% | valid time = 29.49 (s) ]
09/08/2020 04:20:35 AM: [ train: Epoch = 0 | iter = 100/2012 | loss = 1.521 | batch P@1 = 54.50 % | elapsed time = 64.07 (s) ]
09/08/2020 04:21:10 AM: [ train: Epoch = 0 | iter = 200/2012 | loss = 1.518 | batch P@1 = 54.53 % | elapsed time = 98.48 (s) ]
09/08/2020 04:21:46 AM: [ train: Epoch = 0 | iter = 300/2012 | loss = 1.482 | batch P@1 = 54.05 % | elapsed time = 134.67 (s) ]
09/08/2020 04:22:21 AM: [ train: Epoch = 0 | iter = 400/2012 | loss = 1.470 | batch P@1 = 54.10 % | elapsed time = 170.03 (s) ]
09/08/2020 04:22:56 AM: [ train: Epoch = 0 | iter = 500/2012 | loss = 1.539 | batch P@1 = 53.87 % | elapsed time = 205.24 (s) ]
09/08/2020 04:23:32 AM: [ train: Epoch = 0 | iter = 600/2012 | loss = 1.545 | batch P@1 = 53.66 % | elapsed time = 241.25 (s) ]
09

# Demo

In [None]:
import numpy as np
import torch
from torch.utils import data

class TextData(data.Dataset):
    def __init__(
        self,
        splitname,
        maxlen=256, # max number of tokens per sentence # chua su dung
        history_len=3,
    ):
        
        df = open(f"{splitname}.csv", "r", encoding="utf-8").readlines()
         
        self.max_hist_len = history_len
        self.data = []
        self.ids = []
        history = []
        for i in range(1, len(df)):
            cparts = df[i - 1].strip().split(",")
            sparts = df[i].strip().split(",")
            if cparts[0] == sparts[0]:
                
                history.append(cparts[5].replace("_comma_", ","))
                idx = int(sparts[1])
                if ((idx % 2) == 0):
                    sentence1 = " <SOC> ".join(history[-self.max_hist_len :]) # SOC start of comment
                    sentence2 =  (sparts[5].replace("_comma_", ","))
                    
                    self.data.append((sentence1, sentence2, sparts[2]))
                    self.ids.append((sparts[0], sparts[1]))
                    
            else:
                history = []
                
    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return self.data[index]

    def getid(self, index):
        return self.ids[index]



In [None]:
text_cands = TextData("ED/train")

In [None]:
net = Net(device, False)

if device == "cuda":
  torch.cuda.set_device(-1) # get the lastest device (GPU)
  net = torch.nn.DataParallel(net)
  net.cuda()
  
# ctx_net.to(device)
net.load_state_dict(torch.load(option.model_file), strict = False)
net.eval()
 

DataParallel(
  (module): Net(
    (roberta): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(119547, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=True)
                (La

In [None]:
train_iter = torch.load("torch_pre_load/bert_train.pth")

In [None]:
import torch, gc
gc.collect()
torch.cuda.empty_cache()

In [None]:
import time
start_time = time.time()

net.eval()
all_cands = []

with torch.no_grad():
  for i, ex in enumerate(train_iter):
      batch_size = ex[0].size(0)
      params = [
          field
          if field is not None
          else None
          for field in ex
      ]

      cands = net(params[1][:,0,:])
      all_cands.append(cands)

  all_cands = torch.cat(all_cands, dim = 0)


end_time = time.time()
print("Total time load candidates: ", end_time - start_time)

Total time load candidates:  58.19445323944092


In [None]:
torch.save(all_cands, "torch_pre_load/bert_all_cands.pth")

In [None]:
all_cands = torch.load("torch_pre_load/bert_all_cands.pth")

In [None]:
 
def predict(context, top_n=5, normalize=False):
    """
    returns a list of top_n tuples ("sentence", "score")
    """
    context = txt2vec(context, max_tokens_length)
        
    with torch.no_grad():
        if device == "cuda":
            context = context.cuda(non_blocking=True)
       
        ctx = net(context)
        scores, index = score_candidates(ctx, all_cands, top_n, normalize)
        response = []
        for i, (score, index) in enumerate(zip(scores.squeeze(0), index.squeeze(0)), 1):
            response.append((text_cands[index][1], float(score)))
     
        return response

In [None]:
outs = predict("I am totally out of money ", 10)
for item in outs:
    print("Score: ", item[1], "\nResponse: ", item[0])

Score:  19.94879913330078 
Response:  Awesome! So you'd recommend it if I were looking for a new phone?
Score:  19.527507781982422 
Response:  How long have you been in school?
Score:  19.465566635131836 
Response:  What did you study?
Score:  19.106966018676758 
Response:  Was there a prize you wanted from it? Or just recognition?
Score:  18.963335037231445 
Response:  You brought yourself. I am a glass half full type of lady lol
Score:  18.95335578918457 
Response:  Were you home when it started?
Score:  18.83824920654297 
Response:  I can understand that. Is it better for you this year?
Score:  18.725860595703125 
Response:  Why do you say that?
Score:  18.608997344970703 
Response:  wow how do you do it?
Score:  18.585189819335938 
Response:  I've never even heard of that before!


In [None]:
index = 100
print("Sentence: ", text_cands[index][0])
print("Target: ", text_cands[index][1])
print(10*"*")

outs = predict(text_cands[index][0], 10)
for item in outs:
    print("Score: ", item[1], "\nResponse: ", item[0])

Sentence:  You are never going to believe what I did <SOC> What did you do?  <SOC> Well, I normally do not feel comfortable lending things to my friends, but recently I mustered up the trust to loan my friend my vehicle.
Target:  Ouch... Is it just for a day? Is your friend a safe driver?
**********
Score:  20.011789321899414 
Response:  Money can always come back but your health is the most important thing.
Score:  17.010778427124023 
Response:  I bet! Sounds like it would be a big shock
Score:  16.908485412597656 
Response:  Oh, you don't work with them anymore? Or maybe it was because you decided to be a stay-at-home mother? 
Score:  16.868776321411133 
Response:  What kind of tricks did he do
Score:  16.69251823425293 
Response:  That is disappointing. Did you confront them about it? I am sorry you have to deal with that.
Score:  16.647926330566406 
Response:  Oh man thats scary! What set it off?
Score:  16.316850662231445 
Response:  oh i hope she got better
Score:  16.30805778503

# Dependency

In [None]:
!mkdir -p vncorenlp/models/wordsegmenter
!wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/VnCoreNLP-1.1.1.jar
!wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/wordsegmenter/vi-vocab
!wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/wordsegmenter/wordsegmenter.rdr
!mv VnCoreNLP-1.1.1.jar vncorenlp/ 
!mv vi-vocab vncorenlp/models/wordsegmenter/
!mv wordsegmenter.rdr vncorenlp/models/wordsegmenter/

--2020-09-07 08:53:21--  https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/VnCoreNLP-1.1.1.jar
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 27412575 (26M) [application/octet-stream]
Saving to: ‘VnCoreNLP-1.1.1.jar’


2020-09-07 08:53:22 (83.9 MB/s) - ‘VnCoreNLP-1.1.1.jar’ saved [27412575/27412575]

--2020-09-07 08:53:22--  https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/wordsegmenter/vi-vocab
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 526544 (514K) [application/octet-stream]
Saving to: ‘vi-vocab’


202

In [None]:
!wget https://public.vinai.io/PhoBERT_base_transformers.tar.gz
!tar -xzvf PhoBERT_base_transformers.tar.gz

--2020-09-07 08:55:21--  https://public.vinai.io/PhoBERT_base_transformers.tar.gz
Resolving public.vinai.io (public.vinai.io)... 13.224.157.37, 13.224.157.83, 13.224.157.54, ...
Connecting to public.vinai.io (public.vinai.io)|13.224.157.37|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 322405979 (307M) [application/x-tar]
Saving to: ‘PhoBERT_base_transformers.tar.gz’


2020-09-07 08:55:33 (27.7 MB/s) - ‘PhoBERT_base_transformers.tar.gz’ saved [322405979/322405979]

PhoBERT_base_transformers/
PhoBERT_base_transformers/config.json
PhoBERT_base_transformers/bpe.codes
PhoBERT_base_transformers/model.bin
PhoBERT_base_transformers/dict.txt


# Evaluate for test set

In [None]:
net = Net(device, False)

if device == "cuda":
  torch.cuda.set_device(-1) # get the lastest device (GPU)
  ctx_net = torch.nn.DataParallel(ctx_net)
  ctx_net.cuda()
  
# ctx_net.to(device)
ctx_net.load_state_dict(torch.load(option.model_file), strict = False)
ctx_net.eval()
 

In [None]:
test_dataset = EmpDataset(
  "ED/test",
  maxlen = max_tokens_length,
  history_len = option.max_turn,
)

sh_test_iter = DataLoader(
  dataset     = test_dataset,
  batch_size  = option.batch_size,
  shuffle     = True,
  num_workers = 0,
  collate_fn  = batchify,
  pin_memory  = True,
)

un_test_iter = DataLoader(
  dataset     = test_dataset,
  batch_size  = option.batch_size,
  shuffle     = False,
  num_workers = 0,
  collate_fn  = batchify,
  pin_memory  = True,
)

In [None]:
validate(
  0,
  net,
  sh_test_iter,
  shuffle = True,
  nb_candidates = option.hits_at_nb_cands
)

09/08/2020 08:16:50 AM: [ Processing candidate top-K ]
09/08/2020 08:16:50 AM: [ Valid (True): Epoch = 0 | avg loss = 1.191 | batch P@1 = 64.99 % | P@1,100 = 41.65% | P@3,100 = 61.98% | P@10,100 = 82.37% | valid time = 28.66 (s) ]


tensor(1.1908, device='cuda:0', grad_fn=<DivBackward0>)

In [None]:
validate(
  0,
  net,
  un_test_iter,
  shuffle = False, 
  nb_candidates= option.hits_at_nb_cands,
)

09/08/2020 08:17:17 AM: [ Processing candidate top-K ]
09/08/2020 08:17:17 AM: [ Valid (False): Epoch = 0 | avg loss = 2.349 | batch P@1 = 42.46 % | P@1,100 = 29.54% | P@3,100 = 57.19% | P@10,100 = 80.73% | valid time = 26.90 (s) ]


tensor(2.3493, device='cuda:0', grad_fn=<DivBackward0>)

In [None]:
import torch, gc
gc.collect()
torch.cuda.empty_cache()