# A Baseline Implementation for SE125 Project 2

We provide a baseline model for conversation modeling using deep learning.


## 1. Libraries
In this section, we import third-party libraries to be used in this project.
You may need to install them using `pip`:
```
    pip install tqdm
    pip install cython
    pip install tables
    pip install tensorboardX
    ...
```

In [None]:
!pip install tqdm
!pip install cython
!pip install tables
!pip install tensorboardX
!pip install nltk

In [None]:
import numpy as np
import time
import os
import math
import sys
import tables
import json
import random
from tqdm import tqdm

import torch 
import torch.utils.data as data
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import logging
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.DEBUG, format="%(message)s")#,format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")


## 2. Utilities

In this section we maintain utilities for model construction and training. 
Please put your own utility modules/functions in this section.

In [None]:
PAD_ID, SOS_ID, EOS_ID, UNK_ID = [0, 1, 2, 3]

def asHHMMSS(s):
    m = math.floor(s / 60)
    s -= m * 60
    h = math.floor(m /60)
    m -= h *60
    return '%d:%d:%d'% (h, m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s<%s'%(asHHMMSS(s), asHHMMSS(rs))

#######################################################################
import nltk
try: 
    nltk.word_tokenize("hello world")
except LookupError: 
    nltk.download('punkt')
    
def sent2indexes(sentence, vocab, maxlen):
    '''sentence: a string or list of string
       return: a numpy array of word indices
    '''      
    def convert_sent(sent, vocab, maxlen):
        idxes = np.zeros(maxlen, dtype=np.int64)
        idxes.fill(PAD_ID)
        tokens = nltk.word_tokenize(sent.strip())
        idx_len = min(len(tokens), maxlen)
        for i in range(idx_len): idxes[i] = vocab.get(tokens[i], UNK_ID)
        return idxes, idx_len
    if type(sentence) is list:
        inds, lens = [], []
        for sent in sentence:
            idxes, idx_len = convert_sent(sent, vocab, maxlen)
            #idxes, idx_len = np.expand_dims(idxes, 0), np.array([idx_len])
            inds.append(idxes)
            lens.append(idx_len)
        return np.vstack(inds), np.vstack(lens)
    else:
        inds, lens = sent2indexes([sentence], vocab, maxlen)
        return inds[0], lens[0]

def indexes2sent(indexes, vocab, ignore_tok=PAD_ID): 
    '''indexes: numpy array'''
    def revert_sent(indexes, ivocab, ignore_tok=PAD_ID):
        toks=[]
        length=0
        indexes=filter(lambda i: i!=ignore_tok, indexes)
        for idx in indexes:
            toks.append(ivocab[idx])
            length+=1
            if idx == EOS_ID:
                break
        return ' '.join(toks), length
    
    ivocab = {v: k for k, v in vocab.items()}
    if indexes.ndim==1:# one sentence
        return revert_sent(indexes, ivocab, ignore_tok)
    else:# dim>1
        sentences=[] # a batch of sentences
        lens=[]
        for inds in indexes:
            sentence, length = revert_sent(inds, ivocab, ignore_tok)
            sentences.append(sentence)
            lens.append(length)
        return sentences, lens
    
def save_model(model, epoch):
    """Save model parameters to checkpoint"""
    ckpt_path=f'./output/checkpoint_iter{epoch}.pkl'
    #print(f'Saving model parameters to {ckpt_path}')
    torch.save(model.state_dict(), ckpt_path)
        
def load_model(model, epoch):
    """Load parameters from checkpoint"""
    ckpt_path=f'./output/checkpoint_iter{epoch}.pkl'
    #print(f'Loading model parameters from {ckpt_path}')
    model.load_state_dict(torch.load(ckpt_path))

############################################################################
def create_masks(question, reply_input, reply_target):
    
    def subsequent_mask(size):
        mask = torch.triu(torch.ones(size, size)).transpose(0, 1).type(dtype=torch.uint8)
        return mask.unsqueeze(0)
    
    question_mask = (question!=0).to(device)
    question_mask = question_mask.unsqueeze(1).unsqueeze(1)         # (batch_size, 1, 1, max_words)
     
    reply_input_mask = reply_input!=0
    reply_input_mask = reply_input_mask.unsqueeze(1)  # (batch_size, 1, max_words)
    reply_input_mask = reply_input_mask & subsequent_mask(reply_input.size(-1)).type_as(reply_input_mask.data) 
    reply_input_mask = reply_input_mask.unsqueeze(1) # (batch_size, 1, max_words, max_words)
    reply_target_mask = reply_target!=0              # (batch_size, max_words)
    
    return question_mask, reply_input_mask, reply_target_mask

class LossWithLS(nn.Module):

    def __init__(self, size, smooth):
        super(LossWithLS, self).__init__()
        self.criterion = nn.KLDivLoss(size_average=False, reduce=False)
        self.confidence = 1.0 - smooth
        self.smooth = smooth
        self.size = size
        
    def forward(self, prediction, target, mask):
        """
        prediction of shape: (batch_size, max_words, vocab_size)
        target and mask of shape: (batch_size, max_words)
        """
        prediction = prediction.view(-1, prediction.size(-1))   # (batch_size * max_words, vocab_size)
        target = target.contiguous().view(-1)   # (batch_size * max_words)
        mask = mask.float()
        mask = mask.view(-1)       # (batch_size * max_words)
        labels = prediction.data.clone()
        labels.fill_(self.smooth / (self.size - 1))
        labels.scatter_(1, target.data.unsqueeze(1), self.confidence)
        loss = self.criterion(prediction, labels)    # (batch_size * max_words, vocab_size)
        loss = (loss.sum(1) * mask).sum() / mask.sum()
        return loss



## 3. Configuration
In this section, we configurate some hyperparameters for the model.

In [None]:
def get_config():
    conf = {
    'maxlen':40, # maximum utterance length
    'diaglen':10, # how many utterance kept in the context window

    # Model Arguments
    'emb_size':200, # size of word embeddings
    'rnn_hid_utt':512, # number of rnn hidden units for utterance encoder
    'rnn_hid_ctx':512, # number of rnn hidden units for context encoder
    'rnn_hid_dec':512, # number of rnn hidden units for decoder
    'n_layers':1, # number of layers
    'dropout':0.5, # dropout applied to layers (0 = no dropout)
    'teach_force': 0.8, # use teach force for decoder
      
    # Training Arguments
    'batch_size':64,
    'epochs':10, # maximum number of epochs
    'lr':2e-4, # autoencoder learning rate
    'beta1':0.9, # beta1 for adam
    'init_w':0.05, # initial w
    'clip':5.0,  # gradient clipping, max norm

    # Transformer configuration
    'd_model':512,
    'heads': 8,
    'num_layers': 6,
    }
    return conf 

## 4. Data Loader
A tool to load batches from the binarized (.h5) dataset

In [None]:
class DialogDataset(data.Dataset):
    def __init__(self, filepath, max_ctx_len=7, max_utt_len=40):
        # 1. Initialize file path or list of file names.
        """read training sentences(list of int array) from a hdf5 file"""
        self.max_ctx_len=max_ctx_len
        self.max_utt_len=max_utt_len
        
        print("loading data...")
        table = tables.open_file(filepath)
        self.data = table.get_node('/sentences')[:].astype(np.long)
        self.index = table.get_node('/indices')[:]
        self.data_len = self.index.shape[0]
        print("{} entries".format(self.data_len))

    def __getitem__(self, offset):
        pos_utt, ctx_len, res_len = self.index[offset]['pos_utt'], self.index[offset]['ctx_len'], self.index[offset]['res_len']
        ctx_arr=self.data[pos_utt-ctx_len:pos_utt]
        res_arr=self.data[pos_utt:pos_utt+res_len]
        ## split context array into utterances
        context=[]
        utt_lens=[]
        utt=[]
        for i, tok in enumerate(ctx_arr):
            utt.append(ctx_arr[i])
            if tok==EOS_ID:
                if len(utt)<self.max_utt_len+1:
                    utt_lens.append(len(utt)-1)# floor is not counted in the utt length
                    utt.extend([PAD_ID]*(self.max_utt_len+1-len(utt)))  
                else:
                    utt=utt[:self.max_utt_len+1]
                    utt[-1]=EOS_ID
                    utt_lens.append(self.max_utt_len)
                context.append(utt)                
                utt=[]    
        if len(context)>self.max_ctx_len: # trunk long context
            context=context[-self.max_ctx_len:]
            utt_lens=utt_lens[-self.max_ctx_len:]
        context_len=len(context)
        
        if len(context)<self.max_ctx_len: # pad short context
            for i in range(len(context), self.max_ctx_len):
                context.append([0, SOS_ID, EOS_ID]+[PAD_ID]*(self.max_utt_len-2)) # [floor, <sos>, <eos>, <pad>, <pad> ...]
                utt_lens.append(2) # <s> and </s>
        context = np.array(context)        
        utt_lens=np.array(utt_lens)
        floors=context[:,0]
        context = context[:,1:]
        
        ## Padding ##    
        response = res_arr[1:]
        if len(response)<self.max_utt_len:
            res_len=len(response)
            response=np.append(response,[PAD_ID]*(self.max_utt_len-len(response)))
        else:
            response=response[:self.max_utt_len]
            response[-1]=EOS_ID
            res_len=self.max_utt_len

        return context, context_len, utt_lens, floors, response, res_len

    def __len__(self):
        return self.data_len
    

def load_dict(filename):
    return json.loads(open(filename, "r").readline())

def load_vecs(fin):         
    """read vectors (2D numpy array) from a hdf5 file"""
    h5f = tables.open_file(fin)
    h5vecs= h5f.root.vecs
    
    vecs=np.zeros(shape=h5vecs.shape,dtype=h5vecs.dtype)
    vecs[:]=h5vecs[:]
    h5f.close()
    return vecs

## 5. Models
Define your model(including its dependent sub-modules) here. 

In [None]:
import torch.nn.init as weight_init
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import torch.nn.functional as F

class Embeddings(nn.Module):
    """
    Implements embeddings of the words and adds their positional encodings. 
    """
    def __init__(self, vocab_size, d_model, max_len = 50):
        super(Embeddings, self).__init__()
        self.d_model = d_model
        self.dropout = nn.Dropout(0.1)
        self.embed = nn.Embedding(vocab_size, d_model)
        self.pe = self.create_positinal_encoding(max_len, self.d_model)
        self.dropout = nn.Dropout(0.1)
        
    def create_positinal_encoding(self, max_len, d_model):
        pe = torch.zeros(max_len, d_model).to(device)
        for pos in range(max_len):   # for each position of the word
            for i in range(0, d_model, 2):   # for each dimension of the each position
                pe[pos, i] = math.sin(pos / (10000 ** ((2 * i)/d_model)))
                pe[pos, i + 1] = math.cos(pos / (10000 ** ((2 * (i + 1))/d_model)))
        pe = pe.unsqueeze(0)   # include the batch size
        return pe
        
    def forward(self, encoded_words):
        embedding = self.embed(encoded_words) * math.sqrt(self.d_model)
        embedding += self.pe[:, :embedding.size(1)]   # pe will automatically be expanded with the same batch size as encoded_words
        embedding = self.dropout(embedding)
        return embedding

class MultiHeadAttention(nn.Module):
    
    def __init__(self, heads, d_model):
        
        super(MultiHeadAttention, self).__init__()
        assert d_model % heads == 0
        self.d_k = d_model // heads
        self.heads = heads
        self.dropout = nn.Dropout(0.1)
        self.query = nn.Linear(d_model, d_model)
        self.key = nn.Linear(d_model, d_model)
        self.value = nn.Linear(d_model, d_model)
        self.concat = nn.Linear(d_model, d_model)
        
    def forward(self, query, key, value, mask):
        """
        query, key, value of shape: (batch_size, max_len, 512)
        mask of shape: (batch_size, 1, 1, max_words)
        """
        # (batch_size, max_len, 512)
        query = self.query(query)
        key = self.key(key)        
        value = self.value(value)   
        
        # (batch_size, max_len, 512) --> (batch_size, max_len, h, d_k) --> (batch_size, h, max_len, d_k)
        query = query.view(query.shape[0], -1, self.heads, self.d_k).permute(0, 2, 1, 3)   
        key = key.view(key.shape[0], -1, self.heads, self.d_k).permute(0, 2, 1, 3)  
        value = value.view(value.shape[0], -1, self.heads, self.d_k).permute(0, 2, 1, 3)  
        
        # attention scores
        # (batch_size, h, max_len, d_k) matmul (batch_size, h, d_k, max_len) --> (batch_size, h, max_len, max_len)
        scores = torch.matmul(query, key.permute(0,1,3,2)) / math.sqrt(query.size(-1))
        scores = scores.masked_fill(mask == 0, -1e9)    # (batch_size, h, max_len, max_len)
        # normalize attention scores to attention weights using softmax
        weights = F.softmax(scores, dim = -1)           # (batch_size, h, max_len, max_len)
        weights = self.dropout(weights)

        # summarize values according to attention weights
        # (batch_size, h, max_len, max_len) matmul (batch_size, h, max_len, d_k) --> (batch_size, h, max_len, d_k)
        context = torch.matmul(weights, value)
        # (batch_size, h, max_len, d_k) --> (batch_size, max_len, h, d_k) --> (batch_size, max_len, h * d_k)
        context = context.permute(0,2,1,3).contiguous().view(context.shape[0], -1, self.heads * self.d_k)
        # concat current context
        # (batch_size, max_len, h * d_k)
        interacted = self.concat(context)
        return interacted 

class FeedForward(nn.Module):

    def __init__(self, d_model, middle_dim = 2048):
        super(FeedForward, self).__init__()
        
        self.fc1 = nn.Linear(d_model, middle_dim)
        self.fc2 = nn.Linear(middle_dim, d_model)
        self.dropout = nn.Dropout(0.1)

    def forward(self, x):
        out = F.relu(self.fc1(x))
        out = self.fc2(self.dropout(out))
        return out

class EncoderLayer(nn.Module):

    def __init__(self, d_model, heads):
        super(EncoderLayer, self).__init__()
        self.layernorm = nn.LayerNorm(d_model)
        self.self_multihead = MultiHeadAttention(heads, d_model)
        self.feed_forward = FeedForward(d_model)
        self.dropout = nn.Dropout(0.1)

    def forward(self, embeddings, mask):
        interacted = self.dropout(self.self_multihead(embeddings, embeddings, embeddings, mask))
        interacted = self.layernorm(interacted + embeddings)
        feed_forward_out = self.dropout(self.feed_forward(interacted))
        encoded = self.layernorm(feed_forward_out + interacted)
        return encoded

class DecoderLayer(nn.Module):
    
    def __init__(self, d_model, heads):
        super(DecoderLayer, self).__init__()
        self.layernorm = nn.LayerNorm(d_model)
        self.self_multihead = MultiHeadAttention(heads, d_model)
        self.src_multihead = MultiHeadAttention(heads, d_model)
        self.feed_forward = FeedForward(d_model)
        self.dropout = nn.Dropout(0.1)
        
    def forward(self, embeddings, encoded, src_mask, target_mask):
        query = self.dropout(self.self_multihead(embeddings, embeddings, embeddings, target_mask))
        query = self.layernorm(query + embeddings)
        interacted = self.dropout(self.src_multihead(query, encoded, encoded, src_mask))
        interacted = self.layernorm(interacted + query)
        feed_forward_out = self.dropout(self.feed_forward(interacted))
        decoded = self.layernorm(feed_forward_out + interacted)
        return decoded

# Implement transformer
class MyModel(nn.Module):
    def __init__(self, config, vocab_size):
        super(MyModel, self).__init__()
        
        self.d_model = config['d_model']
        self.vocab_size = vocab_size
        self.embed = Embeddings(self.vocab_size, self.d_model)
        self.heads = config['heads']
        self.num_layers = config['num_layers']
        self.encoder = nn.ModuleList([EncoderLayer(self.d_model, self.heads) for _ in range(self.num_layers)])
        self.decoder = nn.ModuleList([DecoderLayer(self.d_model, self.heads) for _ in range(self.num_layers)])
        self.logit = nn.Linear(self.d_model, self.vocab_size)
        
    def encode(self, src_words, src_mask):
        src_embeddings = self.embed(src_words)
        for layer in self.encoder:
            src_embeddings = layer(src_embeddings, src_mask)
        return src_embeddings
    
    def decode(self, target_words, target_mask, src_embeddings, src_mask):
        tgt_embeddings = self.embed(target_words)
        for layer in self.decoder:
            tgt_embeddings = layer(tgt_embeddings, src_embeddings, src_mask, target_mask)
        return tgt_embeddings
        
    def forward(self, src_words, src_mask, target_words, target_mask):
        encoded = self.encode(src_words, src_mask)
        decoded = self.decode(target_words, target_mask, encoded, src_mask)
        out = F.log_softmax(self.logit(decoded), dim = 2)
        return out 

## 6. Evaluation
We provide the evaluation script as well as the BLEU score metric. 

**Do not change code in this block**

In [None]:
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction
from collections import Counter

class Metrics:
    """
    """
    def __init__(self):
        super(Metrics, self).__init__()

    def sim_bleu(self, hyps, ref):
        """
        :param ref - a list of tokens of the reference
        :param hyps - a list of tokens of the hypothesis
    
        :return maxbleu - recall bleu
        :return avgbleu - precision bleu
        """
        scores = []
        for hyp in hyps:
            try:
                scores.append(sentence_bleu([ref], hyp, smoothing_function=SmoothingFunction().method7,
                                        weights=[1./4, 1./4, 1./4, 1./4]))
            except:
                scores.append(0.0)
        return np.max(scores), np.mean(scores)
    
def evaluate(model, metrics, test_loader, vocab, repeat, f_eval):
    ivocab = {v: k for k, v in vocab.items()}
    device = next(model.parameters()).device
    
    recall_bleus, prec_bleus, avg_lens  = [], [], []
        
    dlg_id = 0
    for context, context_lens, utt_lens, floors, response, res_lens in tqdm(test_loader): 
        
        if dlg_id > 5000: break
        
#        max_ctx_len = max(context_lens)
        max_ctx_len = context.size(1)
        context, utt_lens, floors = context[:,:max_ctx_len,1:], utt_lens[:,:max_ctx_len]-1, floors[:,:max_ctx_len] 
                         # remove empty utts and the sos token in the context and reduce the context length
        ctx, ctx_lens = context, context_lens
        context, context_lens, utt_lens \
            = [tensor.to(device) for tensor in [context, context_lens, utt_lens]]

#################################################
        utt_lens[utt_lens<=0]=1
#################################################
        
        with torch.no_grad():
            sample_words, sample_lens = model.sample(context, context_lens, utt_lens, repeat)
        # nparray: [repeat x seq_len]       
        
        pred_sents, _ = indexes2sent(sample_words, vocab)
        pred_tokens = [sent.split(' ') for sent in pred_sents]   
        ref_str, _ =indexes2sent(response[0].numpy(), vocab, SOS_ID)
        #ref_str = ref_str.encode('utf-8')
        ref_tokens = ref_str.split(' ')
        
        max_bleu, avg_bleu = metrics.sim_bleu(pred_tokens, ref_tokens)
        recall_bleus.append(max_bleu)
        prec_bleus.append(avg_bleu)
        
        avg_lens.append(np.mean(sample_lens))

        response, res_lens = [tensor.to(device) for tensor in [response, res_lens]]
        
        ## Write concrete results to a text file
        dlg_id += 1 
        if f_eval is not None:
            f_eval.write("Batch {:d} \n".format(dlg_id))
            # print the context
            start = np.maximum(0, ctx_lens[0]-5)
            for t_id in range(start, ctx_lens[0], 1):
                context_str = indexes2sent(ctx[0, t_id].numpy(), vocab)
                f_eval.write("Context {:d}-{:d}: {}\n".format(t_id, floors[0, t_id], context_str))
            #print the ground truth response    
            f_eval.write("Target >> {}\n".format(ref_str.replace(" ' ", "'")))
            for res_id, pred_sent in enumerate(pred_sents):
                f_eval.write("Sample {:d} >> {}\n".format(res_id, pred_sent.replace(" ' ", "'")))
            f_eval.write("\n")
    prec_bleu= float(np.mean(prec_bleus))
    recall_bleu = float(np.mean(recall_bleus))
    result = {'avg_len':float(np.mean(avg_lens)),
              'recall_bleu': recall_bleu, 'prec_bleu': prec_bleu, 
              'f1_bleu': 2*(prec_bleu*recall_bleu) / (prec_bleu+recall_bleu+10e-12),
             }
    
    if f_eval is not None:
        for k, v in result.items():
            f_eval.write(str(k) + ':'+ str(v)+' ')
        f_eval.write('\n')
    print("Done testing")
    print(result)
    
    return result


## 7. Training
The training script here.

In [None]:
import argparse
from datetime import datetime
from tensorboardX import SummaryWriter # install tensorboardX (pip install tensorboardX) before importing this package

def train(args, model=None, pad = 0):
    # LOG #
    fh = logging.FileHandler(f"./output/logs.txt")
                                      # create file handler which logs even debug messages
    logger.addHandler(fh)# add the handlers to the logger
    
    timestamp = datetime.now().strftime('%Y%m%d%H%M')
    tb_writer = SummaryWriter(f"./output/logs/{timestamp}") if args.visual else None

    # Set the random seed manually for reproducibility.
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)
    device = torch.device(f"cuda:{args.gpu_id}" if torch.cuda.is_available() else "cpu")
    print(device)


    config=get_config()

    if args.visual:
        json.dump(config, open(f'./output/config_{timestamp}.json', 'w'))# save configs

    ###############################################################################
    # Load data
    ###############################################################################
    data_path = args.data_path+args.dataset+'/'
    train_set = DialogDataset(os.path.join(data_path, 'train.h5'), config['diaglen'], config['maxlen'])
    valid_set = DialogDataset(os.path.join(data_path, 'valid.h5'), config['diaglen'], config['maxlen'])
    test_set = DialogDataset(os.path.join(data_path, 'test.h5'), config['diaglen'], config['maxlen'])
    vocab = load_dict(os.path.join(data_path, 'vocab.json'))
    ivocab = {v: k for k, v in vocab.items()}
    n_tokens = len(ivocab)
    metrics=Metrics()    
    print("Loaded data!")

    ###############################################################################
    # Define the models
    ###############################################################################
    if model is None:
        model = MyModel(config, n_tokens)

    if args.reload_from>=0:
        load_model(model, args.reload_from)
        
    model=model.to(device)

    logger.info("Training...")
    best_perf = -1
    itr_global=1
    start_epoch=1 if args.reload_from==-1 else args.reload_from+1
    for epoch in range(start_epoch, config['epochs']+1):
        epoch_start_time = time.time()
        itr_start_time = time.time()
        
        # shuffle (re-define) data between epochs   
        train_loader=torch.utils.data.DataLoader(dataset=train_set, batch_size=config['batch_size'],
                                                 shuffle=True, num_workers=1, drop_last=True)
        n_iters=train_loader.__len__()
        itr = 1
        for batch in train_loader:# loop through all batches in training data
            model.train()
            context, context_lens, utt_lens, floors, response, res_lens = batch

 #           max_ctx_len = max(context_lens)
            max_ctx_len = context.size(1)
            context, utt_lens = context[:,:max_ctx_len,1:], utt_lens[:,:max_ctx_len]-1
                                    # remove empty utterances in context
                                    # remove the sos token in the context and reduce the context length     
#################################################
            utt_lens[utt_lens<=0]=1
#################################################
            batch_gpu = [tensor.to(device) for tensor in [context, context_lens, utt_lens, response, res_lens]] 
            train_results = model.train_batch(*batch_gpu)
                     
            if itr % args.log_every == 0:
                elapsed = time.time() - itr_start_time
                log = '%s|%s@gpu%d epo:[%d/%d] iter:[%d/%d] step_time:%ds elapsed:%s'\
                %(args.model, args.dataset, args.gpu_id, epoch, config['epochs'],
                         itr, n_iters, elapsed, timeSince(epoch_start_time,itr/n_iters))
                logger.info(log)
                logger.info(train_results)
                if args.visual:
                    tb_writer.add_scalar('train_loss', train_results['train_loss'], itr_global)

                itr_start_time = time.time()    
                
            if itr % args.valid_every == 0 and False:
                logger.info('Validation ')
                valid_loader=torch.utils.data.DataLoader(dataset=valid_set, batch_size=config['batch_size'], shuffle=True, num_workers=1)
                model.eval()    
                valid_losses = []
                for context, context_lens, utt_lens, floors, response, res_lens in valid_loader:
 #                   max_ctx_len = max(context_lens)
                    max_ctx_len = context.size(1)
                    context, utt_lens = context[:,:max_ctx_len,1:], utt_lens[:,:max_ctx_len]-1
                             # remove empty utterances in context
                             # remove the sos token in the context and reduce the context length
#################################################
                    utt_lens[utt_lens<=0]=1
#################################################
                    batch = [tensor.to(device) for tensor in [context, context_lens, utt_lens, response, res_lens]]
                    valid_results = model.valid(*batch)    
                    valid_losses.append(valid_results['valid_loss'])
                if args.visual: tb_writer.add_scalar('valid_loss', np.mean(valid_losses), itr_global)
                logger.info({'valid_loss':np.mean(valid_losses)})    
                
            itr += 1
            itr_global+=1            
            
            if itr_global % args.eval_every == 0:  # evaluate the model in the validation set
                model.eval()          
                logger.info("Evaluating in the validation set..")

                valid_loader=torch.utils.data.DataLoader(dataset=valid_set, batch_size=1, shuffle=False, num_workers=1)

                f_eval = open(f"./output/tmp_results/iter{itr_global}.txt", "w")
                repeat = 10            
                eval_results = evaluate(model, metrics, valid_loader, vocab, repeat, f_eval)
                bleu = eval_results['recall_bleu']
                if bleu> best_perf:
                    save_model(model, 0)#itr_global) # save model after each epoch
                if args.visual:
                    tb_writer.add_scalar('recall_bleu', bleu, itr_global)
                
        # end of epoch ----------------------------
               # model.adjust_lr()

    return model


## 8. Main Function (Training)
You can change the default arguments by setting the `default` attribute.

In [None]:

if __name__ == '__main__':

    parser = argparse.ArgumentParser(description='Dialog Pytorch')
    # Path Arguments
    parser.add_argument('--data_path', type=str, default='./data/', help='location of the data corpus')
    parser.add_argument('--model', type=str, default='MyModel', help='model name')
    # parser.add_argument('--dataset', type=str, default='weibo', help='name of dataset.')
    parser.add_argument('--dataset', type=str, default='dailydialog', help='name of dataset.')
    # parser.add_argument('-v','--visual', action='store_true', default=False, help='visualize training status in tensorboard')
    parser.add_argument('-v','--visual', action='store_true', default=True, help='visualize training status in tensorboard')
    parser.add_argument('--reload_from', type=int, default=-1, help='reload from a trained ephoch')
    # parser.add_argument('--gpu_id', type=int, default=1, help='GPU ID')
    parser.add_argument('--gpu_id', type=int, default=0, help='GPU ID')

    # Evaluation Arguments
    parser.add_argument('--log_every', type=int, default=100, help='interval to log autoencoder training results')
    parser.add_argument('--valid_every', type=int, default=1000, help='interval to validation')
    parser.add_argument('--eval_every', type=int, default=2000, help='interval to evaluation to concrete results')
    parser.add_argument('--seed', type=int, default=1111, help='random seed')
    
    
    
    
    args = parser.parse_args(args=[])
    print(vars(args))

    # make output directory if it doesn't already exist
    os.makedirs(f'./output/models', exist_ok=True)
    os.makedirs(f'./output/tmp_results', exist_ok=True)
        
    torch.backends.cudnn.benchmark = True # speed up training by using cudnn
    torch.backends.cudnn.deterministic = True # fix the random seed in cudnn
    
    model = train(args)

## 9. Main Function (Test)

**Please do not change code here except the default arguments**

In [None]:

def test(args):
    conf = get_config()
    # Set the random seed manually for reproducibility.
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(args.seed)
    else:
        print("Note that our pre-trained models require CUDA to evaluate.")
    
    # Load data
    data_path=args.data_path+args.dataset+'/'
    test_set=DialogDataset(data_path+'test.h5', conf['diaglen'], conf['maxlen'])
    test_loader=torch.utils.data.DataLoader(dataset=test_set, batch_size=1, shuffle=False, num_workers=1)
    vocab = load_dict(data_path+'vocab.json')
    n_tokens = len(vocab)

    metrics=Metrics()
    
    # Load model checkpoints    
    model = MyModel(conf, n_tokens)
    load_model(model, 0)
    #model=model.to(device)
    model.eval()
    
    f_eval = open("./output/results.txt", "w")
    repeat = args.n_samples
    
    evaluate(model, metrics, test_loader, vocab, repeat, f_eval)

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='PyTorch DialogGAN for Eval')
    parser.add_argument('--data_path', type=str, default='./data/', help='location of the data corpus')
#     parser.add_argument('--dataset', type=str, default='weibo', help='name of dataset, SWDA or DailyDial')
    parser.add_argument('--dataset', type=str, default='dailydialog', help='name of dataset, SWDA or DailyDial')
    parser.add_argument('--model', type=str, default='MyModel', help='model name')
    parser.add_argument('--reload_from', type=int, default=0, 
                        help='directory to load models from')
    
    parser.add_argument('--n_samples', type=int, default=10, help='Number of responses to sampling')
    parser.add_argument('--seed', type=int, default=1111, help='random seed')
    args = parser.parse_args(args=[])
    print(vars(args))
    test(args)