## Load Packages

In [102]:
import random
import numpy as np
import torch
import pickle
import pandas as pd
import os
from tqdm import tqdm_notebook
from nhps.models import nhp
from nhps.io import processors
from nhps.miss import factorized

## Functions

In [98]:
## function to convert pkl to df
def convert_dict_to_df(pkl):
    df = pd.DataFrame()
    for i in tqdm_notebook(range(0,len(pkl))):
        temp_df = (pd.DataFrame(pkl[i]))
        temp_df['seq_id'] = i
        df = pd.concat([df,temp_df])
    return df.reset_index(drop=True)

### Function to load dataset

In [6]:
def load_dataset(**args):
    '''
        'PathData': 'Directory where data is stored'
        'Split': Type of data to use ['Train','Test','dev']. While testing choose either test or dev
        'Small': reduce size of data for debug runs
    '''
    data_path = os.path.join(args['PathData'], args['Split'] + '.pkl')
    with open(data_path, 'rb') as fp:
        pkl_dev = pickle.load(fp)
    data_dev, data_dev_gold = pkl_dev['seqs_obs'], pkl_dev['seqs']

    if args['Small'] > -1:
        data_dev = data_dev[:args['Small']]
        data_dev_gold = data_dev_gold[:args['Small']]

    #obs_num, unobs_num = pkl_dev['obs_num'], pkl_dev['unobs_num']
    #total_event_num = obs_num + unobs_num
    total_event_num = pkl_dev['total_num']

    return [data_dev, data_dev_gold], [total_event_num]
    #return [data_dev, data_dev_gold], [obs_num, unobs_num, total_event_num]


### Function to Propose Particles

In [114]:
def propose_particles(**args):
    '''
        Model : nhpf or nhps
        Seed : 12345 or any other number
        UseGPU : True        
     'PathData':'data/pilottaxi'
     DimLSTM : 16
        
    '''
#     handler = args['handler']# What is handler. Assuming that it is used for multi processing
#     handler.print("Propose particles for model {}".format(args['Model']))
    print("Propose particles for model {}".format(args['Model']))
    random.seed(args['Seed'])
    np.random.seed(args['Seed'])
    torch.manual_seed(args['Seed'])

    [data_dev, data_dev_gold], [total_event_num] = load_dataset(**args)
    # seq_bases are the x's.
    # We may use miss_mec to generate seq_bases in the future.
    seq_bases = data_dev

    if not args['UseGPU']:
        device = 'cpu'
    else:
        device = 'cuda'

    sampling = 1

    miss_mec = factorized.FactorizedMissMec(
        device = 'cuda' if args['UseGPU'] else 'cpu',
        config_file = os.path.join(args['PathData'], 'censor.conf')
    )

    proc = processors.DataProcessorNeuralHawkes(
        idx_BOS=total_event_num,
        idx_EOS=total_event_num+1,
        idx_PAD=total_event_num+2,
        miss_mec=miss_mec,
        sampling=sampling,
        device = 'cuda' if args['UseGPU'] else 'cpu'
    )

    if args['Model'] == 'nhps':
        agent = nhp.NeuralHawkes(
            total_num=total_event_num, hidden_dim=args['DimLSTM'],
            device=device, miss_mec=miss_mec
        )
        agent.initBackwardMachine(hidden_dim_back=args['BackDimLSTM'], type_back=args['BackType'],
                                  back_beta=args['BackBeta'])
    elif args['Model'] == 'nhpf':
        agent = nhp.NeuralHawkes(
            total_num=total_event_num, hidden_dim=args['DimLSTM'],
            device=device, miss_mec=miss_mec
        )
    else:
        raise NotImplementedError

    agent.load_state_dict(
        torch.load('./logs/model_nhps_5000', map_location='cpu') )

    # NOTE: Here I just simply replace neglect_mask with r vector
    # In a more general case mentioned in the paper,
    # r factor should take the whole history into account.
    # But in our experiment, events are censored independently,
    # so the r vector is constant for all the events that're going to be proposed.
    # I set history as None to get the r vector.
    # **Reverse Changes**
    agent.setMaskIntensity(miss_mec.neglect_mask())

    if args['UseGPU']:
        agent.cuda(device)
    agent.eval()

    input_dev = []
    input_dev_with_grountruth = []
    all_weights = list()
    all_particles = list()
    all_log_proposals = list()
    all_num_unobs = list()

    for i_dev, (one_seq_dev, one_seq_dev_gold) in enumerate(zip(data_dev, data_dev_gold)):

        one_seq_dev_augmented = proc.orgSeq(
            one_seq_dev, one_seq_dev_gold[-1]['time_since_start']
        )
        input_dev.append(
            proc.augmentLogProbMissing(
                agent.sample_particles(
                    args['NumParticle'], one_seq_dev_augmented, args['NumUnobserved'],
                    args['Multiplier'],
                    resampling=args['Resampling'],
                    need_eliminate_log_base=args['EliminateBase'] )))

        input_dev_with_grountruth.append(
            proc.processSeq(
                one_seq_dev_gold, n=1, seq_obs=one_seq_dev))

        if (i_dev+1) % args['SizeBatch'] == 0 or \
                (i_dev == len(data_dev)-1 and (len(input_dev)%args['SizeBatch']) > 0):

            r"""
            this part is computing log q(z | x)
            where z is ground truth (similar to train nhps)
            """
            batch_seqs_with_groundtruth = proc.processBatchSeqsWithParticles(
                input_dev_with_grountruth)
            log_proposals, num_unobs = agent(batch_seqs_with_groundtruth, mode=9)
            all_log_proposals.append(log_proposals.detach().cpu().numpy())
            all_num_unobs.append(num_unobs.detach().cpu().numpy())
            input_dev_with_grountruth = list()

            r"""
            this part is computing weights for decoding
            """

            # If resampling is on, we could directly get the weights of particles.
            if args['Resampling']:
                log_weights = np.array([input_[2].cpu().detach().numpy() for input_ in input_dev])
                unnormalized_weights = np.exp(log_weights)
                weights = (unnormalized_weights.T / unnormalized_weights.sum(axis=1)).T
            else:
                # If not, we could use our old method.
                batch_seqs_dev = proc.processBatchSeqsWithParticles(input_dev)
                weights, _ = agent(batch_seqs_dev, mode=4)
                weights = weights.detach().cpu().numpy()

            weights_orders = np.argsort(-weights, axis=1)

            for i_batch, [event_, dtime_, _, _, len_seq_, _, _, _, _, _, _, _] in enumerate(input_dev):
                particles = list()
                all_particles.append(particles)
                for particle_idx in weights_orders[i_batch]:
                    event, dtime, len_seq = event_[particle_idx], \
                                            dtime_[particle_idx], len_seq_[particle_idx]
                    particles.append(proc.getSeq(event.cpu(), dtime.cpu(), len_seq.cpu()))
                all_weights.append(weights[i_batch][weights_orders[i_batch]])

            input_dev = []
#             handler.print("Proposing {}-th {} seq".format(i_dev+1, args['Split']))
            print("Proposing {}-th {} seq".format(i_dev+1, args['Split']))

    all_weights = np.array(all_weights)
    all_log_proposals = np.concatenate(all_log_proposals)
    all_num_unobs = np.concatenate(all_num_unobs)

    # Note that particles in each batch are ordered by the their weights inversely.
    return all_weights, all_particles, seq_bases, all_log_proposals, all_num_unobs

## Run Propose_Particles function

In [133]:
args = {
    'PathData':'data/pilottaxi',
    'Split':'Train',
    'NumUnobserved':	10, 
    'Multiplier':	1,
    'NumParticle':	1,
    'Cost':	1,
    'MultiplierCost':	2,
    'NumCost':	1,
    'Split':	'dev',
    'SizeBatch':	1,
    'UseGPU':	True,
    'Seed':	12345,
    'Small':	5, # change value to increase the number of output sequences
    'MaxIter':	5,
    'ProcessPerDevice':	2,
#     'VisibleDevice':	
    'MultiProcess':	None,
    'Resampling':	False,
    'EliminateBase':	True,
    'Model':'nhps',
    'PathModel':'./',
    'BackDimLSTM':16,
    'DimLSTM':16,
    'BackType':'add',
    'BackBeta':1
}

In [134]:
all_weights, all_particles, seq_bases, all_log_proposals, all_num_unobs = propose_particles(**args)

Propose particles for model nhps
Proposing 1-th dev seq
Proposing 2-th dev seq
Proposing 3-th dev seq
Proposing 4-th dev seq
Proposing 5-th dev seq


### all_weights

In [135]:
all_weights

array([[1.],
       [1.],
       [1.],
       [1.],
       [1.]], dtype=float32)

### all_particles (output sequence)

all_particles[0] holds the output sequence. <br>
<b> Note </b>: Number of imputed events is decided by the NumUnobserved

In [140]:
len(seq_bases)==args['Small']

True

In [137]:
convert_dict_to_df(all_particles[0])

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




Unnamed: 0,type_event,time_since_last_event,time_since_start,seq_id
0,8,0.063277,0.063277,0
1,3,0.313112,0.376389,0
2,8,0.159279,0.535668,0
3,3,0.021277,0.556944,0
4,8,0.273824,0.830769,0
5,8,0.022564,0.853332,0
6,3,0.040557,0.893889,0
7,3,0.461944,1.355833,0
8,3,0.202222,1.558056,0
9,8,0.08014,1.638195,0


In [138]:
convert_dict_to_df(all_particles[1])

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




Unnamed: 0,type_event,time_since_last_event,time_since_start,seq_id
0,0,0.098611,0.098611,0
1,8,0.062732,0.161343,0
2,8,0.304507,0.46585,0
3,0,0.108039,0.573889,0
4,0,0.135556,0.709444,0
5,0,0.299167,1.008611,0
6,0,0.193056,1.201667,0
7,0,0.119722,1.321389,0
8,8,0.016404,1.337793,0
9,0,0.259985,1.597778,0


### seq_base

Base sequences ( events without imputation)

In [139]:
len(seq_bases)==args['Small']

True

In [119]:
convert_dict_to_df([seq_bases[0]])

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




Unnamed: 0,idx_event,type_event,time_since_start,time_since_last_event,seq_id
0,1,3,0.376389,0.376389,0
1,2,3,0.556944,0.180556,0
2,3,3,0.893889,0.336944,0
3,4,3,1.355833,0.461944,0
4,5,3,1.558056,0.202222,0
5,6,3,1.713889,0.155833,0
6,7,3,2.355,0.641111,0
7,8,3,2.826111,0.471111,0
8,9,3,3.2775,0.451389,0
9,10,3,5.117778,1.840278,0


In [120]:
convert_dict_to_df([seq_bases[1]])

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




Unnamed: 0,idx_event,type_event,time_since_start,time_since_last_event,seq_id
0,1,0,0.098611,0.098611,0
1,2,0,0.573889,0.475278,0
2,3,0,0.709444,0.135556,0
3,4,0,1.008611,0.299167,0
4,5,0,1.201667,0.193056,0
5,6,0,1.321389,0.119722,0
6,7,0,1.597778,0.276389,0
7,8,0,3.076667,1.478889,0
8,9,0,3.401111,0.324444,0
9,10,0,3.805556,0.404444,0


### all_log_proposals

An indicator showing how well the new series with imputed events fits the distribution

In [141]:
len(all_log_proposals)==args['Small']

True

In [142]:
all_log_proposals

array([-27.207394, -49.186005, -47.112453, -13.214535, -14.234542],
      dtype=float32)

### all_num_unobs

This array seems to hold the number of events that were originally observed in the corresponding sequence

In [143]:
all_num_unobs

array([11., 17., 15., 14., 19.], dtype=float32)

If hypothesis is true, then the difference between the all_num_unobs and it's corresponding output sequence will by = args['NumUnobserved']

In [154]:
for i,num in enumerate(all_num_unobs):
    print(len(all_particles[i][0]) - num == args['NumUnobserved'])

True
True
True
True
True


In [152]:
len(all_particles[1][0])

27