# Experiment on Shanghai Telecom Dataset

Dataset discretize with 100 Gaussian mixture clusters (mog_100.npy), 30 minute interval. Unfiltered.

## Requirements

- Pre-processed dataset with 100 clusters located at `{ROOT}/data/sh30-c100`
- Pre-processed dataset with 50 clusters located at `{ROOT}/data/sh30-c50`
- Pre-computed 50 clusters located at `{ROOT}/data/exploratory_analysis/mog_50.npy`
- Pre-computed 100 clusters located at `{ROOT}/data/exploratory_analysis/mog_100.npy`

## import and constants

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import tqdm
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

from src.path import ROOT
from src.ml.checkpoint import Checkpoint

# trajectory length
SEQ_LENGTH: int = 48

# cuda flag
USE_CUDA: bool = True

if USE_CUDA and not torch.cuda.is_available():
    USE_CUDA = False
    print('fallback to cpu as CUDA is not available on this device')

CHECKPOINT_PREFIX: str = 'sh30-c100'
CACHE_PREFIX: str = 'sh30-c100'

checkpoint = Checkpoint(
    checkpoint_interval=5,
    prefix=CHECKPOINT_PREFIX
)

## define dataset

### define path

Change the path variable here if you place your dataset files in a different location.

In [2]:
cluster_path = f'{ROOT}/exploratory_analysis/mog_100.npy'
dataset_path = str(ROOT.joinpath('data/sh30-c100'))

### split dataset

Split to pre-defined training set and test set.

In [3]:
import os
from datetime import date

from src.data_preprocess.trajectory import from_dataframe
from src.ml.dataset import get_shanghai_date

file_list = os.listdir(dataset_path)

def is_test(fname: str):
    '''
    returns True if file belongs to test set
    '''
    fdate = get_shanghai_date(fname)
    ref_date = date(2014, 6, 18)
    return fdate >= ref_date and (fdate - ref_date).days < 15


test_files = [fname for fname in file_list if is_test(fname)]
train_files = [fname for fname in file_list if not is_test(fname)]

### read basestations

In [4]:
from src.ml.dataset import create_point_to_class_map

all_candidates = torch.tensor(np.load(cluster_path), dtype=torch.float32)

point_to_class_map = create_point_to_class_map(all_candidates)

### load dataset

Load dataset files into in-memory tensors.

In [5]:
from torch.utils.data import random_split

from src.ml.dataset import TrajectoryDataset, get_shanghai_date, CACHE_PATH

def read_file(fname: str):
    df = pd.read_csv(f'{dataset_path}/{fname}')
    return get_shanghai_date(fname), [*from_dataframe(df, SEQ_LENGTH).values()]
    

train_set = TrajectoryDataset(sequence_length=SEQ_LENGTH, point_to_class_map=point_to_class_map)

if os.path.exists(f'{CACHE_PATH}/{CACHE_PREFIX}_train_data.pt'):
    train_set.load(f'{CACHE_PATH}/{CACHE_PREFIX}_train_data.pt')
else:
    train_set.read_files(
        train_files,
        read_file=read_file
    )

    train_set.save(f'{CACHE_PATH}/{CACHE_PREFIX}_train_data.pt')

# fix seed for reproducibility
train_set, valid_set = random_split(train_set, [0.8, 0.2], torch.Generator().manual_seed(123))

test_set = TrajectoryDataset(sequence_length=SEQ_LENGTH, point_to_class_map=point_to_class_map)

if os.path.exists(f'{CACHE_PATH}/{CACHE_PREFIX}_test_data.pt'):
    test_set.load(f'{CACHE_PATH}/{CACHE_PREFIX}_test_data.pt')
else:
    test_set.read_files(
        test_files,
        read_file=read_file
    )

    test_set.save(f'{CACHE_PATH}/{CACHE_PREFIX}_test_data.pt')

### Define pre-process pipeline

1. convert to Cartesian coordinates by tangent plane project. Choose center of plane (reference point) to be median of lat-long.
2. normalize to [-1, +1] for better gradients

In [6]:
from src.ml.utils import create_shanghai_preprocessor, to_cartesian

ref_lat = all_candidates[:, 0].median()
ref_long = all_candidates[:, 1].median()

all_candidates_cart = to_cartesian(all_candidates, ref_point=(ref_lat, ref_long))
min_x, max_x = all_candidates_cart[:, 0].min().item(), all_candidates_cart[:, 0].max().item()
min_y, max_y = all_candidates_cart[:, 1].min().item(), all_candidates_cart[:, 1].max().item()
del all_candidates_cart

preprocess = create_shanghai_preprocessor(
    x_range=(min_x, max_x),
    y_range=(min_y, max_y),
    ref_point=(ref_lat, ref_long)
)

## define model

In [7]:
from src.ml.model import TrajectoryModel
from src.ml.model.modules import TransformerTrajectoryEncoder, BaseStationEmbedding

model_dim = 128

base_station_embedding = BaseStationEmbedding(
    feat_dim=(2, 64),
    context_dim=(31, 48),
    out_dim=model_dim,
    layer_norm=True
)

trajectory_encoder = TransformerTrajectoryEncoder(
    in_dim=model_dim,
    max_len=SEQ_LENGTH,
    hid_dim=(model_dim, model_dim * 2, 8),
    do_prob=0.2,
    n_blocks=4,
    #layer_norm=True
)

model = TrajectoryModel(
    base_station_embedding=base_station_embedding,
    trajectory_encoder=trajectory_encoder,
)

#optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.5)

## train model

### define train config

In [8]:
from src.ml.config import TrainConfig

config = TrainConfig(
    optimizer=optimizer,
    lr_scheduler=lr_scheduler,
    datasets={ 'train': train_set, 'valid': valid_set },
    n_epoch=5,
    all_candidates=all_candidates,
    verbose=True,
    cuda=USE_CUDA,
    checkpoint=checkpoint,
    preprocess=preprocess,
    batch_size=64
)

In [9]:
from src.ml.train import train

if USE_CUDA:
    model.cuda()

state = train(model, config)

  attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)
[train] 1: 100%|██████████| 6382/6382 [08:56<00:00, 11.90it/s]


loss: 0.34502483012673624
elapsed: 536.3130717277527
perplexity: 1.4120249799915892
accuracy: 0.9278285370665228


[valid] 1: 100%|██████████| 1596/1596 [01:21<00:00, 19.60it/s]


mdev: 1459.11754277356
elapsed: 81.43980669975281
perplexity: 1.267627244256125
accuracy: 0.9458875808127243


[test] 1: 0it [00:00, ?it/s]


perplexity: 1.0
accuracy: 0


[train] 2: 100%|██████████| 6382/6382 [09:03<00:00, 11.75it/s]


loss: 0.22957574169054296
elapsed: 543.142297744751
perplexity: 1.2580661516718912
accuracy: 0.9459330160783698


[valid] 2: 100%|██████████| 1596/1596 [01:15<00:00, 21.07it/s]


mdev: 721.1150676564764
elapsed: 75.76587390899658
perplexity: 1.2557706730673222
accuracy: 0.9464186935272432


[test] 2: 0it [00:00, ?it/s]


elapsed: 0.004126310348510742
perplexity: 1.0
accuracy: 0


[train] 3: 100%|██████████| 6382/6382 [08:28<00:00, 12.55it/s]


loss: 0.22201307219155753
elapsed: 508.6900722980499
perplexity: 1.248587699535188
accuracy: 0.9463755957985402


[valid] 3: 100%|██████████| 1596/1596 [01:15<00:00, 21.26it/s]


mdev: 720.6402325355319
elapsed: 75.07411313056946
perplexity: 1.2476887616196553
accuracy: 0.9464558143364755


[test] 3: 0it [00:00, ?it/s]


elapsed: 0.0016241073608398438
perplexity: 1.0
accuracy: 0


[train] 4: 100%|██████████| 6382/6382 [08:55<00:00, 11.92it/s]


loss: 0.21864594555445616
elapsed: 535.2086660861969
perplexity: 1.2443906166534933
accuracy: 0.9465386186580753


[valid] 4: 100%|██████████| 1596/1596 [01:17<00:00, 20.57it/s]


mdev: 720.5645127643021
elapsed: 77.57998180389404
perplexity: 1.2455669457648562
accuracy: 0.9464594855867233


[test] 4: 0it [00:00, ?it/s]


elapsed: 0.0019922256469726562
perplexity: 1.0
accuracy: 0


[train] 5: 100%|██████████| 6382/6382 [08:50<00:00, 12.04it/s]


loss: 0.21696043086673952
elapsed: 530.0546660423279
perplexity: 1.2422949446309977
accuracy: 0.9465394417849576


[valid] 5: 100%|██████████| 1596/1596 [01:19<00:00, 20.13it/s]


mdev: 720.5732124156522
elapsed: 79.28772377967834
perplexity: 1.2434330681908632
accuracy: 0.9464592816387501


[test] 5: 0it [00:00, ?it/s]

elapsed: 0.0005626678466796875
perplexity: 1.0
accuracy: 0





## Sanity check

check accuracy if we just predict the next position as the last known position.

In [13]:
import tqdm
from src.ml.utils import haversine

valid_count = 0
valid_acc = 0
valid_mse = 0

with torch.no_grad():
    for trajectories, context, target in tqdm.tqdm(state.valid_loader, desc=state.get_tqdm_desc('[valid]'), disable=not config.verbose):
        batch_size, sequence_length = trajectories.shape[:2]
    
        context: torch.FloatTensor = context
        trajectories: torch.FloatTensor = trajectories
        target: torch.IntTensor = target

        if config.cuda:
            context = context.cuda()
            trajectories = trajectories.cuda()
            target = target.cuda()

        if config.preprocess:
            trajectories = config.preprocess(trajectories)

        trajectories = torch.concat((trajectories[:, :1, :], trajectories), dim=1)

        # accumulate accuracy
        acc = (trajectories[:, :-1] == trajectories[:, 1:]).prod(dim=-1).float().mean().item()
        valid_acc += batch_size * acc

        mdev = haversine(trajectories[:, 1:], trajectories[:, :-1]).mean().item()
        valid_mse += batch_size * mdev

        valid_count += batch_size

print(valid_acc / valid_count)
print(valid_mse / valid_count)

[valid] 6: 100%|██████████| 1596/1596 [00:27<00:00, 59.08it/s]

0.9667665488976879
143.26925145636451





## Experiment results:
| Dimension | Perplexity | Mean Error (m) | Accuracy |
|-----------|------------|------------|----------|
| 16 | 1.32 | 983 | 0.9459924913363946 |
| 32 | 1.28 | 758 | 0.9464138739912731 |
| 64 | 1.25 | 726 | 0.9464254994738969 |
| 128 | 1.24 | 721 | 0.9464592816387501 |
| last position | - | 143 | 0.9667665488976879 |