**Dataset**: https://www.nature.com/articles/s41597-024-03237-9

**user ID** is the unique identifier of the mobile phone user (type: integer)

**day** is the masked date of the observation. It may take a value between 0 and 74 for both Dataset 1 and Dataset 2 (type: integer).

The location pings are discretized into 500 meters × 500 meters grid cells and the timestamps are rounded up into 30-minute bins. The actual date of the observations is not available either (i.e., timeslot t of day d) to protect privacy. In the second Dataset, the 75 day period is composed of 60 days of business-as-usual and 15 days during an emergency with unusual behavior.

**timeslot** is the timestamp of the observation discretized into 30 minute intervals. 
It may take a value between 0 and 47, where 0 indicates between 0AM and 0:30AM, 
and 13 would indicate the timeslot between 6:30AM and 7:00AM.

**x,y** are the coordinates of the observed location mapped onto the 500 meter discretized grid cell. It may take a value between (1, 1) and (200, 200). Details are shown in Fig. 2.

## Input Data Processing + Tokenization

In [2]:
# import pandas as pd
# import gzip

# Path to the .csv.gz file
# yjmob1 = 'yjmob100k-dataset1.csv.gz'
# yjmob2 = 'yjmob100k-dataset2.csv.gz'
# yjmob_df = pd.concat([pd.read_csv(yjmob1, compression='gzip'),
#                       pd.read_csv(yjmob2, compression='gzip')]).sort_values(by=['uid','d','t'],
#                                                                             ignore_index=True)

In [3]:
import pandas as pd
import gzip

yjmob1 = 'yjmob100k-dataset1.csv.gz' # dataset under normal scenes
yjmob_df = pd.read_csv(yjmob1, compression='gzip').sort_values(by=['uid', 'd', 't'], ignore_index=True)

### Train-Val-Test Split

In [6]:
# Retrieve all ids

uids = yjmob_df['uid'].unique()

In [7]:
from sklearn.model_selection import train_test_split

# 70 : 15 : 15 split

# test-train split
train_val_uids, test_uids = train_test_split(uids, test_size=0.15, random_state=42)

# validation-test split
train_uids, val_uids = train_test_split(train_val_uids, test_size=0.176, random_state=42) # 0.176≈15/85

### Load Test Data

In [14]:
df = yjmob_df[yjmob_df['uid'].isin(test_uids)]

In [20]:
df.head(5)

Unnamed: 0,uid,d,t,x,y
35039,23,0,24,47,183
35040,23,0,25,55,188
35041,23,0,26,55,188
35042,23,0,29,50,187
35043,23,0,30,47,182


### Tokenize Time Data

In [45]:
# df[['t']].min(), df[['t']].max() # [0, 47]
# df[['d']].min(), df[['d']].max() # [0, 74]

In [42]:
df['combined_t'] = df['d']*47+df['t']

## Ignore the SettingWithCopyWarning

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['combined_t'] = df['d']*47+df['t']


In [44]:
df.head(5)

Unnamed: 0,uid,d,t,x,y,combined_t
35039,23,0,24,47,183,24
35040,23,0,25,55,188,25
35041,23,0,26,55,188,26
35042,23,0,29,50,187,29
35043,23,0,30,47,182,30


### Tokenize Spatial Data

In [65]:
# linearization of the 2-dimensional grid, i.e., the original x,y coordinate system
def spatial_token(x, y):
    # x,y are the coordinate location
    # x determines the column order while
    # y determines the row order
    # (x-1) calculates the starting grid-column position
    # (y-1)*200 calculates the start index of the grid-row
    return (x-1)+(y-1)*200

In [79]:
df['combined_xy'] = df.apply(lambda row: spatial_token(row['x'], row['y']), axis=1)

## Ignore the SettingWithCopyWarning

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['combined_xy'] = df.apply(lambda row: spatial_token(row['x'], row['y']), axis=1)


In [80]:
df.head(5)

Unnamed: 0,uid,d,t,x,y,combined_t,combined_xy
35039,23,0,24,47,183,24,36446
35040,23,0,25,55,188,25,37454
35041,23,0,26,55,188,26,37454
35042,23,0,29,50,187,29,37249
35043,23,0,30,47,182,30,36246


In [148]:
# original data

# df['combined_t'].min(), df['combined_t'].max() # (0, 3525)
# df['combined_xy'].min(), df['combined_xy'].max() # (0, 39999)

df = df.sort_values(by=['uid', 'combined_t'])

In [149]:
df.head(5)

Unnamed: 0,uid,d,t,x,y,combined_t,combined_xy
35039,23,0,24,47,183,24,36446
35040,23,0,25,55,188,25,37454
35041,23,0,26,55,188,26,37454
35042,23,0,29,50,187,29,37249
35043,23,0,30,47,182,30,36246


## Batching

In [161]:
# Group data by uid

grouped_data = df[['uid', 'combined_t', 'combined_xy']].groupby('uid')
grouped_data = [group for _, group in df.groupby('uid')]

In [180]:
grouped_data[0]

Unnamed: 0,uid,d,t,x,y,combined_t,combined_xy
35039,23,0,24,47,183,24,36446
35040,23,0,25,55,188,25,37454
35041,23,0,26,55,188,26,37454
35042,23,0,29,50,187,29,37249
35043,23,0,30,47,182,30,36246
...,...,...,...,...,...,...,...
36620,23,74,13,48,182,3491,36247
36621,23,74,14,52,187,3492,37251
36622,23,74,15,66,194,3493,38665
36623,23,74,33,153,192,3511,38352


In [168]:
import torch
from torch.utils.data import Dataset, DataLoader

class TrajectoryDataset(Dataset):
    def __init__(self, grouped_data):
        self.data = grouped_data
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        # Fetch the DataFrame for a specific uid
        # Note idx != uid
        data_for_uid = self.data[idx]
        
        # Extract combined_xy and combined_t from the DataFrame
        combined_xy = torch.tensor(data_for_uid['combined_xy'].values.tolist(), dtype=torch.long)
        combined_t = torch.tensor(data_for_uid['combined_t'].values.tolist(), dtype=torch.long)
        
        return combined_xy, combined_t

test_dataset = TrajectoryDataset(grouped_data)

In [172]:
# Example
test_dataset.__getitem__(0) # uid=23

(tensor([36446, 37454, 37454,  ..., 38665, 38352, 37350]),
 tensor([  24,   25,   26,  ..., 3493, 3511, 3512]))

In [189]:
def collate_fn(batch):
    # Unzip all batch
    combined_xy_batch, combined_t_batch = zip(*batch)
    
    # Pad the sequence with less length in a batch
    # Crucial for pad processing
    combined_xy_padded = torch.nn.utils.rnn.pad_sequence(combined_xy_batch, padding_value=0)
    combined_t_padded = torch.nn.utils.rnn.pad_sequence(combined_t_batch, padding_value=0)
    
    return combined_xy_padded, combined_t_padded

In [None]:
# Dataloader would contain 4 users trajectory data within the same batch
test_dataloader = DataLoader(test_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)

In [194]:
# Example
data_iter = iter(dataloader)
combined_xy_padded, combined_t_padded = next(data_iter)

print("Shape of combined_xy_padded:", combined_xy_padded.shape)
print("Shape of combined_t_padded:", combined_t_padded.shape)
print("First data from combined_xy_padded:", combined_xy_padded[0])
print("First data from combined_t_padded:", combined_t_padded[0])

Shape of combined_xy_padded: torch.Size([4, 1124])
Shape of combined_t_padded: torch.Size([4, 1124])
First data from combined_xy_padded: tensor([16684, 16685, 17079,  ..., 16684, 16685, 16684])
First data from combined_t_padded: tensor([  21,   22,   24,  ..., 3523, 3524, 3525])


## Embedding

https://pytorch.org/tutorials/beginner/transformer_tutorial.html#load-and-batch-data

In [74]:
import numpy as np
import torch.nn as nn

EMBED_DIM = 64

In [75]:
# Space - Input Embedding

class InputEmbedding(nn.Module):
    def __init__(self, num_embeddings=40000, embedding_dim=EMBED_DIM):
        super(InputEmbedding, self).__init__()
        self.embedding = nn.Embedding(num_embeddings, embedding_dim)

    def forward(self, x):
        return self.embedding(x)

In [96]:
# Time - Positional Embedding

class PositionalEncoding(nn.Module):
    def __init__(self, embedding_dim=EMBED_DIM, dropout=0.1, max_len=3600):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout) # prevent overfitting
        self.embedding_dim = embedding_dim

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embedding_dim, 2) * (-np.log(10000.0) / embedding_dim))
        pe = torch.zeros(max_len, 1, embedding_dim)
        pe[:, 0, 0::2] = torch.sin(position.float() * div_term)
        pe[:, 0, 1::2] = torch.cos(position.float() * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        """
        Arguments:
            x: Tensor, shape ``[seq_len, batch_size, embedding_dim]``
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

In [199]:
input_embedding = InputEmbedding()
positional_encoding = PositionalEncoding()

In [206]:
# Example
for combined_xy_padded, combined_t_padded in test_dataloader:
    # Shape: [seq_len, batch_size, embedding_dim]
    
    # Input Embedding
    space = input_embedding(combined_xy_padded) 
    
    # Apply Positional Encoding
    time = positional_encoding(embedded_xy)
    
    print("Input Embedding:", space.shape)
    print(space[0, :, :]) # [first timstamp, for all data in a batch, for all element composed]
    print()
    print("Positional Encoding:", time.shape)
    print(time[0, :, :])
    
    break

## Transformer Model