In [39]:
import gzip
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import random

# Data-Processing 
(same process as in Transformer's)

In [4]:
yjmob1 = 'yjmob100k-dataset1.csv.gz' # dataset under normal scenes
yjmob_df = pd.read_csv(yjmob1, compression='gzip').sort_values(by=['uid', 'd', 't'], ignore_index=True)

# Retrieve all ids
uids = yjmob_df['uid'].unique()

# Just to reduce memory space
rand_indicies = [random.randint(0, len(uids)) for _ in range(200)] # only 200 data would be used
selected_uids = [uid for uid in uids[rand_indicies]] # selected_uids = uids[:200]
# selected_uids = uids[:200]

df = yjmob_df[yjmob_df['uid'].isin(selected_uids)] 

# Time
df['combined_t'] = df['d']*47+df['t']

# Location
def spatial_token(x, y):
    return (x-1)+(y-1)*200
df['combined_xy'] = df.apply(lambda row: spatial_token(row['x'], row['y']), axis=1)

# Sort value
df = df.sort_values(by=['uid', 'combined_t'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['combined_t'] = df['d']*47+df['t']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['combined_xy'] = df.apply(lambda row: spatial_token(row['x'], row['y']), axis=1)


# Train-Test Split 
(~Transformer's)

In [5]:
# 7:3 split
train_uids, test_uids = train_test_split(selected_uids, test_size=0.30, random_state=42)

# Load training and testing data
df_train = df[df['uid'].isin(train_uids)]
df_test = df[df['uid'].isin(test_uids)]

# Batching 
(~Transformer's)

In [6]:
BATCH_SIZE = 50
STEP_SIZE = 600

In [22]:
def generate_sequences(data, data_t):
    return torch.tensor(data[:STEP_SIZE]),torch.tensor(data[STEP_SIZE]),\
                torch.tensor(data_t[:STEP_SIZE]),torch.tensor(data_t[STEP_SIZE])

In [23]:
# Group data by uid
grouped_data_train = df_train[['uid', 'combined_t', 'combined_xy']].groupby('uid')
grouped_data_train = [group for _, group in df_train.groupby('uid')]
grouped_data_test = df_test[['uid', 'combined_t', 'combined_xy']].groupby('uid')
grouped_data_test = [group for _, group in df_test.groupby('uid')]

In [24]:
class TrajectoryDataset(Dataset):
    def __init__(self, grouped_data):
        self.data = grouped_data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        data_for_uid = self.data[idx]
        inputs, labels, positions, label_positions = generate_sequences(
                                                         data_for_uid['combined_xy'].values.tolist(),
                                                         data_for_uid['combined_t'].values.tolist())
        return inputs, labels, positions, label_positions

train_dataset = TrajectoryDataset(grouped_data_train)
test_dataset = TrajectoryDataset(grouped_data_test)

In [33]:
def collate_fn(batch):
    # Unzip all batch
    inputs_batch, labels_batch, positions_batch, label_positions_batch = zip(*batch)
    
    # Pad the sequence with less length in a batch
    inputs_padded = torch.nn.utils.rnn.pad_sequence(inputs_batch, padding_value=0.0, batch_first=True)
    labels_padded = torch.tensor(np.array(labels_batch))
    positions_padded = torch.nn.utils.rnn.pad_sequence(positions_batch, padding_value=0, batch_first=True)
    label_positions_padded = torch.tensor(np.array(label_positions_batch))
    
    # return inputs_padded, labels_padded, positions_padded, label_positions_padded
    # Doing Addition here
    return inputs_padded+positions_padded, labels_padded+label_positions_padded

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)

In [44]:
# Example

for inputs, labels in test_dataloader:
    print("Location Shape:", inputs.shape)
    print("Desired output Location Shape:", labels.shape)
    break

Location Shape: torch.Size([50, 600])
Desired output Location Shape: torch.Size([50])


# LSTM
https://machinelearningmastery.com/lstm-for-time-series-prediction-in-pytorch/

In [35]:
class LSTMModel(nn.Module):
    def __init__(self, input_dim, embed_dim, layer_dim, output_dim):
        super(LSTMModel, self).__init__()
        self.embed_dim = embed_dim
        self.layer_dim = layer_dim
        self.lstm = nn.LSTM(input_dim, embed_dim, layer_dim, batch_first=True)
        self.fc = nn.Linear(embed_dim, output_dim)
    
    def forward(self, x):
        # Initialize hidden and cell states
        h0 = torch.zeros(self.layer_dim, x.size(0), self.embed_dim).to(x.device)
        c0 = torch.zeros(self.layer_dim, x.size(0), self.embed_dim).to(x.device)
        
        # Forward propagate LSTM
        out, _ = self.lstm(x, (h0, c0))
        
        # Taking the output of the last sequence step
        out = self.fc(out[:, -1, :])
        return out

In [40]:
# Data related param
BATCH_SIZE = 50
STEP_SIZE = 600 # seq_size

# Model related param
EMBED_DIM = 64
INPUT_DIM = 1
LAYER_DIM = 1 # 1-single layer, 2 or 3-multi-layer
NUM_CLASS = 40000 # 200*200 grid loc

model = LSTMModel(input_dim=INPUT_DIM, embed_dim=EMBED_DIM, layer_dim=LAYER_DIM, output_dim=NUM_CLASS)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

In [42]:
# Training
epochs = 80

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for inputs, labels in train_dataloader:
        inputs = inputs.float().unsqueeze(-1)
        labels = labels.long()
        
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    if epoch % 10 == 0:
        print(f"Epoch {epoch}: Loss {total_loss / len(train_dataloader)}")

Epoch 0: Loss 10.032061894734701
Epoch 10: Loss 9.40379778544108
Epoch 20: Loss 8.444165229797363
Epoch 30: Loss 7.350537300109863
Epoch 40: Loss 6.170390446980794
Epoch 50: Loss 5.47230863571167
Epoch 60: Loss 5.1818515459696455
Epoch 70: Loss 4.999857266743978


In [45]:
# Inference

softmax = nn.Softmax(dim=1)
model.eval()
with torch.no_grad():
    for inputs, labels in test_dataloader:
        inputs = inputs.float().unsqueeze(-1)
        logits = model(inputs)
        probabilities = softmax(logits)
        predictions = torch.argmax(probabilities, dim=1)
        print(f"Predicted Locations: {predictions}")
        print(f"Actual Locations: {labels}")
        print()
        break

Predicted Locations: tensor([19180, 17996, 17996, 17996, 22999, 17996, 17996, 22999, 17996, 17996,
        17996, 17996, 17996, 17996, 17996, 17996, 17996, 17996, 17996, 22999,
        17996, 17996, 22999, 22999, 17996, 17996, 17996,  2319, 17996, 18172,
        17996, 17996, 22999, 22999, 17996, 17996, 17996, 22999, 17996, 17996,
        22999, 17996, 17996, 19180, 17996, 17996, 22999, 17996, 17996, 22999])
Actual Locations: tensor([28782, 18563, 34257, 17964, 16302, 15959, 17766,  3871, 19252, 16370,
         9111, 24168, 23442, 12166, 11673, 28947, 12359, 21437, 12859, 10353,
        26152, 40738,  8359,  2521, 27359, 22703, 26538, 21885, 40756, 16606,
        17520, 19472, 16642,  3481, 27266, 35966, 19095, 14692, 26958, 13700,
        22659, 25109, 14152, 29546, 25187, 15066,  8463, 17996, 24747,  3124])

