In [1]:
import pandas as pd
import numpy as np
import torch
import glob
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
from scipy.spatial.distance import cdist

In [2]:
def normalize_coordinate(latitude, longitude):
    global min_lat, max_lat, min_long, max_long
    normalized_lat = 2 * (latitude - min_lat) / (max_lat - min_lat) - 1
    normalized_long = 2 * (longitude - min_long) / (max_long - min_long) - 1
    return normalized_lat, normalized_long

def denormalize_coordinate(normalized_lat, normalized_long):
    latitude = ((normalized_lat + 1) / 2) * (max_lat - min_lat) + min_lat
    longitude = ((normalized_long + 1) / 2) * (max_long - min_long) + min_long
    return latitude, longitude

def read_data(file):
    training = []
    df = pd.read_csv(file)
    users = df['uid'].unique()
    X, y = [], []
    for user in users:
        new_df = df[(df['uid'] == user)]
        LOOKBACK = 3
        for i in range(1, 48-LOOKBACK):
            feature = [(normalize_coordinate(lat, long)) for lat, long in zip(new_df[i:i + LOOKBACK]['lat'], new_df[i:i + LOOKBACK]['long'])]
            target = [(normalize_coordinate(lat, long)) for lat, long in zip(new_df[i + LOOKBACK:i + LOOKBACK + 1]['lat'], new_df[i + LOOKBACK:i + LOOKBACK + 1]['long'])]
            X.append(feature)
            y.append(target)
    
    X = np.array([np.array(feature) for feature in X])
    y = np.array([np.array(target) for target in y])
    return torch.tensor(X, dtype=torch.float64), torch.tensor(y, dtype=torch.float64)

In [3]:

def get_training_data(train_folder):
    # training data preparation
    outputs = []
    for file in glob.glob(train_folder):
            outputs.append(read_data(file))
    features_list, targets_list = zip(*outputs)
    train_features = torch.cat(features_list, dim=0)
    train_targets = torch.cat(targets_list, dim=0)
    return train_features, train_targets

def get_testing_data(test_folder):
    # testing data preparation
    outputs = []
    for file in glob.glob(test_folder):
            outputs.append(read_data(file))
    features_list, targets_list = zip(*outputs)
    test_features = torch.cat(features_list, dim=0)
    test_targets = torch.cat(targets_list, dim=0)
    return test_features, test_targets


## Retrieving training and testing data for lstm

In [4]:
# Defining parameters
# Reading data GEOlife
beijing_min_longitude, beijing_max_longitude = 115.416827, 117.508251
beijing_min_latitude, beijing_max_latitude = 39.442078, 41.058964

# Reading data SHANGHAI
shanghai_min_longitude = 120.9
shanghai_max_longitude = 121.9
shanghai_min_latitude = 30.69
shanghai_max_latitude = 31.51

min_long = beijing_min_longitude
max_long = beijing_max_longitude
min_lat = beijing_min_latitude
max_lat = beijing_max_latitude

# Data folder selected from preprocessed data
train_folder = './training_data/*'
test_folder = './testing_data/*'

train_features, train_targets = get_training_data(train_folder)
test_features, test_targets = get_testing_data(test_folder)


## Model definition and training

In [5]:
# model definition
 
class TrajModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.lstm = nn.LSTM(input_size=2, hidden_size=40, num_layers=2, batch_first=True)
        self.dropout = nn.Dropout(0.5)
        self.linear = nn.Linear(40, 2)
    def forward(self, x):
        x, _ = self.lstm(x.float())
        x = self.dropout(x)
        x = self.linear(x[:, -1, :])
        return x

def lstm_training(train_features, train_targets, test_features, test_targets):
    model = TrajModel()
    optimizer = optim.Adam(model.parameters())
    loss_fn = nn.MSELoss()
    loader = data.DataLoader(data.TensorDataset(train_features, train_targets), shuffle=True, batch_size=8)
    # Data shape = (8, 3, 2), each batch have 8 3-pairs coordinate
    n_epochs = 100
    for epoch in range(n_epochs):
        model.train()
        for X_batch, y_batch in loader:
            y_pred = model(X_batch).float()
            y_pred = torch.unsqueeze(y_pred, 1)
            loss = loss_fn(y_pred.float(), y_batch.float())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        # Validation
        if epoch % 10 != 0:
            continue
        model.eval()
        with torch.no_grad():
            y_pred = model(train_features)
            y_pred = torch.unsqueeze(y_pred, 1)
            train_rmse = np.sqrt(loss_fn(y_pred, train_targets))
            y_pred = model(test_features)
            y_pred = torch.unsqueeze(y_pred, 1)
            test_rmse = np.sqrt(loss_fn(y_pred, test_targets))
        print("Epoch %d: train mse %.4f, test mse %.4f" % (epoch, train_rmse, test_rmse))
    return model

In [6]:
model = lstm_training(train_features, train_targets, test_features, test_targets)

Epoch 0: train mse 0.0224, test mse 0.0281
Epoch 10: train mse 0.0181, test mse 0.0232
Epoch 20: train mse 0.0179, test mse 0.0223
Epoch 30: train mse 0.0175, test mse 0.0205
Epoch 40: train mse 0.0169, test mse 0.0196
Epoch 50: train mse 0.0172, test mse 0.0220
Epoch 60: train mse 0.0171, test mse 0.0203
Epoch 70: train mse 0.0173, test mse 0.0242
Epoch 80: train mse 0.0187, test mse 0.0251
Epoch 90: train mse 0.0160, test mse 0.0185


In [7]:
# Test model performance
for i in range(0, 1000, 100):
    input = test_features[i]
    print("Input:")
    print(input)
    input = torch.reshape(input, (1, 3, 2))
    y = model(input)
    print("LSTM predicted next coordinates:")
    print(y)
    print("Real output:")
    print(test_targets[i])
    print("-------------")
# Save model
torch.save(model, 'lstm_model_40_2.pt')

Input:
tensor([[-0.3098, -0.1296],
        [-0.3098, -0.1296],
        [-0.3098, -0.1296]], dtype=torch.float64)
LSTM predicted next coordinates:
tensor([[-0.3063, -0.1265]], grad_fn=<AddmmBackward0>)
Real output:
tensor([[-0.3098, -0.1296]], dtype=torch.float64)
-------------
Input:
tensor([[-0.3621, -0.1273],
        [-0.3407, -0.1250],
        [-0.3295, -0.1123]], dtype=torch.float64)
LSTM predicted next coordinates:
tensor([[-0.3226, -0.1092]], grad_fn=<AddmmBackward0>)
Real output:
tensor([[-0.3840, -0.0325]], dtype=torch.float64)
-------------
Input:
tensor([[-0.3287, -0.1486],
        [-0.3287, -0.1486],
        [-0.3287, -0.1486]], dtype=torch.float64)
LSTM predicted next coordinates:
tensor([[-0.3174, -0.1481]], grad_fn=<AddmmBackward0>)
Real output:
tensor([[-0.3287, -0.1486]], dtype=torch.float64)
-------------
Input:
tensor([[-0.2221, -0.0453],
        [-0.2221, -0.0453],
        [-0.2221, -0.0453]], dtype=torch.float64)
LSTM predicted next coordinates:
tensor([[-0.2335, -0

## Use model for predictions

### Read from raw data and make predictions using lstm model

In [14]:
def append_traj(current_traj, next_df):
    all_traj = current_traj
    if len(current_traj[0]) != 1: 
        # get the last point to compare and find smallest distance only
        current_traj = [[item[-1]] for item in current_traj]
    next_points = []
    for i in range(len(next_df)):
        next_points.append([next_df['lat'][i], next_df['long'][i]])
    for index, item in enumerate(current_traj):
        distances = cdist(item,next_points)
        min_index = np.argmin(distances)
        all_traj[index].append(next_points[min_index])
        next_points.pop(min_index)
    return all_traj

In [15]:
def recover_traj(csv_file):
    # read from data
    df = pd.read_csv(csv_file)
    time_df = df[(df['t'] == 0)]
    time_df.reset_index(drop=True, inplace=True)
    # append to 3
    all_traj = []
    current_traj = []
    for i in range(len(time_df)):
        current_traj.append([[time_df['lat'][i], time_df['long'][i]]])
    time = 1
    while len(current_traj[0]) < 3:
        next_df = df[(df['t'] == time)]
        next_df.reset_index(drop=True, inplace=True)
        new_traj = append_traj(current_traj, next_df)
        current_traj = new_traj
        time += 1
    # use model to fill point 3-47
    while time < 48:
        for index, traj in enumerate(current_traj):
            input = [(normalize_coordinate(lat, long)) for lat, long in zip([item[0] for item in traj], [item[1] for item in traj])]
            # get the last 3 only
            input = input[-3:]
            input = torch.tensor(input, dtype=torch.float64)
            input = torch.reshape(input, (1, 3, 2))
            y = model(input)
            x, y = denormalize_coordinate(y[0][0],y[0][1])
            new_coor = [x.item(),y.item()]
            current_traj[index].append(new_coor)
        
        next_df = df[(df['t'] == time)]
        next_df.reset_index(drop=True, inplace=True)
        # get closest point for every current trajectories
        current_traj = append_traj(current_traj, next_df)
        # remove intermediate point predicted by model
        for index, item in enumerate(current_traj):
            current_traj[index].pop(-2)
        # update time
        time += 1
    # all final traj with 48 points each
    return current_traj

In [21]:
def export_datafram(df, current_traj):
    uid = (df['uid'].unique())
    data = []

    for index, item in enumerate(current_traj):
        t = 0
        for time in item:
            data.append({'uid': uid[index], 't': t, 'lat': time[0], 'long': time[1]})
            t += 1

    expected_df = pd.DataFrame(data)
    expected_df['t'] = expected_df['t'].astype(int)

    expected_df.to_csv('expected.csv', index=False)
    df.to_csv('actual.csv', index=False)

In [24]:
csv_file = 'testing_data/2009-05-25.csv'
df = pd.read_csv(csv_file)
current_traj = recover_traj(csv_file)
export_datafram(df, current_traj)
