# Exploratory analysis of T-Drive dataset

## Requirements

- T-Drive dataset txt in `{ROOT}/data/tdrive`

## Import

In [1]:
import pandas as pd
import numpy as np
import folium
import matplotlib.pyplot as plt
import tqdm
import os

## read dataset

In [2]:
from collections import OrderedDict
from src.path import default_tdrive_dataset_path

X = []

def read_txt(fname: str) -> pd.DataFrame:
    df = pd.read_csv(fname, sep=',', header=None, names=['uid', 't', 'long', 'lat'])
    df['t'] = pd.to_datetime(df['t'], format='%Y-%m-%d %H:%M:%S')

    df = df[
        (39.75 <= df['lat']) & (df['lat'] <= 40.026) &
        (116.2 <= df['long']) & (df['long'] <= 116.55)
    ]

    return df

for fname in tqdm.tqdm(os.listdir(default_tdrive_dataset_path())):
    X.append(
        read_txt(f'{default_tdrive_dataset_path()}/{fname}')[['lat', 'long']].to_numpy()
    )

X = np.concatenate(X, axis=0)


  0%|          | 0/10357 [00:00<?, ?it/s]

100%|██████████| 10357/10357 [00:39<00:00, 259.65it/s]


## Compute cluster

In [30]:
import folium

map = folium.Map(tiles="CartoDB Positron")

for p in np.random.choice(np.arange(X.shape[0]), 1000):
    folium.CircleMarker(location=(X[p, 0], X[p, 1]), radius=1, weight=2, color='black').add_to(map)

map

In [7]:
from sklearn.mixture import GaussianMixture

m = GaussianMixture(n_components=100, covariance_type='spherical', verbose=1)
m.fit(X[np.random.choice(np.arange(X.shape[0]), 500000), :])

Initialization 0
  Iteration 10
Initialization converged: True


In [8]:
import math

def haversine(lat1, lon1, lat2, lon2): #generally used geo measurement function
    R = 6378.137; # Radius of earth in KM
    dLat = lat2 * math.pi / 180 - lat1 * math.pi / 180
    dLon = lon2 * math.pi / 180 - lon1 * math.pi / 180
    a = math.sin(dLat/2) * math.sin(dLat/2) + math.cos(lat1 * math.pi / 180) * math.cos(lat2 * math.pi / 180) * math.sin(dLon/2) * math.sin(dLon/2)
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
    d = R * c
    return d * 1000 # meters

map = folium.Map(tiles="CartoDB Positron")


for p in np.random.choice(np.arange(X.shape[0]), 1000):
    folium.CircleMarker(location=(X[p, 0], X[p, 1]), radius=1, weight=2, color='black').add_to(map)

for i in range(m.means_.shape[0]):
    folium.Circle(
        location=m.means_[i, :], 
        radius=haversine(39.89, m.means_[i, 1], 39.89, m.means_[i, 1] + 2 * math.sqrt(m.covariances_[i])),
        fill_color='cornflowerblue',
        color='blue',
        fill=True,
        weight=1,
        fill_opacity=0.6
    ).add_to(map)

map

## Speed aware interpolation

In [3]:
from src.data_preprocess.tdrive import read_txt, TDrivePreprocessConfig, average_speed, interpolate
from src.data_preprocess.point import NearestNeighborDiscretizer, CoordinateMap
from src.data_preprocess.interpolate import SpeedAwareLinearInterpolator

from datetime import datetime

uid = 1
df = read_txt(f'{default_tdrive_dataset_path()}/{uid}.txt')

discretizer = NearestNeighborDiscretizer(np.load(f'tdrive_mog100.npy'))

config = TDrivePreprocessConfig(
    delta_min=30,
    start_date=datetime(2008, 2, 2),
    n_day=7,
    verbose=True,
    interp_trajectory=True,
    discretizer=discretizer
)

config.uid = 1

coord_map = CoordinateMap(ref=(39.915, 116.395))

# calculate expected velocity
expected_vel = average_speed(df)

# average sampling interval is around 177 seconds
sample_interval = 3

interpolator = SpeedAwareLinearInterpolator(
    expected_speed=expected_vel,
    sample_interval=sample_interval
)

df1 = interpolate(df, config)


continuous interpolation: 100%|██████████| 588/588 [00:00<00:00, 13715.57it/s]


In [4]:
import folium

def plot_trajectory(df: pd.DataFrame):
    map = folium.Map(tiles="CartoDB Positron")
    prev_loc = None

    for _, row in df.iterrows():
        loc = (row['lat'], row['long'])

        folium.CircleMarker(location=loc, radius=1, weight=2, color='black').add_to(map)

        if prev_loc != None and prev_loc != loc:
            folium.PolyLine(locations=[prev_loc, loc], weight=2).add_to(map)

        if prev_loc == loc:
            folium.CircleMarker(location=loc, radius=3, weight=2).add_to(map)

        prev_loc = loc

    return map

plot_trajectory(df[(df['t'] >= datetime(2008, 2, 3, 12, 00)) & (df['t'] <= datetime(2008, 2, 3, 18, 45))])

In [6]:
plot_trajectory(df1[(df1['t'] >= datetime(2008, 2, 3, 12, 00)) & (df1['t'] <= datetime(2008, 2, 3, 18, 45))])