# Identifying tourism activities

Load the necessary libraries.

In [1]:
import os
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path

import datablox_od
import geopandas as gpd
import pandas as pd
import pyarrow.parquet as pq

pd.set_option("display.max_rows", 5)

%load_ext autoreload
%autoreload 2

Folder names

In [2]:
SAMPLE_DATA_DIRECTORY = os.path.join("..", "sample_data")
SAMPLE_OUTPUT_DIRECTORY = os.path.join("..", "sample_output")

STAYPOINTS_DIRECTORY = os.path.join(SAMPLE_OUTPUT_DIRECTORY, "tourism", "staypoints")
TRIPS_DIRECTORY = os.path.join(SAMPLE_OUTPUT_DIRECTORY, "tourism", "trips")
TOURISM_TRIPS_DIRECTORY = os.path.join(
    SAMPLE_OUTPUT_DIRECTORY, "tourism", "tourism_trips"
)
os.makedirs(STAYPOINTS_DIRECTORY, exist_ok=True)
os.makedirs(TRIPS_DIRECTORY, exist_ok=True)
os.makedirs(TOURISM_TRIPS_DIRECTORY, exist_ok=True)

<hr>

## I. Detecting staypoints

We start by first identifying staypoints (locations where a person stayed for some period of time). Examples of staypoints include one's place of residence, workplace, and visited tourist attractions.

In [3]:
PINGS_FOR_TOURISM_DIRECTORY = os.path.join(
    SAMPLE_DATA_DIRECTORY, "tourism", "pings_per_device"
)
h3_to_area = pd.read_parquet(
    os.path.join(SAMPLE_OUTPUT_DIRECTORY, "h3_to_area.parquet")
)

We write utility functions for:
- Identifying staypoints using DataBlox-OD's `datablox_od.trajectory.detect_staypoints()`
- Mapping these staypoints to Thailand's administrative boundaries using DataBlox-OD's `datablox_od.preprocessing.map_pings_to_areas()`

In [4]:
def detect_staypoints_and_map_to_administrative_regions(
    pings_file, verbose, skip_if_staypoints_file_exists=True
):
    """
    If no staypoints are detected, no output file is written and nothing is returned.
    """
    if skip_if_staypoints_file_exists and os.path.exists(
        os.path.join(STAYPOINTS_DIRECTORY, pings_file)
    ):
        return

    pings = gpd.read_parquet(os.path.join(PINGS_FOR_TOURISM_DIRECTORY, pings_file))
    staypoints = datablox_od.trajectory.detect_staypoints(
        pings,
        coordinates_column="geometry",
        timestamp_column="timestamp",
        max_distance_between_pings_in_km=1,
        min_length_of_stay_in_minutes=30,
        max_days_between_two_consecutive_pings=1,
        verbose=verbose,
    )

    if not staypoints.empty:
        staypoints["device_id"] = Path(pings_file).stem
        staypoints = map_to_administrative_regions(staypoints, verbose)
        staypoints.to_parquet(os.path.join(STAYPOINTS_DIRECTORY, pings_file))

        return staypoints


def map_to_administrative_regions(staypoints, verbose):
    staypoints = datablox_od.preprocessing.map_pings_to_areas(
        staypoints,
        h3_to_area,
        spatial_index_column="h3_cell",
        device_id_column="device_id",
        coordinates_column="geometry",
        timestamp_column="time_arrive",
        spatial_indexing_system="h3",
        spatial_index_resolution=8,
        verbose=verbose,
    )
    staypoints["ADM2_EN"] = staypoints["ADM1_EN"] + "#" + staypoints["ADM2_EN"]
    staypoints["ADM3_EN"] = staypoints["ADM2_EN"] + "#" + staypoints["ADM3_EN"]

    return staypoints

Just for illustrative purposes, consider the pings from the device with ID `B79C7F07-A0EC-4568-A5E3-1FFB061AA679`.

In [5]:
pings = gpd.read_parquet(
    os.path.join(
        PINGS_FOR_TOURISM_DIRECTORY, "B79C7F07-A0EC-4568-A5E3-1FFB061AA679.parquet"
    )
)
pings

Unnamed: 0,device_id,timestamp,h3,ADM3_EN,ADM2_EN,ADM1_EN,geometry
491316,B79C7F07-A0EC-4568-A5E3-1FFB061AA679,2024-05-01 07:20:41,88658aaad1fffff,Rayong#Pluak Daeng#Ta Sit,Rayong#Pluak Daeng,Rayong,POINT (101.19907 13.05344)
410932,B79C7F07-A0EC-4568-A5E3-1FFB061AA679,2024-05-01 07:20:46,88658aaad1fffff,Rayong#Pluak Daeng#Ta Sit,Rayong#Pluak Daeng,Rayong,POINT (101.19908 13.05342)
...,...,...,...,...,...,...,...
2110360,B79C7F07-A0EC-4568-A5E3-1FFB061AA679,2024-12-01 06:23:20,88658aaad1fffff,Rayong#Pluak Daeng#Ta Sit,Rayong#Pluak Daeng,Rayong,POINT (101.19767 13.05416)
95195,B79C7F07-A0EC-4568-A5E3-1FFB061AA679,2024-12-01 06:43:18,88658aaad1fffff,Rayong#Pluak Daeng#Ta Sit,Rayong#Pluak Daeng,Rayong,POINT (101.19766 13.05416)


From these pings, we identify the staypoints.

In [6]:
detect_staypoints_and_map_to_administrative_regions(
    "B79C7F07-A0EC-4568-A5E3-1FFB061AA679.parquet",
    verbose=False,
    skip_if_staypoints_file_exists=False,
)

Unnamed: 0,time_arrive,time_depart,num_pings_in_staypoint,num_minutes_stayed,geometry,device_id,h3_cell,ADM1_EN,ADM2_EN,ADM3_EN
0,2024-05-01 07:20:41,2024-05-01 17:16:14,108,595.550000,POINT (101.19328 13.04791),B79C7F07-A0EC-4568-A5E3-1FFB061AA679,88658aaadbfffff,Rayong,Rayong#Pluak Daeng,Rayong#Pluak Daeng#Ta Sit
1,2024-05-01 17:27:25,2024-05-01 19:25:08,40,117.716667,POINT (101.20227 13.01753),B79C7F07-A0EC-4568-A5E3-1FFB061AA679,88658aa113fffff,Rayong,Rayong#Pluak Daeng,Rayong#Pluak Daeng#Pluak Daeng
...,...,...,...,...,...,...,...,...,...,...
704,2024-11-30 15:26:48,2024-11-30 16:11:41,26,44.883333,POINT (100.61187 14.31917),B79C7F07-A0EC-4568-A5E3-1FFB061AA679,8864a408ddfffff,Phra Nakhon Si Ayutthaya,Phra Nakhon Si Ayutthaya#Bang Pa-In,Phra Nakhon Si Ayutthaya#Bang Pa-In#Ban Krot
705,2024-11-30 20:04:50,2024-11-30 22:49:16,10,164.433333,POINT (100.47945 13.94505),B79C7F07-A0EC-4568-A5E3-1FFB061AA679,8864a4ab5dfffff,Nonthaburi,Nonthaburi#Pak Kret,Nonthaburi#Pak Kret#Khlong Khoi


For the actual tutorial, we are going to identify the staypoints of all the devices in our dataset. To speed this up, we employ multithreading using Python's built-in [`ThreadPoolExecutor`](https://docs.python.org/3/library/concurrent.futures.html).

In [7]:
# Adjust max_workers depending on your machine's available resources
with ThreadPoolExecutor(max_workers=2048) as executor:
    _ = [
        executor.submit(
            detect_staypoints_and_map_to_administrative_regions, file, verbose=False
        )
        for file in os.listdir(PINGS_FOR_TOURISM_DIRECTORY)
    ]

<hr>

## II. Identifying staypoints in the districts of residence

The importance of staypoints is in their semantic labels, that is, they can be a person's place of residence, workplace, or visited tourist attractions, among others.

For now, we are interested in identifying which staypoints fall inside each device's district of residence. This information will be used later on to identify trips.

In [8]:
RESIDENCE_FOR_TOURISM_DIRECTORY = os.path.join(
    SAMPLE_DATA_DIRECTORY, "tourism", "residence"
)


def identify_staypoints_in_residence(staypoints_file, skip_if_staypoints_labeled=True):
    if (
        skip_if_staypoints_labeled
        and "is_staypoint_in_district_of_residence"
        in pq.ParquetFile(
            os.path.join(STAYPOINTS_DIRECTORY, staypoints_file)
        ).schema.names
    ):
        return

    staypoints = gpd.read_parquet(os.path.join(STAYPOINTS_DIRECTORY, staypoints_file))

    staypoints["is_staypoint_in_district_of_residence"] = False
    device = Path(staypoints_file).stem

    for residence_file in os.listdir(RESIDENCE_FOR_TOURISM_DIRECTORY):
        year, month = map(int, Path(residence_file).stem.split("-"))
        residence = pd.read_parquet(
            os.path.join(RESIDENCE_FOR_TOURISM_DIRECTORY, residence_file),
            filters=[("device_id", "==", device)],
        )
        if not residence.empty:
            residence = residence["residence"].to_numpy()[0]

            staypoints.loc[
                (staypoints["time_arrive"].dt.year == year)
                & (staypoints["time_arrive"].dt.month == month)
                & (staypoints["ADM2_EN"] == residence),
                "is_staypoint_in_district_of_residence",
            ] = True

    staypoints.to_parquet(os.path.join(STAYPOINTS_DIRECTORY, staypoints_file))
    return staypoints

Just for illustrative purposes, we identify which staypoints of the device with ID `B79C7F07-A0EC-4568-A5E3-1FFB061AA679` fall inside its distrct of residence.

In [9]:
identify_staypoints_in_residence(
    "B79C7F07-A0EC-4568-A5E3-1FFB061AA679.parquet",
    skip_if_staypoints_labeled=False,
)

Unnamed: 0,time_arrive,time_depart,num_pings_in_staypoint,num_minutes_stayed,geometry,device_id,h3_cell,ADM1_EN,ADM2_EN,ADM3_EN,is_staypoint_in_district_of_residence
0,2024-05-01 07:20:41,2024-05-01 17:16:14,108,595.550000,POINT (101.19328 13.04791),B79C7F07-A0EC-4568-A5E3-1FFB061AA679,88658aaadbfffff,Rayong,Rayong#Pluak Daeng,Rayong#Pluak Daeng#Ta Sit,True
1,2024-05-01 17:27:25,2024-05-01 19:25:08,40,117.716667,POINT (101.20227 13.01753),B79C7F07-A0EC-4568-A5E3-1FFB061AA679,88658aa113fffff,Rayong,Rayong#Pluak Daeng,Rayong#Pluak Daeng#Pluak Daeng,True
...,...,...,...,...,...,...,...,...,...,...,...
704,2024-11-30 15:26:48,2024-11-30 16:11:41,26,44.883333,POINT (100.61187 14.31917),B79C7F07-A0EC-4568-A5E3-1FFB061AA679,8864a408ddfffff,Phra Nakhon Si Ayutthaya,Phra Nakhon Si Ayutthaya#Bang Pa-In,Phra Nakhon Si Ayutthaya#Bang Pa-In#Ban Krot,False
705,2024-11-30 20:04:50,2024-11-30 22:49:16,10,164.433333,POINT (100.47945 13.94505),B79C7F07-A0EC-4568-A5E3-1FFB061AA679,8864a4ab5dfffff,Nonthaburi,Nonthaburi#Pak Kret,Nonthaburi#Pak Kret#Khlong Khoi,False


For the actual tutorial, we are going to perform this step for all the devices in our dataset. To speed this up, we again employ multithreading using Python's built-in [`ThreadPoolExecutor`](https://docs.python.org/3/library/concurrent.futures.html).

In [10]:
# Adjust max_workers depending on your machine's available resources
with ThreadPoolExecutor(max_workers=2048) as executor:
    _ = [
        executor.submit(identify_staypoints_in_residence, file)
        for file in os.listdir(STAYPOINTS_DIRECTORY)
    ]

<hr>

## III. Identifying trips

The United Nations' [_International Recommendation for Tourism Statistics 2008_](https://unstats.un.org/unsd/publication/seriesm/seriesm_83rev1e.pdf) defines a trip as a "travel by a person from the time of departure from his usual residence until he/she returns" (paragraph 2.7).

In this tutorial, we equate a device's usual residence to its district of residence.

We write a utility function using DataBlox-OD's `datablox_od.tourism.identify_trips()`.

In [11]:
def identify_trips(staypoints_file, verbose, skip_if_trips_file_exists=True):
    """
    If no trips are identified, no output file is written and nothing is returned.
    """
    if skip_if_trips_file_exists and os.path.exists(
        os.path.join(TRIPS_DIRECTORY, staypoints_file)
    ):
        return

    staypoints = gpd.read_parquet(os.path.join(STAYPOINTS_DIRECTORY, staypoints_file))
    trips = datablox_od.tourism.identify_trips(
        staypoints,
        arrival_time_column="time_arrive",
        departure_time_column="time_depart",
        residence_column="is_staypoint_in_district_of_residence",
        max_days_between_two_consecutive_staypoints=2,
        verbose=verbose,
    )

    if not trips.empty:
        trips.to_parquet(os.path.join(TRIPS_DIRECTORY, staypoints_file))
        return trips

Just for illustrative purposes, we identify the trips taken by the device with ID `B79C7F07-A0EC-4568-A5E3-1FFB061AA679`.

In [12]:
identify_trips(
    "B79C7F07-A0EC-4568-A5E3-1FFB061AA679.parquet",
    verbose=False,
    skip_if_trips_file_exists=False,
)

Unnamed: 0_level_0,time_arrive,time_depart,num_pings_in_staypoint,num_minutes_stayed,geometry,device_id,h3_cell,ADM1_EN,ADM2_EN,ADM3_EN,is_staypoint_in_district_of_residence
trip_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,2024-05-01 17:27:25,2024-05-01 19:25:08,40,117.716667,POINT (101.20227 13.01753),B79C7F07-A0EC-4568-A5E3-1FFB061AA679,88658aa113fffff,Rayong,Rayong#Pluak Daeng,Rayong#Pluak Daeng#Pluak Daeng,True
0,2024-05-01 19:28:46,2024-05-02 11:54:36,118,985.833333,POINT (101.19361 13.05194),B79C7F07-A0EC-4568-A5E3-1FFB061AA679,88658aaad3fffff,Chon Buri,Chon Buri#Si Racha,Chon Buri#Si Racha#Khao Khan Song,False
...,...,...,...,...,...,...,...,...,...,...,...
117,2024-11-29 14:41:42,2024-11-29 15:28:03,10,46.350000,POINT (100.67193 14.35126),B79C7F07-A0EC-4568-A5E3-1FFB061AA679,8864a4082bfffff,Phra Nakhon Si Ayutthaya,Phra Nakhon Si Ayutthaya#Uthai,Phra Nakhon Si Ayutthaya#Uthai#Uthai,False
117,2024-11-29 19:46:17,2024-11-29 23:07:14,28,200.950000,POINT (101.26586 12.96887),B79C7F07-A0EC-4568-A5E3-1FFB061AA679,88658aa305fffff,Rayong,Rayong#Pluak Daeng,Rayong#Pluak Daeng#Pluak Daeng,True


For the actual tutorial, we are going to identify the trips taken by all the devices in our dataset. To speed this up, we again employ multithreading using Python's built-in [`ThreadPoolExecutor`](https://docs.python.org/3/library/concurrent.futures.html).

In [13]:
# Adjust max_workers depending on your machine's available resources
with ThreadPoolExecutor(max_workers=2048) as executor:
    _ = [
        executor.submit(identify_trips, file, verbose=False)
        for file in os.listdir(STAYPOINTS_DIRECTORY)
    ]

<hr>

## IV. Identifying tourism trips

Finally, we now identify which among our trips are tourism trips.

We write a utility function using DataBlox-OD's `datablox_od.tourism.identify_tourism_trips()`. Note that this function returns two data frames: the trips and the statistics on each of them (for example, trip duration).

In [14]:
def identify_tourism_trips(trips_file, verbose, skip_if_tourism_trips_file_exists=True):
    """
    If no trips are identified, no output file is written and nothing is returned.
    """
    if (
        skip_if_tourism_trips_file_exists
        and os.path.exists(os.path.join(TOURISM_TRIPS_DIRECTORY, trips_file))
        and os.path.exists(
            os.path.join(
                TOURISM_TRIPS_DIRECTORY, f"{Path(trips_file).stem}-statistics.parquet"
            )
        )
    ):
        return

    trips = gpd.read_parquet(os.path.join(TRIPS_DIRECTORY, trips_file))
    tourism_trips, statistics = datablox_od.tourism.identify_tourism_trips(
        trips,
        coordinates_column="geometry",
        arrival_time_column="time_arrive",
        departure_time_column="time_depart",
        # Tourism trips should involve visiting another province
        administrative_region_column="ADM1_EN",
        trip_history_length_in_months=1,
        max_frequency_in_trip_history=1,
        max_trip_duration_in_days=30,
        min_distance_from_origin_in_km=30,
        verbose=verbose,
    )

    if not tourism_trips.empty:
        tourism_trips.to_parquet(os.path.join(TOURISM_TRIPS_DIRECTORY, trips_file))
        statistics.to_parquet(
            os.path.join(
                TOURISM_TRIPS_DIRECTORY, f"{Path(trips_file).stem}-statistics.parquet"
            )
        )
        return tourism_trips, statistics

Just for illustrative purposes, we identify the tourism trips taken by the device with ID `B79C7F07-A0EC-4568-A5E3-1FFB061AA679`.

In [15]:
tourism_trips, tourism_trip_statistics = identify_tourism_trips(
    "B79C7F07-A0EC-4568-A5E3-1FFB061AA679.parquet",
    verbose=False,
    skip_if_tourism_trips_file_exists=False,
)

In [16]:
tourism_trips

Unnamed: 0_level_0,time_arrive,time_depart,num_pings_in_staypoint,num_minutes_stayed,geometry,device_id,h3_cell,ADM1_EN,ADM2_EN,ADM3_EN,is_staypoint_in_district_of_residence,distance_from_origin_in_km
trip_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
5,2024-05-07 14:38:10,2024-05-07 19:14:35,27,276.416667,POINT (101.19222 13.04789),B79C7F07-A0EC-4568-A5E3-1FFB061AA679,88658aaadbfffff,Rayong,Rayong#Pluak Daeng,Rayong#Pluak Daeng#Ta Sit,True,0.000000
5,2024-05-07 19:40:13,2024-05-07 21:28:49,12,108.600000,POINT (101.18386 13.0519),B79C7F07-A0EC-4568-A5E3-1FFB061AA679,88658aa1a5fffff,Chon Buri,Chon Buri#Si Racha,Chon Buri#Si Racha#Khao Khan Song,False,1.008752
...,...,...,...,...,...,...,...,...,...,...,...,...
117,2024-11-29 14:41:42,2024-11-29 15:28:03,10,46.350000,POINT (100.67193 14.35126),B79C7F07-A0EC-4568-A5E3-1FFB061AA679,8864a4082bfffff,Phra Nakhon Si Ayutthaya,Phra Nakhon Si Ayutthaya#Uthai,Phra Nakhon Si Ayutthaya#Uthai#Uthai,False,154.362889
117,2024-11-29 19:46:17,2024-11-29 23:07:14,28,200.950000,POINT (101.26586 12.96887),B79C7F07-A0EC-4568-A5E3-1FFB061AA679,88658aa305fffff,Rayong,Rayong#Pluak Daeng,Rayong#Pluak Daeng#Pluak Daeng,True,11.990752


In [17]:
tourism_trip_statistics

Unnamed: 0_level_0,num_administrative_regions_visited,trip_duration_in_days,farthest_distance_from_origin_in_km,trip_frequency
trip_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
5,2,1.326979,155.872766,0
6,3,0.471389,153.863722,0
...,...,...,...,...
111,2,0.606065,153.861290,1
117,4,6.537083,156.356792,1


For the actual tutorial, we are going to identify the tourism trips taken by all the devices in our dataset. To speed this up, we again employ multithreading using Python's built-in [`ThreadPoolExecutor`](https://docs.python.org/3/library/concurrent.futures.html).

In [18]:
# Adjust max_workers depending on your machine's available resources
with ThreadPoolExecutor(max_workers=2048) as executor:
    _ = [
        executor.submit(identify_tourism_trips, file, verbose=False)
        for file in os.listdir(TRIPS_DIRECTORY)
    ]

In summary, identifying tourism trips from raw GPS pings involves four steps:
1. Detect staypoints (locations where a device stayed for some period of time)
1. Identify which staypoints are associated with the device's usual residence
1. Identify trips (travels that start and end at the device's usual residence)
1. Identify which trips are tourism trips