In [1]:
import pandas as pd
import geopandas as gpd
from shapely import wkt
import numpy as np
import glob

In [2]:
# 1 - Determine the list of TSVs to convert to a dataframe.
files = sorted(
    glob.glob("./AgentStateTable-*.tsv"),
    key=lambda fn: int(fn.split("AgentStateTable-")[1].split(".tsv")[0])
)
files

['./AgentStateTable-1.tsv', './AgentStateTable-2.tsv']

In [3]:
# 2 - Parse the TSVs: retrieve the positions of the agents.
list_df = []
for f in files :
    print(f'Processing file {f}...')
    
    df = pd.read_csv(
        f,
        sep="\t",
        usecols=[1, 2, 3],
        dtype={2: str, 3: np.uint32}
    )
    # rename to friendlier column names
    df.columns = ["timestamp", "geometry", "ID"]

    # Convert the dates.
    df['timestamp'] = pd.to_datetime(df['timestamp'])

    # 2. Convert the WKT POINT strings into actual geometry objects
    df["geometry"] = df["geometry"].apply(wkt.loads)

    # Append to list.
    list_df.append(df)


# Concatenate everything.
df = pd.concat(list_df)
del list_df

Processing file ./AgentStateTable-1.tsv...
Processing file ./AgentStateTable-2.tsv...


In [4]:
# 3 - Create a GeoDataFrame, with the CRS set to the one used by the authors of 'pol' for all the maps, i.e., EPSG:26916.
gdf = gpd.GeoDataFrame(df, geometry="geometry", crs="EPSG:26916")

# Project the coordinates to WGS84.
gdf.to_crs(epsg=4326, inplace=True)

display(gdf)
display(gdf.info())

Unnamed: 0,timestamp,geometry,ID
0,2019-07-01 00:00:00,POINT (-84.40031 33.75035),0
1,2019-07-01 00:00:00,POINT (-84.3743 33.73444),1
2,2019-07-01 00:00:00,POINT (-84.37928 33.734),2
3,2019-07-01 00:00:00,POINT (-84.36595 33.73923),3
4,2019-07-01 00:00:00,POINT (-84.38093 33.75587),4
...,...,...,...
1196279,2019-07-03 00:01:00,POINT (-84.41069 33.75307),995
1196280,2019-07-03 00:01:00,POINT (-84.37174 33.74393),996
1196281,2019-07-03 00:01:00,POINT (-84.3659 33.75888),997
1196282,2019-07-03 00:01:00,POINT (-84.41085 33.75312),998


<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 2881999 entries, 0 to 1196283
Data columns (total 3 columns):
 #   Column     Dtype         
---  ------     -----         
 0   timestamp  datetime64[ns]
 1   geometry   geometry      
 2   ID         uint32        
dtypes: datetime64[ns](1), geometry(1), uint32(1)
memory usage: 77.0 MB


None

In [5]:
gdf.to_parquet('./trajs.parquet')