In [None]:
import pandas as pd
from tqdm import tqdm
from multiprocessing import Pool
from functools import partial
import utils.data_preprocess_utils as dpu
import geopandas as gpd
from utils.data_preprocess_utils import get_config, get_all_files
from easydict import EasyDict

In [4]:
def process_file(file_path, cfg):
    if file_path.endswith('csv.gz'):
        df = pd.read_csv(file_path, usecols=cfg.columns_to_extract, compression='gzip')
        columns_to_check = ['longitude', 'latitude', 'speed', 'heading']
        df = df.dropna(subset=columns_to_check).reset_index(drop=True)
        df = dpu.convert_time_format(df, time_column_ori='timestamp', time_column_new='timestamp')
        df.drop_duplicates(subset=['mmsi', 'timestamp'], inplace=True)
        location_1_polygon = gpd.read_file(cfg.location_1)
        location_2_polygon = gpd.read_file(cfg.location_2)
        df_to, df_from = dpu.filter_trajs_between(df, location_1_polygon, location_2_polygon, group_column='mmsi', time_column='timestamp')

        return (df_to, df_from)


def process_file_wrapper(args):
    return process_file(*args)


cfg = EasyDict(get_config('./cfg/route_extract_cfg.yaml'))
file_paths = get_all_files(cfg.raw_data_dir)

location_1_name = cfg.location_1.split('/')[-1].split('.')[0]
location_2_name = cfg.location_2.split('/')[-1].split('.')[0]
traj_from = pd.DataFrame(columns=cfg.columns_to_extract)
traj_to = pd.DataFrame(columns=cfg.columns_to_extract)

with Pool(processes=16) as pool:
    # Use tqdm to display progress
    with tqdm(total=len(file_paths), desc="Processing files", unit="file") as pbar:
        # Define the partial function with fixed arguments
        partial_process_file = partial(process_file, cfg=cfg)
        
        # Use imap_unordered to process files in parallel and get results asynchronously
        results = pool.imap_unordered(partial_process_file, file_paths)

        # Iterate over results and update progress bar
        for df_to, df_from in results:
            traj_from = pd.concat([traj_from, df_from], ignore_index=True)
            traj_to = pd.concat([traj_to, df_to], ignore_index=True)
            pbar.update(1)

traj_from.to_csv(f'{cfg.save_dir}{location_1_name}to{location_2_name}.csv.gz', index=False, compression='gzip')
traj_to.to_csv(f'{cfg.save_dir}{location_2_name}to{location_1_name}.csv.gz', index=False, compression='gzip')


Processing files: 100%|██████████| 402/402 [16:12<00:00,  2.42s/file]


In [5]:
import folium
import utils.visualize_utils as vis
cfg = EasyDict(get_config('./cfg/route_extract_cfg.yaml'))
location_1_name = cfg.location_1.split('/')[-1].split('.')[0]
location_2_name = cfg.location_2.split('/')[-1].split('.')[0]
traj_from = pd.read_csv(f'{cfg.save_dir}{location_1_name}to{location_2_name}.csv.gz', compression='gzip')
traj_to = pd.read_csv(f'{cfg.save_dir}{location_2_name}to{location_1_name}.csv.gz', compression='gzip')
m_to = vis.visulize_trajs(traj_to, mode='lines', heatmap=False, save=False)
m_from = vis.visulize_trajs(traj_to, mode='lines', heatmap=False, save=False)
location_1_polygon = gpd.read_file(cfg.location_1)
location_2_polygon = gpd.read_file(cfg.location_2)
folium.GeoJson(location_1_polygon).add_to(m_to).add_to(m_from)
folium.GeoJson(location_2_polygon).add_to(m_to).add_to(m_from)

<folium.features.GeoJson at 0x7fe9403d1ff0>