In [1]:
import time
import json
import ast
import os
import datetime
import io
from collections import defaultdict

import imageio
import boto3
import pandas as pd
import numpy as np
import imageio
import matplotlib.pyplot as plt
import seaborn as sns

from brtdevkit.core.db.athena import AthenaClient
from brtdevkit.data import Dataset
from timezonefinder import TimezoneFinderL
import pytz

from aletheia_dataset_creator.dataset_tools.aletheia_dataset_helpers import imageids_to_dataset
from aletheia_dataset_creator.config.dataset_config import LEFT_CAMERAS, ALL_CAMERA_PAIRS_LIST
%matplotlib inline

In [2]:
pd.set_option('display.max_rows', 500)

In [3]:
athena = AthenaClient()
s3 = boto3.resource('s3')
tf = TimezoneFinderL()
home = os.path.expanduser(path='~')
data_path = '/data/jupiter/alex.li/datasets/'

[31;1m2024-01-24 21:32:40,520 - APIRequestor - ERROR - API error received | error_code : 403, error_message : {'extra': {'role_name': 'token_exchange'}, 'message': 'Role token_exchange is required to make this API call'}
[0mERROR:APIRequestor:API error received | error_code : 403, error_message : {'extra': {'role_name': 'token_exchange'}, 'message': 'Role token_exchange is required to make this API call'}


# Selecting data

In [4]:
HALO_LEFT_CAMERAS = ['T01', 'T02', 'T05', 'T06', 'T09', 'T10', 'T13', 'T14', 'I01', 'I02']
allpath = data_path + "/halo_all.parquet"
if os.path.exists(allpath):
    df_all = pd.read_parquet(path=allpath)
else:
    query = f"""
        SELECT collected_on, id, robot_name, geohash, camera_location, operation_time, latitude,
            longitude, gps_can_data__json
        FROM image_jupiter
        WHERE sensor_type = 'VD6763'
        AND camera_location IN {tuple(HALO_LEFT_CAMERAS)}
        AND geohash IS NOT NULL
        AND geohash NOT LIKE '7zzzz%'
        AND gps_can_data__json IS NOT NULL
        ORDER BY RAND()
        LIMIT 1000000
    """
    df_all = athena.get_df(query)
    df_all.to_parquet(allpath)
orangepath = data_path + "/halo_orange_implement.parquet"
if os.path.exists(orangepath):
    df_orange = pd.read_parquet(orangepath)
else:
    print('cache failed')
    query1 = f"""
    SELECT id, robot_name, collected_on, operation_time,
        camera_location, gps_can_data__json, group_id, geohash
    FROM image_jupiter
    WHERE sensor_type = 'VD6763'
    AND camera_location IN {tuple(HALO_LEFT_CAMERAS)}
    AND geohash IS NOT NULL
    AND geohash NOT LIKE '7zzzz%'
    AND gps_can_data__json IS NOT NULL
    AND image_jupiter.robot_name IN ('halohitchhiker_182')
    ORDER BY RAND()
    LIMIT 30000
    """
    df_orange = athena.get_df(query1)
    df_orange.to_parquet(orangepath)

In [5]:
puddlepath = data_path + "/halo_puddle.parquet"
if os.path.exists(puddlepath):
    df_puddle = pd.read_parquet(puddlepath)
else:
    df_puddle = Dataset.retrieve(name='labelbox_import_puddle_slice').to_dataframe()
    df_puddle.to_parquet(puddlepath)
dustpath = data_path + "/halo_dust.parquet"
if os.path.exists(dustpath):
    df_dust = pd.read_parquet(dustpath)
else:
    df_dust = Dataset.retrieve(name='labelbox_import_dust_slice').to_dataframe()
    df_dust.to_parquet(dustpath)

In [7]:
geohash_df = pd.read_csv(filepath_or_buffer='/data/jupiter/alex.li/20231213_geohash_table_v6.csv', index_col="Unnamed: 0")
geohash_train_df = geohash_df[geohash_df['bucket'] == 'train']
new_geohashes = set()
def filter_df(df_orig):
    global new_geohashes
    df_orig["geohash_short"] = df_orig["geohash"].apply(lambda x: x[:6])
    if 'speed' not in df_orig.columns:
        if 'gps_can_data__json' in df_orig.columns:
            df_orig["speed"] = df_orig["gps_can_data__json"].apply(lambda x: json.loads(x).get('speed', np.nan))
        elif 'gps_can_data' in df_orig.columns:
            df_orig["speed"] = df_orig["gps_can_data"].apply(lambda x: x.get('speed', np.nan))
    df_atspeed = df_orig[(1 < df_orig["speed"]) & (df_orig["speed"] < 30)]

    new_geohashes = new_geohashes.union([geohash for geohash in set(df_atspeed["geohash_short"]) if geohash not in geohash_df.index])
    df_train = df_atspeed[df_atspeed['geohash_short'].isin(geohash_train_df.index)]
    return df_train

In [18]:
df_filt_all = filter_df(df_all)[['id', 'camera_location', 'robot_name','collected_on', 'speed', 'geohash_short']]
df_filt_orange = filter_df(df_orange)[['id', 'camera_location', 'robot_name','collected_on', 'speed', 'geohash_short']]
df_filt_puddle = filter_df(df_puddle)[['id', 'camera_location', 'robot_name','collected_on', 'speed', 'geohash_short']]
df_filt_dust = filter_df(df_dust)[['id', 'camera_location', 'robot_name','collected_on', 'speed', 'geohash_short']]
print(len(new_geohashes))   

159


In [19]:
print(len(df_filt_all))
df_filt_all = df_filt_all.sample(30000, replace=False)
print(len(df_filt_orange))
df_filt_all = df_filt_orange.sample(5000, replace=False)
print(len(df_filt_puddle))
df_filt_puddle = df_filt_puddle.sample(10000, replace=False)
print(len(df_filt_dust))
df_filt_dust = df_filt_dust.sample(10000, replace=False)

528674
17489
96491
46919


In [24]:
df = pd.concat([df_filt_all, df_filt_orange, df_filt_puddle, df_filt_dust])
df = df[df['camera_location'].isin(HALO_LEFT_CAMERAS)]
df['collected_on']  = pd.to_datetime(df['collected_on'])
print(len(df))

31487


In [25]:
df.groupby('camera_location').count()

Unnamed: 0_level_0,id,robot_name,collected_on,speed,geohash_short
camera_location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
I01,1623,1623,1623,1623,1623
I02,1818,1818,1818,1818,1818
T01,735,735,735,735,735
T02,641,641,641,641,641
T05,1407,1407,1407,1407,1407
T06,860,860,860,860,860
T09,6214,6214,6214,6214,6214
T10,6171,6171,6171,6171,6171
T13,5958,5958,5958,5958,5958
T14,6060,6060,6060,6060,6060


In [26]:
# df = pd.read_csv('/data/jupiter/alex.li/wrong_label.csv')
# Dataset.create(name='halo_v61_to_relabel', description='images with incorrect label from v61 train set', kind=Dataset.KIND_IMAGE, image_ids=list(df['id']))

In [27]:
def make_dataset_slow(from_df, name, description) -> None:
    imids = list(from_df['id'])
    desc = f"{description} ({len(from_df['id'])} images)"
    print(len(imids))
    from_df.to_parquet(data_path + f'/{name}.parquet', index=False)
    imageids_to_dataset(imids, name, dataset_kind='image',
                            dataset_description=desc)
make_dataset_slow(df, 'halo_images_for_train_implement_dust_puddle_small', 'training images for halo, choosen based on recent fps. Needs to be filtered further...')

[31;1m2024-01-24 21:36:25,628 - APIRequestor - ERROR - API error received | error_code : 403, error_message : {'extra': {'role_name': 'token_exchange'}, 'message': 'Role token_exchange is required to make this API call'}
[0mERROR:APIRequestor:API error received | error_code : 403, error_message : {'extra': {'role_name': 'token_exchange'}, 'message': 'Role token_exchange is required to make this API call'}


31487
Running create dataset with many images, it will take some time, consider using the Dataset.create API directlyor the imageids_to_dataset_fast function




Preparing stereo dataframe for {'T01': 'T03', 'T02': 'T04', 'T05': 'T07', 'T06': 'T08', 'T09': 'T11', 'T10': 'T12', 'T13': 'T15', 'T14': 'T16', 'I01': 'I03', 'I02': 'I04'}...
Size of left dataframe: 31261


[31;1m2024-01-24 21:38:03,741 - APIRequestor - ERROR - API error received | error_code : 403, error_message : {'extra': {'role_name': 'token_exchange'}, 'message': 'Role token_exchange is required to make this API call'}
[0mERROR:APIRequestor:API error received | error_code : 403, error_message : {'extra': {'role_name': 'token_exchange'}, 'message': 'Role token_exchange is required to make this API call'}
[31;1m2024-01-24 21:38:13,187 - APIRequestor - ERROR - API error received | error_code : 403, error_message : {'extra': {'role_name': 'token_exchange'}, 'message': 'Role token_exchange is required to make this API call'}
[0mERROR:APIRequestor:API error received | error_code : 403, error_message : {'extra': {'role_name': 'token_exchange'}, 'message': 'Role token_exchange is required to make this API call'}
[31;1m2024-01-24 21:38:22,467 - APIRequestor - ERROR - API error received | error_code : 403, error_message : {'extra': {'role_name': 'token_exchange'}, 'message': 'Role token_e

Size of stereo dataframe: 25773
Preparing stereo dataframe for {'T02': 'T03', 'T06': 'T07', 'T10': 'T11', 'T14': 'T15', 'I02': 'I03'}...
Size of left dataframe: 15436


[31;1m2024-01-24 21:40:22,691 - APIRequestor - ERROR - API error received | error_code : 403, error_message : {'extra': {'role_name': 'token_exchange'}, 'message': 'Role token_exchange is required to make this API call'}
[0mERROR:APIRequestor:API error received | error_code : 403, error_message : {'extra': {'role_name': 'token_exchange'}, 'message': 'Role token_exchange is required to make this API call'}
[31;1m2024-01-24 21:40:32,668 - APIRequestor - ERROR - API error received | error_code : 403, error_message : {'extra': {'role_name': 'token_exchange'}, 'message': 'Role token_exchange is required to make this API call'}
[0mERROR:APIRequestor:API error received | error_code : 403, error_message : {'extra': {'role_name': 'token_exchange'}, 'message': 'Role token_exchange is required to make this API call'}
[31;1m2024-01-24 21:40:41,564 - APIRequestor - ERROR - API error received | error_code : 403, error_message : {'extra': {'role_name': 'token_exchange'}, 'message': 'Role token_e

Size of stereo dataframe: 12942
Sending 64634 image ids for creating dataset
Time taken to prepare data for dataset creation job: 5.10 mins
