In [1]:
import time
import json
import ast
import os
import datetime
import io
from collections import defaultdict

import imageio
import boto3
import pandas as pd
import numpy as np
import imageio
import matplotlib.pyplot as plt
import seaborn as sns

from brtdevkit.core.db.athena import AthenaClient
from brtdevkit.data import Dataset
from timezonefinder import TimezoneFinderL
import pytz

from aletheia_dataset_creator.dataset_tools.aletheia_dataset_helpers import imageids_to_dataset
from aletheia_dataset_creator.config.dataset_config import LEFT_CAMERAS, ALL_CAMERA_PAIRS_LIST
%matplotlib inline


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
pd.set_option('display.max_rows', 500)

In [3]:
athena = AthenaClient()
s3 = boto3.resource('s3')
tf = TimezoneFinderL()
home = os.path.expanduser('~')
data_path = home + '/data/get_dust_data'

In [4]:
import os
os.getenv('AWS_PROFILE')

'jupiter_prod'

In [5]:
def get_calibration(x):
    try:
        return ast.literal_eval(x)
    except:
        return {}   
def get_adjusted_timezone(timestamp, latitude, longitude):
    if (latitude == 0) or (longitude == 0):
        return np.nan
    
    if isinstance(timestamp, np.datetime64):
        timestamp = pd.to_datetime(timestamp)
    # Localize and adjust UTC timestamps to local timezone
    utc = pytz.utc.localize(timestamp)
    tz = tf.timezone_at(lat=latitude, lng=longitude)
    adjusted_timestamp = utc.astimezone(tz).to_pydatetime()

    return adjusted_timestamp


# Selecting data

In [13]:
try:
    halo_df = pd.read_parquet(data_path + '/hh_df.parquet')
except FileNotFoundError:
    print("file not found")
    query = f"""
    SELECT id, hard_drive_name, robot_name, collected_on,
        bag_name, operating_field_name, operation_time, latitude, longitude, geohash, camera_location, sensor_type, created_at, 
        bundle, gps_can_data__json, weather_summary__json, group_id
    FROM image_jupiter 
    WHERE LENGTH(robot_name) = 14 AND robot_name LIKE 'hitchhiker_1%'
    """
#--AND camera_location IN {left_tractor_cameras}
    start = time.time()
    halo_df = athena.get_df(query)
    end = time.time()
    print(end - start)
    halo_df.to_parquet(data_path + '/hh_df.parquet', index=False)

file not found


KeyboardInterrupt: 

In [6]:
query = """SELECT id, hard_drive_name, robot_name, collected_on,
    bag_name, operating_field_name, operation_time, latitude, longitude, geohash, camera_location, sensor_type, created_at, 
    bundle, gps_can_data__json, weather_summary__json, group_id
FROM image_jupiter 
WHERE "hard_drive_name" IN ('JUPD-172_2023-6-28', 'JUPD-173_2023-6-28', 'JUPD-174_2023-6-28')
"""
start = time.time()
df = athena.get_df(query)
end = time.time()
print(end - start)

29.028175592422485


In [None]:
try:
    rev1_df = pd.read_parquet(home + '/data/all_hitchiker_images/rev1_df.parquet')
except FileNotFoundError:
    print("file not found")
    query = f"""
    SELECT id, hard_drive_name, robot_name, collected_on,
        bag_name, operating_field_name, operation_time, latitude, longitude, geohash, camera_location, sensor_type, created_at, 
        bundle, gps_can_data__json, weather_summary__json, group_id
    FROM image_jupiter 
    WHERE LENGTH(robot_name) = 9 AND SUBSTR(robot_name, 7, 9) IN ('619', '646', '708', '710', '733', '735', '750', '756', '768', '799', '812', '817', '842', '869', '872', '909') AND "collected_on" BETWEEN TIMESTAMP'2023-03-15 0:00:00' AND TIMESTAMP'2023-05-15 0:00:00'
    """
#--AND camera_location IN {left_tractor_cameras}
    start = time.time()
    rev1_df = athena.get_df(query)
    end = time.time()
    print(end - start)
    rev1_df.to_parquet(data_dir + 'all_hitchiker_images/rev1_df.parquet', index=False)

In [7]:
# df = pd.concat([rev1_df, halo_df], ignore_index=True)
# print(set(df.camera_location))
# # drop invalid GPS
# df = df[(df.geohash != '7zzzzzzzzzzz')].copy()
# print(set(df.camera_location))
print(df.shape)
# drop no speed data
def valid_speed(x):
    load = json.loads(x)
    return 'speed' in load and 200 > load['speed'] > -200 #remove NaN or impossible values
df = df[df['gps_can_data__json'].apply(valid_speed)]
print(df.shape)

(602995, 17)
(602007, 17)


In [8]:
def get_day(collect_str):
    t = pd.Timestamp(collect_str)
    return t.strftime("%m/%d")
def get_minute(collect_str):
    t = pd.Timestamp(collect_str)
    return t.strftime("%m/%d %H:%M")

df['day'] = df['collected_on'].map(get_day)
df['minute'] = df['collected_on'].map(get_minute)
df['speed_kph'] = df['gps_can_data__json'].map(lambda x:(json.loads(x)['speed']))
# df.to_parquet('/home/alexli/data/all_hitchiker_images/full_df.parquet', index=False)

In [11]:
df.to_parquet(data_path + f'/hhh.parquet', index=False)

In [9]:
bidirectional_dict = {}
for pair_dict in ALL_CAMERA_PAIRS_LIST:
    for k, v in pair_dict.items():
        bidirectional_dict[k] = v
        bidirectional_dict[v] = k

def make_dataset(from_df, name, description, pairs=[bidirectional_dict]) -> None:
    imids = list(from_df['id'])
    # print(len(imids))
    from_df.to_parquet(data_path + f'/{name}.parquet', index=False)
    # desc = f"{description} ({len(from_df['id'])} images)"
    # imageids_to_dataset_fast(from_df, name, desc,
    #                          camera_pairs_list=pairs, camera_pair_df=df)
    # Dataset.create(
    #     name=name,
    #     description=desc,
    #     kind=Dataset.KIND_IMAGE,
    #     image_ids=imids,
    # )

def make_dataset_slow(from_df, name, description) -> None:
    imids = list(from_df['id'])
    desc = f"{description} ({len(from_df['id'])} images)"
    print(len(imids))
    from_df.to_parquet(f'/home/alexli/data/all_hitchiker_images/{name}.parquet', index=False)
    imageids_to_dataset(imids, name, dataset_kind='image',
                             dataset_description=desc)


SyntaxError: unterminated string literal (detected at line 10) (1765102717.py, line 10)

# Sample a bunch of random images

In [None]:
df=df.sample(frac=1)

In [None]:
stratified_df = df.groupby(['robot_name', 'camera_location', 'minute']).head(4)
print(len(stratified_df))

70486


In [None]:
set(stratified_df.camera_location)

{'I01',
 'I02',
 'I03',
 'I04',
 'I05',
 'I06',
 'I07',
 'I08',
 'T01',
 'T02',
 'T03',
 'T04',
 'T05',
 'T06',
 'T07',
 'T08',
 'T09',
 'T10',
 'T11',
 'T12',
 'T13',
 'T14',
 'T15',
 'T16',
 'front-center-left',
 'front-center-right',
 'front-left-left',
 'front-left-right',
 'front-right-left',
 'front-right-right'}

In [20]:
# stratified_df_tiny = stratified_df.groupby(['camera_location']).head(1)
# make_dataset(stratified_df, "hhh_field_data_stratified", description="first 3 hard drives from the field")

In [None]:
# stratified_df_tiny = stratified_df.groupby(['camera_location']).head(1)
# make_dataset(stratified_df, "Spring hitchhiker random", description="Random hitchhiker images from the spring", pairs=ALL_CAMERA_PAIRS_LIST)
# make_dataset_slow(stratified_df, "Stratefied ", description="test only the left cameras imageids")
print("DONE MADE DATASET")

In [None]:
# from aletheia_dataset_creator.dataset_tools.aletheia_dataset_helpers import imageids_to_dataframe
# from aletheia_dataset_creator.config.dataset_config import ALL_CAMERA_PAIRS_LIST, IMAGE_DATASET_COLS, LEFT_CAMERAS

# images = imageids_to_dataframe(
#                 stratified_df_tiny, fields=IMAGE_DATASET_COLS, camera_location=LEFT_CAMERAS
# )

In [37]:
LEFT_CAMERAS

['front-center-left',
 'front-left-left',
 'front-right-left',
 'side-left-left',
 'side-right-left',
 'rear-left',
 'T01',
 'T02',
 'T05',
 'T06',
 'T09',
 'T10',
 'T13',
 'T14',
 'I01',
 'I03',
 'I05',
 'I07']

In [23]:
ALL_CAMERA_PAIRS_LIST

[{'front-center-left': 'front-center-right',
  'front-left-left': 'front-left-right',
  'front-right-left': 'front-right-right',
  'side-left-left': 'side-left-right',
  'side-right-left': 'side-right-right',
  'rear-left': 'rear-right',
  'front-center-right': 'front-center-left',
  'front-left-right': 'front-left-left',
  'front-right-right': 'front-right-left',
  'side-left-right': 'side-left-left',
  'side-right-right': 'side-right-left',
  'rear-right': 'rear-left'},
 {'T01': 'T03',
  'T02': 'T04',
  'T05': 'T07',
  'T06': 'T08',
  'T09': 'T11',
  'T10': 'T12',
  'T13': 'T15',
  'T14': 'T16'},
 {'T02': 'T03', 'T06': 'T07', 'T10': 'T11', 'T14': 'T15'},
 {'I01': 'I02', 'I03': 'I04', 'I05': 'I06', 'I07': 'I08'}]