In [1]:
import time
import json
import ast
import os
import datetime
import io
from collections import defaultdict

import imageio
import boto3
import pandas as pd
import numpy as np
import imageio
import matplotlib.pyplot as plt
import seaborn as sns

from brtdevkit.core.db.athena import AthenaClient
from brtdevkit.data import Dataset
from timezonefinder import TimezoneFinderL
import pytz

from aletheia_dataset_creator.dataset_tools.aletheia_dataset_helpers import imageids_to_dataset
from aletheia_dataset_creator.config.dataset_config import LEFT_CAMERAS, ALL_CAMERA_PAIRS_LIST
%matplotlib inline


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
pd.set_option('display.max_rows', 500)

In [3]:
athena = AthenaClient()
s3 = boto3.resource('s3')
tf = TimezoneFinderL()
home = os.path.expanduser('~')
data_path = home + '/data'

In [4]:
import os
os.getenv('AWS_PROFILE')

'jupiter_prod'

In [5]:
def get_calibration(x):
    try:
        return ast.literal_eval(x)
    except:
        return {}   
def get_adjusted_timezone(timestamp, latitude, longitude):
    if (latitude == 0) or (longitude == 0):
        return np.nan
    
    if isinstance(timestamp, np.datetime64):
        timestamp = pd.to_datetime(timestamp)
    # Localize and adjust UTC timestamps to local timezone
    utc = pytz.utc.localize(timestamp)
    tz = tf.timezone_at(lat=latitude, lng=longitude)
    adjusted_timestamp = utc.astimezone(tz).to_pydatetime()

    return adjusted_timestamp
def valid_speed(x):
    try:
        load = json.loads(x)
    except TypeError:
        return False
    return 'speed' in load and 200 > load['speed'] > -200 #remove NaN or impossible values
def get_day(collect_str):
    t = pd.Timestamp(collect_str)
    return t.strftime("%Y/%m/%d")
def get_minute(collect_str) -> str:
    t = pd.Timestamp(collect_str)
    return t.strftime(format="%Y/%m/%d %H:%M")
def get_hour(collect_str) -> str:
    t = pd.Timestamp(collect_str)
    return t.strftime(format="%Y/%m/%d %H")
def speed_discrete(speed):
    if 0 <= speed <= 5:
        return "A"
    elif 5 < speed <= 7.2:
        return "B"
    elif 7.2 < speed <= 9.6:
        return "C"
    elif 9.6 < speed <= 25:
        return "D"
    elif 25 < speed:
        return "E"
    else:
        assert False


# Selecting data

In [6]:
# try:
#     halo_df = pd.read_parquet(data_path + '/hh_df.parquet')
# except FileNotFoundError:
#     print("file not found")
#     query = f"""
#     SELECT id, hard_drive_name, robot_name, collected_on,
#         bag_name, operating_field_name, operation_time, latitude, longitude, geohash, camera_location, sensor_type, created_at, 
#         bundle, gps_can_data__json, weather_summary__json, group_id
#     FROM image_jupiter 
#     # WHERE LENGTH(robot_name) = 14 AND robot_name LIKE 'hitchhiker_1%'
#     """
# #--AND camera_location IN {left_tractor_cameras}
#     start = time.time()
#     halo_df = athena.get_df(query)
#     end = time.time()
#     print(end - start)
#     halo_df.to_parquet(data_path + '/hh_df.parquet', index=False)
# try:
#     rev1_df = pd.read_parquet(home + '/data/all_hitchiker_images/rev1_df.parquet')
# except FileNotFoundError:
#     print("file not found")
#     query = f"""
#     SELECT id, hard_drive_name, robot_name, collected_on,
#         bag_name, operating_field_name, operation_time, latitude, longitude, geohash, camera_location, sensor_type, created_at, 
#         bundle, gps_can_data__json, weather_summary__json, group_id
#     FROM image_jupiter 
#     WHERE LENGTH(robot_name) = 9 AND SUBSTR(robot_name, 7, 9) IN ('619', '646', '708', '710', '733', '735', '750', '756', '768', '799', '812', '817', '842', '869', '872', '909') AND "collected_on" BETWEEN TIMESTAMP'2023-03-15 0:00:00' AND TIMESTAMP'2023-05-15 0:00:00'
#     """
# #--AND camera_location IN {left_tractor_cameras}
#     start = time.time()
#     rev1_df = athena.get_df(query)
#     end = time.time()
#     print(end - start)
#     rev1_df.to_parquet(data_dir + 'all_hitchiker_images/rev1_df.parquet', index=False)

In [7]:
# query = """SELECT id, hard_drive_name, robot_name, collected_on,
#     bag_name, operating_field_name, operation_time, latitude, longitude, geohash, camera_location, sensor_type, created_at, 
#     bundle, gps_can_data__json, weather_summary__json, group_id
# FROM image_jupiter 
# WHERE "hard_drive_name" IN ('JUPD-172_2023-6-28', 'JUPD-173_2023-6-28', 'JUPD-174_2023-6-28')
# """
# start = time.time()
# df = athena.get_df(query)
# end = time.time()
# print(end - start)
ppath = data_path + "/lotta_data.parquet"
if os.path.exists(ppath):
    df = pd.read_parquet(ppath)
else:
    print('cache failed')
    query1 = f"""
    SELECT id, robot_name, collected_on, operation_time,
        camera_location, gps_can_data__json, group_id
    FROM image_jupiter
    ORDER BY RAND()
    LIMIT 10000000
    """
    df = athena.get_df(query1)
    df = df[df['gps_can_data__json'].apply(valid_speed)]
    df['day'] = df['collected_on'].map(get_day)
    df['minute'] = df['collected_on'].map(get_minute)
    df['hour'] = df['collected_on'].map(get_hour)
    df['speed_kph'] = df['gps_can_data__json'].map(lambda x:(json.loads(x)['speed']))
    df['speed_d'] = df['speed_kph'].apply(speed_discrete)
    df.to_parquet(ppath)

In [8]:
set(df['operation_time'])

{<NA>, 'dawn_dusk', 'daytime', 'error', 'nightime', 'unknown'}

In [1]:
bidirectional_dict = {}
for pair_dict in ALL_CAMERA_PAIRS_LIST:
    for k, v in pair_dict.items():
        bidirectional_dict[k] = v
        bidirectional_dict[v] = k

def make_dataset(from_df, name, description, pairs=[bidirectional_dict]) -> None:
    imids = list(from_df['id'])
    # print(len(imids))
    from_df.to_parquet(data_path + f'/{name}.parquet', index=False)
    desc = f"{description} ({len(from_df['id'])} images)"
    # imageids_to_dataset_fast(from_df, name, desc,
    #                         camera_pairs_list=pairs, camera_pair_df=df)
    Dataset.create(
        name=name,
        description=desc,
        kind=Dataset.KIND_IMAGE,
        image_ids=imids,
    )

def make_dataset_slow(from_df, name, description) -> None:
    imids = list(from_df['id'])
    desc = f"{description} ({len(from_df['id'])} images)"
    print(len(imids))
    from_df.to_parquet(data_path + f'/{name}.parquet', index=False)
    # imageids_to_dataset(imids, name, dataset_kind='image',
    #                          dataset_description=desc)


NameError: name 'ALL_CAMERA_PAIRS_LIST' is not defined

# Sample a bunch of random images

In [13]:
df = df[df['speed_d'] != 'D:25+']

In [14]:
def is_ok(row):
    cam_ok = row['camera_location'][0] != 'V' and row['camera_location'][0:2] != 'fi'
    if not cam_ok:
        return False
    if row['camera_location'][0] =='I' or row['camera_location'][0] =='T':
        return  'halo'
    else:
        return 'rev1'
df['ok'] = df.apply(is_ok, axis=1)

In [15]:
df['ok']

0          rev1
2          rev1
3          rev1
4          rev1
6          rev1
           ... 
9999993    rev1
9999994    rev1
9999995    rev1
9999996    rev1
9999999    rev1
Name: ok, Length: 9073936, dtype: object

In [16]:
df = df[df.operation_time.isin(['nightime','daytime', 'dawn_dusk'])]
df = df[~df.robot_name.isna()]

In [17]:
df_rev1 = df[df['ok'] == 'rev1']
df_rev1=df_rev1.sample(frac=1)

In [18]:
df_rev2 = df[df['ok']  == 'halo']
df_rev2=df_rev2.sample(frac=1)

In [19]:
from rich import print
from rich import pretty
pretty.install()
print(len(df_rev1))
print(len(df_rev2))
print(set(df_rev1.robot_name))
print(set(df_rev2.robot_name))
print(set(df_rev1.camera_location))
print(set(df_rev2.camera_location))
# print(set(df.day))
print(set(df_rev1.speed_d))
print(set(df_rev1.operation_time))

In [20]:
stratified_df = df.groupby(['robot_name', 'camera_location', 'day', 'speed_d', 'operation_time']).head(1)
print(f"{len(stratified_df) / 1000}k")

In [55]:
stratified_df_rev1 = df_rev1.groupby(['robot_name', 'camera_location', 'day', 'speed_d', 'operation_time']).head(1)
print(f"{len(stratified_df_rev1) / 1000}k")
stratified_df_rev2 = df_rev2.groupby(['robot_name', 'camera_location', 'hour', 'speed_d', 'operation_time']).head(7)
print(f"{len(stratified_df_rev2) / 1000}k")
stratified_df_rev1=stratified_df_rev1.sample(50000)
stratified_df_rev2=stratified_df_rev2.sample(50000)

In [57]:
# stratified_df_tiny = stratified_df.groupby(['camera_location']).head(1)
make_dataset_slow(stratified_df_rev1, "rev1_data_stratified", description="Randomly selected data from rev1")
make_dataset_slow(stratified_df_rev2, "rev2_data_stratified", description="Randomly selected data from rev2")
print("DONE MADE DATASET")

In [None]:
# from aletheia_dataset_creator.dataset_tools.aletheia_dataset_helpers import imageids_to_dataframe
# from aletheia_dataset_creator.config.dataset_config import ALL_CAMERA_PAIRS_LIST, IMAGE_DATASET_COLS, LEFT_CAMERAS

# images = imageids_to_dataframe(
#                 stratified_df_tiny, fields=IMAGE_DATASET_COLS, camera_location=LEFT_CAMERAS
# )

In [None]:
LEFT_CAMERAS

['front-center-left',
 'front-left-left',
 'front-right-left',
 'side-left-left',
 'side-right-left',
 'rear-left',
 'T01',
 'T02',
 'T05',
 'T06',
 'T09',
 'T10',
 'T13',
 'T14',
 'I01',
 'I03',
 'I05',
 'I07']

In [None]:
ALL_CAMERA_PAIRS_LIST

[{'front-center-left': 'front-center-right',
  'front-left-left': 'front-left-right',
  'front-right-left': 'front-right-right',
  'side-left-left': 'side-left-right',
  'side-right-left': 'side-right-right',
  'rear-left': 'rear-right',
  'front-center-right': 'front-center-left',
  'front-left-right': 'front-left-left',
  'front-right-right': 'front-right-left',
  'side-left-right': 'side-left-left',
  'side-right-right': 'side-right-left',
  'rear-right': 'rear-left'},
 {'T01': 'T03',
  'T02': 'T04',
  'T05': 'T07',
  'T06': 'T08',
  'T09': 'T11',
  'T10': 'T12',
  'T13': 'T15',
  'T14': 'T16'},
 {'T02': 'T03', 'T06': 'T07', 'T10': 'T11', 'T14': 'T15'},
 {'I01': 'I02', 'I03': 'I04', 'I05': 'I06', 'I07': 'I08'}]