In [1]:
import time
import json
import ast
import os
import datetime
import io
from collections import defaultdict
from tqdm import tqdm

import imageio
import boto3
import pandas as pd
import numpy as np
import imageio
import matplotlib.pyplot as plt
import seaborn as sns

from brtdevkit.core.db.athena import AthenaClient
from brtdevkit.data import Dataset
from timezonefinder import TimezoneFinderL
import pytz
import cv2
from brtdevkit.util.aws.s3 import S3
client = S3()

from aletheia_dataset_creator.dataset_tools.aletheia_dataset_helpers import imageids_to_dataset
from aletheia_dataset_creator.config.dataset_config import LEFT_CAMERAS, ALL_CAMERA_PAIRS_LIST
%matplotlib inline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
pd.set_option('display.max_rows', 500)
athena = AthenaClient()
s3 = boto3.resource('s3')
tf = TimezoneFinderL()
from pathlib import Path
home = Path(os.path.expanduser('~'))
data_path = home / 'data' 

In [3]:
if os.path.exists(data_path / 'df_sequences.parquet'):
    df_sequences = pd.read_parquet(data_path / 'df_sequences.parquet')
else:
    print("Cache miss")
    query = """SELECT ij.id, hard_drive_name, robot_name, collected_on,
        bag_name, operating_field_name, operation_time, latitude, longitude, geohash, camera_location, 
        bundle, group_id, s3_bucket, s3_key, special_notes, image_artifact_jupiter.kind
    FROM image_jupiter AS ij
    JOIN "image_artifact_jupiter" ON ij."id" = "image_artifact_jupiter"."image"
    WHERE "hard_drive_name" IN ('JUPD-004_2023-7-19', 'JUPD-006_2023-7-19', 'JUPD-007_2023-7-11') AND image_artifact_jupiter.kind = 'debayeredrgb' AND camera_location IN ('rear-left', 'side-left-left', 'side-right-left')
    """
    df_sequences: pd.DataFrame = athena.get_df(query) # type: ignore
    df_sequences.to_parquet(data_path / 'df_sequences.parquet')
df_sequences['image_id'] = df_sequences['id']
df_sequences = df_sequences.set_index('id')

In [4]:
if os.path.exists(data_path / 'df_dusty.parquet'):
    df_dusty = pd.read_parquet(data_path / 'df_dusty.parquet')
else:
    print("Cache miss")
    query = """SELECT ij.id, hard_drive_name, robot_name, collected_on,
        bag_name, operating_field_name, operation_time, latitude, longitude, geohash, camera_location, 
        bundle, group_id, s3_bucket, s3_key, special_notes
    FROM image_jupiter AS ij
    JOIN "image_artifact_jupiter" ON ij."id" = "image_artifact_jupiter"."image"
    WHERE "hard_drive_name" IN ('JUPD-054_2023-6-13') AND image_artifact_jupiter.kind = 'debayeredrgb' AND camera_location LIKE '%left'
    """
    df_dusty: pd.DataFrame = athena.get_df(query) # type: ignore
    df_dusty.to_parquet(data_path / 'df_dusty.parquet')
df_dusty['image_id'] = df_dusty['id']
df_dusty = df_dusty.set_index('id')

# Look through sequences

In [5]:
from typing import Hashable
df_groups_orig: dict[Hashable, list[Hashable]] = df_sequences.groupby('special_notes').groups
df_index_orig = set(df_groups_orig.keys())
merged_runs = []
for e in [
    'vehicle in dust time dawn/dusk',
    'vehicle in dust day time ',
    'vehicle in dust Day',
    'vehicle dust dusk',
]:
    merged_runs.append(df_sequences[df_sequences['special_notes'] == e])
merged_runs = pd.concat(merged_runs)
for e in [
    '6508 IQ-test-1',
    '6524 IQ-Test-1',
    '6524 IQ-Test-2',
    'IQ-image to bright',
    'dust right side',
]:
    df_index_orig.remove(e)
df_sequences_valid = df_sequences[df_sequences['special_notes'].isin(df_index_orig)].copy()
# rebuild the index and groups
df_groups = df_sequences_valid.groupby('special_notes').groups
df_index = set(df_groups.keys())

In [6]:
def get_run_id(df_row):
    try:
        return int(df_row['special_notes'].split(" ")[-1])
    except ValueError:
        return pd.NA
df_sequences_valid['run_id'] = df_sequences_valid.apply(get_run_id, axis=1)
print(set(df_sequences_valid['run_id']))

{28, 29, 30, 31, 32, 33, 34, 35, 37, 38, 39, 40, 41, 42, 43, 44, 45, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, <NA>}


In [8]:
from datetime import datetime, timedelta
interval = 5

known_ids = set(df_sequences_valid['run_id'])
total_sequences = 0
merged_runs = merged_runs.sort_values('collected_on')
# merged_runs['c'] = merged_runs.collected_on.apply(datetime.fromisoformat)
delta = timedelta(seconds=interval)
start_t = merged_runs.iloc[0].collected_on
for i in range(1, len(merged_runs)):
    end_t = merged_runs.iloc[i - 1].collected_on
    next_t = merged_runs.iloc[i].collected_on
    if next_t - end_t > delta or i == len(merged_runs) - 1:
        if i == len(merged_runs) - 1:
            next_t += timedelta(microseconds=1)
        known_ids.add(total_sequences)
        while total_sequences in known_ids:
            total_sequences += 1
        merged_runs.loc[(start_t <= merged_runs['collected_on']) & (merged_runs['collected_on'] < next_t), 'run_id'] = total_sequences
        start_t = next_t

In [9]:
out = []
for id, row in df_sequences_valid.iterrows():
    try:
        out.append(merged_runs.loc[id]['run_id'])
    except KeyError:
        out.append(int(row['special_notes'].split(" ")[-1]))
        
df_sequences_valid['run_id'] = out

In [10]:
# Should be 72?
df_sequences_valid[df_sequences_valid['run_id'].isna()]
print(total_sequences)

87


In [11]:
def get_image(df_row, collected_on: str, folder_name: str):
    if len(df_row) == 0:
        whiteFrame = 255 * np.ones((604, 964, 3), np.uint8)
        font = cv2.FONT_HERSHEY_PLAIN
        whiteFrame = cv2.putText(whiteFrame, collected_on, (50, 400), font, 5, (0,0,0), 5)
        return whiteFrame
    elif isinstance(df_row, pd.DataFrame):
        assert len(df_row) == 1
        df_row = df_row.iloc[0]
    file_name = Path(data_path) / folder_name / (str(df_row.image_id) + '.png')
    if not os.path.exists(file_name):
        client.download_file(df_row['s3_bucket'], df_row['s3_key'], file_name)
    im = cv2.imread(str(file_name))
    return im
    

In [12]:
def create_video_frames(file_prefix: str, base_df: pd.DataFrame, folder_name: str):
    """
    Given dictionary with image paths creates concatenated image and video and saves to output_dir.
    """
    video_dir = Path(data_path) / 'videos' / str(file_prefix) 
    os.makedirs(video_dir, exist_ok=True)
    video_name = video_dir / "video.mp4"
    if os.path.exists(video_name):
        return
    writer = imageio.get_writer(video_name, fps=1)
    k_df = base_df.sort_values('collected_on')
    k_groups = base_df.groupby('group_id').groups
    seen = set()
    print(len(k_df))
    for row in tqdm(k_df.iterrows()):
        gid = row[1]['group_id']
        if gid in seen:
            continue
        seen.add(gid)
        values = k_groups[gid]
        group = k_df.loc[values]
        collected_on_str = str(group.iloc[0].collected_on)[11:19]
        # try:
        # concatenate image Horizontally
        front_pod = np.concatenate(
            (
                get_image(group[group['camera_location'] == 'front-left-left'], collected_on_str, folder_name),
                get_image(group[group['camera_location'] == 'front-center-left'], collected_on_str, folder_name),
                get_image(group[group['camera_location'] == 'front-right-left'], collected_on_str, folder_name),
            ),
            axis=1,
        )
        rear_pod = np.concatenate(
            (
                get_image(group[group['camera_location'] == 'side-left-left'], collected_on_str, folder_name),
                get_image(group[group['camera_location'] == 'rear-left'], collected_on_str, folder_name),
                get_image(group[group['camera_location'] == 'side-right-left'], collected_on_str, folder_name),
            ),
            axis=1,
        )
        # concatenate image vertically
        all_cameras = np.concatenate((front_pod, rear_pod), axis=0)[::4, ::4, ::-1]
        # save concatenated image file
        full_img_name = f"{collected_on_str}.png"
        file_path = os.path.join(video_dir, full_img_name)
        plt.imsave(file_path, all_cameras)
        plt.clf()
        plt.close()
        writer.append_data(imageio.imread(file_path))
        # except Exception as e:
        #     print(f"Skipping frame. Exception occurred: {e}")
    writer.close()


In [13]:
# 1) Download all of the images
from tqdm import tqdm
for i in range(1, 1 + total_sequences):
    folder_name = Path(data_path) / str(int(i))
    os.makedirs(folder_name, exist_ok=True)
    base_df = df_sequences_valid[df_sequences_valid['run_id'] == i]
    for id, df_row in base_df.iterrows():
        file_name = folder_name / str(id + '.png')
        if not os.path.exists(file_name):
            client.download_file(df_row['s3_bucket'], df_row['s3_key'], file_name)
# 2) Make all of the videos
# for i in range(1, 1 + total_sequences):
    create_video_frames(str(i), base_df, str(int(df_row['run_id'])))
    # print(i)

TokenRetrievalError: Error when retrieving token from sso: Token has expired and refresh failed

In [17]:
print(len(df_sequences_valid))

41545


In [18]:
runs_orig = df_sequences_valid.groupby('run_id').groups

In [19]:
# time cutoffs such that the video shows one pass of the vehicle moving forwards.
# Remove any time that the vehicle is lingering/ not moving
# All found by hand via observation of the downloaded videos
from typing import Dict, List, Tuple
cutoff_dict: Dict[int, List[Tuple[str, str]]] = {
    6: [], # just lingers, then stops
    18: [],
    28: [],
    51: [],
    53: [],
    56: [],
    59: [], # car does not move forward the whole time, stays in dustcloud
    72: [], # yo its a human!
    85: [], # sequence is a bit cut off at the start

    # White SUV
    1: [('21:28:50', '21:29:03')],
    2: [('21:29:49', '21:30:04')],
    3: [('21:31:53', '21:32:16')],
    4: [('21:59:20', '21:59:41')], # hard
    5: [('22:00:10', '22:00:30')],
    7: [('22:15:30', '22:15:55')], # hard, lingers at the start in huge dust
    8: [('22:16:10', '22:16:25')],
    9: [('22:16:47', '22:17:00'), ('22:17:08', '22:17:25')], # does 2 runs, one each side
    10: [('22:21:13', '22:21:29')],
    11: [('22:21:47', '22:22:06')],
    12: [('22:22:44', '22:23:00')],
    13: [('22:23:32', '22:23:41')],
    14: [('22:29:12', '22:29:27')],
    15: [('22:29:57', '22:30:10')],
    16: [('22:30:35', '22:30:54')],
    17: [('22:31:38', '22:31:48')],
    19: [('11:28:30', '11:28:48')],
    20: [('11:33:18', '11:33:33')],
    21: [('21:53:00', '21:53:11')],
    22: [('21:53:56', '21:54:09')],
    23: [('21:55:00','21:55:15'), ('21:55:56', '21:56:07')],
    24: [('21:56:56','21:57:08')],
    25: [('22:05:29','22:05:40')],
    26: [('22:17:52','22:18:00')],
    27: [('22:19:07','22:19:19')],
    36: [('22:20:22', '22:20:36')],
    46: [('22:21:29', '22:21:42')],
    47: [('22:27:58', '22:28:07'), ('22:29:00', '22:29:15')],
    48: [('22:29:55', '22:30:20')],
    49: [('22:35:10', '22:35:30')],
    50: [('22:36:17', '22:36:24')],
    52: [('22:42:00', '22:42:18'), ('22:43:08', '22:43:25')],
    54: [('22:49:04', '22:49:13')],
    55: [('22:50:40', '22:50:53')],

    29: [('01:40:00', '01:41:13'), ('01:42:02', '01:42:12')],
    30: [('01:49:28', '01:49:35')],
    31: [('01:50:12', '01:50:22')],
    32: [('01:56:52', '01:57:25')], # added some extra frames at the start since the tractor was turning which seems interesting
    33: [('01:58:08', '01:58:19')],
    34: [('01:58:55', '01:58:03')],
    35: [('01:59:35', '01:59:49')],

    # Begin black SUV
    37: [('03:12:05', '03:12:22')],
    38: [('03:13:22', '03:13:35')],
    39: [('03:20:20', '03:20:50')],
    40: [('03:21:35', '03:21:54')],
    41: [('03:22:40', '03:22:54')],
    42: [('03:27:35', '03:28:03')],
    43: [('03:28:35', '03:28:53')],
    44: [('03:29:28', '03:29:40')],
    45: [('03:33:49', '03:34:44')],

    # white again
    57: [('01:01:57','01:02:14'), ('01:02:44','01:03:01')],
    58: [('01:04:00','01:04:22'), ('01:04:48','01:05:05')],
    60: [('01:06:28', '01:06:43'), ('01:07:16', '01:07:30')],
    61: [('01:08:16', '1:08:31')],
    62: [('01:19:52', '1:19:59')],
    63: [],
    64: [('01:22:10', '1:22:17'), ('1:23:34', '1:23:42')],
    65: [('01:24:32', '1:24:45')],
    66: [('01:25:17', '01:25:27')],
    67: [('01:25:59', '01:26:13')],
    68: [('01:27:00', '01:27:09')],
    69: [('01:27:42', '01:27:56')],
    70: [('01:42:32', '01:42:38')],
    71: [('01:42:32', '01:42:38')],
    # black suv again :)
    73: [('01:00:55', '01:01:14')],
    74: [('01:02:00', '01:02:14')],
    75: [('01:03:00', '01:03:10')],
    76: [('01:08:45', '01:08:55')],
    77: [('01:09:30', '01:09:45')],
    78: [('01:10:30', '01:10:45')],
    79: [('01:16:30', '01:16:44')],
    80: [('01:17:21', '01:17:34')],
    81: [('01:18:08', '01:18:17')],
    82: [('01:18:49', '01:18:59')],
    83: [('01:24:32', '01:24:50')],
    84: [('01:25:15', '01:25:25')],
    86: [('01:26:45', '01:26:55')],
    87: [('01:32:08', '01:32:26')],
}

In [20]:
from datetime import datetime
def filter_movie(movie, start_t, end_t):
    timestamp = movie.iloc[0]['collected_on']
    s_hour, s_minute, s_second = map(int, start_t.split(':'))
    e_hour, e_minute, e_second = map(int, end_t.split(':'))
    y, m, d = timestamp.year, timestamp.month, timestamp.day
    tzinfo = movie['collected_on'].iloc[0].tzinfo
    start_dt = datetime(y, m, d, s_hour, s_minute, s_second, tzinfo=tzinfo)
    end_dt = datetime(y, m, d, e_hour, e_minute, e_second, tzinfo=tzinfo)
    return movie[(start_dt < movie['collected_on']) & (movie['collected_on'] < end_dt)]
img_cache = {}

In [21]:
cleaned_runs = []
for run_id in range(1, 88):
    sequence = df_sequences_valid.loc[runs_orig[run_id]]
    times = cutoff_dict[run_id]
    for start_t, end_t in cutoff_dict[run_id]:
        filtered = filter_movie(sequence, start_t, end_t)
        # if not len(filtered):
        #     print(run_id, start_t, end_t ) # oops
        cleaned_runs.append(filtered)

In [22]:
cleaned_df = pd.concat(cleaned_runs, keys=list(range(len(cleaned_runs))))

In [29]:
def guess_missing_operation_time_from_special_notes(row: pd.Series) -> str:
    """Some data (suv_driving_through_rear_dust_anno) didn't have operation time
    in the rows (or latitude/longitude), so you can't easily infer the
    time of day. However, it can be guessed from the special notes
    for these particular bags, so its ok.

    Args:
        row: should have 'special_notes' and 'operation_time' columns

    Returns:
        str: the operation time
    """
    if row['operation_time'] != 'unknown':
        return row['operation_time']
    notes = row['special_notes'].lower()
    if 'night' in notes:
        return 'nightime' # that's how it's spelled in the database
    elif 'dusk' in notes:
        return 'dawn_dusk'
    elif 'day' in notes:
        return 'daytime'
    else:
        return 'unknown'

def fill_missing_operation_time(df: pd.DataFrame) -> pd.Series:
    """Some data  didn't have operation time in the rows (or latitude/longitude),
    so you can't read the time of day. It is recorded in another file that we must read from."""
    df['operation_time'] = df.apply(guess_missing_operation_time_from_special_notes, axis=1)
    return df['operation_time']


cleaned_df['operation_time'] = fill_missing_operation_time(cleaned_df)

In [35]:
print(sum(cleaned_df['operation_time'] == 'nightime'))
print(sum(cleaned_df['operation_time'] == 'dawn_dusk'))
print(sum(cleaned_df['operation_time'] == 'daytime'))

1246
5370
4464


In [76]:
image_ids = list(cleaned_df.image_id)

In [105]:
def make_dataset_slow(from_df, name, description, kind='image') -> None:
    imids = list(from_df['image_id'])
    desc = f"{description} ({len(from_df['image_id'])} images)"
    print(len(imids))
    from_df.to_parquet(data_path / '{name}.parquet', index=False)
    imageids_to_dataset(image_ids=imids, name, dataset_description=desc, dataset_kind=kind, production_dataset=False)
# make_dataset_slow(cleaned_df, "suv_driving_through_rear_dust", "87 sequences of rear+rear side data where a (white/black) suv drives through dust, starting from behind the tractor and ending up on the side of it. Collected 2023 July 12-14.")

In [78]:
# Dataset.create(
#     name='suv_driving_through_rear_dust_left_cam',
#     description="87 sequences of rear+rear side data where a (white/black) suv drives through dust, starting from behind the tractor and ending up on the side of it. Collected 2023 July 12-14. Left cameras only (11080 images)",
#     kind=Dataset.KIND_IMAGE,
#     image_ids=list(cleaned_df['image_id']),
# )

# Look through dusty human data

In [79]:
from rich import pretty
pretty.install()
df_dusty = df_dusty[df_dusty['special_notes'].notna()]
valid_notes = [
    'Dust-test-1-pos-1',
    'Dust-test-1-pos-1-atmp-2',
    'Dust-test-1-pos-2-atmp-1',
    'Dust-test-1-pos-3-atmp-1',
    'Dust-test-1-pos-3-atmp-2',
    'Dust-test-1-pos-4-atmp-1',
    'Dust-test-1-pos-5-atmp-1',
    'Dust-test-1-pos-5-atmp-2',
]
df_dusty = df_dusty[df_dusty['special_notes'].isin(valid_notes)]

In [80]:
df_dusty = df_dusty.sort_values('collected_on')

In [81]:
len(df_dusty.sort_values('collected_on'))

5625

In [82]:
for key in valid_notes:
    print(key)
    folder_name = Path(data_path) / 'humans_in_dust' / key
    os.makedirs(folder_name, exist_ok=True)
    base_df = df_dusty[df_dusty['special_notes'] == key]
    for id, df_row in tqdm(base_df.iterrows(), total=len(base_df)):
        file_name = folder_name / str(id + '.png')
        if not os.path.exists(file_name):
            client.download_file(df_row['s3_bucket'], df_row['s3_key'], file_name)
    create_video_frames(key, base_df=base_df, folder_name=f'humans_in_dust/{key}')

Dust-test-1-pos-1


100%|██████████| 564/564 [00:00<00:00, 24359.12it/s]


Dust-test-1-pos-1-atmp-2


100%|██████████| 959/959 [00:00<00:00, 25773.15it/s]


Dust-test-1-pos-2-atmp-1


100%|██████████| 577/577 [00:00<00:00, 25090.08it/s]


Dust-test-1-pos-3-atmp-1


100%|██████████| 544/544 [00:00<00:00, 25852.92it/s]


Dust-test-1-pos-3-atmp-2


100%|██████████| 663/663 [00:00<00:00, 25846.25it/s]


Dust-test-1-pos-4-atmp-1


100%|██████████| 90/90 [00:00<00:00, 23766.75it/s]


Dust-test-1-pos-5-atmp-1


100%|██████████| 1000/1000 [00:00<00:00, 26216.37it/s]


Dust-test-1-pos-5-atmp-2


100%|██████████| 1228/1228 [00:00<00:00, 26030.38it/s]


In [83]:
set(df_dusty['camera_location'])

{'front-center-left', 'front-left-left', 'front-right-left'}

In [84]:
only_human_images = {
    'Dust-test-1-pos-1': { # very light dust
        'front-left-left': [('01:12:27', '01:13:26')],
        'front-center-left': [],
        'front-right-left': []
    },
    'Dust-test-1-pos-1-atmp-2': { # medium dust
        'front-left-left': [('01:18:42', '01:20:20')],
        'front-center-left': [],
        'front-right-left': []
    },
    'Dust-test-1-pos-2-atmp-1': { # medium dust
        'front-left-left': [('01:20:28', '01:20:36')],
        'front-center-left': [('01:22:46', '01:23:28')],
        'front-right-left': []
    },
    'Dust-test-1-pos-3-atmp-1': { # light dust
        'front-left-left': [],
        'front-center-left': [('01:24:15', '01:25:20')],
        'front-right-left': []
    },
    'Dust-test-1-pos-3-atmp-2': { # heavy dust
        'front-left-left': [],
        'front-center-left': [('01:34:19', '01:35:15')],
        'front-right-left': []
    },
    'Dust-test-1-pos-4-atmp-1': { # medium dust
        'front-left-left': [],
        'front-center-left': [('01:43:03', '1:43:10')],
        'front-right-left': []
    },
    'Dust-test-1-pos-5-atmp-1': { # heavy dust
        'front-left-left': [],
        'front-center-left': [('01:37:07', '01:39:05')],
        'front-right-left': []
    },
    'Dust-test-1-pos-5-atmp-2': { # heavy dust
        'front-left-left': [],
        'front-center-left': [('01:39:07', '01:43:00')],
        'front-right-left': []
    },
}

In [85]:
cleaned_human_runs = []
for notes in valid_notes:
    for camera_location in ['front-left-left', 'front-center-left', 'front-right-left']:
        times = only_human_images[notes][camera_location]
        for start_t, end_t in times:
            sequence = df_dusty.loc[(df_dusty['special_notes'] == notes) & (df_dusty['camera_location'] == camera_location)]
            filtered = filter_movie(sequence, start_t, end_t)
            if not len(filtered):
                print(run_id, start_t, end_t) # oops
            cleaned_human_runs.append(filtered)
human_dusty_df = pd.concat(cleaned_human_runs)

In [86]:
len(human_dusty_df)

1650

In [87]:
human_dusty_df = human_dusty_df.sort_values('collected_on')
folder_name = Path(data_path) / 'humans_in_dust' / 'final'
os.makedirs(folder_name, exist_ok=True)
for id, df_row in tqdm(human_dusty_df.iterrows(), total=len(human_dusty_df)):
    file_name = folder_name / str(id + '.png')
    if not os.path.exists(file_name):
        client.download_file(df_row['s3_bucket'], df_row['s3_key'], file_name)
create_video_frames('final', base_df=human_dusty_df, folder_name=f'humans_in_dust/final')

100%|██████████| 1650/1650 [00:00<00:00, 25084.00it/s]


In [104]:
# Dataset.create(
#     name='mannequin_in_dust',
#     description="8 sequences of a mannequin in front of the tractor with dust blowing into it. All images contain a mannequin. Collected 2023 July 7. Left cameras only (1650 images)",
#     kind=Dataset.KIND_IMAGE,
#     image_ids=list(human_dusty_df['image_id']),
# )

# Create annotated datasets

In [89]:
mannequin_dset = Dataset.retrieve(name='mannequin_in_dust')

In [90]:
suv_dset = Dataset.retrieve(name='suv_driving_through_rear_dust_left_cam')

In [91]:
len(list(cleaned_df['image_id']))

11080

In [93]:
if os.path.exists(data_path / 'df_dusty_anno.parquet'):
    df_dusty_anno = pd.read_parquet(data_path / 'df_dusty_anno.parquet')
else:
    print("Cache miss")
    query = """SELECT ij.id, hard_drive_name, robot_name, collected_on,
        bag_name, operating_field_name, operation_time, latitude, longitude, geohash, camera_location, 
        bundle, group_id, s3_bucket, s3_key, special_notes, label_map__json, vendor_metadata__json, annotation_jupiter.updated_at
    FROM image_jupiter AS ij
    JOIN "annotation_jupiter" ON ij."id" = "annotation_jupiter"."image"
    WHERE "hard_drive_name" IN ('JUPD-054_2023-6-13')
    """
    df_dusty_anno: pd.DataFrame = athena.get_df(query) # type: ignore
    df_dusty_anno.to_parquet(data_path / 'df_dusty_anno.parquet')
df_dusty_anno['image_id'] = df_dusty_anno['id']
df_dusty_anno = df_dusty_anno.set_index('id')
if os.path.exists(data_path / 'df_sequences_anno.parquet'):
    df_sequences_anno = pd.read_parquet(data_path / 'df_sequences_anno.parquet')
else:
    print("Cache miss")
    query = """SELECT ij.id, hard_drive_name, robot_name, collected_on,
        bag_name, operating_field_name, operation_time, latitude, longitude, geohash, camera_location, 
        bundle, group_id, s3_bucket, s3_key, special_notes, label_map__json, vendor_metadata__json
    FROM image_jupiter AS ij
    JOIN "annotation_jupiter" ON ij."id" = "annotation_jupiter"."image"
    WHERE "hard_drive_name" IN ('JUPD-004_2023-7-19', 'JUPD-006_2023-7-19', 'JUPD-007_2023-7-11')
    """
    df_sequences_anno: pd.DataFrame = athena.get_df(query) # type: ignore
    df_sequences_anno.to_parquet(data_path / 'df_sequences_anno.parquet')
df_sequences_anno['image_id'] = df_sequences_anno['id']
df_sequences_anno = df_sequences_anno.set_index('id')

Cache miss


In [94]:
most_recently_annotated = df_dusty_anno.sort_values('updated_at').drop_duplicates('image_id', keep='last')

In [106]:
make_dataset_slow(df_sequences_anno, 
    name='suv_driving_through_rear_dust_anno',
    description="87 sequences of rear+rear side data where a (white/black) suv drives through dust, starting from behind the tractor and ending up on the side of it. Collected 2023 July 12-14. Left cameras only (11080 images)",
    kind=Dataset.KIND_ANNOTATION,
)

11080
INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
Preparing stereo dataframe...
Size of left dataframe: 10658
INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
Size of stereo dataframe: 10658
Sending 10658 annotated_ids for creating dataset
Time taken to prepare data for dataset creation job: 1.78 mins


In [107]:
make_dataset_slow(df_dusty_anno, 
    name='mannequin_in_dust_anno',
    description="8 sequences of a mannequin in front of the tractor with dust blowing into it. All images contain a mannequin. Collected 2023 July 7. Left cameras only (1650 images)",
    kind=Dataset.KIND_ANNOTATION,
)

1650
INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
Preparing stereo dataframe...
Size of left dataframe: 1441
INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
Size of stereo dataframe: 1441
Sending 1441 annotated_ids for creating dataset
Time taken to prepare data for dataset creation job: 0.77 mins


In [4]:
seq_12 = np.load("/home/alexli/data/seq12.npz", allow_pickle=True)
[k for k in seq_12.keys()]

['Day', 'Dusk', 'Night', 'dtype']

In [7]:
seq_12['Dusk']

array([list(['6425d56432a7ef3fc7dd19e1', '6425d4bf88ab036228b7e08e', '6425d56e87f5eb64dd52fc43', '6425d4db88ab036228b7e124', '6425d45587f5eb64dd52f93e', '6425d4f5ecdb4875e4916b70', '6425d46288ab036228b7dde3', '6425d45887f5eb64dd52f953', '6425d4ed87f5eb64dd52fb1f', '6425d4b088ab036228b7e034', '6425d50aafb870a21b528f18', '6425d48f87f5eb64dd52f9e6', '6425d56732a7ef3fc7dd19f1', '6425d506afb870a21b528f0c', '6425d50587f5eb64dd52fba9', '6425d4d81c7b215efdb3559b', '6425d4d5ecdb4875e4916ada', '6425d4d0ecdb4875e4916ab3', '6425d4ce32a7ef3fc7dd16fa', '6425d4cc87f5eb64dd52fa81', '6425d4cbafb870a21b528e83', '6425d45287f5eb64dd52f92c', '6425d450ecdb4875e49168be', '6425d49becdb4875e49169cb', '6425d49decdb4875e49169d7', '6425d49932a7ef3fc7dd15f7', '6425d49888ab036228b7df76', '6425d47788ab036228b7de44', '6425d47588ab036228b7de3b', '6425d518ecdb4875e4916c0a', '6425d51688ab036228b7e2dd', '6425d4e632a7ef3fc7dd175e', '6425d4e488ab036228b7e160', '6425d4a832a7ef3fc7dd1634', '6425d4aa32a7ef3fc7dd1641', '6425d5

In [8]:
seq_12['Day']

array([list(['6425a758422702cc9df4111b', '6425a759422702cc9df41121', '6425a829422702cc9df41279', '6425a828422702cc9df41276', '6425a78388ab036228b75109', '6425a78488ab036228b7510c', '6425a801afb870a21b52158f', '6425a802873be0f80a6f4b07', '6425a72d873be0f80a6f49a7', '6425a72b873be0f80a6f49a4', '6425a7ac87f5eb64dd52a892', '6425a7ab05b0d2deaf43b891', '6425a7cf88ab036228b75191', '6425a7d0422702cc9df411bc', '6425a728afb870a21b521444', '6425a72988ab036228b75046', '6425a750873be0f80a6f49f5', '6425a751afb870a21b5214ef', '6425a774873be0f80a6f4a3a', '6425a776873be0f80a6f4a3d', '6425a76e422702cc9df4115a', '6425a76f88ab036228b750d8', '6425a7bb05b0d2deaf43b8d4', '6425a7bc88ab036228b7517b', '6425a818afb870a21b5215d9', '6425a81905b0d2deaf43b9fe', '6425a82288ab036228b75284', '6425a82387f5eb64dd52a993', '6425a792422702cc9df4119a', '6425a79188ab036228b7512a', '6425a75487f5eb64dd52a83b', '6425a735873be0f80a6f49b6', '6425a736afb870a21b521480', '6425a73f873be0f80a6f49d8', '6425a740422702cc9df410f0', '6425a7

In [14]:
query = """SELECT ij.id, hard_drive_name, robot_name, collected_on,
    bag_name, operating_field_name, operation_time, latitude, longitude, geohash, camera_location, 
    bundle, group_id, special_notes
FROM image_jupiter AS ij
WHERE ij.id = '6425a78488ab036228b7510c'
"""
img: pd.DataFrame = athena.get_df(query) # type: ignore

In [15]:
img

Unnamed: 0,id,hard_drive_name,robot_name,collected_on,bag_name,operating_field_name,operation_time,latitude,longitude,geohash,camera_location,bundle,group_id,special_notes
0,6425a78488ab036228b7510c,loamy_812_1,loamy_812,2023-03-29 22:24:05.665,03_29_2023-22_24_04,,unknown,0.0,0.0,7zzzzzzzzzzz,side-left-left,5769,bb9531ca1bd94532a3402ea0bfc45d13,
