In [1]:
import time
import json
import ast
import os
import datetime
import io
from collections import defaultdict
from tqdm import tqdm

import imageio
import boto3
import pandas as pd
import numpy as np
import imageio
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

from brtdevkit.core.db.athena import AthenaClient
from brtdevkit.data import Dataset
from timezonefinder import TimezoneFinderL
import pytz
import cv2
from brtdevkit.util.aws.s3 import S3
client = S3()

from aletheia_dataset_creator.dataset_tools.aletheia_dataset_helpers import imageids_to_dataset
from aletheia_dataset_creator.config.dataset_config import LEFT_CAMERAS, ALL_CAMERA_PAIRS_LIST
%matplotlib inline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
pd.set_option('display.max_rows', 500)
athena = AthenaClient()
s3 = boto3.resource('s3')
tf = TimezoneFinderL()
home = Path(os.path.expanduser('~'))
data_path = Path(os.environ['OUTPUT_PATH']) / "manny_2"

In [3]:
hard_drive_names = ['JUPD-061_2023-8-21']

In [4]:
#https://bluerivertechnology.atlassian.net/wiki/spaces/JUPT/pages/3029106724/JQA-408+Post+Test+Report
if os.path.exists(data_path / 'df_dusty_man.parquet'):
    df_dusty = pd.read_parquet(data_path / 'df_dusty_man.parquet')
else:
    print("Cache miss")
    query = """SELECT ij.id, hard_drive_name, robot_name, collected_on,
        bag_name, operating_field_name, operation_time, latitude, longitude, geohash, camera_location, 
        bundle, group_id, s3_bucket, s3_key, special_notes
    FROM image_jupiter AS ij
    JOIN "image_artifact_jupiter" ON ij."id" = "image_artifact_jupiter"."image"
    WHERE "hard_drive_name" IN ('JUPD-061_2023-8-21') AND image_artifact_jupiter.kind = 'debayeredrgb' AND camera_location LIKE '%left'
    """
    df_dusty: pd.DataFrame = athena.get_df(query) # type: ignore
    df_dusty.to_parquet(data_path / 'df_dusty_man.parquet')
df_dusty['image_id'] = df_dusty['id']
df_dusty = df_dusty.set_index('id')

Cache miss


FileNotFoundError: [Errno 2] No such file or directory: '/home/alexli/results/manny_2/df_dusty_man.parquet'

# Look through sequences

In [None]:
def get_run_id(df_row):
    try:
        return int(df_row['special_notes'].split(" ")[-1])
    except ValueError:
        return pd.NA

In [30]:
def get_image(df_row, collected_on: str, folder_name: str):
    if len(df_row) == 0:
        whiteFrame = 255 * np.ones((604, 964, 3), np.uint8)
        font = cv2.FONT_HERSHEY_PLAIN
        whiteFrame = cv2.putText(whiteFrame, collected_on, (50, 400), font, 5, (0,0,0), 5)
        return whiteFrame
    elif isinstance(df_row, pd.DataFrame):
        assert len(df_row) == 1
        df_row = df_row.iloc[0]
    file_name = Path(data_path) / folder_name / (str(df_row.image_id) + '.png')
    if not os.path.exists(file_name):
        client.download_file(df_row['s3_bucket'], df_row['s3_key'], file_name)
    im = cv2.imread(str(file_name))
    return im
    

In [31]:
def create_video_frames(file_prefix: str, base_df: pd.DataFrame, folder_name: str):
    """
    Given dictionary with image paths creates concatenated image and video and saves to output_dir.
    """
    video_dir = Path(data_path) / 'videos' / str(file_prefix) 
    os.makedirs(video_dir, exist_ok=True)
    video_name = video_dir / "video.mp4"
    if os.path.exists(video_name):
        return
    writer = imageio.get_writer(video_name, fps=1)
    k_df = base_df.sort_values('collected_on')
    k_groups = base_df.groupby('group_id').groups
    seen = set()
    print(len(k_df))
    for row in tqdm(k_df.iterrows()):
        gid = row[1]['group_id']
        if gid in seen:
            continue
        seen.add(gid)
        values = k_groups[gid]
        group = k_df.loc[values]
        collected_on_str = str(group.iloc[0].collected_on)[11:19]
        # try:
        # concatenate image Horizontally
        front_pod = np.concatenate(
            (
                get_image(group[group['camera_location'] == 'front-left-left'], collected_on_str, folder_name),
                get_image(group[group['camera_location'] == 'front-center-left'], collected_on_str, folder_name),
                get_image(group[group['camera_location'] == 'front-right-left'], collected_on_str, folder_name),
            ),
            axis=1,
        )
        rear_pod = np.concatenate(
            (
                get_image(group[group['camera_location'] == 'side-left-left'], collected_on_str, folder_name),
                get_image(group[group['camera_location'] == 'rear-left'], collected_on_str, folder_name),
                get_image(group[group['camera_location'] == 'side-right-left'], collected_on_str, folder_name),
            ),
            axis=1,
        )
        # concatenate image vertically
        all_cameras = np.concatenate((front_pod, rear_pod), axis=0)[::4, ::4, ::-1]
        # save concatenated image file
        full_img_name = f"{collected_on_str}.png"
        file_path = os.path.join(video_dir, full_img_name)
        plt.imsave(file_path, all_cameras)
        plt.clf()
        plt.close()
        writer.append_data(imageio.imread(file_path))
        # except Exception as e:
        #     print(f"Skipping frame. Exception occurred: {e}")
    writer.close()


# Look through dusty human data

In [32]:
print(set(df_dusty['special_notes']))

{'MF-blind-spot-b/w-front-rear--wheel-t', 'dust-Night-pos-6', 'Bundle 6777 object stop', 'test2-dust', 'MF-FC--FL/overlap-1', 'dust_pos7', 'dust-Night-pos-9', 'dust-dusk-pos1', 'MF---SL-rear/overlap-3', 'dust-dusk-pos3', 'CAT-sup-5', 'dust-Night-pos-10', 'dust_pos10-atmp-2', 'test-front-camera-2', 'MF-walk-across-camera-overlap-zone', 'aruco-3-dust-2', 'MF-walk-across-camera-overlap-zone-1', 'Test recording after network failure DELETE', 'dust-Night-pos-7', 'dust_pos8', 'MF-walk-across-camera-overlap-zone-2', 'dust_pos9', 'MF-walk-in-out-decision/over-lap-zone-1', 'MF-walk-across-camera-overlap-zone-4', 'dust_pos6', 'dust-dusk-pos5', 'MF-Run-into-blind-spot-1', 'dust-Night-pos-2', 'dust_pos6-atmp-2', 'apriltag-night-high-light', 'dust-Night-pos-1', 'dust-dusk-pos4', 'MF-FC--FL/overlap', 'dust-Night-pos-5', 'Human stop test 6738', 'MF-walk-in-out-decision/over-lap-zone', 'test-test-front-camera', 'MF-Run-into-blind-spot', 'dust_pos10', 'dust-Night-pos-8', 'dust-dusk-pos2', 'MF---SL-rear

In [33]:
from rich import pretty
pretty.install()
df_dusty = df_dusty[df_dusty['special_notes'].notna()]
# https://bluerivertechnology.atlassian.net/wiki/spaces/JUPT/pages/3029106724/JQA-408+Post+Test+Report
valid_notes = [
    'dust-Night-pos-1',
    'dust-Night-pos-2',
    'dust-Night-pos-3',
    'dust-Night-pos-4',
    'dust-Night-pos-5',
    'dust-Night-pos-6',
    'dust-Night-pos-7',
    'dust-Night-pos-8',
    'dust-Night-pos-9',
    'dust-Night-pos-10',
    'dust-dusk-pos1',
    'dust-dusk-pos2',
    'dust-dusk-pos3',
    'dust-dusk-pos4',
    'dust-dusk-pos5',
    'dust_pos6',
    'dust_pos6-atmp-2',
    'dust_pos7',
    'dust_pos8',
    'dust_pos9',
    'dust_pos9-atmp-2',
    'dust_pos10',
    'dust_pos10-atmp-2'
]
print(len(valid_notes))
df_dusty = df_dusty[df_dusty['special_notes'].isin(valid_notes)]

23


In [8]:
df_dusty = df_dusty.sort_values('collected_on')

In [9]:
print(len(df_dusty.sort_values('collected_on')))
print(set(df_dusty['camera_location']))

205509
{'side-right-left', 'rear-left', 'front-right-left', 'front-left-left', 'front-center-left', 'side-left-left'}


In [10]:
def get_second(pddatetime):
    return pddatetime.strftime('%Y/%m/%d %H:%M:%S')
df_dusty['collected_second'] = df_dusty['collected_on'].apply(get_second)

In [11]:
ok_groups = df_dusty.groupby(['collected_second', 'camera_location']).first()['group_id']
df_dusty_subset = df_dusty[df_dusty['group_id'].isin(ok_groups)]

In [12]:
df_dusty_subset.groupby(['collected_second', 'camera_location']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,id,hard_drive_name,robot_name,collected_on,bag_name,operating_field_name,operation_time,latitude,longitude,geohash,bundle,group_id,s3_bucket,s3_key,special_notes
collected_second,camera_location,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2023/08/09 22:58:44,rear-left,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
2023/08/09 22:58:44,side-left-left,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
2023/08/09 22:58:44,side-right-left,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
2023/08/09 22:58:45,rear-left,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
2023/08/09 22:58:45,side-left-left,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023/08/25 23:08:42,front-left-left,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
2023/08/25 23:08:42,front-right-left,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
2023/08/25 23:08:42,rear-left,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
2023/08/25 23:08:42,side-left-left,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2


In [20]:
print(len(df_dusty))
print(len(df_dusty_subset))
df_dusty = df_dusty_subset

84387
84387


In [21]:
for key in valid_notes:
    folder_name = Path(data_path) / 'humans_in_dust' / key
    if os.path.exists(data_path / 'videos' / key / 'video.mp4'):
        continue
    os.makedirs(folder_name, exist_ok=True)
    base_df = df_dusty[df_dusty['special_notes'] == key]
    for id, df_row in tqdm(base_df.iterrows(), total=len(base_df)):
        file_name = folder_name / str(id + '.png')
        if not os.path.exists(file_name):
            client.download_file(df_row['s3_bucket'], df_row['s3_key'], file_name)
    create_video_frames(key, base_df=base_df, folder_name=f'humans_in_dust/{key}')

  0%|          | 0/2480 [00:00<?, ?it/s]


TypeError: unsupported operand type(s) for +: 'int' and 'str'

In [22]:
valid_notes = [
    'dust-Night-pos-1',
    'dust-Night-pos-2',
    'dust-Night-pos-3',
    'dust-Night-pos-4',
    'dust-Night-pos-5',
    'dust-Night-pos-6',
    'dust-Night-pos-7',
    'dust-Night-pos-8',
    'dust-Night-pos-9',
    'dust-Night-pos-10',
    'dust-dusk-pos1',
    'dust-dusk-pos2',
    'dust-dusk-pos3',
    'dust-dusk-pos4',
    'dust-dusk-pos5',
    'dust_pos6',
    'dust_pos6-atmp-2',
    'dust_pos7',
    'dust_pos8',
    'dust_pos9',
    'dust_pos9-atmp-2',
    'dust_pos10',
    'dust_pos10-atmp-2'
]

only_human_images = {
    'dust-Night-pos-1': {
        'front-left-left': [('05:02:00', '05:02:30'), ('05:06:00', '05:07:00'), ('05:07:20', '05:07:30')],
        'front-center-left': [],
        'front-right-left': [],
        'side-left-left': [('05:02:00', '05:02:30')],
        'side-right-left': [],
        'rear-left': []
    },
    'dust-Night-pos-2': {
        'front-left-left': [('05:07:30', '05:08:30')],
        'front-center-left': [('05:11:30', '05:11:40'),('05:12:40', '05:13:10') ],
        'front-right-left': [],
        'side-left-left': [],
        'rear-left': [],
        'side-right-left': [],
    },
    'dust-Night-pos-3': {
        'front-left-left': [],
        'front-center-left': [('05:13:15', '05:13:40'), ('05:17:00', '05:18:40')],
        'front-right-left': [],
        'side-left-left': [],
        'rear-left': [],
        'side-right-left': [],
    },
    'dust-Night-pos-4': {
        'front-left-left': [],
        'front-center-left': [('05:18:00', '05:19:05'), ('05:22:30', '05:22:50')],
        'front-right-left': [],
        'side-left-left': [],
        'rear-left': [],
        'side-right-left': [],
    },
    'dust-Night-pos-5': {
        'front-left-left': [],
        'front-center-left': [('05:23:45', '05:27:00')],
        'front-right-left': [],
        'side-left-left': [('05:29:10', '05:31:10')],
        'rear-left': [],
        'side-right-left': [],
    },
    'dust-Night-pos-6': {
        'front-left-left': [],
        'front-center-left': [],
        'front-right-left': [],
        'side-left-left': [('05:31:11', '05:32:30'), ('05:33:05', '05:34:50')],
        'rear-left': [('05:33:05', '05:34:50')],
        'side-right-left': [],
    },
    'dust-Night-pos-7': {
        'front-left-left': [],
        'front-center-left': [],
        'front-right-left': [],
        'side-left-left': [],
        'rear-left': [('05:49:32', '05:51:30')],
        'side-right-left': [],
    },
    'dust-Night-pos-8': {
        'front-left-left': [],
        'front-center-left': [],
        'front-right-left': [],
        'side-left-left': [],
        'rear-left': [('05:51:30', '05:51:40'), ('05:53:10', '05:54:20'), ('05:59:00', '05:59:59')],
        'side-right-left': [],
    },
    'dust-Night-pos-9': {
        'front-left-left': [],
        'front-center-left': [],
        'front-right-left': [],
        'side-left-left': [],
        'rear-left': [('05:59:20', '06:03:00'), ('06:04:50', '06:06:35')],
        'side-right-left': [],
    },
    'dust-Night-pos-10': {
        'front-left-left': [],
        'front-center-left': [],
        'front-right-left': [],
        'side-left-left': [],
        'rear-left': [('06:07:50', '06:09:40')],
        'side-right-left': [],
    },
    'dust-dusk-pos1': {
        'front-left-left': [('02:09:00','02:17:30')],
        'front-center-left': [],
        'front-right-left': [],
        'side-left-left': [('02:09:00','02:12:30')],
        'rear-left': [],
        'side-right-left': [],
    },
    'dust-dusk-pos2': {
        'front-left-left': [('02:17:28','02:20:30')],
        'front-center-left': [('02:20:21', '02:23:45')],
        'front-right-left': [],
        'side-left-left': [],
        'rear-left': [],
        'side-right-left': [],
    },
    'dust-dusk-pos3': {
        'front-left-left': [],
        'front-center-left': [('02:23:40', '02:26:00'), ('02:26:00', '02:35:00')],
        'front-right-left': [],
        'side-left-left': [],
        'rear-left': [],
        'side-right-left': [],
    },
    'dust-dusk-pos4': {
        'front-left-left': [],
        'front-center-left': [('02:34:00', '02:40:45')],
        'front-right-left': [],
        'side-left-left': [],
        'rear-left': [],
        'side-right-left': [],
    },
    'dust-dusk-pos5': {
        'front-left-left': [],
        'front-center-left': [('02:40:45', '02:42:25')],
        'front-right-left': [],
        'side-left-left': [],
        'rear-left': [],
        'side-right-left': [],
    },
    'dust_pos6': {
        'front-left-left': [],
        'front-center-left': [],
        'front-right-left': [],
        'side-left-left': [('22:57:03', '22:58:59')],
        'rear-left': [],
        'side-right-left': [],
    },
    'dust_pos6-atmp-2': {
        'front-left-left': [],
        'front-center-left': [],
        'front-right-left': [],
        'side-left-left': [],
        'rear-left': [('23:04:00', '23:06:00')],
        'side-right-left': [],
    },
    'dust_pos7': {
        'front-left-left': [],
        'front-center-left': [],
        'front-right-left': [],
        'side-left-left': [('23:18:02', '23:20:00')],
        'rear-left': [('23:18:02','23:27:00')],
        'side-right-left': [],
    },
    'dust_pos8': {
        'front-left-left': [],
        'front-center-left': [],
        'front-right-left': [],
        'side-left-left': [],
        'rear-left': [('23:27:00', '23:31:00')],
        'side-right-left': [],
    },
    'dust_pos9': {
        'front-left-left': [],
        'front-center-left': [],
        'front-right-left': [],
        'side-left-left': [],
        'rear-left': [('23:31:00', '23:34:43')],
        'side-right-left': [],
    },
    'dust_pos9-atmp-2': {
        'front-left-left': [],
        'front-center-left': [],
        'front-right-left': [],
        'side-left-left': [],
        'rear-left': [('23:34:43', '23:44:00')],
        'side-right-left': [],
    },
    'dust_pos10': {
        'front-left-left': [],
        'front-center-left': [],
        'front-right-left': [],
        'side-left-left': [],
        'rear-left': [('23:44:00', '23:59:59')],
        'side-right-left': [],
    },
    'dust_pos10-atmp-2': {
        'front-left-left': [],
        'front-center-left': [],
        'front-right-left': [],
        'side-left-left': [],
        'rear-left': [('23:58:00', '23:59:59')],
        'side-right-left': [],
    },
}

In [23]:
from datetime import datetime
def filter_movie(movie, start_t, end_t):
    timestamp = movie.iloc[0]['collected_on']
    s_hour, s_minute, s_second = map(int, start_t.split(':'))
    e_hour, e_minute, e_second = map(int, end_t.split(':'))
    y, m, d = timestamp.year, timestamp.month, timestamp.day
    tzinfo = movie['collected_on'].iloc[0].tzinfo
    start_dt = datetime(y, m, d, s_hour, s_minute, s_second, tzinfo=tzinfo)
    end_dt = datetime(y, m, d, e_hour, e_minute, e_second, tzinfo=tzinfo)
    return movie[(start_dt < movie['collected_on']) & (movie['collected_on'] < end_dt)]

cleaned_human_runs = []
for notes in valid_notes:
    for camera_location in ['front-left-left', 'front-center-left', 'front-right-left', 'side-left-left', 'rear-left', 'side-right-left']:
        times = only_human_images[notes][camera_location]
        for start_t, end_t in times:
            sequence = df_dusty.loc[(df_dusty['special_notes'] == notes) & (df_dusty['camera_location'] == camera_location)]
            filtered = filter_movie(sequence, start_t, end_t)
            # print(len(filtered))
            if not len(filtered):
                print(notes, start_t, end_t) # oops
            cleaned_human_runs.append(filtered)
human_dusty_df = pd.concat(cleaned_human_runs)

In [70]:
Dataset.create(
    name='mannequin_in_dust_night_dawn_10pos',
    description="sequences of a mannequin with dust blowing around it from 10 positions at night and dusk. All images contain a mannequin. Collected Aug 22 2023. Left cameras only (7375 images)",
    kind=Dataset.KIND_IMAGE,
    image_ids=list(human_dusty_df['image_id']),
)

{}

# Create annotated datasets

In [3]:
aletheia_ds = Dataset.retrieve(name='mannequin_in_dust')
aletheia_df_1 = aletheia_ds.to_dataframe()
print(aletheia_df_1.shape)
# retrieve dataset from aletheia
aletheia_ds = Dataset.retrieve(name='mannequin_in_dust_night_dawn_10pos')
aletheia_df_2 = aletheia_ds.to_dataframe()
print(aletheia_df_2.shape)
aletheia_df = pd.concat([aletheia_df_1, aletheia_df_2])
print(aletheia_df.shape)

(1650, 108)
(7375, 107)
(9025, 109)


In [6]:
aletheia_df['image_id'] = aletheia_df['id']

In [7]:
def make_dataset_slow(from_df, name, description, kind='image') -> None:
    imids = list(from_df['image_id'])
    desc = f"{description} ({len(from_df['image_id'])} images)"
    print(len(imids))
    from_df.to_parquet(data_path / '{name}.parquet', index=False)
    imageids_to_dataset(imids, name, dataset_description=desc, dataset_kind=kind, production_dataset=False)

In [8]:
make_dataset_slow(from_df=aletheia_df, 
    name='mannequin_in_dust_v0',
    description="A mannequin with dust billowing around. All images contain a mannequin.",
    kind=Dataset.KIND_ANNOTATION,
)

9025
INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
Preparing stereo dataframe...
Size of left dataframe: 8714
INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
Size of stereo dataframe: 8714
Sending 8714 annotated_ids for creating dataset
Time taken to prepare data for dataset creation job: 1.80 mins
