In [10]:
import os
import brtdevkit
os.environ["BRT_ENV"] = 'prod'
os.environ['AWS_DEFAULT_REGION'] = 'us-west-2'
brtdevkit.log = 'info'

from aletheia_dataset_creator.dataset_tools.aletheia_dataset_helpers import *
from aletheia_dataset_creator.config.dataset_config import *
from brtdevkit.core.db.athena import AthenaClient
from brtdevkit.data import Dataset
from pathlib import Path

import pandas as pd
import numpy as np

# Set it to None to display all columns in the dataframe
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [24]:
from brtdevkit.core.api.error import InvalidRequestError

dataset_name = "dust_analysis_random_1000"
dataset_path = Path(f"/home/alexli/data/{dataset_name}")
try:
    dataset = Dataset.retrieve(name=dataset_name)
    print(dataset)
except InvalidRequestError as e:
    dataset = None
    print(e)

{
  "created_at": "2023-06-30T15:30:34.787000",
  "created_by": "62856584a5aafa5bdcf6321e",
  "created_by_email": "alex.li@bluerivertech.com",
  "definition": null,
  "description": "1000 random images from May 6",
  "id": "649ef51a15639b3bbe271a8f",
  "is_deleted": false,
  "kind": "image",
  "machine_log_group_id": null,
  "metadata_s3_bucket": "brt-mesa-jupiter-datasets-eng",
  "metadata_s3_key": "image/2023/06/30/649ef51a15639b3bbe271a8f.jsonl",
  "name": "dust_analysis_random_1000",
  "project_name": "jupiter",
  "provenance_url": null,
  "request_s3_location": {
    "s3_bucket": "brt-mesa-tartarus-queue-payloads-prod-green",
    "s3_key": "dataset-requests/image/2023/06/30/62856584a5aafa5bdcf6321e_1fbbed8d1cc24790a4549e76d7429f0f.json"
  },
  "state": "ready",
  "updated_at": "2023-06-30T15:30:34.787000",
  "updated_by": null,
  "webdatasets": []
}


In [12]:
from datetime import datetime
start_datetime  = datetime(2022, 5, 6, hour=23, minute=0)   #, tzinfo=timezone.utc)
end_datetime    = datetime(2022, 5, 6, hour=23, minute=59)  # this is EXclusive

database = "mesa-data-catalog-prod"
table = "image_jupiter"

stmt = f"""
SELECT *
FROM {table} T
WHERE 
    T.collected_on >= cast('{start_datetime}' as timestamp)
    AND T.collected_on < cast('{end_datetime}' as timestamp)
ORDER BY RAND()
LIMIT 1000
"""
if dataset == None:
    athena = AthenaClient()
    database = "mesa-data-catalog-prod"
    df_hh = athena.get_df(stmt, database=database)

In [13]:
if dataset == None:
    image_ids = df_hh['id'].tolist()
    imageids_to_dataset(
        image_ids, dataset_name, 
        '1000 random images from May 6', 'image')
    dataset = Dataset.retrieve(name=dataset_name)
dataset

{'created_at': '2023-06-30T15:30:34.787000',
 'created_by': '62856584a5aafa5bdcf6321e',
 'created_by_email': 'alex.li@bluerivertech.com',
 'definition': None,
 'description': '1000 random images from May 6',
 'id': '649ef51a15639b3bbe271a8f',
 'is_deleted': False,
 'kind': 'image',
 'machine_log_group_id': None,
 'metadata_s3_bucket': 'brt-mesa-jupiter-datasets-eng',
 'metadata_s3_key': 'image/2023/06/30/649ef51a15639b3bbe271a8f.jsonl',
 'name': 'dust_analysis_random_1000',
 'project_name': 'jupiter',
 'provenance_url': None,
 'request_s3_location': {'s3_bucket': 'brt-mesa-tartarus-queue-payloads-prod-green',
  's3_key': 'dataset-requests/image/2023/06/30/62856584a5aafa5bdcf6321e_1fbbed8d1cc24790a4549e76d7429f0f.json'},
 'state': 'ready',
 'updated_at': '2023-06-30T15:30:34.787000',
 'updated_by': None,
 'webdatasets': []}

In [14]:
# Run pack perception in kf, don't forget --image-only

In [28]:
from dl.dataset.fetch_pp_artifacts import prefetch_from_s3

if not os.path.exists(dataset_path / 'images'):
    dataset.download(dataset_path)
if not os.path.exists(dataset_path / 'processed'):
    master_csv_s3_uri = "s3://blueriver-jupiter-data/pack_perception/ml/48fe80193177bc671b32ffe6443142c9_e5de20d6782cffae96fcffd7f974f1ce/649ef51a15639b3bbe271a8f_master_annotations.csv"
    prefetch_from_s3(master_csv_s3_uri, dataset_path)

In [31]:
from cv.core.image_quality_server_side import ImageQuality

master_annotation_file = '649ef51a15639b3bbe271a8f_master_annotations.csv'
stereo_df = pd.read_csv(dataset_path / master_annotation_file)
iq = ImageQuality()
labeled = iq.from_df(stereo_df, dataset_path, use_progress=True)

In [34]:
labeled.columns

Index(['artifact_debayeredrgb_0__id', 'artifact_debayeredrgb_0_content_hash',
       'artifact_debayeredrgb_0_created_at',
       'artifact_debayeredrgb_0_data_category', 'artifact_debayeredrgb_0_id',
       'artifact_debayeredrgb_0_image', 'artifact_debayeredrgb_0_is_archived',
       'artifact_debayeredrgb_0_kind', 'artifact_debayeredrgb_0_project_name',
       'artifact_debayeredrgb_0_s3_bucket',
       ...
       'dataset_parent_dir', 'image_id', 'image_quality_label',
       'median_depth_check', 'script_id', 'stereo_left_image',
       'stereo_pipeline_npz_save_path', 'stereo_right_image', 'terrain_type',
       'image_quality'],
      dtype='object', length=114)

In [41]:
iqs = []
for i in range(484):
    iqs.append(labeled.iloc[i].image_quality.algorithm_output)

In [43]:
set(iqs)

{'bad_depth', 'bright', 'good'}