In [None]:
import time
import json
import ast
import os
import datetime
import io
from collections import defaultdict
from tqdm import tqdm

import imageio
import boto3
import pandas as pd
import numpy as np
import imageio
import matplotlib.pyplot as plt
from dl.config.label_map_helper import LabelMapHelper, LabelConversion
import json
from pandarallel import pandarallel
pandarallel.initialize(nb_workers=12, progress_bar=True)
import seaborn as sns

from brtdevkit.core.db.athena import AthenaClient
from brtdevkit.data import Dataset
from timezonefinder import TimezoneFinderL
import pytz
import cv2
from brtdevkit.util.aws.s3 import S3
client = S3()

from aletheia_dataset_creator.dataset_tools.aletheia_dataset_helpers import imageids_to_dataset
from aletheia_dataset_creator.config.dataset_config import LEFT_CAMERAS, ALL_CAMERA_PAIRS_LIST
%matplotlib inline

In [None]:
pd.set_option('display.max_rows', 500)
athena = AthenaClient()
s3 = boto3.resource('s3')
tf = TimezoneFinderL()
from pathlib import Path
home = Path(os.path.expanduser('~'))
data_path = home / 'data' 

In [None]:
with open('/mnt/sandbox1/alex.li/set3_image_ids.csv', 'r') as f:
    imids = f.readline().split(',')
print(len(imids))

In [None]:
# 1245 human
# 12515 vehicle
data_dir = "/data/jupiter/datasets/vehicles_driving_in_dust/"
df = pd.read_csv(data_dir + '/64dfb36ebe1e14d37b7287d8_master_annotations.csv')

In [None]:
# # 716
# # 5893
# data_dir="/data/jupiter/li.yu/data/Jupiter_2023_may_loamy731_vehicle_dust_human_stereo/"
# path =data_dir + "/master_annotations.csv"
# df_unlabled_1 = pd.read_csv(path)

# # 533
# # 6622
# data_dir="/data/jupiter/li.yu/data/Jupiter_2023_may_loamy731_vehicle_dust_human_stereo_part2/"
# path =data_dir + "/master_annotations.csv"
# df_unlabled_2 = pd.read_csv(path)
# df_unlabled = pd.concat([df_unlabled_1, df_unlabled_2])

In [None]:
helper = LabelMapHelper("/home/alex.li/git/JupiterCVML/europa/base/src/europa/dl/config/label_maps/four_class_train.csv")
lc = LabelConversion(helper)

In [None]:
def has_thing(row):
    label = np.load(os.path.join(data_dir, row.rectified_label_save_path))['left']
    label = lc.convert_label_for_driveable_terrain(
        label,
        json.loads(row['label_map']))
    if 'rear' in row['camera_location']:
        has_vehicle = np.sum(label == helper.get_vehicle_label()) > 30
        has_human = np.sum(label == helper.get_human_label()) > 30
    else:
        has_vehicle = np.sum(label == helper.get_vehicle_label()) > 100
        has_human = np.sum(label == helper.get_human_label()) > 100
    return [has_human, has_vehicle]

In [None]:
# df[['has_human', 'has_vehicle']] 
result= df.parallel_apply(has_thing, axis=1, result_type='expand')
# df.loc[:, ['has_human', 'has_vehicle']] = df[['has_human']]

In [None]:
print(sum(result[0])) # has human
print(sum(result[1])) # has vehicle

In [None]:
from typing import List
import logging
from datetime import timedelta

def get_runs(df: pd.DataFrame, trim: bool=True) -> List[pd.DataFrame]:
    """Split all consecutive videos of a dataframe 
    Returns the minimal list of dataframes such that no two dataframes have
    images within 5 seconds of each other
    
    If trim is true, then we do not consider images with no ground truth stop
    detected when selecting the start/end times for the videos, which may
    result in some removed data.
    """
    df['collected_on_dt'] = pd.to_datetime(df.collected_on)
    df = df.sort_values('collected_on_dt')
    if trim:
        # merged_runs = df[result[1]]
        merged_runs = df[result[1]]

    else:
        merged_runs = df
    if len(merged_runs) == 0:
        logging.error("Did not find any runs with any labeled stop class")
        return []
    merged_runs = merged_runs.sort_values('collected_on_dt')

    delta = timedelta(seconds=5)
    start_t = merged_runs.iloc[0].collected_on_dt
    runs = []
    for i in range(1, len(merged_runs)):
        end_t = merged_runs.iloc[i - 1].collected_on_dt
        next_t = merged_runs.iloc[i].collected_on_dt
        if next_t - end_t > delta or i == len(merged_runs) - 1:
            if i == len(merged_runs) - 1:
                next_t += timedelta(microseconds=1)
            runs.append(df.loc[(start_t <= df['collected_on_dt']) & (df['collected_on_dt'] <= end_t)])
            start_t = next_t
    return runs

In [None]:
def get_n_images(runs):
    return sum(len(run) for run in runs)

print(len(df))
trimmed_runs = get_runs(df, True)
print(len(trimmed_runs))
print(get_n_images(trimmed_runs))

In [None]:
run_len_time = [(i, len(run), max(run['collected_on_dt']) - min(run['collected_on_dt'])) for i, run in enumerate(trimmed_runs)]
trimmed_runs = [trimmed_runs[i] for i, _, time in run_len_time if (time > timedelta(seconds=5) and time < timedelta(minutes=5))]
run_len_time = [(i, len(run), max(run['collected_on_dt']) - min(run['collected_on_dt'])) for i, run in enumerate(trimmed_runs)]
run_len_time = sorted(run_len_time, key=lambda x : x[2])
print(run_len_time)
print(len(trimmed_runs))

In [None]:
op_times = np.load('/home/alex.li/logs/operation_time_for_2023_April_sequence_data.npz', allow_pickle=True)
for k, v in op_times.items():
    print(k, v)

In [None]:
list(op_times.keys())

In [None]:
run_times_dict = {
    'daytime': op_times['Day'].tolist(),
    'dawn_dusk': op_times['Dusk'].tolist(),
    'nightime': op_times['Night'].tolist(),
}
for i, run in enumerate(trimmed_runs):
    run_time = 'unknown'
    times = set(run[run['operation_time'] != 'unknown']['operation_time'])
    if len(times) != 1:
        time = 'daytime'
        print(i, times)
    else:
        assert len(times) == 1, len(times)
        time = list(times)[0]
    print(time)
    print(min(run['collected_on_dt']))
    run_times_dict[time].append(list(run['id']))
for k, v in run_times_dict.items():
    print(k, len(v))

In [None]:
for k, v in run_times_dict.items():
    run_times_dict[k] = np.array(v, dtype=object)
for k, v in run_times_dict.items():
    print(k, v.shape)

In [None]:
np.savez('/home/alex.li/logs/operation_time_for_2023_April_sequence_data_with_seq3.npz', **run_times_dict)

In [None]:
print(len(all_image_ids))
imageids_to_dataset(all_image_ids,
    dataset_name='vehicles_driving_through_dust_1_2',
    dataset_description=f"Sequences of vehicles driving through dust ({len(all_image_ids)} images)",
    dataset_kind=Dataset.KIND_ANNOTATION,
    production_dataset=False
)

In [None]:
set5_anno = pd.read_csv('/data/jupiter/datasets/suv_driving_through_rear_dust_anno/64cd53a3748e0a51e1a72774_master_annotations.csv')

In [None]:
all_image_ids.extend(list(set5_anno['id']))

In [None]:
print(len(all_image_ids))
all_image_ids = list(set(all_image_ids))
print(len(all_image_ids))

In [None]:
imageids_to_dataset(all_image_ids,
    dataset_name='vehicles_driving_through_dust_1_2_5',
    dataset_description=f"Sequences of vehicles driving through dust. ({len(all_image_ids)} images)",
    dataset_kind=Dataset.KIND_ANNOTATION,
    production_dataset=False
)

# Analysis for dust threshold

In [None]:
dust_df = pd.read_csv('/data/jupiter/alex.li/results/vehicles_driving_in_dust/results_4class/dust_ratio.csv')
dust_df = pd.merge(dust_df, df,on='id')
dust_df['collected_on'] = pd.to_datetime(dust_df['collected_on'])
dust_df = dust_df.sort_values('collected_on')

In [None]:
def plot_seq(seq):
    for camera_location in set(seq['camera_location']):
        cam_seq = seq[seq['camera_location'] == camera_location]
        plt.scatter(cam_seq['collected_on'], cam_seq['total_averaged_dust_conf'], s=1, label=camera_location)
    plt.legend()

In [None]:
plot_seq(dust_df)

In [None]:
lo = datetime.datetime(2023, 4, 1)
hi = datetime.datetime(2023, 4, 27, 13, 5)
plot_seq(dust_df[(lo < dust_df['collected_on']) & (dust_df['collected_on'] < hi)])

In [None]:
lo = datetime.datetime(2023, 5, 4, 1)
hi = datetime.datetime(2023, 5, 4, 1, 20)
plot_seq(dust_df[(lo < dust_df['collected_on']) & (dust_df['collected_on'] < hi)])

In [None]:
lo = datetime.datetime(2023, 5, 4, 1, 55)
hi = datetime.datetime(2023, 5,5)
plot_seq(dust_df[(lo < dust_df['collected_on']) & (dust_df['collected_on'] < hi)])