In [1]:
import time
import json
import ast
import datetime
import io
from collections import defaultdict

import matplotlib.pyplot as plt
import cv2
import matplotlib.dates as mdates
import imageio
import boto3
import pandas as pd
import numpy as np
import imageio
import matplotlib.pyplot as plt
import seaborn as sns

from brtdevkit.core.db.athena import AthenaClient
from brtdevkit.data import Dataset
from timezonefinder import TimezoneFinderL
import pytz

from aletheia_dataset_creator.dataset_tools.aletheia_dataset_helpers import imageids_to_dataset#, imageids_to_dataset_fast
from aletheia_dataset_creator.config.dataset_config import LEFT_CAMERAS, ALL_CAMERA_PAIRS_LIST
%matplotlib inline


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
pd.set_option('display.max_rows', 500)

In [3]:
athena = AthenaClient()
s3 = boto3.resource('s3')
tf = TimezoneFinderL()

In [4]:
def get_calibration(x):
    try:
        return ast.literal_eval(x)
    except:
        return {}   
def get_adjusted_timezone(timestamp, latitude, longitude):
    if (latitude == 0) or (longitude == 0):
        return np.nan
    
    if isinstance(timestamp, np.datetime64):
        timestamp = pd.to_datetime(timestamp)
    # Localize and adjust UTC timestamps to local timezone
    utc = pytz.utc.localize(timestamp)
    tz = tf.timezone_at(lat=latitude, lng=longitude)
    adjusted_timestamp = utc.astimezone(tz).to_pydatetime()

    return adjusted_timestamp


# Selecting data

In [5]:
try:
    halo_df = pd.read_parquet(home + '/workspace/hh_df.parquet')
except FileNotFoundError:
    print("file not found")
    query = f"""
    SELECT id, hard_drive_name, robot_name, collected_on,
        bag_name, operating_field_name, operation_time, latitude, longitude, geohash, camera_location, sensor_type, created_at, 
        bundle, gps_can_data__json, weather_summary__json, group_id
    FROM image_jupiter 
    WHERE LENGTH(robot_name) = 14 AND robot_name LIKE 'hitchhiker_1%'
    """
#--AND camera_location IN {left_tractor_cameras}
    start = time.time()
    halo_df = athena.get_df(query)
    end = time.time()
    print(end - start)
    halo_df.to_parquet(home + '/workspace/hh_df.parquet', index=False)

In [6]:
print(len(halo_df))

4524864


In [7]:
# try:
#     rev1_df = pd.read_parquet('/home/alexli/data/all_hitchiker_images/rev1_df.parquet')
# except FileNotFoundError:
#     print("file not found")
#     query = f"""
#     SELECT id, hard_drive_name, robot_name, collected_on,
#         bag_name, operating_field_name, operation_time, latitude, longitude, geohash, camera_location, sensor_type, created_at, 
#         bundle, gps_can_data__json, weather_summary__json, group_id
#     FROM image_jupiter 
#     WHERE LENGTH(robot_name) = 9 AND SUBSTR(robot_name, 7, 9) IN ('619', '646', '708', '710', '733', '735', '750', '756', '768', '799', '812', '817', '842', '869', '872', '909') AND "collected_on" BETWEEN TIMESTAMP'2023-03-15 0:00:00' AND TIMESTAMP'2023-05-15 0:00:00'
#     """
# #--AND camera_location IN {left_tractor_cameras}
#     start = time.time()
#     rev1_df = athena.get_df(query)
#     end = time.time()
#     print(end - start)
#     rev1_df.to_parquet('/home/alexli/data/all_hitchiker_images/rev1_df.parquet', index=False)

In [8]:
# full_df = pd.concat([rev1_df, halo_df], ignore_index=True)
full_df = halo_df.copy()
# drop invalid GPS
full_df = full_df[(full_df.geohash != '7zzzzzzzzzzz')].copy()
print(full_df.shape)
# # drop no speed data
def valid_speed(x):
    load = json.loads(x)
    return 'speed' in load and 200 > load['speed'] > -200 #remove NaN or impossible values
full_df = full_df[full_df['gps_can_data__json'].apply(valid_speed)]
print(full_df.shape)
full_df['speed_kph'] = full_df['gps_can_data__json'].map(lambda x:(json.loads(x)['speed']))

(4480035, 17)
(4479965, 17)


In [9]:
tf = TimezoneFinderL()
def get_adjusted_timezone(df_row):
    timestamp = pd.to_datetime(df_row['collected_on'])
    latitude, longitude = df_row['latitude'], df_row['longitude']
    if (latitude == 0) or (longitude == 0):
        return np.nan
    
    if isinstance(timestamp, np.datetime64):
        timestamp = pd.to_datetime(timestamp)

    # Localize and adjust UTC timestamps to local timezone
    utc =  pytz.utc.localize(timestamp)
    tz = tf.timezone_at(lat=latitude, lng=longitude)
    adjusted_timestamp = utc.astimezone(tz).to_pydatetime()

    return adjusted_timestamp
full_df['collected_on_localtime'] = full_df.apply(get_adjusted_timezone, axis=1)
def get_day(t):
    return t.strftime("%m/%d")
def get_second(t):
    return t.strftime("%H:%M:%S")
full_df['daystr'] = full_df['collected_on_localtime'].apply(get_day)
full_df['secstr'] = full_df['collected_on_localtime'].apply(get_second)

In [10]:
bidirectional_dict = {}
for pair_dict in ALL_CAMERA_PAIRS_LIST:
    for k, v in pair_dict.items():
        bidirectional_dict[k] = v
        bidirectional_dict[v] = k

def make_dataset(from_df, name, description, pairs=[bidirectional_dict]) -> None:
    imids = list(from_df['id'])
    # print(len(imids))
    from_df.to_parquet(f'/home/alexli/data/all_hitchiker_images/{name}.parquet', index=False)
    desc = f"{description} ({len(from_df['id'])} images)"
    # imageids_to_dataset_fast(from_df, name, desc,
    #                          camera_pairs_list=pairs, camera_pair_df=df)
    Dataset.create(
        name=name,
        description=desc,
        kind=Dataset.KIND_IMAGE,
        image_ids=imids,
    )

def make_dataset_slow(from_df, name, description) -> None:
    imids = list(from_df['id'])
    desc = f"{description} ({len(from_df['id'])} images)"
    print(len(imids))
    from_df.to_parquet(f'/home/alexli/data/all_hitchiker_images/{name}.parquet', index=False)
    imageids_to_dataset(imids, name, dataset_kind='image',
                             dataset_description=desc)


In [11]:
stratified_df = df.groupby(['robot_name', 'camera_location', 'minute']).head(6)

NameError: name 'df' is not defined

In [None]:
df_groups = df.groupby(['robot_name', 'camera_location', 'daystr']).groups

NameError: name 'df' is not defined

In [None]:
imids = list(smudge_df['id'])
print(len(imids))
# imageids_to_dataset(imids, "hitchhiker_smudge", "Hitchhiker images on smudgy days.", dataset_kind='image', mode='stereo')

70489


In [None]:
movie = df.iloc[df_groups[SMUDGE_DAYS[0]]].sort_values('collected_on_localtime')

NameError: name 'df_groups' is not defined

In [None]:
movie = df.iloc[df_groups[SMUDGE_DAYS[0]]].sort_values('collected_on_localtime')
ncols = len(movie)
page = 0
rows_per_cat = (4 + ncols) // 5
nrows = rows_per_cat
fig, ax = plt.subplots(nrows, 5, figsize=(16, nrows * 4))
# ax0 = ax[i * rows_per_cat][0]
# info = dusty_unmoving_days[i]
# ax0.scatter(df.iloc[df_groups[info]]['collected_on_localtime'],
#             df.iloc[df_groups[info]]['collected_on_localtime'])#, df.iloc[df_groups[info]]['pred_dust_percent'])
# ax0.set_title(str(dusty_unmoving_days[i]))
# ax0.set_xlabel("Time of day")
# ax0.set_ylabel("Predicted dust level")
# tz = tf.timezone_at(lng=dust_df.iloc[0]['longitude'], lat=dust_df.iloc[0]['latitude'])
# ax0.xaxis.set_major_formatter(mdates.DateFormatter('%H:%M', tz=tz))
# ax0.xaxis.set_major_locator(mdates.HourLocator(interval=3, tz=tz))
for j in range(0, ncols):
    if j - 1 >= len(movie):
        break
    df_row = movie.iloc[j - 1 + page * ncols]
    im = cv2.imread(str(Path(data_dir) / df_row['artifact_debayeredrgb_0_save_path']))
    ax[j // 5][j % 5].imshow(im)
    ax[j // 5][j % 5].set_title(df_row['secstr'])