In [1]:
import time
import json
import ast
import os
import datetime
import io
from collections import defaultdict
from tqdm import tqdm

import imageio
import boto3
import pandas as pd
import numpy as np
import imageio
import matplotlib.pyplot as plt
from pathlib import Path

from brtdevkit.core.db.athena import AthenaClient
from brtdevkit.data import Dataset
from timezonefinder import TimezoneFinderL
import pytz
import cv2
from brtdevkit.util.aws.s3 import S3
client = S3()

from aletheia_dataset_creator.dataset_tools.aletheia_dataset_helpers import imageids_to_dataset
from aletheia_dataset_creator.config.dataset_config import LEFT_CAMERAS, ALL_CAMERA_PAIRS_LIST
%matplotlib inline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
pd.set_option('display.max_rows', 500)
athena = AthenaClient()
s3 = boto3.resource('s3')
tf = TimezoneFinderL()
home = Path(os.path.expanduser('~'))
data_path = Path(os.environ['OUTPUT_PATH']) / "manny_2"

In [3]:
import os
from aletheia_dataset_creator.dataset_tools.aletheia_dataset_helpers import imageids_to_dataset
import random
from sklearn.cluster import KMeans
from brtdevkit.data import Dataset
from email.mime import image
import itertools
from pathlib import Path
import torch
import clip
from PIL import Image
from torch.utils.data import Dataset as torchDataset, DataLoader
import torchvision.transforms as T
import torch.nn as nn
from skimage.metrics import structural_similarity as ssim
from tqdm import tqdm
from matplotlib import pyplot as plt

import pandas as pd
import numpy as np
import yaml
from pathlib import Path 
from sklearn.decomposition import PCA
torch.cuda.empty_cache()

import imageio

class Datasetpreparer(torchDataset):    
        def __init__(self, data, preprocess=None, transform=None, device="cuda"):
            self.data = data
            self.transform =  transform 
            self.preprocess = preprocess
            self.device = device 
        def __len__(self):
            return len(self.data)

        def __getitem__(self, idx):
            img_path = self.data.loc[idx, 'image_path']
            try:
                img_pil = self.transform(Image.fromarray(imageio.imread(img_path)))
                # Download and open the image        
                img_pil = self.preprocess(img_pil).to(self.device)
            except Exception as e:
                print(e)
                img_path = self.data.loc[0, 'image_path']
                img_pil = self.transform(Image.fromarray(imageio.imread(img_path)))
                img_pil = self.preprocess(img_pil).to(self.device)
                return img_pil, img_path
            return img_pil, img_path

class ImageSimilarity: 
    def __init__(self, images_full_path, data_base_path, dataset_name='image_list', overwrite=False):
        self.data_base_path = data_base_path
        os.makedirs(self.data_base_path, exist_ok=True)
        self.images_base_path = data_base_path + "/images"
        self.images_full_path = images_full_path
        
        self.image_path_df = None 
        self.transform = T.Compose([
            T.Resize((224,224))])
        
        self.inference_set = None 
        self.embeddings_np = None
        self.image_paths = None 
        self.image_paths_df = None
        self.embeddings_save_name = f"{dataset_name}_embeddings.npz"
        self.embeddings_image_paths = Path(self.data_base_path) / f"{dataset_name}_embeddings_image_paths.csv"
        self.embeddings_save_loc = Path(self.data_base_path) / self.embeddings_save_name
        self.overwrite = overwrite 
        self.model = None 
        self.preprocessor = None 
        self.sorted_scores = None 
        self.sorted_scores_save_loc = data_base_path 
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.load_model()
        
    def prepare_images_path_df(self): 
        self.image_path_df = pd.DataFrame(data=self.images_full_path, columns=['image_path'])
        return self.image_path_df
    
    def prepare_dataloader(self): 
        self.inference_set = Datasetpreparer(data=self.image_path_df, preprocess=self.preprocessor, transform=self.transform, device=self.device)
        inference_loader = DataLoader(self.inference_set, batch_size=64, shuffle=False, num_workers=0, drop_last=True)
        self.total = len(self.image_path_df) / 64
        return inference_loader
 
    def load_model(self): 
        self.model, self.preprocessor = clip.load("ViT-B/32", device=self.device)
        return self.model, self.preprocessor

    def build_embeddings(self): 
        outputs = []
        image_paths = []
        if (os.path.isfile(self.embeddings_save_loc)==True) and (self.overwrite==False): 
            self.embeddings_np, self.image_paths = self.load_embeddings()
            return self.embeddings_np, self.image_paths 
        else:
            inference_loader = self.prepare_dataloader()
            for idx, batch in tqdm(enumerate(inference_loader), total=self.total): 
                batch, paths = batch[0], batch[1]
                with torch.no_grad():
                        batch = batch.to(self.device)
                        image_features = self.model.encode_image(batch)
                        outputs.append(image_features)  
                        image_paths.extend(paths)
            outputs = torch.cat(outputs, dim=0)
            self.image_paths = image_paths 
            self.embeddings_np = outputs.detach().cpu().numpy()
            return self.embeddings_np, self.image_paths
            
    def save_embeddings(self): 
            np.savez(self.embeddings_save_loc, self.embeddings_np) 
            image_paths_df = pd.DataFrame(data=self.image_paths, columns=['image_path'])
            image_paths_df.to_csv(self.embeddings_image_paths, index=False)
            return None 
        
    def load_embeddings(self): 
        embeddings_save_loc = str(self.embeddings_save_loc)
        embeddings = np.load(embeddings_save_loc)
        self.embeddings_np = embeddings[embeddings.files[0]]       
        self.image_paths_df= pd.read_csv(self.embeddings_image_paths )
        return self.embeddings_np, self.image_paths_df 

    def get_image_embedding(self, image_path): 
        transformed_image = self.transform(Image.open(image_path))
        img = self.preprocessor(transformed_image).unsqueeze(0).to(self.device)
        embedding = self.model.encode_image(img)
        embeddings_np = embedding.detach().cpu().numpy()
        return embeddings_np
    
    def get_similar_images(self, image_path, calcuate_embeddings=False, images_base_path=None): 
        if images_base_path==None:
            images_base_path = self.images_base_path
        full_image_path = os.path.join(images_base_path, image_path)
        embeddings_np, image_paths_df = self.load_embeddings()
        image_paths = image_paths_df['image_path'].tolist()
        if calcuate_embeddings==True:
            reference_embedding = self.get_image_embedding(full_image_path)
        else :
            embedding_index = image_paths_df[image_paths_df['image_path'] == full_image_path].index[0]
            reference_embedding = embeddings_np[embedding_index, :]
        
        similarity_score = np.dot(reference_embedding, embeddings_np.T)/(np.linalg.norm(reference_embedding)*np.linalg.norm( embeddings_np, axis=1))
        score_df = pd.DataFrame(data=list(similarity_score.T), columns=['similarity_score'])
        score_df['image_path'] = image_paths 
        self.sorted_scores = score_df.sort_values(by='similarity_score', ascending=False)
        image_name = image_path.split("/")[-1].replace(".png","")
        save_loc_sorted_scores = Path(self.sorted_scores_save_loc) / f"{image_name}.csv"
        self.sorted_scores.to_csv(save_loc_sorted_scores, index=False)
        return self.sorted_scores
    
    def similar_images_with_text(self, text_prompt):
        
        embeddings_np, image_paths_df = self.load_embeddings()
        image_paths = image_paths_df['image_path'].tolist()
        text = clip.tokenize(text_prompt).to(self.device)
        text_features = self.model.encode_text(text)
        text_features /= text_features.norm(dim=-1, keepdim=True)
        text_features = text_features.detach().cpu().numpy()
        reference_embedding = text_features
            
        similarity_score = np.dot(reference_embedding, embeddings_np.T)/(np.linalg.norm(reference_embedding)*np.linalg.norm( embeddings_np, axis=1))
        score_df = pd.DataFrame(data=list(similarity_score.T), columns=['similarity_score'])
        score_df['image_path'] = image_paths 
        self.sorted_scores = score_df.sort_values(by='similarity_score', ascending=False)
        image_name = text_prompt.replace(" ","_")
        save_loc_sorted_scores = Path(self.sorted_scores_save_loc) / f"{image_name}.csv"
        self.sorted_scores.to_csv(save_loc_sorted_scores, index=False)
        return self.sorted_scores
        
        
    def plot_image_grid(self, image_paths_df, tail=0  ,nrows=5, ncols=2):
        
        # Assume image_paths is your list of image file paths
        if tail > 0:
            image_paths_df = image_paths_df.tail(tail)
        image_paths = image_paths_df['image_path'].tolist()
        n_rows = min(nrows, int(image_paths_df.shape[0] // 2 ))
        fig, axes = plt.subplots(n_rows, ncols, figsize=(10, nrows*5))
        for i, ax in enumerate(axes.flatten()):
            if i < len(image_paths)-1:
                image_path = image_paths[i]
                title = image_path.split("/")[-1]
                img = Image.open(image_path)
                ax.imshow(img)
                ax.set_title(f'{title}  indx: {i}', fontsize=6)
            ax.axis('off')

        plt.tight_layout()
        plt.show()


    def extract_embeddings(self): 
        self.prepare_images_path_df()
        self.prepare_dataloader()
        self.build_embeddings()
        self.save_embeddings()
        return None
    
def get_images(save_dir, df):
    save_dir = Path(save_dir)
    dirs = [save_dir / d for d in df.id]
    image_paths = []
    for i, dir in tqdm(enumerate(dirs), total=len(dirs)):
        if i % 1500 == 0:
            print(100 * i / len(dirs))
        for fname in os.listdir(dir):
            if 'debayeredrgb' in fname and fname.endswith('.png'):
                image_paths.append(str(object=dir / fname))
    return image_paths

In [5]:
dsetname = "dynamic_manny_in_dust_raw"
aletheia_ds = Dataset.retrieve(name=dsetname)
aletheia_df = aletheia_ds.to_dataframe()
n_images_final = 12000
kind = Dataset.KIND_IMAGE
label_data = Dataset.retrieve(name="dynamic_manny_in_dust_raw_diverse_4000").to_dataframe()
label_data_had_human = Dataset.retrieve(name="mannequin_in_dust_v1").to_dataframe()
label_data_no_human = list(set(label_data['id']) - set(label_data_had_human['id']))
dataset_save_dir = os.environ['DATASET_PATH'] + "/" + dsetname
save_file = Path(dataset_save_dir) / "image_ids.npy"
image_paths = np.load(save_file).tolist()

In [6]:
label_data

Unnamed: 0,artifact_debayeredrgb_0__id,artifact_debayeredrgb_0_content_hash,artifact_debayeredrgb_0_created_at,artifact_debayeredrgb_0_data_category,artifact_debayeredrgb_0_id,artifact_debayeredrgb_0_image,artifact_debayeredrgb_0_kind,artifact_debayeredrgb_0_project_name,artifact_debayeredrgb_0_s3_bucket,artifact_debayeredrgb_0_s3_key,...,teleop_response,terrain_type,updated_at,vadc_estimated_curvature,vehicle_bearing,vpu_position,was_compressed,water_management,weather,weather_summary
0,64ee42a42e655e79fbf5d0b8,353bf77d01bd75641976a1e34c3faa76,2023-08-29T19:10:28.343000,eng,64ee42a42e655e79fbf5d0b8,64ee42a42e655e79fbf5d0b7,debayeredrgb,jupiter,brt-mesa-jupiter-images-eng,2023/08/29/e118a5b5c1cc42d5852f26fedaee6a73-da...,...,,openfield,2023-08-29T19:10:29.036000,,,1,False,none,sunny,"{'datetime': '21:00:00', 'datetimeEpoch': 1693..."
1,64ee42a956cb49f3dcc2f1a9,1c95feb853be38ed833ad64c89ad7de6,2023-08-29T19:10:33.277000,eng,64ee42a956cb49f3dcc2f1a9,64ee42a856cb49f3dcc2f1a7,debayeredrgb,jupiter,brt-mesa-jupiter-images-eng,2023/08/29/9274de7c81d74fa68d750b8bdedf6da2-da...,...,,openfield,2023-08-29T19:10:33.604000,,,1,False,none,sunny,"{'datetime': '21:00:00', 'datetimeEpoch': 1693..."
2,64ee42af2fea9c788d7152e1,087846d7872b4cb4d3e5e0f0c5ff59e7,2023-08-29T19:10:39.361000,eng,64ee42af2fea9c788d7152e1,64ee42af2fea9c788d7152e0,debayeredrgb,jupiter,brt-mesa-jupiter-images-eng,2023/08/29/d7b0cca48dbf46c2b11aad22bf625ee9-da...,...,,openfield,2023-08-29T19:10:40.042000,,,1,False,none,sunny,"{'datetime': '21:00:00', 'datetimeEpoch': 1693..."
3,64ee42b0e2b5d7468e2750b5,84ff622a07182cc4fc6330c55cc95856,2023-08-29T19:10:40.930000,eng,64ee42b0e2b5d7468e2750b5,64ee42b0e2b5d7468e2750b4,debayeredrgb,jupiter,brt-mesa-jupiter-images-eng,2023/08/29/4baaa95715e446368ed5fc90b8711127-da...,...,,openfield,2023-08-29T19:10:41.778000,,,1,False,none,sunny,"{'datetime': '21:00:00', 'datetimeEpoch': 1693..."
4,64ee42d4f7f4625589492dbe,7789b942e6e2bf29c2c350eff359d95f,2023-08-29T19:11:16.708000,eng,64ee42d4f7f4625589492dbe,64ee42d4f7f4625589492dbd,debayeredrgb,jupiter,brt-mesa-jupiter-images-eng,2023/08/29/7818d7faee014ce189102471c107bf96-da...,...,,openfield,2023-08-29T19:11:17.567000,,,1,False,none,sunny,"{'datetime': '21:00:00', 'datetimeEpoch': 1693..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6423,64f8ba9dd07ef104045d93a9,87c97498b5742869b50c38f1f787ca8a,2023-09-06T17:45:01.098000,eng,64f8ba9dd07ef104045d93a9,64f8ba9dd07ef104045d93a8,debayeredrgb,jupiter,brt-mesa-jupiter-images-eng,2023/09/06/257e14e05111454ca5e2196498bc55ae-da...,...,,openfield,2023-09-06T17:45:01.933000,{'bag_name': 'JUPD-001_2023-8-31__08_31_2023-0...,112.820312,0,False,none,sunny,"{'datetime': '04:00:00', 'datetimeEpoch': 1693..."
6424,64f8bac3db61ee654227d22f,adbe88d2c9ab623b86f6004f4efed019,2023-09-06T17:45:39.990000,eng,64f8bac3db61ee654227d22f,64f8bac3db61ee654227d22e,debayeredrgb,jupiter,brt-mesa-jupiter-images-eng,2023/09/06/6065f06334b0442d862e14279907ddf4-da...,...,,openfield,2023-09-06T17:45:40.953000,{'bag_name': 'JUPD-001_2023-8-31__08_31_2023-0...,111.804688,0,False,none,sunny,"{'datetime': '04:00:00', 'datetimeEpoch': 1693..."
6425,64f8bb06e13fe93adb6e738c,1e8fc7a855cf59b0245b1d1fb55c6ddf,2023-09-06T17:46:46.352000,eng,64f8bb06e13fe93adb6e738c,64f8bb06e13fe93adb6e738b,debayeredrgb,jupiter,brt-mesa-jupiter-images-eng,2023/09/06/8c11cadfeeaa40dba5b79412ef8b9383-da...,...,,openfield,2023-09-06T17:46:47.204000,{'bag_name': 'JUPD-001_2023-8-31__08_31_2023-0...,121.960938,0,False,none,sunny,"{'datetime': '04:00:00', 'datetimeEpoch': 1693..."
6426,64f8c07738f9481299e68808,08c595a6009ae9ce6dc703ddca3f7d83,2023-09-06T18:09:59.544000,eng,64f8c07738f9481299e68808,64f8c07738f9481299e68807,debayeredrgb,jupiter,brt-mesa-jupiter-images-eng,2023/09/06/349a4164f4874f68818c892a64e05db6-da...,...,,openfield,2023-09-06T18:10:00.119000,{'bag_name': 'JUPD-001_2023-8-31__08_31_2023-0...,136.218750,0,False,none,sunny,"{'datetime': '04:00:00', 'datetimeEpoch': 1693..."


In [17]:
# aletheia_df = aletheia_df[aletheia_df['camera_location'].str.endswith('left')]
# image_paths = list(get_images(save_dir=dataset_save_dir + '/images', df=aletheia_df))
# image_paths = np.load(save_file).tolist()
# image_paths = [p for p in image_paths if p.endswith('.png')]
# np.save(save_file, image_paths)

  0%|          | 76/75706 [00:00<01:40, 752.09it/s]

0.0


  2%|▏         | 1658/75706 [00:01<01:22, 902.79it/s]

1.9813489023327082


  4%|▍         | 3105/75706 [00:03<01:20, 904.06it/s]

3.9626978046654164


  6%|▌         | 4529/75706 [00:04<01:08, 1045.82it/s]

5.944046706998124


  8%|▊         | 6142/75706 [00:06<01:34, 737.81it/s] 

7.925395609330833


 10%|█         | 7609/75706 [00:08<01:08, 991.25it/s]

9.90674451166354


 12%|█▏        | 9123/75706 [00:10<01:06, 1004.00it/s]

11.888093413996248


 14%|█▍        | 10598/75706 [00:11<01:08, 954.13it/s]

13.869442316328957


 16%|█▌        | 12178/75706 [00:14<01:06, 951.54it/s]

15.850791218661666


 18%|█▊        | 13616/75706 [00:15<01:07, 913.34it/s]

17.832140120994374


 20%|█▉        | 15124/75706 [00:17<01:12, 832.66it/s]

19.81348902332708


 22%|██▏       | 16652/75706 [00:19<01:04, 912.65it/s]

21.794837925659788


 24%|██▍       | 18107/75706 [00:21<01:18, 738.38it/s]

23.776186827992497


 26%|██▌       | 19609/75706 [00:22<01:06, 839.06it/s]

25.757535730325205


 28%|██▊       | 21151/75706 [00:24<01:03, 857.58it/s]

27.738884632657914


 30%|██▉       | 22639/75706 [00:26<01:00, 876.50it/s]

29.720233534990623


 32%|███▏      | 24180/75706 [00:28<00:51, 1006.69it/s]

31.70158243732333


 34%|███▍      | 25654/75706 [00:30<00:58, 849.10it/s] 

33.68293133965604


 36%|███▌      | 27169/75706 [00:32<00:52, 926.09it/s]

35.66428024198875


 38%|███▊      | 28682/75706 [00:33<00:50, 934.76it/s]

37.645629144321454


 40%|███▉      | 30131/75706 [00:35<00:51, 891.43it/s] 

39.62697804665416


 42%|████▏     | 31566/75706 [00:37<00:51, 862.32it/s]

41.60832694898687


 44%|████▍     | 33125/75706 [00:39<00:44, 960.86it/s]

43.589675851319576


 46%|████▌     | 34511/75706 [00:40<01:04, 640.34it/s] 

45.57102475365229


 48%|████▊     | 36137/75706 [00:43<00:45, 865.26it/s]

47.552373655984994


 50%|████▉     | 37632/75706 [00:44<00:44, 862.53it/s]

49.533722558317706


 52%|█████▏    | 39194/75706 [00:46<00:42, 863.38it/s]

51.51507146065041


 54%|█████▎    | 40657/75706 [00:48<00:48, 715.91it/s]

53.496420362983116


 56%|█████▌    | 42110/75706 [00:50<00:38, 876.67it/s]

55.47776926531583


 58%|█████▊    | 43625/75706 [00:52<00:33, 968.76it/s]

57.45911816764853


 60%|█████▉    | 45113/75706 [00:54<00:42, 724.76it/s] 

59.440467069981246


 62%|██████▏   | 46661/75706 [00:56<00:35, 811.93it/s]

61.42181597231395


 64%|██████▎   | 48136/75706 [00:58<00:29, 944.15it/s]

63.40316487464666


 66%|██████▌   | 49621/75706 [00:59<00:28, 923.70it/s] 

65.38451377697936


 68%|██████▊   | 51201/75706 [01:01<00:25, 980.02it/s]

67.36586267931207


 69%|██████▉   | 52610/75706 [01:03<00:23, 992.66it/s] 

69.34721158164479


 72%|███████▏  | 54135/75706 [01:04<00:25, 840.70it/s] 

71.3285604839775


 74%|███████▎  | 55654/75706 [01:07<00:24, 828.50it/s]

73.3099093863102


 76%|███████▌  | 57169/75706 [01:08<00:19, 952.94it/s]

75.29125828864291


 77%|███████▋  | 58655/75706 [01:10<00:18, 938.50it/s]

77.27260719097562


 79%|███████▉  | 60010/75706 [01:12<00:25, 625.51it/s]

79.25395609330832


 81%|████████▏ | 61627/75706 [01:14<00:15, 924.32it/s]

81.23530499564103


 83%|████████▎ | 63173/75706 [01:15<00:13, 959.51it/s]

83.21665389797374


 85%|████████▌ | 64632/75706 [01:17<00:12, 876.11it/s]

85.19800280030645


 87%|████████▋ | 66119/75706 [01:19<00:10, 884.44it/s]

87.17935170263915


 89%|████████▉ | 67611/75706 [01:21<00:09, 841.33it/s]

89.16070060497186


 91%|█████████▏| 69099/75706 [01:23<00:08, 817.05it/s] 

91.14204950730458


 93%|█████████▎| 70499/75706 [01:25<00:09, 560.87it/s]

93.12339840963728


 95%|█████████▌| 72117/75706 [01:27<00:04, 848.56it/s]

95.10474731196999


 97%|█████████▋| 73689/75706 [01:29<00:02, 985.15it/s]

97.0860962143027


 99%|█████████▉| 75137/75706 [01:30<00:00, 909.40it/s]

99.06744511663541


100%|██████████| 75706/75706 [01:31<00:00, 824.78it/s]


In [20]:
sim = ImageSimilarity(images_full_path=image_paths, data_base_path=dataset_save_dir, dataset_name=dsetname, overwrite=True)
sim.extract_embeddings()
embeddings_np, paths_df = sim.load_embeddings()


  img_pil = self.transform(Image.fromarray(imageio.imread(img_path)))
  0%|          | 5/2365.671875 [00:20<2:39:02,  4.04s/it]


KeyboardInterrupt: 

In [10]:
def get_id(p):
    return p.split('/')[-2]
paths_df['id'] = paths_df['image_path'].apply(get_id)
paths_df['known_no_human'] = paths_df['id'].isin(label_data_no_human)
invalid_distances = []
for i in range(len(paths_df)):
    if paths_df['known_no_human'].iloc[i]:
        invalid_distances.append(embeddings_np[i])
invalid_distances_np = np.stack(arrays=invalid_distances,axis=0)

In [None]:
kmeans = KMeans(n_clusters=n_images_final, random_state=0, n_init="auto", max_iter=200)
kmeans.fit(embeddings_np)
final_paths = [None for _ in range(n_images_final)]

In [13]:
print("Choosing one random image from each cluster")
order = list(enumerate(kmeans.labels_))
random.shuffle(order)
for i, l in order:
    if final_paths[l] == None:
        if np.min(np.linalg.norm(embeddings_np[i] - invalid_distances_np, axis=1)) > 1.25:
            final_paths[l] = paths_df.image_path.iloc[i]
imids = [p.split('_')[-1][:-4] for p in final_paths if p is not None]
print(len(imids))
# Average distance from each mean, if this value is higher, than the images probably quite different than each other
# Hard to interpret without context, I selected 4k groups from 150k images collected in one bag and got about 1.6
score = kmeans.inertia_ / len(embeddings_np)
print(f"KMEANS SCORE: {score}")

Choosing one random image from each cluster


AttributeError: 'KMeans' object has no attribute 'labels_'

In [None]:
from aletheia_dataset_creator.dataset_tools.aletheia_dataset_helpers import imageids_to_dataset
imageids_to_dataset(image_ids=imids, dataset_name=f"{dsetname}_diverse_{n_images_final}_2", dataset_description=desc, dataset_kind=kind, production_dataset=False)

In [7]:
new_data = Dataset.retrieve(name="dynamic_manny_in_dust_raw_diverse_4000")
old_data= Dataset.retrieve(name="mannequin_in_dust_v0_diverse_1550")
new_df = new_data.to_dataframe()
old_df = old_data.to_dataframe()
new_df = new_df[new_df['camera_location'].str.endswith('left')]
nonhuman_ids = []

imageids_to_dataset(list(new_df['id']), "mannequin_in_dust_v1", "mannequins standing in dusty conditions (2k images)", Dataset.KIND_ANNOTATION, production_dataset=False)


    SELECT
        image_jupiter.id AS id, image_jupiter.camera_location AS camera_location, image_jupiter.group_id AS group_id, annotation_jupiter.id AS annotation_id, annotation_jupiter.updated_at AS annotation_updated_at, annotation_jupiter.label_map__json AS annotation_label_map__json, annotation_jupiter.vendor_metadata__json AS annotation_vendor_metadata__json
    FROM image_jupiter
    JOIN annotation_jupiter
        ON image_jupiter.id =
            annotation_jupiter.image
    WHERE
        image_jupiter.camera_location IN ('front-center-left', 'front-left-left', 'front-right-left', 'side-left-left', 'side-right-left', 'rear-left', 'T01', 'T02', 'T05', 'T06', 'T09', 'T10', 'T13', 'T14', 'I01', 'I03', 'I05', 'I07')
        AND annotation_jupiter.s3_key IS NOT NULL
        AND annotation_jupiter.state IN ('ok', 'review')
        AND
     image_jupiter.id IN ('64ee42a42e655e79fbf5d0b7', '64ee42b0e2b5d7468e2750b4', '64ee42dd8db25bf27d074f47', '64ee42f72e655e79fbf5d1b6', '64ee43433

INFO: Pandarallel will run on 32 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
Total annotations retrieved from athena 716
Removed annotataions in state SKIP, 716 left
Removed duplicate annotations for the same image (choosing latest), 711 left
Preparing stereo dataframe...
Size of left dataframe: 711
INFO: Pandarallel will run on 32 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
INFO: Pandarallel will run on 32 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.

        SELECT image_jupiter.id AS id, image_jupiter.camera_location AS camera_location, image_jupiter.group_id AS group_id
        FROM image_jupiter
        WHERE
            image_jupiter.camera_location =
                'front-center-right' AND
         image_jupiter.group_id IN ('459105fc713d4a56badf8ed536c63bcf', 'ef992c1308394b3c97611e4

KeyboardInterrupt: 

In [5]:
df = new_data.to_dataframe()

In [10]:
old_id = old_data.to_dataframe()['id']
print(len(old_id))

3100


In [12]:
all_id = list(pd.concat([new_id, old_id]))

In [13]:
imageids_to_dataset(list(all_id), "mannequin_in_dust_v1", "mannequins standing in dusty conditions (2k images)", Dataset.KIND_ANNOTATION, production_dataset=False)

INFO: Pandarallel will run on 32 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
Total annotations retrieved from athena 2302
Removed annotataions in state SKIP, 2302 left
Removed duplicate annotations for the same image (choosing latest), 2261 left
Preparing stereo dataframe...
Size of left dataframe: 2261
INFO: Pandarallel will run on 32 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
INFO: Pandarallel will run on 32 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
Size of stereo dataframe: 2261
Sending 2261 annotated_ids for creating dataset
Time taken to prepare data for dataset creation job: 2.01 mins
