In [1]:
import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt

import os
import numpy as np
from progressbar import progressbar as pbar

from omegaconf import OmegaConf
import hydra

%load_ext autoreload
%autoreload 2

check that neighbor files are a subset of original chip IDs

In [2]:
df = pd.read_parquet('/opt/data/california-naip-chips/california-naip-chips-100k.parquet')
original_chip_IDs = df['original_chip_id']

In [3]:
folder = "/opt/data/california-naip-chips/california-naip-chips-100k-neighbours"
files = pd.Series([f.removesuffix('.parquet') for f in os.listdir(folder) if os.path.isfile(os.path.join(folder, f))])
files.isin(original_chip_IDs).all()

True

In [4]:
folder = "/opt/data/california-naip-chips/california-naip-chips-100k-neighbours/npy"
files = pd.Series([f.removesuffix('.npy') for f in os.listdir(folder) if os.path.isfile(os.path.join(folder, f))])
files.isin(original_chip_IDs).all()

True

---

load the dataloader

In [5]:
conf = OmegaConf.load("../../configs/naip-multilabel.yaml")
conf.dataloader.neighbor_embeddings_folder = "/opt/data/california-naip-chips/california-naip-chips-100k-neighbours/npy"
conf.dataloader.batch_size = 16
dl = hydra.utils.instantiate(conf.dataloader)

[32m2024-06-04 17:10:23.099[0m | [1mINFO    [0m | [36mearthtext.datamodules.components.chipmultilabel[0m:[36m__init__[0m:[36m100[0m - [1musing embeddings found in metadata file[0m
[32m2024-06-04 17:10:23.101[0m | [1mINFO    [0m | [36mearthtext.datamodules.components.chipmultilabel[0m:[36m__init__[0m:[36m113[0m - [1mread train split with 72268 chip files (out of 72268)[0m
[32m2024-06-04 17:10:23.101[0m | [1mINFO    [0m | [36mearthtext.datamodules.components.chipmultilabel[0m:[36m__init__[0m:[36m117[0m - [1mremoving chip IDs with no associated neighbors .npy files[0m
[32m2024-06-04 17:10:23.523[0m | [1mINFO    [0m | [36mearthtext.datamodules.components.chipmultilabel[0m:[36m__init__[0m:[36m125[0m - [1mmax cache size is -1[0m
[32m2024-06-04 17:10:24.827[0m | [1mINFO    [0m | [36mearthtext.datamodules.components.chipmultilabel[0m:[36m__init__[0m:[36m100[0m - [1musing embeddings found in metadata file[0m
[32m2024-06-04 17:10:24.83

In [6]:
dl.train_dataset[0]['embedding'].shape

(17, 17, 768)

---

test the dataloader

In [7]:
dltrain = dl.train_dataloader()

In [8]:
batch = next(iter(dltrain))

In [9]:
batch['embedding'].shape

torch.Size([16, 17, 17, 768])

---

smaller neighborhood radius

In [10]:
conf.dataloader.neighborhood_radius = 5
dl = hydra.utils.instantiate(conf.dataloader)

[32m2024-06-04 17:11:50.681[0m | [1mINFO    [0m | [36mearthtext.datamodules.components.chipmultilabel[0m:[36m__init__[0m:[36m100[0m - [1musing embeddings found in metadata file[0m
[32m2024-06-04 17:11:50.684[0m | [1mINFO    [0m | [36mearthtext.datamodules.components.chipmultilabel[0m:[36m__init__[0m:[36m113[0m - [1mread train split with 72268 chip files (out of 72268)[0m
[32m2024-06-04 17:11:50.684[0m | [1mINFO    [0m | [36mearthtext.datamodules.components.chipmultilabel[0m:[36m__init__[0m:[36m117[0m - [1mremoving chip IDs with no associated neighbors .npy files[0m
[32m2024-06-04 17:11:51.103[0m | [1mINFO    [0m | [36mearthtext.datamodules.components.chipmultilabel[0m:[36m__init__[0m:[36m125[0m - [1mmax cache size is -1[0m
[32m2024-06-04 17:11:52.474[0m | [1mINFO    [0m | [36mearthtext.datamodules.components.chipmultilabel[0m:[36m__init__[0m:[36m100[0m - [1musing embeddings found in metadata file[0m
[32m2024-06-04 17:11:52.47

In [11]:
dl.train_dataset[0]['embedding'].shape

(11, 11, 768)

In [12]:
dltrain = dl.train_dataloader()
batch = next(iter(dltrain))
batch['embedding'].shape

torch.Size([16, 11, 11, 768])