In [1]:
%load_ext nb_black
%load_ext autoreload
%autoreload 2

<IPython.core.display.Javascript object>

In [13]:
import pandas as pd
from einops import asnumpy
from torch import no_grad
from tqdm import tqdm

from src.config import system_config, torch_config
from src.nets.define_net import define_net
from src.train.classificator.train_utils import create_dataloader


<IPython.core.display.Javascript object>

In [47]:
@no_grad()
def get_embeddings(
    csv_path: str = "splits/downloaded.csv",
    model_path: str = "rexnet_adamw_redefine_scheduler/model_best.pth",
    phase: str = "test",
):
    outputs_save_path = (
        system_config.data_dir
        / f"embeddings/{model_path.split('/')[-2]}_{csv_path.split('/')[-1]}"
    )
    outputs_save_path.mkdir(parents=True, exist_ok=True)

    model = define_net("rexnet-100", weights=system_config.model_dir / model_path)

    dataloader = create_dataloader(
        system_config.data_dir / "arrays/more_arrays_fixed",
        system_config.data_dir / csv_path,
        inference=phase == "test",
        save_preprocessed=None,
    )
    model.eval()
    model.reset_classifier(0)

    output = {"uid": [], "features": [], "split": []}

    for phase in dataloader.keys():
        for batch in tqdm(dataloader[phase]):
            features = model(batch["image"].to(torch_config.device))
            output["uid"].extend(batch["uid"])
            output["features"].extend(asnumpy(features))
            output["split"].extend([phase.value for _ in range(batch["image"].shape[0])])

    print(len(output["uid"]), len(output["features"]), len(output["split"]))
    pd.DataFrame(output).to_hdf(outputs_save_path / "features_test.h5", key="features", mode="w")

<IPython.core.display.Javascript object>

In [48]:
get_embeddings()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.data["filepath"] = self.data.loc[:, "uid"].map(self.images_dict)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.data["origin"] = self.data.loc[:, "uid"].map(self.origin)
100%|██████████| 200/200 [01:10<00:00,  2.82it/s]
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index(['uid', 'features', 'split'], dtype='object')]

  pd.DataFrame(output).to_hdf(outputs_save_path / "features.h5", key="features_

6381 6381 6381


<IPython.core.display.Javascript object>

In [50]:
f = pd.read_hdf(system_config.data_dir, "embeddings/rexnet_adamw_redefine_scheduler_downloaded.csv/features.h5")
f

Unnamed: 0,uid,features,split
0,aabn,"[-0.05148753, -0.28469357, 0.060479403, 0.1216...",test
1,aair,"[-0.059007075, -0.31480384, -0.0015645854, 0.0...",test
2,aajw,"[-0.019622073, -0.06280427, 0.0, -0.015245821,...",test
3,aalr,"[-0.059091806, 0.0, 0.17071946, 0.07721321, -0...",test
4,aalw,"[-0.012919307, -0.21879324, 0.013832152, -0.02...",test
...,...,...,...
6376,zzpn,"[-0.01615512, -0.08934799, 0.004488887, 0.0065...",test
6377,zzrv,"[-0.052115697, 0.0, 0.09748986, 0.00917091, -0...",test
6378,zzsx,"[0.0, -0.12665486, 0.00061047927, -0.014877672...",test
6379,zzvv,"[-0.016086448, -0.2605043, 0.09118736, 0.04008...",test


<IPython.core.display.Javascript object>