In [None]:
1. Import images from folder into initial table
2. Add embeddings column using some model
3. Reduce the table using UMAP/PACMAP
4. Add local / global image metrics
5. Split initial table into train/test split


In [5]:
%load_ext autoreload
%autoreload 2

In [3]:
import tlc
from tools.embeddings import add_embeddings_to_table

In [4]:
dataset_name = "cats-and-dogs"
project_name = "image-classification-table"
table_name = "initial"

table = tlc.Table.from_names(    
    table_name="initial",
    dataset_name=dataset_name,
    project_name=project_name,
)

In [None]:
table_with_embeddings = add_embeddings_to_table(table)


In [None]:
reduced_table = tlc.reduce_embeddings(
    table_with_embeddings,
    method="umap",
    delete_source_tables=False,
    source_embedding_column="embedding",
)

In [16]:
from tools.add_columns_to_table import add_columns_to_table
from tools.common import get_column_from_table
from tools.metrics import uniqueness, diversity, traversal_index

embedding_column = get_column_from_table(reduced_table, "embedding_umap")
label_column = get_column_from_table(reduced_table, "label")

uniqueness_score = uniqueness(embeddings=embedding_column, labels=label_column)
diversity_score = diversity(embeddings=embedding_column, labels=label_column)
traversal_index_score = traversal_index(embedding_column)

added_metrics_table = add_columns_to_table(
    table=reduced_table,
    columns={
        "uniqueness": uniqueness_score,
        "diversity": diversity_score,
        "traversal_index": traversal_index_score,
        },
    output_table_name="with_metrics_and_embeddings",
)


In [None]:
from tools.split import split_table

splits = split_table(
    table=added_metrics_table,
    splits={"train": 0.7, "val": 0.2, "test": 0.1},
)

train_table = splits["train"]
val_table = splits["val"]
test_table = splits["test"]

In [None]:
print(len(train_table))
print(len(val_table))
print(len(test_table))

In [None]:
train_table.url