# UNICOM

## Dependencies

In [None]:
!pip install torch torchvision tqdm timm pillow faiss-gpu

## K-Means Clustering

### Generate embeddings

In [None]:
!pip install git+https://github.com/openai/CLIP.git

In [None]:
import Clustering.generate_embeddings as embed

In [None]:
embed.process_and_save_embeddings(
        csv_path="../Image-Captioning/vlm4bio_captions_with_split.csv",
        base_path="../Image-Captioning",
        batch_size=32,
        save_every=1000,
        cohort='train'
    )

### Generate Clusters from embeddings

In [1]:
import Clustering.generate_clusters as cluster

In [None]:
# NOTE -- the name of this is dependent on the model used (see code) in the embed.process_and_save_embeddings step
embeddings_dir = "embeddings_train_ViT-H-14-378-quickgelu"

cluster.combine_and_cluster(embeddings_dir=embeddings_dir, n_clusters=100)
cluster.combine_and_cluster(embeddings_dir=embeddings_dir, n_clusters=500)
cluster.combine_and_cluster(embeddings_dir=embeddings_dir, n_clusters=1000)
cluster.combine_and_cluster(embeddings_dir=embeddings_dir, n_clusters=2000)
cluster.combine_and_cluster(embeddings_dir=embeddings_dir, n_clusters=3000)

## Training with Unicom

### Setup training parameters

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# Create the config
from retrieval import Config, main

config = Config()
config.batch_size = 128
config.epochs = 32
config.lr = 1e-5 # Match Roman
config.model_name = "ViT-B/32"

## cluster count n = 100

In [None]:

config.cluster_results_path = "embeddings_train_ViT-H-14-378-quickgelu/clustering_results_100_clusters.csv"
config.save_dir = "checkpoints_ViT-H-14-378-quickgelu_cluster_100"  # Specify custom save directory to save weights for later perf eval.


# Add a flag to indicate we're not using distributed training
#config.use_distributed = False

# Run training
log_file = main(config)
print("Training loss data printed to: ", log_file)


## Training Clusters n = 500, 1000, 2000, 3000

### n = 500

In [None]:
config.cluster_results_path = "embeddings_train_ViT-H-14-378-quickgelu/clustering_results_500_clusters.csv"
config.save_dir = "checkpoints_ViT-H-14-378-quickgelu_cluster_500"  # Specify custom save directory to save weights for later perf eval.
# Run training
log_file = main(config)

### n = 1000

In [None]:
config.cluster_results_path = "embeddings_train_ViT-H-14-378-quickgelu/clustering_results_1000_clusters.csv"
config.save_dir = "checkpoints_ViT-H-14-378-quickgelu_cluster_1000"  # Specify custom save directory to save weights for later perf eval.
# Run training
log_file = main(config)

### n = 2000

In [None]:

config.cluster_results_path = "embeddings_train_ViT-H-14-378-quickgelu/clustering_results_2000_clusters.csv"
config.save_dir = "checkpoints_ViT-H-14-378-quickgelu_cluster_2000"  # Specify custom save directory to save weights for later perf eval.
# Run training
log_file = main(config)

### n = 3000

In [None]:



config.cluster_results_path = "embeddings_train_ViT-H-14-378-quickgelu/clustering_results_3000_clusters.csv"
config.save_dir = "checkpoints_ViT-H-14-378-quickgelu_cluster_3000"  # Specify custom save directory to save weights for later perf eval.
# Run training
log_file = main(config)


## Performance Evaluation

### Bar chart comparing trained models against base for species and genus recall

In [None]:
import perf_evaluation_bar_charts as perf_eval_bar

In [None]:
perf_eval_bar.main()

### Generate Embeddings of Test Rows

In [None]:
embed.process_and_save_embeddings(
        csv_path="../Image-Captioning/vlm4bio_captions_with_split.csv",
        base_path="../Image-Captioning",
        batch_size=32,
        save_every=1000,
        cohort='test' # TEST!
    )

### Benchmarks

In [None]:
!pip install seaborn

In [1]:
import perf_evaluations

In [None]:
perf_evaluations.main() # Trained Model Results

In [None]:
perf_evaluations.main() # ViT/B-32 Model Results