# Visualize Embeddings using Pytorch version of Tensorboard

Things to remember when using Tensorboard in Jupyter Notebook:
- `%tensorboard --logdir=<dir-name>` requires `<dir-name>` to be the actual string (without quotes), not variable and use the highest directory level (e.g., use **test** in **test/exp1/run1** as `<dir-name>`)
- Assuming same `log_dir`
    - Calling writer functions AFTER restarting notebook will override existing proj-config file
    - Calling writer functions BEFORE restarting notebook will append existing proj-config file (Restart still required to reflect changes)
- Not specifying `tag` or `global_step` argument when calling multiple `add_embedding` function will still work as intended, but for some reason only one sprite and tsv file appears
- Always completely kill and delete Tensorboard processes before exiting

***Go to the visualization section directly if required files are already available and written to Tb***

In [None]:
# Install necessary modules if not already (Uncomment lines below)
# pip install tensorflow
# pip install tensorboard

In [None]:
# # Check tensorboard versions (Duplicated versions can result in errors)
# import pkg_resources

# for entry_point in pkg_resources.iter_entry_points('tensorboard_plugins'):
#     print(entry_point.dist)

In [1]:
# Import libraries
import numpy as np
import os
import torch
from torch.utils.tensorboard import SummaryWriter
from sklearn.cluster import KMeans

import matplotlib.pyplot as plt
from PIL import Image

# Fixes tensorboard error with tf
import tensorflow as tf
import tensorboard as tb
tf.io.gfile = tb.compat.tensorflow_stub.io.gfile

In [2]:
def train_embed_load(data_path, embed_path):
    """
    Description: 
        Loads the training data and embedding
    
    Args: 
        data_path: Path of training data file 
        embed_path: Path of embedding file 
        
    Outputs:
        imgs: Loaded training data (N, C, W, H)
        embed: Loaded embedding (N, embed_dim)   

    """
    # Load saved training file
    print('Loading training data...')
    train_data = torch.load(data_path) # (N, W, H, C)
    loader = torch.utils.data.DataLoader(train_data, 
                                         batch_size = len(train_data), 
                                         shuffle = False)
    dataiter = iter(loader)
    imgs = dataiter.next()
    imgs = imgs.cpu().detach().permute((0, 3, 1, 2)) / 255 # (N, C, W, H) and scale pixel values between [0, 1]
    print('Done')
    print('Training data shape, type, and dtype: ', imgs.shape, type(imgs), imgs.dtype)
    print('\n')
    
    # Load saved embedding file
    print('Loading embeddings...')
    embed = torch.load(embed_path) # (N, embed_dim)
    if type(embed) is not np.ndarray:
        embed = embed.cpu().detach()
    print('Done')
    print('Embedding shape, type, and dtype: ', embed.shape, type(embed), embed.dtype)
    print('\n')
    
    return imgs, embed

In [3]:
def create_cluster_list(embed, K_clusters):
    """
    Description: 
        Creates metadata required to visualize data with associated cluster class
    
    Args: 
        embed: Loaded embedding (N, embed_dim)   
        K_clusters: The number of cluster classes (int)
        
    Outputs:
        pred: A list containing the cluster class of each element (tile)

    """
    print('Creating cluster metadata with {} classes...'.format(K_clusters))
    
    # Convert embedding to numpy array
    if type(embed) is not np.ndarray:
        embed = embed.cpu().detach().numpy() # (N, embed_dim)
    
    # Cluster the embedding
    kmeans = KMeans(n_clusters = K_clusters, random_state = 1)
    pred = kmeans.fit_predict(embed) # (N,)
    pred = pred.tolist() # type list required 
    
    print('Done')
    print('Metadata length and type: ', len(pred), type(pred))
    print('\n')
    
    return pred

In [4]:
def tb_add_embed(t_dir, e_dir, dim, log, K_clusters = None, all_classes = False):
    """
    Description: 
        Adds data to Tensorboard
    
    Args: 
        t_dir: Main directory containing the training tensors
        e_dir: Main directory containing the embeddings
        dim: Dimension of each image tile
        log: Path of Tensorboard log directory    
        K_clusters: The number of cluster classes (int)
        all_classes: Whether to create multiple K_clusters metadata all at once (bool)
        
    Outputs:
        data_path: Path to the training tensor files
        embed_path: Path to the embedding files
        embed_name: Name of the embedding files

    """    
        
    data_path, embed_path, embed_name = [], [], []
    
    # Get name of training tensors and its directory
    for f in os.listdir(t_dir):
        if all( s in f for s in ['train_tensors', str(dim)] ):
            data_path.append(os.path.join(t_dir, f))
    
    # Get name of embedding and its directory
    for f in os.listdir(e_dir):
        if f.split("_")[0] == 'embed' and f.split("_")[2] == ('dim' + str(dim)) and f.split(".")[-1] != 'tsv':               
            embed_path.append(os.path.join(e_dir, f))
            embed_name.append(f.split(".")[0])
    
    print('Adding embedding to Tensorboard...\n')
    
    i = 1
    writer = SummaryWriter(log_dir = log) 
    for dpath, epath, ename in zip(data_path, embed_path, embed_name):
        # Load data
        imgs, embed = train_embed_load(dpath, epath) 
        
        # Tensorboard can only take sprites 8192 x 8192
        if int(np.ceil(np.sqrt(imgs.shape[0]) * imgs.shape[-1])) > 8191:
            print('Skipping because the number of images is too large\n')
            continue          
                 
        # Create metadata containing cluster classes and write everything to projector
        if K_clusters is None:
            pred = None
            writer.add_embedding(mat = embed, 
                                 metadata = pred, 
                                 label_img = imgs, 
                                 global_step = K_clusters, 
                                 tag = ename)
        elif K_clusters is not None and all_classes == True:
            # Write to projector for multiple max cluster classes
            for K in range(2, K_clusters + 1):
                pred = create_cluster_list(embed, K)
                writer.add_embedding(mat = embed, 
                             metadata = pred, 
                             label_img = imgs, 
                             global_step = K, 
                             tag = ename)
        else:
            pred = create_cluster_list(embed, K_clusters)
            writer.add_embedding(mat = embed, 
                                 metadata = pred, 
                                 label_img = imgs, 
                                 global_step = K_clusters, 
                                 tag = ename)
        
        print('Added {} embedding \n'.format(i))
        i += 1
    
    writer.close()    
    print('Embeddings all added')
    
    return data_path, embed_path, embed_name

In [1]:
# Change paths accordingly
# Main directory
train_dir = r'D:\MASC Big Files\Data Processing\Deep Learning\Unsupervised Learning\Bald Mountain\Top Pit\2021_BM_Top'
embed_dir = os.path.join(train_dir, r'Model_MT_Outputs\Embeddings\s5555 first run')
dim = 64
K = 7
write_all = True
log_dir = 'tb_embed/MT/2021_BM_Top/' + str(dim)

NameError: name 'os' is not defined

In [6]:
d, e, _ = tb_add_embed(t_dir = train_dir,
                       e_dir = embed_dir,
                       dim = dim, 
                       log = log_dir, 
                       K_clusters = K, 
                       all_classes = write_all)

Adding embedding to Tensorboard...

Loading training data...
Done
Training data shape, type, and dtype:  torch.Size([10736, 3, 64, 64]) <class 'torch.Tensor'> torch.float32


Loading embeddings...
Done
Embedding shape, type, and dtype:  torch.Size([10736, 128]) <class 'torch.Tensor'> torch.float32


Creating cluster metadata with 2 classes...
Done
Metadata length and type:  10736 <class 'list'>


Creating cluster metadata with 3 classes...
Done
Metadata length and type:  10736 <class 'list'>


Creating cluster metadata with 4 classes...
Done
Metadata length and type:  10736 <class 'list'>


Creating cluster metadata with 5 classes...
Done
Metadata length and type:  10736 <class 'list'>


Creating cluster metadata with 6 classes...
Done
Metadata length and type:  10736 <class 'list'>


Creating cluster metadata with 7 classes...
Done
Metadata length and type:  10736 <class 'list'>


Added 1 embedding 

Embeddings all added


### Attempt: Visualize training tensors directly

In [3]:
# Visualize training tensors directly
# Import training data
pth = r'D:\MASC Big Files\Data Processing\Deep Learning\Unsupervised Learning\Bald Mountain\Top Pit\2021_BM_Top'
train_data = torch.load(os.path.join(pth, 'train_tensors_128_sample.pt')) # torch tensor (N, W, H, C)
loader = torch.utils.data.DataLoader(train_data, 
                                     batch_size = len(train_data), 
                                     shuffle = False)
dataiter = iter(loader)
imgs = dataiter.next()
imgs = imgs.cpu().detach().permute((0, 3, 1, 2)) # torch tensor [N, C, H, W]

# Create label images for thumbnails
label_imgs = imgs / 255 # torch tensor (N, C, W, H) and scale pixel values between [0, 1]

# Change training Pytorch tensors to (N, C * H * W) numpy array for K-Means clustering
imgs = imgs.reshape(label_imgs.shape[0], -1).numpy() # np array [N, C * H * W]

print('Thumbnail images shape: ', label_imgs.shape)
print('Projected images shape: ', imgs.shape)

Thumbnail images shape:  torch.Size([2729, 3, 128, 128])
Projected images shape:  (2729, 49152)


In [4]:
# Write to projector
writer = SummaryWriter(log_dir = 'tb_training/2021_BM_Top/128') 
writer.add_embedding(mat = imgs,
                     label_img = label_imgs                     
                    )
# K = 7
# for K in range(3, K + 1):
#     pred = create_cluster_list(imgs, K)
#     writer.add_embedding(mat = imgs, 
#                  metadata = pred, 
#                  label_img = label_imgs, 
#                  global_step = K, 
#                  tag = str(K)) # tag needs to be a string
writer.close()

## Visualize

Click http://localhost:6006/#projector for better viewing experience

In [2]:
# Load tensorboard extension
%load_ext tensorboard
# %reload_ext tensorboard

In [3]:
# Remember to completely kill process
%tensorboard --logdir=tb_embed

In [4]:
# Kills tensorboard and removes info files (uncomment)
!taskkill /IM "tensorboard.exe" /F
!rmdir /S /Q %temp%\.tensorboard-info

SUCCESS: The process "tensorboard.exe" with PID 8992 has been terminated.


=========================================================================================================================================
=========================================================================================================================================

In [None]:
# # Change paths accordingly
# # Data loading
# fpath = r'D:\MASC Big Files\Code\Deep Learning\Gold Bar\Pick Pit\2019_BM_Top'
# data_file = r'\train_tensors_2_128.pt'

# # Project config file
# embed_file = '\Model_MT_Outputs\embed_MT_dim128_emb128_2.tsv'
# sprite_file = '\Model_MT_Outputs\sprite.jpg'
# im_dim = 128

# # Create files
# sprite = False
# config = False

In [None]:
# def train_data_load(fpath, data_file):
#     print('Loading training data')
#     # Load saved training tensors file
#     train_data = torch.load(fpath + data_file) # (N, W, H, C)

#     # Use the training data for sprite creation (Note that this is NOT the entire orthomosaic)
#     loader = torch.utils.data.DataLoader(train_data, 
#                                          batch_size = len(train_data), 
#                                          shuffle = False)
#     dataiter = iter(loader)
#     imgs = dataiter.next()

#     print('Loading training data complete \n')
    
#     return imgs

In [None]:
# def create_sprite(fpath, data_file):
#     print('Creating sprite...')
#     imgs = train_data_load(fpath, data_file)
    
#     # Convert training data torch tensors to numpy (uint8)
#     imgs = imgs.cpu().detach().numpy().astype(np.uint8) # (N, W, H, C)
    
#     num_tiles = imgs.shape[0]
#     tile_size = imgs.shape[1]
#     sprite_size = int(np.ceil(np.sqrt(num_tiles) * tile_size)) # Tensorflow embedding projector requires square sprite
#     print('Total number of tiles: ', num_tiles)
#     print('Tile size: {}'.format(tile_size))
#     print('Sprite size: {}'.format(sprite_size))

#     sprite = Image.new(mode = 'RGB', 
#                        size = (sprite_size, sprite_size), 
#                        color = (255, 255, 255)) # White background

#     for idx in range(num_tiles):
#         h_increment, w_increment = divmod(idx, sprite_size // tile_size) # Returns (0, 0) for idx = 0                                   
#         h_loc = tile_size * h_increment
#         w_loc = tile_size * w_increment
#         img = imgs[idx] # (H, W, C)
#         sprite.paste(Image.fromarray(img, mode = 'RGB'), 
#                      (w_loc, h_loc)) # Loc is upper lefthand corner

#         if (idx + 1) % 500 == 0:
#             print('Number of tiles processed: ', idx + 1)

#     sprite.save(fpath + '\sprite.jpg') 
#     print('Creating sprite complete \n')                  

In [None]:
# def write_proj_config(fpath, embed_file, sprite_file, im_dim):
#     print('Creating projector config file...')
#     with open(fpath + r'\Model_MT_Outputs\projector_config.pbtxt', 'w') as f:
#         f.write('embeddings {')
#         f.write('\n')
#         f.write('\ttensor_path: "{}"'.format(embed_file))
#         f.write('\n')
#         f.write('\tsprite {')
#         f.write('\n')
#         f.write('\t\timage_path: "{}"'.format(sprite_file))
#         f.write('\n')
#         f.write('\t\tsingle_image_dim: {}'.format(im_dim))
#         f.write('\n')
#         f.write('\t\tsingle_image_dim: {}'.format(im_dim))
#         f.write('\n')
#         f.write('\t}')
#         f.write('\n')
#         f.write('}')
    
#     print('Creating projector config file complete')

In [None]:
# def vis_embed(fpath, data_file, embed_file, sprite_file, im_dim = 128, sprite = False, config = False):    
#     if sprite:
#         create_sprite(fpath, data_file)
#     if config:
#         write_proj_config(fpath, embed_file, sprite_file, im_dim)
#     return spr

In [None]:
# vis_embed(fpath, data_file, embed_file, sprite_file, im_dim, sprite, config)