# Setting up the Collection Space Navigator
### configuration and data files for custom collections and research needs
In this How-To guide you will produce all necessary files for the Collection Space Navigator (CSN).:

## 1) Load & Prepare Collection Data

#### import libraries

In [1]:
import pandas as pd
import numpy as np
import json, math, os, io
from tqdm import tqdm
import ipywidgets as widgets
from ipywidgets import interactive,HBox,VBox,Label
from IPython.display import display

### define INPUT

In [9]:
def mount_gdrive(v):
    try:
      from google.colab import drive
      # drive.mount(drive_path,force_remount=False)
      buttonGDrive.description="mounting..."
      buttonGDrive.disabled=True
      drive.mount('/content/gdrive',force_remount=True)
    except:
      print("...error mounting drive")
      buttonGDrive.description="mounting failed!"
    else:
      buttonGDrive.description="successfully mounted"
      
def getOptions(USAGE, BUILD, EXIST, EXAMPLE):
  if BUILD == 2:
    uploadFile.disabled=False
  else:
    uploadFile.disabled=True
  if EXIST == 1:
    embeddingsLocation.disabled=False
  else:
    embeddingsLocation.disabled=True
  if USAGE == 3:
    buttonGDrive.disabled = False
    imageWebLocation.disabled = True
    imageLocation.value = "/content/gdrive/MyDrive/YOUR-IMAGE-PATH"
    embeddingsLocation.value = "/content/gdrive/MyDrive/YOUR-EMBEDDINGS-PATH"
    metadataLocation.value = "/content/gdrive/MyDrive/YOUR-METADATA-PATH"   
  else:
    buttonGDrive.disabled = True
    imageWebLocation.disabled=False
    imageLocation.value = ''
    embeddingsLocation.value = ''
    metadataLocation.value = ''   
  if USAGE == 1:
    imageWebLocation.value = "http://localhost:3000/"
  if EXAMPLE == True:
    buttonGDrive.disabled = True
    imageWebLocation.disabled = False
    datasetTitle.value = "Demo Dataset"
    imageLocation.value = "EXAMPLE-PATH"
    embeddingsLocation.value = "EXAMPLE-PATH"
    metadataLocation.value = "EXAMPLE-PATH"   

style = {'description_width': '250px'}
layout = {'width': '600px', 'justify-content': 'lex-satrt'}
layoutButtons = {'width': '210px'}
usage = widgets.Dropdown(options=[('locally on machine (offline)',1), ('as web tool (for production)',2), ('in Colab (for testing)',3)],value=3,description='Usage:',style=style,layout=layout)
existingEmbeddings = widgets.RadioButtons(options=[('use own file',1), ('extract features',2)],value=1,description='Embeddings:',style=style, layout=layout)
useExample = widgets.Checkbox(value=False,description='use demo data',indent=True, style=style, layout=layout)
datasetTitle = widgets.Text(placeholder='title of the dataset', description='Title:', style=style, layout=layout)
description = widgets.Textarea(placeholder='Short description of the dataset and method(s)', description='Description (optional):', style=style, layout=layout)
embeddingsLocation = widgets.Text(placeholder='path to embeddings file (.csv)', description='Embeddings Filepath:', style=style, layout=layout)
metadataLocation = widgets.Text(placeholder='path to metadata file (.csv)', description='Metadata Filepath:', style=style, layout=layout)
imageLocation = widgets.Text(placeholder='path to image collection folder', description='Image Folder:', style=style, layout=layout)
imageWebLocation = widgets.Text(placeholder='URL to images (available online)', description='Image URL:', value = '',style=style, layout=layout)
buildTool = widgets.RadioButtons(options=[('create new tool and dataset',1), ('add new dataset to existing tool',2)],value=1,description='Building:',style=style, layout=layout)
uploadFile = widgets.FileUpload(accept='.json',multiple=False,description="upload 'datasets_config.json'", layout=layoutButtons)
buttonGDrive = widgets.Button(description='mount Google Drive',icon='check',layout=layoutButtons)
buttonGDrive.on_click(mount_gdrive)
i = interactive(getOptions, USAGE = usage, BUILD = buildTool, EXIST = existingEmbeddings, EXAMPLE = useExample)
left = VBox([i, datasetTitle, description, embeddingsLocation, metadataLocation, imageLocation, imageWebLocation])
right = VBox([buttonGDrive,uploadFile])
display(HBox([left,right]))

HBox(children=(VBox(children=(interactive(children=(Dropdown(description='Usage:', index=2, layout=Layout(widt…

Mounted at /content/gdrive


#### load INPUT files

In [85]:
if buildTool.value == 2:
  try:
    uploaded_file = uploadFile.value["datasets_config.json"]
  except:
     print("ERROR: uploaded file must be'datasets_config.json'!")
  else:
    datasetsJSON = json.load(io.BytesIO(uploaded_file['content']))
    if datasetsJSON['data']:
      print("'datasets_config.json' looks ok.")
    else:
      print("ERROR: 'datasets_config.json' seems to be broken!")
imagNumb = len(os.listdir(imageLocation.value))
print(f'found {imagNumb} files in {imageLocation.value}')
if existingEmbeddings.value == 1:
  embeddings = pd.read_csv(embeddingsLocation.value, skipinitialspace=True)
  embeddings = embeddings.loc[:, embeddings.columns!='id'] # for testing, delete later!!!!
  vecNumb = len(embeddings)
  print(f'found {vecNumb} entries in {embeddingsLocation.value}')
metadata = pd.read_csv(metadataLocation.value, skipinitialspace=True)
metaNumb = len(metadata)
print(f'found {metaNumb} entries in {metadataLocation.value}')
if existingEmbeddings.value == 1:
  if metaNumb == vecNumb:
    if vecNumb <= imagNumb:
      print("Looks ok.")
      print()
      print(f'Embedding file contains {vecNumb} vectors in {len(embeddings.columns)} dimensions.')
      print("Metadata Head:")
      print(metadata.head())
    else:
      print()
      print("ERROR: number of images is smaller than number of vectors")
else:
  if metaNumb <= imagNumb:
    print("Looks ok.")
    print("Metadata Head:")
    print(metadata.head())
  else:
    print()
    print("ERROR: number of images and metadata elements don't match!")
foldername = datasetTitle.value.lower().replace(" ","_")
print()
print(f'Creating new dataset directory: {foldername}...')
if not os.path.exists(foldername):
    os.makedirs(foldername)
    print("... success")
else:
    print("... folder already exists (might overwrite existing files)")

found 4800 files in /content/gdrive/MyDrive/data/images
found 4800 entries in /content/gdrive/MyDrive/data/basic_concepts_restnet50 (1).csv
found 4800 entries in /content/gdrive/MyDrive/data/metadata_basic_concepts.csv
Looks ok.

Embedding file contains 4800 vectors in 2048 dimensions.
Metadata Head:
   index             id                                                URL  \
0      0    nose_63.png  http://localhost:3000/CulturalDataTools/CSN/da...   
1      1      red_3.png  http://localhost:3000/CulturalDataTools/CSN/da...   
2      2    tree_95.png  http://localhost:3000/CulturalDataTools/CSN/da...   
3      3  yellow_58.png  http://localhost:3000/CulturalDataTools/CSN/da...   
4      4   liver_72.png  http://localhost:3000/CulturalDataTools/CSN/da...   

   Prompt  Class  Colorfulness  Colors   Contrast  File Size  
0    nose     28         30.89   20051  52.061315      14071  
1     red     35         68.66   25534  38.299577      13396  
2    tree     41         23.57   34112  

FileNotFoundError: ignored

#### Assign metadata fields
Select which field names in the metadata file should be used. Multiple values can be selected with shift and/or ctrl (or command) pressed and mouse clicks or arrow keys.

In [86]:
filenameColumn = widgets.Dropdown(description="Image filenames:",options=list(metadata.columns), style=style, layout=layout)
classColumns = widgets.SelectMultiple(options=list(metadata.columns),description='optional: Cluster data (integers):', style=style, layout=layout)
infoColumns = widgets.SelectMultiple(options=list(metadata.columns),description='Info fields (display in preview):', style=style, layout=layout)
sliderColumns = widgets.SelectMultiple(options=list(metadata.columns),description='Slider data (floats or integers):', style=style, layout=layout)
filterColumns = widgets.SelectMultiple(options=list(metadata.columns),description='optional: Filter & Search fields:', style=style, layout=layout)
if useExample.value == True:
  print() ### preselect columns
left = VBox([filenameColumn, infoColumns, sliderColumns])
right = VBox([filterColumns, classColumns])
HBox([left,right])

HBox(children=(VBox(children=(Dropdown(description='Image filenames:', layout=Layout(width='600px'), options=(…

#### check if data types are correct

In [91]:
error = False
print("checking Image Filenames field...")
if pd.api.types.is_string_dtype(metadata[filenameColumn.value]) and metadata[filenameColumn.value].str.endswith((".jpg",".JPEG","JPG",".jpeg",".png",".PNG")).all():
  print(f"...'{filenameColumn.value}' looks ok.")
else:
  print(f"ERROR: metadata field '{filenameColumn.value}' doesn't seem right. All values need to be jpeg or png image filenames!")
  error = True
print("checking Info fields...")
if len(infoColumns.value) > 0:
  print(f"... looks ok.")
  print(f"CSN will provide info on: {infoColumns.value}")
else:
  print(f"...no Info fields selected!")
print("checking Sliders fields...")
for col in sliderColumns.value:
  if not pd.api.types.is_numeric_dtype(metadata[col]):
    print(f"ERROR: metadata field '{col}' doesn't seem right. Data type for Sliders must be integer or float!")
    error = True
  else:
    print(f"...'{col}' looks ok.")
print("checking Filter & Search fields...")
if len(filterColumns.value) > 0:
  print(f"... looks ok.")
  print(f"CSN will use the follwing fields for queries: {filterColumns.value}")
else:
  print(f"...no Filter & Search fields selected!")
print("checking Cluster fields...")
if len(classColumns.value) > 0:
  for col in classColumns.value:
    if not pd.api.types.is_integer_dtype(metadata[col]):
      print(f"ERROR: metadata field '{col}' doesn't seem right. Data type for Cluster must be integer!")
      error = True
    else:
      print(f"...'{col}' looks ok.")
else:
  print(f"...no Cluster fields selected!")
if error == False:
  print("\nEverything looks good!")
else:
  print("\nFound some errors! Please fix before you continue.")
  
sliderCols = list(sliderColumns.value)

checking Image Filenames field...
...'id' looks ok.
checking Info fields...
... looks ok.
CSN will provide info on: ('id',)
checking Sliders fields...
...'Colorfulness' looks ok.
...'Colors' looks ok.
...'Contrast' looks ok.
...'File Size' looks ok.
checking Filter & Search fields...
...no Filter & Search fields selected!
checking Cluster fields...
...no Cluster fields selected!

Everything looks good!


#### Process metadata and save file

In [7]:
# modify image paths
if usage.value == 2:
  imageFolder = ''
else:
  imageFolder = f'public/datasets/{foldername}/images/'
metadata["URL"] = imageWebLocation.value + imageFolder + metadata[filenameColumn.value]
# save metadata file
result = metadata.to_json(orient="records")
with open(f'{foldername}/metadata.json', "w") as f:
    f.write(result)
print("saved metadata.json")

saved metadata.json


## 2) Generate Image Tiles  
To handle large amounts of images efficiently, the CSN loads the files in form of optimized tiles. This step generates them.  

In [97]:
# parameters for tiles
tileSize = 2048  # size of tile
tileRows = 32  # rows per tile
columns = tileRows  # columns per tile
squareSize = int(tileSize/tileRows)
imgPerTile = tileRows*columns
numbTiles = math.ceil(len(metadata)/imgPerTile)

> Note: Only needed for new datasets or to update existing tiles.  
Skip this part if you already generated them.

In [98]:
from PIL import Image

def generateTiles(ImgPaths,foldername,IMAGE_FOLDER):
    tileNumb = 0
    currentIDX = 0
    if not IMAGE_FOLDER.endswith("/"):
      IMAGE_FOLDER += "/"
    result = Image.new("RGBA", (tileSize, tileSize), (255, 0, 0, 0))
    for entry in tqdm(ImgPaths, desc = "Generating tiles"):
        if currentIDX > imgPerTile:
            result = result.resize((tileSize, tileSize), Image.ANTIALIAS)
            result.save(f'{foldername}/tile_{tileNumb}.png', "PNG")
            # new tile
            currentIDX = 0
            tileNumb += 1
            result = Image.new("RGBA", (tileSize, tileSize), (0, 0, 0, 0))
        else:
            try:
                image = Image.open(IMAGE_FOLDER + entry) 
            except Exception as e: 
                print(e)
            else:
                (w,h) = image.size
                # portrait format
                if (h > w):
                    w = int(w/h*squareSize)
                    h = squareSize
                    x_dif = int((squareSize - w) / 2)
                    y_dif = 0
                # landscape or square format
                else:
                    h = int(h/w*squareSize)
                    w = squareSize
                    x_dif = 0
                    y_dif = int((squareSize - h) / 2)
                resizedImage = image.resize((w-8, h-8), Image.ANTIALIAS)       
                r_result = Image.new("RGBA", (w, h), (1, 1, 1, 1))   # produces an almost transparent border to indicate clusters in the tool
                r_result.paste(resizedImage, (4,4))
                x = currentIDX % tileRows * squareSize + x_dif
                y = currentIDX // columns * squareSize + y_dif
                result.paste(r_result, (x, y, x + w, y + h))
                currentIDX += 1
    result = result.resize((tileSize, tileSize), Image.ANTIALIAS)
    result.save(f'{foldername}/tile_{tileNumb}.png', "PNG")

In [99]:
generateTiles(metadata[filenameColumn.value],foldername,imageLocation.value)

Generating tiles:   0%|          | 19/4800 [00:09<40:21,  1.97it/s]


KeyboardInterrupt: ignored

## 3) Generate Mappings

Mappings are plots containing 2D coordinates (x,y) of the image objects. 

Here are several methods you can run. The Collection Space Navigator can handle many mappings but needs at least one to work.

In [68]:
from sklearn.preprocessing import StandardScaler
mappings = []
minScale = -25
maxScale = 25

def normalize(embeddings):
    minX = min(embeddings, key=lambda x: x[0])[0]
    rangeX = max(embeddings, key=lambda x: x[0])[0] - minX
    minY = min(embeddings, key=lambda x: x[1])[1]
    rangeY = max(embeddings, key=lambda x: x[1])[1] - minY
    rangeScale = maxScale + 0.9999999999 - minScale
    for index, e in enumerate(embeddings):
        embeddings[index][0] =  (embeddings[index][0] - minX) / rangeX * rangeScale + minScale
        embeddings[index][1] = (embeddings[index][1] - minY) / rangeY * rangeScale + minScale
    return embeddings

def centerEmbeddings(embeddings):
    offsetA = (max(embeddings, key=lambda x: x[0])[0] + min(embeddings, key=lambda x: x[0])[0]) / 2
    offsetB = (max(embeddings, key=lambda x: x[1])[1] + min(embeddings, key=lambda x: x[1])[1]) / 2
    for index, e in enumerate(embeddings):
        embeddings[index][0] = embeddings[index][0] - offsetA
        embeddings[index][1] = embeddings[index][1] - offsetB
    return embeddings
    
class NumpyEncoder(json.JSONEncoder):
    """ Special json encoder for numpy types """
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        return json.JSONEncoder.default(self, obj)

### PCA: Principal Component Analysis
https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html

In [70]:
from sklearn.decomposition import PCA

def generate_PC(df,n,scale):
    print("performing PCA...")
    x = StandardScaler().fit_transform(df)
    pca = PCA(n_components=n)
    embedding = pca.fit_transform(x)
    if scale == True:
      normalized = normalize(embedding)
      centeredEmbedding = centerEmbeddings(normalized)
    else:
      centeredEmbedding = embedding
    print("...done")
    return centeredEmbedding

In [None]:
PCAEembedding = generate_PC(embeddings,2,True)
# save file
with open(f'{foldername}/PCA.json', "w") as out_file:
    out = json.dumps(PCAEembedding, cls=NumpyEncoder)
    out_file.write(out)
print(f"saved PCA.json")
mappings.append({"name": "PCA", "file": "PCA.json"})

performing PCA...
...done
saved PCA.json


#### add Principal Components to metadata and Sliders

In [92]:
def addPC(ADD):
  if ADD == True:
    display(VBox([PCs,addButton]))

def PCButton(v):
  PCAEembedding = generate_PC(embeddings,PCs.value,False)
  print("adding to metadata and Slides...")
  for PC in range(0,PCs.value):
    key = f"PC{PC+1}"
    metadata[key] = PCAEembedding[:,PC]
    sliderCols.append(key)
  print("...done")

addPCA = widgets.Checkbox(value=False,description='add Principal Components',indent=True, style=style, layout=layout)
PCs = widgets.IntSlider(value=3,min=1,max=5,step=1,description='number of Components:',readout=True,readout_format='d', style=style, layout=layout)
addButton = widgets.Button(description='make PC data',icon='check', layout=layoutButtons)
addButton.on_click(PCButton)
display(interactive(addPC, ADD = addPCA))

interactive(children=(Checkbox(value=False, description='add Principal Components', layout=Layout(width='600px…

performing PCA...
...done
adding to metadata and Slides...
...done


### UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction
https://umap-learn.readthedocs.io/en/latest/

In [None]:
try:
  import umap.umap_ as umap
except:
  print("Installing umap-learn via Pip")
  !pip install umap-learn --quiet
  import umap.umap_ as umap

# UMAP parameter
n_neighbors=15
min_dist=0.15
metric="correlation"
verbose=True

def generateUMAP(df):
    print("generating UMAP...")
    scaled_penguin_data = StandardScaler().fit_transform(df)
    reducer = umap.UMAP(n_neighbors=n_neighbors,
                        min_dist=min_dist,
                        metric=metric,
                        verbose=verbose)
    embedding = reducer.fit_transform(scaled_penguin_data)
    normalized = normalize(embedding)
    centeredEmbedding = centerEmbeddings(normalized)
    print("...done")
    return centeredEmbedding

Installing umap-learn via Pip
[K     |████████████████████████████████| 88 kB 3.4 MB/s 
[K     |████████████████████████████████| 1.1 MB 48.3 MB/s 
[?25h  Building wheel for umap-learn (setup.py) ... [?25l[?25hdone
  Building wheel for pynndescent (setup.py) ... [?25l[?25hdone


In [None]:
fullEmbeddings = generateUMAP(embeddings)
# save file
with open(f'{foldername}/UMAP.json', "w") as out_file:
    out = json.dumps(fullEmbeddings, cls=NumpyEncoder)
    out_file.write(out)
print(f"saved UMAP.json")
mappings.append({"name": "UMAP", "file": "UMAP.json"})

generating UMAP...
UMAP(angular_rp_forest=True, metric='correlation', min_dist=0.15, verbose=True)
Fri Dec  2 15:23:42 2022 Construct fuzzy simplicial set
Fri Dec  2 15:23:42 2022 Finding Nearest Neighbors
Fri Dec  2 15:23:42 2022 Building RP forest with 8 trees
Fri Dec  2 15:23:49 2022 NN descent for 12 iterations
	 1  /  12
	 2  /  12
	 3  /  12
	 4  /  12
	 5  /  12
	Stopping threshold met -- exiting after 5 iterations
Fri Dec  2 15:24:22 2022 Finished Nearest Neighbor Search
Fri Dec  2 15:24:25 2022 Construct embedding


Epochs completed:   0%|            0/500 [00:00]

Fri Dec  2 15:24:38 2022 Finished embedding
...done
saved UMAP.json


### t-SNE: t-distributed Stochastic Neighbor Embedding

https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html

In [None]:
from sklearn.manifold import TSNE

# t-SNE parameter
n_components = 2
verbose = 1
random_state = 123

def generateTSNE(df):
    print("generating t-SNE...")
    x = StandardScaler().fit_transform(df)
    tsne = TSNE(n_components=n_components, verbose=verbose, random_state=random_state)
    embedding = tsne.fit_transform(x)
    normalized = normalize(embedding)
    centeredEmbedding = centerEmbeddings(normalized)
    print("...done")
    return centeredEmbedding

In [None]:
tsneEembedding = generateTSNE(embeddings)
# save file
with open(f'{foldername}/tSNE.json', "w") as out_file:
    out = json.dumps(tsneEembedding, cls=NumpyEncoder)
    out_file.write(out)
print(f"saved tSNE.json")
mappings.append({"name": "t-SNE", "file": "tSNE.json"})

generating t-SNE...




[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 4800 samples in 0.013s...
[t-SNE] Computed neighbors for 4800 samples in 3.905s...
[t-SNE] Computed conditional probabilities for sample 1000 / 4800
[t-SNE] Computed conditional probabilities for sample 2000 / 4800
[t-SNE] Computed conditional probabilities for sample 3000 / 4800
[t-SNE] Computed conditional probabilities for sample 4000 / 4800
[t-SNE] Computed conditional probabilities for sample 4800 / 4800
[t-SNE] Mean sigma: 12.923576
[t-SNE] KL divergence after 250 iterations with early exaggeration: 82.684647
[t-SNE] KL divergence after 1000 iterations: 1.637312
...done
saved tSNE.json


### 2D plots

Choose 2 metadadata fields (float or integer) and click "make plot". Repeat for every combination you want to add.

In [None]:
def makePlot(v):
  A = AColumn.value
  B = BColumn.value
  if pd.api.types.is_numeric_dtype(metadata[A]):
    print(f"'{A}' looks ok")
    if pd.api.types.is_numeric_dtype(metadata[B]):
      print(f"'{B}' looks ok")
      plot = metadata[[A,B]]
      normalizedPlot = normalize(plot.values)
      centeredEmbedding = centerEmbeddings(normalizedPlot)
      filename = (A + "_" + B).replace(" ","")
      # save file
      with open(f'{foldername}/{filename}.json', "w") as out_file:
        out = json.dumps(centeredEmbedding, cls=NumpyEncoder)
        out_file.write(out)
      print(f"saved {filename}.json")
      mappings.append({"name": filename, "file": filename + ".json"})
    else:
      print(f"ERROR: metadata field '{B}' doesn't seem right. Data type must be integer or float!")
  else:
    print(f"ERROR: metadata field '{A}' doesn't seem right. Data type must be integer or float!")
    
AColumn = widgets.Dropdown(description="x-axis:",options=list(metadata.columns), style=style, layout=layout)
BColumn = widgets.Dropdown(description="y-axis:",options=list(metadata.columns), style=style, layout=layout)
button2DPlot = widgets.Button(description='make plot',icon='check')
button2DPlot.on_click(makePlot)
left = VBox([AColumn,BColumn])
right = VBox([button2DPlot])
HBox([left,right])

HBox(children=(VBox(children=(Dropdown(description='x-axis:', layout=Layout(width='600px'), options=('index', …

'Colorfulness' looks ok
'Colors' looks ok
saved Colorfulness_Colors.json
'Colors' looks ok
'File Size' looks ok
saved Colors_FileSize.json
'Colorfulness' looks ok
'File Size' looks ok
saved Colorfulness_FileSize.json


## 4) Settings


#### Sliders

In [95]:
if len(sliderCols) > 0:    
  try:
    import distinctipy
  except:
    print("Installing distinctipy via Pip")
    !pip install distinctipy --quiet
    import distinctipy
  layoutCol = {'width': '110px'}
  sliderColorDict = {}
  left = [Label('display name')]
  middle = [Label('description text')]
  right = [Label('histogram color')]
  colors = distinctipy.get_colors(len(sliderCols),pastel_factor=1)
  for i, sliderName in enumerate(sliderCols):
    sliderColorDict[sliderName] = widgets.ColorPicker(concise=False,value=distinctipy.get_hex(colors[i]),layout=layoutCol)
    right.append(sliderColorDict[sliderName])
  sliderInfoDict = {}
  for sliderName in sliderCols:
    sliderInfoDict[sliderName] = widgets.Text(placeholder="info text for slider",layout=layout)
    middle.append(sliderInfoDict[sliderName])
  sliderNameDict = {}
  for sliderName in sliderCols:
    sliderNameDict[sliderName] = widgets.Text(placeholder="name of slider",value=sliderName)
    left.append(sliderNameDict[sliderName])
  print("\nSlider Settings:\n") 
  idx = VBox([Label('')]+[Label(f"{n}:") for n in sliderCols])
  left_box = VBox([l for l in left])
  middle_box = VBox([m for m in middle])
  right_box = VBox([r for r in right])
  display(HBox([idx,left_box,middle_box,right_box]))
else:
  print("No Cluster fields selected!")

Installing distinctipy via Pip

Slider Settings:



HBox(children=(VBox(children=(Label(value=''), Label(value='Colorfulness:'), Label(value='Colors:'), Label(val…

#### Cluster colors

In [96]:
if len(classColumns.value) > 0:
  classColorDict = {}
  amount = len(classColumns.value)
  styleCol = {'description_width': '25px'}
  layoutCl = {'width': '135px'}
  allClasses = {}
  for className in classColumns.value:
    clusters = metadata[className].unique()
    allClasses[className] = len(clusters)
  l = sorted(allClasses.items(), key=lambda item: item[1])[0]
  length = l[1]
  allColors = {}
  colors = distinctipy.get_colors(length)
  col = 5
  row = math.ceil(length/col)
  i=0
  rows = []
  for r in range(0,col):
    newRow = []
    for c in range(0,row):
      # classColorDict[className] = widgets.ColorPicker(concise=True, value=distinctipy.get_hex(colors[i]))
      if i < len(colors):
        allColors[i] = widgets.ColorPicker(concise=False, description=str(i), value=distinctipy.get_hex(colors[i]),layout=layoutCl,style=styleCol)
        newRow.append(allColors[i])
        i+=1
    rows.append(VBox([nr for nr in newRow]))
  display(HBox(rows))
else:
  print("No cluster was selected.")

No cluster was selected.


## 5) Create/update Config Files

All customization and component settings are defined in the config files.

#### calculate and save histograms  
The CSN Range Sliders come with interactive histograms. This step calculates the buckets and prepares the data.

In [13]:
def prepareBuckets(MIN,MAX, data):
    # prepare Slider Bar Historgram
    buckets = {}
    bucketsSize = {}
    bucketCount = 50
    if (MIN < 0):
        stepSize = (abs(MIN) + abs(MAX)) / bucketCount
    else:
        stepSize = abs((abs(MIN) - abs(MAX)) / bucketCount)
    for i in range(0, bucketCount):
        buckets[i] = []
        bucketsSize[i] = 0
    for index, e in enumerate(data):
        if (e == MAX):
            targetBucket = bucketCount-1
        else:
            targetBucket = math.floor((e - MIN) / stepSize)
        buckets[targetBucket].append(index)
        bucketsSize[targetBucket]+=1
    return {"histogram":list(bucketsSize.values()), "selections":list(buckets.values()), "range":[int(MIN),int(MAX)]}

def getBarChartData(df, selectionList):
    bucketData =  {} 
    for PC in selectionList:
        print("preparing Slider Bar Historgram data", PC)
        bucketData[PC] = {str(PC):{"histogram":[], "selections":[]}}
        bucketData[PC] = prepareBuckets(df[PC].min(),df[PC].max(), df[PC].values.tolist())
    return bucketData

def update_config(metadata,mappings):
    configData = {"title": datasetTitle.value, "datasetInfo": description.value, "metadata": "metadata.json", "embeddings": []}
    if mappings:
        configData["embeddings"] = mappings    
    configData["clusters"] = clusters
    configData["total"] = len(metadata)
    if tileSize:
        configData["sprite_side"] = tileRows
        configData["sprite_number"] = numbTiles
        configData["sprite_image_size"] = squareSize
        configData["sprite_actual_size"] = tileSize
    configData["sliders"] = sliderSetting
    if infoColumns.value:
        configData["info"] = infoColumns.value
    configData["search"] = searchFields
    return configData

def save_datasetsJSON():
  with open(f'datasets_config.json', "w") as fd:
    json.dump(datasetsJSON , fd)
  print("saved datasets_config.json")

def make_default(DEFAULT):
  datasetsJSON["default"] = DEFAULT
  print(f"changed default dataset to {datasetsJSON['data'][DEFAULT]['name']}")
  save_datasetsJSON()

#### write histogram data

In [14]:
BarChartData = getBarChartData(metadata,sliderCols)
with open(f'{foldername}/barData.json', "w") as f:
    json.dump(BarChartData , f)
print(f'saved barData.json')

preparing Slider Bar Historgram data Colorfulness
preparing Slider Bar Historgram data Colors
preparing Slider Bar Historgram data Contrast
preparing Slider Bar Historgram data File Size
saved barData.json


#### write config files

In [41]:
sliderSetting = []
def zip_folder(v):
    buttonZip.description = "...zipping dataset"
    buttonZip.disabled = True
    !7z a {foldername}.zip {foldername}
    print(f"\nDataset '{foldername}.zip' and 'datasets_config.json' are ready to download.")
    print(f"Place the unpacked dataset folder in your local CSN directory 'public/datasets/' in development mode or in '/datasets/' in the build production.")
    print(f"Also, replace 'datasets/datasets_config.json' with the new file.")
    buttonZip.description = "zipping done"

for k in sliderCols:
  dtype = 'float'
  if pd.api.types.is_integer_dtype(metadata[k]):
    dtype = 'integer'
  slider = {"id":k,"title":sliderNameDict[k].value,"info":sliderInfoDict[k].value,"typeNumber":dtype,"color":sliderColorDict[k].value}
  sliderSetting.append(slider)
searchFields = []
for k in filterColumns.value:
  filter = {"columnField":k,"type":"selection"}
  searchFields.append(filter)
clusters = {"clusterList":list(classColumns.value),"clusterColors":[allColors[g].value for g in allColors]}
configData = update_config(metadata,mappings)
with open(f'{foldername}/config.json', "w") as fb:
    json.dump(configData , fb)
print(f'saved config.json')
newDataset = {'name': datasetTitle.value, 'directory': foldername}
if buildTool.value == 2:
    if newDataset not in datasetsJSON["data"]:
      datasetsJSON["data"].append(newDataset)
    print("change default dataset:")
    defaultOptions = [(e["name"],i) for i,e in enumerate(datasetsJSON["data"])]
    defaultDataset = widgets.Dropdown(description="Default dataset:",options=defaultOptions, style=style, layout=layout)
    defaulInteracive = interactive(make_default,DEFAULT = defaultDataset)
    display(defaulInteracive)
    # save_datasetsJSON()
else:
    datasetsJSON = {"default": 0, "data": [newDataset]}
    save_datasetsJSON()
if buildTool.value == 2:
    buttonZip = widgets.Button(description='zip dataset',icon='check')
    buttonZip.on_click(zip_folder)
    display(buttonZip)
else:
    print("\nContinue with building the Collection Space Navigator in the next step...")

saved config.json
change default dataset:


interactive(children=(Dropdown(description='Default dataset:', layout=Layout(width='600px'), options=(('Basic_…

Button(description='zip dataset', icon='check', style=ButtonStyle())


7-Zip [64] 16.02 : Copyright (c) 1999-2016 Igor Pavlov : 2016-05-21
p7zip Version 16.02 (locale=en_US.UTF-8,Utf16=on,HugeFiles=on,64 bits,2 CPUs Intel(R) Xeon(R) CPU @ 2.20GHz (406F0),ASM,AES-NI)

Open archive: basic_concepts.zip
--
Path = basic_concepts.zip
Type = zip
Physical Size = 67090738

Scanning the drive:
  0M Scan           1 folder, 14 files, 128458158 bytes (123 MiB)

Updating archive: basic_concepts.zip

Items to compress: 15

  0%      0% 7 U basic_concepts/metadata.json                                       1% 8 U basic_concepts/metadata_basic_concepts.csv                                                     3% 10 U basic_concepts/tile_0.png                                     5% 10 U basic_concepts/ti

----------------

## 6) Building custom Collection Space Navigator

Depending on how you choose to run the CSN, we need to prepare the tool.

#### Clone CSN repository and install packages

In [19]:
%cd /content
!git clone https://github.com/CulturalDataTools/CSN/
%cd CSN
!npm install --loglevel=error

/content
Cloning into 'CSN'...
remote: Enumerating objects: 104, done.[K
remote: Counting objects: 100% (104/104), done.[K
remote: Compressing objects: 100% (78/78), done.[K
remote: Total 104 (delta 42), reused 88 (delta 26), pack-reused 0[K
Receiving objects: 100% (104/104), 1.16 MiB | 3.32 MiB/s, done.
Resolving deltas: 100% (42/42), done.
/content/CSN
[K[?25h
> node-sass@6.0.1 install /content/CSN/node_modules/node-sass
> node scripts/install.js

Cached binary found at /root/.npm/node-sass/6.0.1/linux-x64-83_binding.node
[K[?25h
> core-js@2.6.12 postinstall /content/CSN/node_modules/babel-runtime/node_modules/core-js
> node -e "try{require('./postinstall')}catch(e){}"

[K[?25h
> core-js@3.26.1 postinstall /content/CSN/node_modules/core-js
> node -e "try{require('./postinstall')}catch(e){}"


> core-js-pure@3.26.1 postinstall /content/CSN/node_modules/core-js-pure
> node -e "try{require('./postinstall')}catch(e){}"

[K[?25h
> ejs@2.7.4 postinstall /content/CSN/node_module

#### move the new Dataset to the Collection Space Navigator

In [23]:
import shutil
shutil.move(f"/content/{foldername}", f"/content/CSN/public/datasets/{foldername}")
shutil.move(f"/content/datasets_config.json", f"/content/CSN/public/datasets/datasets_config.json")

'/content/CSN/public/datasets/datasets_config.json'

#### create production build
this could take a few minutes

In [47]:
%cd CSN
!npm run build

/content/CSN

> CSN@0.1.0 build /content/CSN
> react-scripts build

Creating an optimized production build...
[33m[39m
src/Sliders/RangeSlider.js
  [1mLine 158:17:[22m  The href attribute requires a valid value to be accessible. Provide a valid, navigable address as the href value. If you cannot provide a valid href, but still need the element to resemble a link, use a button and change it with appropriate styles. Learn more: https://github.com/jsx-eslint/eslint-plugin-jsx-a11y/blob/HEAD/docs/rules/anchor-is-valid.md  [33m[4mjsx-a11y/anchor-is-valid[24m[39m

To ignore, add [36m// eslint-disable-next-line[39m to the line before.

File sizes after gzip:

  512.92 KB  [2mbuild/static/js/[22m[36m2.5cbf7f96.chunk.js[39m
  11.36 KB   [2mbuild/static/js/[22m[36mmain.0ac3b6a6.chunk.js[39m
  2.81 KB    [2mbuild/static/css/[22m[36mmain.fe97f2c8.chunk.css[39m
  772 B      [2mbuild/static/js/[22m[36mruntime-main.083b1ddb.js[39m

The project was built assuming it is hosted

#### zip folder to download your version of the Collection Space Navigator

In [20]:
!7z a CSN.zip CSN
print("download your CSN version 'CSN.zip'")


7-Zip [64] 16.02 : Copyright (c) 1999-2016 Igor Pavlov : 2016-05-21
p7zip Version 16.02 (locale=en_US.UTF-8,Utf16=on,HugeFiles=on,64 bits,2 CPUs Intel(R) Xeon(R) CPU @ 2.20GHz (406F0),ASM,AES-NI)

Scanning the drive:
  0M Scan           
CSN

0 files, 0 bytes

Creating archive: CSN.zip

Items to compress: 0

  0%    
Files read from disk: 0
Archive size: 22 bytes (1 KiB)


CSN : No more files
----------------
download your CSN version 'CSN.zip'


## 7) Using the Collection Space Navigator

depending on how you chose to use the tool

In [48]:
buildFolderPath = "build" # ToDo: create everything needed for local server to run
webFolderPath = "" # ToDo: create everything needed for server to run as website
if usage.value != 2:
  print(f"-> To run your CSN version locally, download the {buildFolderPath} folder and run the virtual server by running [] \n")
if usage.value != 1:
  print(f"-> To run your CSN version as a web tool, download the {webFolderPath} folder and upload it to your server. \n")
print("-> Continue with the next step to run your CSN version with a proxy server.")

-> To run your CSN version locally, download the build folder and run the virtual server by running [] 

-> To run your CSN version as a web tool, download the  folder and upload it to your server. 

-> Continue with the next step to run your CSN version with a proxy server.


#### run a proxy server and test your CSN version

In [49]:
def run_server():
  def server_entry():
    from functools import partial
    import socketserver, http.server
    Handler = partial(http.server.SimpleHTTPRequestHandler, directory='/content/CSN/build/')              
    httpd = socketserver.TCPServer(("", port), Handler)
    # Handle a single request then exit the thread.
    httpd.serve_forever()
  import portpicker, threading, socket
  port = portpicker.pick_unused_port()
  thread = threading.Thread(target=server_entry)
  thread.start()
  from google.colab import output
  print(f"... ready!")
  print(f"\nUse the CSN here:")
  output.serve_kernel_port_as_window(port)
def run_ngrok(ngrokAuthtoken):
  try:
    from pyngrok import ngrok
    from flask_ngrok import run_with_ngrok
    from flask import Flask, render_template,send_from_directory
  except:
    print("Installing flask, flask_ngrok and pyngrok via Pip")
    !pip install flask flask_ngrok pyngrok --quiet
    from flask import Flask, render_template,send_from_directory
    from pyngrok import ngrok
    from flask_ngrok import run_with_ngrok
  from multiprocessing import Process
  app = Flask(__name__,static_folder='/content/CSN/build/',template_folder='/content/CSN/build/')
  ngrok.set_auth_token(ngrokAuthtoken)
  run_with_ngrok(app)
  @app.route('/<path:path>')
  def send_report(path):
    # remove the replace in next to lines later later <-- important !!!!!!!!
    print("files",path)
    return send_from_directory('/content/CSN/build/', str(path))
  @app.route("/")
  def home():
      return render_template('index.html')  
  if __name__ == "__main__":
    server = Process(target=app.run)
    server.start()

In [50]:
def button_run(v):
  if serverChoice.value == 1:
    print(f"\nStarting your CSN version in colab...")
    buttonRun.description = '...running'
    buttonRun.disabled = True
    run_server()
  else:
    if len(ngrokKey.value)>10:
      print("\nRunning your CSN version with ngrok...")
      buttonRun.description = '...running'
      buttonRun.disabled = True
      run_ngrok(ngrokKey.value)
    else:
      print("\nERROR: no ngrok authentication token found!")
def get_server(SERVER):
  if SERVER == 1:
    buttonRun.description = 'run colab proxy'
    infoLabel = Label("colab proxy: The link only works until reloaded and can't be shared. Perfect for quick testing.")
    display(VBox([infoLabel,buttonRun]))
  else:
    buttonRun.description = 'run ngrok proxy'
    infoLabel = Label("ngrok proxy: External service. The link works while running and can be shared with others.")
    warnLabel = Label("NOTE: This is NOT a solution for production (make web tool instead)! Needs registration -> https://ngrok.com/")
    display(VBox([ngrokKey,infoLabel,warnLabel,buttonRun]))
serverChoice = widgets.Dropdown(description="run server:",options=[("colab proxy",1),("ngrok proxy",2)])
ngrokKey = widgets.Text(placeholder="authentication token from your ngrok account",description="ngrok token:",layout=layout)
buttonRun = widgets.Button(description='run colab proxy',icon='check',disabled=False)
buttonRun.on_click(button_run)
interactive(get_server,SERVER = serverChoice)


interactive(children=(Dropdown(description='run server:', options=(('colab proxy', 1), ('ngrok proxy', 2)), va…


Starting your CSN version in colab...
... ready!

Use the CSN here:


<IPython.core.display.Javascript object>

In [None]:
# for testing ngrok, don't ship!
# 214xqlGaK4ILkWe0trnzvyD8CCo_zsbBD48amdfb6LQ9t7Kw 