In [6]:
import os
import time
from io import StringIO

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import keyfi as kf
from keyfi.dimred import UMAP
from keyfi.cluster import HDBSCAN

from sklearn.preprocessing import StandardScaler, MaxAbsScaler


params

In [7]:
DATA_PATH = "/home/azureuser/project/data/plane"
SNAPSHOT = "285140.078369"

#savepaths
figpath="/home/azureuser/project/figures/scaling_test4_scale_U_ind"
embedding_path = os.path.join("/home/azureuser/project/data/embeddings/scaling_test4_scale_U_ind", SNAPSHOT)

os.makedirs(figpath, exist_ok=True)
os.makedirs(embedding_path, exist_ok=True)

helper functions

In [8]:
def get_data(snapshot):
    return kf.import_vtk_data(
        os.path.join(DATA_PATH, snapshot, "data.vtk")
    )

data prep

In [9]:
df, mesh = get_data(SNAPSHOT)

df = pd.read_csv(StringIO(df.to_csv()), index_col=0)

cleaned_data = kf.clean_data(df, dim=2,
                             vars_to_drop=["N2", "NO2", "rho"]
                            )

features = cleaned_data.columns
standard_scalers = [StandardScaler] * len(cleaned_data.columns)
maxabs_scalers = [MaxAbsScaler] * len(cleaned_data.columns)

features

Index(['T', 'N2O4', 'Qdot', 'U:0', 'U:1'], dtype='object')

UMAP params

In [10]:
n_neighbors_range = [200, 250, 300, 150]
min_dist_range = [0.1, 0.05]

scaling

In [11]:
cleaned_data300 = cleaned_data.copy()
cleaned_data300["Qdot"].clip(-300, 300, inplace=True)

cleaned_data200 = cleaned_data.copy()
cleaned_data200["Qdot"].clip(-200, 200, inplace=True)

cleaned_data400 = cleaned_data.copy()
cleaned_data400["Qdot"].clip(-400, 400, inplace=True)

In [12]:
maxabs200 = kf.scale_data(cleaned_data200, features, maxabs_scalers)
maxabs300 = kf.scale_data(cleaned_data300, features, maxabs_scalers)
maxabs400 = kf.scale_data(cleaned_data400, features, maxabs_scalers)

dimred

In [None]:
saved_embeddings = os.listdir(embedding_path)

for n_neighbors in n_neighbors_range:
    for data, scaling in zip((maxabs200, maxabs300, maxabs400), ("maxabs200", "maxabs300", "maxabs400")):
        for min_dist in min_dist_range:

            start_time = time.time()

            print(n_neighbors, min_dist, scaling)

            if f"{n_neighbors}_{min_dist}_{scaling}.npy" in saved_embeddings:
                embedding = np.load(
                    os.path.join(embedding_path, f"{n_neighbors}_{min_dist}_{scaling}.npy")
                    )
                print("loaded from existing")

            else:

                embedding, mapper = kf.embed_data(
                    data=data,
                    algorithm=UMAP,
                    n_neighbors=n_neighbors,
                    min_dist=min_dist,
                    #ensures reproducibility, disable for faster compute
                    # random_state=0,
                    #how many dimensions to reduce to
                    n_components=2,
                )

                np.save(
                    os.path.join(embedding_path, f"{n_neighbors}_{min_dist}_{scaling}"),
                    embedding
                )

            print("time:", time.time()-start_time)


200 0.1 maxabs200
time: 117.91648769378662
200 0.05 maxabs200
time: 106.11140036582947
200 0.1 maxabs300
time: 102.9315574169159
200 0.05 maxabs300
time: 101.54374718666077
200 0.1 maxabs400


make figures

In [None]:
df, mesh = get_data(SNAPSHOT)

df = pd.read_csv(StringIO(df.to_csv()), index_col=0)

cleaned_data = kf.clean_data(df, dim=2,
                             vars_to_drop=["N2", "NO2", "rho"]
                            )

for n_neighbors in n_neighbors_range:
    for min_dist in min_dist_range:
        for scaling in ("maxabs200", "maxabs300", "maxabs400"):
            
            embedding = np.load(
                os.path.join(embedding_path, f"{n_neighbors}_{min_dist}_{scaling}.npy")
                )
            
            for var in cleaned_data.columns:
            
                if var == "Qdot":
                    if "200" in scaling:
                        cmap_minmax=[-200, 200]
                    elif "300" in scaling:
                        cmap_minmax=[-300, 300]
                    elif "400" in scaling:
                        cmap_minmax=[-400, 400]
                    else:
                        cmap_minmax=[]
                else:
                    cmap_minmax=[]

                kf.plot_embedding(
                    embedding=embedding,
                    data=cleaned_data,
                    scale_points = True,
                    cmap_var=var,
                    cmap_minmax=cmap_minmax,
                    save=True,
                    title=f"{n_neighbors}_{min_dist}_{scaling}_{var}",
                    figname=f"{n_neighbors}_{min_dist}_{scaling}_{var}",
                    figpath=figpath,
                    view=(None, None)
                )
                
            