In [19]:
import os
import time
from io import StringIO

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import keyfi as kf
from keyfi.dimred import UMAP
from keyfi.cluster import HDBSCAN

from sklearn.preprocessing import StandardScaler, MaxAbsScaler


In [20]:
DATA_PATH = "/home/azureuser/project/data/plane"
SNAPSHOT = "285140.078369"

#savepaths
TEST_NAME = "no_species_scaling_final_umap"

figpath = os.path.join("/home/azureuser/project/figures", TEST_NAME)
embedding_path = os.path.join("/home/azureuser/project/data/embeddings", TEST_NAME, SNAPSHOT)

os.makedirs(figpath, exist_ok=True)
os.makedirs(embedding_path, exist_ok=True)

In [21]:
def get_data(snapshot):
    return kf.import_vtk_data(
        os.path.join(DATA_PATH, snapshot, "data.vtk")
    )

In [22]:
df, mesh = get_data(SNAPSHOT)

df = pd.read_csv(StringIO(df.to_csv()), index_col=0)

cleaned_data = kf.clean_data(df, dim=2,
                             vars_to_drop=["N2", "NO2", "rho"]
                            )

In [25]:
data200 = cleaned_data.copy()
data200["Qdot"].clip(-200, 200, inplace=True)

data300 = cleaned_data.copy()
data300["Qdot"].clip(-300, 300, inplace=True)

data400 = cleaned_data.copy()
data400["Qdot"].clip(-400, 400, inplace=True)

In [26]:
features_combined_U = [feature for feature in cleaned_data.columns[:-2]] + [["U:0", "U:1"]]
features_combined_U.remove("N2O4")
features_no_U = [feature for feature in cleaned_data.columns[:-2]]
features_no_U.remove("N2O4")

scalers_combined_U = [MaxAbsScaler] * 3
scalers_no_U = [MaxAbsScaler] * 2
features_combined_U

['T', 'Qdot', ['U:0', 'U:1']]

In [27]:
data200c = kf.scale_data(data200, features_combined_U, scalers_combined_U)
data300c = kf.scale_data(data300, features_combined_U, scalers_combined_U)

data200n = kf.scale_data(data200, features_no_U, scalers_no_U)
data300n = kf.scale_data(data300, features_no_U, scalers_no_U)
data400n = kf.scale_data(data400, features_no_U, scalers_no_U)

In [28]:
#n_neighbours, min_distance, data, description
param_sets = [
    [300, 0.1, data300c, "300c"],
    [300, 0.05, data300c, "300c"],
    [300, 0.1, data200c, "200c"],
    [250, 0.1, data300c, "300c"],
    [300, 0.1, data200c, "200c"],
    [200, 0.1, data200n, "200n"],
    [250, 0.1, data200n, "200n"],
    [250, 0.1, data300n, "300n"],
    [250, 0.05, data400n, "400n"],
    [300, 0.1, data300n, "300n"],
    [150, 0.1, data200c, "200c"],
    [200, 0.1, data300c, "300c"],
    [150, 0.1, data400n, "400n"],
    [250, 0.05, data400n, "400n"],
    [300, 0.1, data400n, "400n"],
]

In [None]:
saved_embeddings = os.listdir(embedding_path)

for params in param_sets:
    start_time = time.time()

    print(params[0], params[1], params[3])

    if f"{params[0]}_{params[1]}_{params[3]}.npy" in saved_embeddings:
        embedding = np.load(
            os.path.join(embedding_path, f"{params[0]}_{params[1]}_{params[3]}.npy")
            )
        print("loaded from existing")

    else:

        embedding, mapper = kf.embed_data(
            data=params[2],
            algorithm=UMAP,
            n_neighbors=params[0],
            min_dist=params[1],
            #ensures reproducibility, disable for faster compute
            random_state=0,
            #how many dimensions to reduce to
            n_components=2,
        )

        np.save(
            os.path.join(embedding_path, f"{params[0]}_{params[1]}_{params[3]}"),
            embedding
        )

    print("time:", time.time()-start_time)

300 0.1 300c
time: 183.11263132095337
300 0.05 300c
time: 183.32432866096497
300 0.1 200c
