In [1]:
import os
import time
from io import StringIO

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import keyfi as kf
from keyfi.dimred import UMAP
from keyfi.cluster import HDBSCAN

from sklearn.preprocessing import MaxAbsScaler


In [2]:
n_neighbors = 250
min_dist = 0.1
qdot_clip = 300

In [3]:
DATA_PATH = "/home/azureuser/project/data/plane"

embeddings_path = os.path.join("/home/azureuser/project/data/embeddings/all_snapshots_N2O4_not_scaled", f"{n_neighbors}_{min_dist}_{qdot_clip}c")

logfile = "/home/azureuser/project/all_snapshots_N2O4_not_scaled.log"

os.makedirs(embeddings_path, exist_ok=True)

saved_embeddings = os.listdir(embeddings_path)

num_snapshots = len(os.listdir(DATA_PATH))

In [4]:
def get_data(snapshot):
    return kf.import_vtk_data(
        os.path.join(DATA_PATH, snapshot, "data.vtk")
    )

def log(*msg):
    with open(logfile, "a") as file:
        print(*msg, file=file)

In [7]:
features_to_scale = ['T', 'Qdot', ['U:0', 'U:1']]
scalers = [MaxAbsScaler] * 3

In [None]:
for index, snapshot in enumerate(os.listdir(DATA_PATH)):
    if index % 2 == 0:
        start_time = time.time()
        #data prep
        df, mesh = get_data(snapshot)

        data = kf.clean_data(df, dim=2,
                                 vars_to_drop=["N2", "NO2", "rho"]
                                )
        data["Qdot"].clip(-qdot_clip, qdot_clip)

        data = kf.scale_data(data, features_to_scale, scalers)

        log(snapshot)
        print(snapshot)

        if f"{snapshot}.npy" in saved_embeddings:
            embedding = np.load(
                os.path.join(embeddings_path, f"{snapshot}.npy")
                )
            log("loaded from existing")
            print("loaded from existing")

        else:
            embedding, mapper = kf.embed_data(
                data=data,
                algorithm=UMAP,
                n_neighbors=n_neighbors,
                min_dist=min_dist,
                #reproducible
                #random_state=0,
                n_components=2,
            )

            np.save(
                os.path.join(embeddings_path, f"{snapshot}"),
                embedding
            )
        log("time:", time.time()-start_time)
        log(f"{100*(index+1)/num_snapshots:.2f}%")
        print("time:", time.time()-start_time)
        print(f"{100*(index+1)/num_snapshots:.2f}%")

285573.578369
