In [1]:
import dask
import numpy as np
import sklearn
from matplotlib import pyplot as plt
from matplotlib import cm
import copy
import astropy
import hdbscan
import pandas as pd

from astropy.io import fits

In [2]:
y = pd.read_csv('rcsed_iGrID.csv')

In [3]:
sdss_indx = list(y[~y.iGrID.isna()].index)

y = y.loc[sdss_indx,:].to_numpy().flatten()

sdss_labels = pd.Series(y)

In [4]:
hdul = fits.open('rcsed_v2_clean.fits', memmap=astropy.io.fits.Conf.use_memmap.defaultvalue, lazy_load_hdus=True)

In [5]:
cols = hdul[1].columns

In [6]:
data = hdul[1].data

In [7]:
hdul.close()
del hdul

In [None]:
DATA = pd.DataFrame(np.array(data).byteswap().newbyteorder()) 

In [None]:
del data

In [None]:
RA = DATA.ra
DEC = DATA.dec
Z = DATA.z

In [None]:
ra_dec_z = DATA[['ra','dec','z']]

In [None]:
from sklearn.preprocessing import StandardScaler

stdscaler = StandardScaler().fit(ra_dec_z) #[sdss_indx])

ra_dec_z_scaled = stdscaler.transform(ra_dec_z)

In [None]:
hdbScan = hdbscan.hdbscan_.HDBSCAN( min_cluster_size=3, #2,
                                    min_samples=15, #3,
                                    algorithm='boruvka_kdtree',   #'boruvka_kdtree', 
                                    leaf_size=50, #30,
                                    approx_min_span_tree=True,
                                    gen_min_span_tree=True,
                                    core_dist_n_jobs=4,
                                    cluster_selection_method='eom',
                                    allow_single_cluster=False,
                                    prediction_data=False,
                                    match_reference_implementation=False).fit(ra_dec_z_scaled)

rcsed_labels = hdbScan.labels_
for i in range(len(rcsed_labels)):
    if rcsed_labels[i]==-1:
        rcsed_labels[i]=i+5000000

true = y
pred = rcsed_labels

fms = round(sklearn.metrics.fowlkes_mallows_score(true, pred[sdss_indx]),5)
ars = round(sklearn.metrics.adjusted_rand_score(true, pred[sdss_indx]),5)
nmi = round(sklearn.metrics.normalized_mutual_info_score(true, pred[sdss_indx]),5)

print(fms)