In [3]:
import numpy as np
import pandas as pd
import glob
from astropy.table import Table
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import silhouette_score
from sklearn.model_selection import ParameterGrid
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import hdbscan
import matplotlib.pyplot as plt
from itertools import combinations
import umap
from joblib import Parallel, delayed
import seaborn as sns; sns.set()
sns.set_theme(style="ticks")

In [None]:
data = Table.read("../Lomeli-data/catalog_all_bands_all_fovs_all_sources_106_rband.dat", format='ascii')

In [None]:
data

In [None]:
combined_df = data.to_pandas()

In [None]:
for columns in combined_df.columns:
    print(columns)

In [None]:
m_x =  (combined_df["r"] >= 13) & (combined_df["r"] <= 22)
m_err = (combined_df["rerr"] <= 0.2) & (combined_df["gerr"] <= 0.2) & \
        (combined_df["ierr"] <= 0.2) & (combined_df["uerr"] <= 0.2) & \
        (combined_df["F378err"] <= 0.2) & (combined_df["F395err"] <= 0.2) & \
        (combined_df["F410err"] <= 0.2) & (combined_df["F430err"] <= 0.2) & \
        (combined_df["F515err"] <= 0.2) & (combined_df["F660err"] <= 0.2) & \
        (combined_df["F861err"] <= 0.2) & (combined_df["zerr"] <= 0.2)

flags = (combined_df["flags_r"] == 0) & (combined_df["flags_i"] == 0)

mask = m_x & m_err & flags

df_cleanErr = combined_df[mask]
len(df_cleanErr)

In [None]:
#Selecting columns
columns = ["r",
"g",
"i",
"u",
"z",
"F378",
"F395",
"F410",
"F430",
"F515",
"F660",
"F861"]


In [None]:
df_mag = df_cleanErr[columns]
df_mag

In [None]:
# Generate all combinations of magnitude columns
color_index_pairs = list(combinations(df_mag, 2))
len(color_index_pairs)

In [None]:
def calculate_earnings(df, index_pairs):
    for index_pair in index_pairs:
        color_index_name = f"{index_pair[0]} - {index_pair[1]}"
        df.loc[:, color_index_name] = df[index_pair[0]] - df[index_pair[1]]
    return df


In [None]:
df_colors_mag = calculate_earnings(df_mag, color_index_pairs)

In [None]:
df_colors_mag

In [None]:
# Drop magniytudes
df_colors = df_colors_mag.drop(columns=columns)

In [None]:
df_colors

# Umap

In [None]:
# Standarized the data
X_stand = StandardScaler().fit_transform(df_colors)


Try cross-validation

To perform cross-validation for choosing the optimal number of components in UMAP.


In [39]:
# Split data into training and validation sets
X_train, X_val = train_test_split(X_stand, test_size=0.2, random_state=42)

In [40]:
# Define a range of number of components to try
num_components_range = [2, 3, 4, 5, 10, 20, 50]
n_neighbors_range = [5, 10, 20, 50, 100]

best_num_components = None
best_n_neighbors = None
best_silhouette_score = -1  # Initialize with a low value

In [None]:
# Loop over different numbers of components and neighbors
for num_components in num_components_range:
    for n_neighbors in n_neighbors_range:
        # Fit UMAP model
        reducer_ = umap.UMAP(n_neighbors=n_neighbors, n_components=num_components, random_state=42)
        X_train_trans = reducer_.fit_transform(X_train)

        # Cluster the transformed data using KMeans
        kmeans = KMeans(n_clusters=num_components, random_state=42)
        labels = kmeans.fit_predict(X_train_trans)

        # Evaluate performance on validation set using Silhouette Score
        silhouette = silhouette_score(X_train_trans, labels, random_state=42)
        print(f"Number of components: {num_components}, Number of neighbors: {n_neighbors}, Silhouette Score: {silhouette}")

        # Update best parameters if necessary
        if silhouette > best_silhouette_score:
            best_silhouette_score = silhouette
            best_num_components = num_components
            best_n_neighbors = n_neighbors
            best_labels = labels

Number of components: 2, Number of neighbors: 5, Silhouette Score: 0.437427818775177
Number of components: 2, Number of neighbors: 10, Silhouette Score: 0.4708039164543152
Number of components: 2, Number of neighbors: 20, Silhouette Score: 0.5006764531135559
Number of components: 2, Number of neighbors: 50, Silhouette Score: 0.5226908922195435
Number of components: 2, Number of neighbors: 100, Silhouette Score: 0.5398621559143066
