In [2]:
# Script that receives an unclassified PSF dataset and returns the dataset with the according SEDs assigned by the PCA classifier

import numpy as np
import tensorflow as tf
from sklearn.decomposition import PCA
import sklearn.metrics as skm

2023-07-25 16:22:59.350131: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [49]:
# Load datasets

dataset_path = '/Users/as274094/Documents/psf_dataset4/'
dataset_name = 'train_Euclid_2000_stars_id_004GT_350_bins.npy'
#dataset_1 = np.load(dataset_path + dataset_name, allow_pickle=True)[()] # The dataset to classify
#dataset_2 = np.load('/Users/as274094/Documents/psf_dataset3/test_Euclid_400_stars_id_003GT_350_bins.npy', allow_pickle=True)[()] # The other dataset in order to make the PCA
dataset_2 = np.load('/Users/as274094/Documents/psf_dataset2/train_Euclid_res_52000_TrainStars_id_002GT_100_bins.npy', allow_pickle=True)[()]
dataset_1 = np.load('/Users/as274094/Documents/psf_dataset2/test_Euclid_res_20000_TestStars_id_002GT_100_bins_SNR_100_400.npy', allow_pickle=True)[()]
dataset_3 = np.load('/Users/as274094/Documents/psf_dataset2/test_Euclid_res_20000_TestStars_id_002GT_100_bins.npy', allow_pickle=True)[()]

# Load the stars
noisy_stars_1 = dataset_1['noisy_stars']
noisy_stars_2 = dataset_2['noisy_stars']
noisy_stars_3 = dataset_3['noisy_stars']

#should I exclude the validation stars?


In [52]:
# PCA decomposition

fit_selection = np.concatenate((noisy_stars_1, noisy_stars_2), axis = 0)
N_components = 30

pca = PCA(n_components= N_components)
pca.fit(fit_selection.reshape(-1, 1024))
x_to_convert = pca.transform(noisy_stars_1.reshape(-1, 1024))

In [3]:
# Load model

model_path = '/Users/as274094/GitHub/Refractored_star_classifier/tensorflow_version/best_models/config1_PCA_dataset2B30/'
classifier = tf.keras.models.load_model(model_path)

def CtoSEDarray(c_values, variance):
    sed_classes = ((c_values - 1.25) // 0.5).astype(int)
    sed_classes = np.where((c_values < 1.25) | (c_values > 7.75), 20, sed_classes)
    sed_classes = np.where((variance > 1.00), 20, sed_classes)
    return sed_classes
    
def calculate_success_rate(confusion_matrix):
    diagonal = np.trace(confusion_matrix)
    diagonal_neighbors = np.sum(np.diagonal(confusion_matrix, offset=1)) + np.sum(np.diagonal(confusion_matrix, offset=-1))
    total_classified = np.sum(confusion_matrix)
    
    success_rate = (diagonal + diagonal_neighbors) / total_classified
    return success_rate


In [53]:
# Make predictions and calculate metrics

C_pred = classifier.predict(x_to_convert, verbose = 1).reshape(-1) # Predict the scalar parameter C
class_pred = CtoSEDarray(C_pred,np.zeros_like(C_pred))

f1_mean = np.mean(skm.f1_score(dataset_1['SED_ids'], class_pred, average = None)[:13])
print('Average F1 score:', f1_mean)

confusion_matrix = skm.confusion_matrix(dataset_1['SED_ids'], class_pred)
print("\nConfusion matrix:")
print(confusion_matrix)

success_rate = calculate_success_rate(confusion_matrix)
print('\nSuccess rate:', success_rate)

Average F1 score: 0.34343918046252575

Confusion matrix:
[[337 324 200 129 112  50  19   6   1   0   0   0   0  11]
 [264 350 212 152 126  56  24  16   3   0   0   0   0  19]
 [ 68 177 206 273 240 113  55  44  24   4   0   0   0  13]
 [ 33 125 160 269 302 150  69  55  27   2   0   0   0  23]
 [ 31  35  86 150 365 246 135  80  54  21   4   0   0  24]
 [ 15   9  37  90 184 313 262 144 129  60  21   0   0   9]
 [ 14   8  16  45 109 220 243 221 156 110  56   1   0  14]
 [  8   6   7  13  51 103 197 266 291 211 120   2   0   8]
 [  6   9   9  12  43  76 128 241 292 242 148   9   0   8]
 [  8   6   2   9  13  52 100 153 268 310 254  17   0   3]
 [  3   2   3   0   9  12  10  46  72 211 665 155   0   6]
 [  3   0   0   3   1   6   2   2   9  55 335 828   1   5]
 [  0   0   1   1   0   1   1   2   1  11  10 242 958   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0]]

Success rate: 0.6799723843595055


In [47]:
# Assign SEDs

concatenated_SEDs = np.load('concatenated_SEDs.npy', allow_pickle=True)[()]

SED_list = []
for spectral_class in class_pred:
    if spectral_class == 20:
        concat_SED = concatenated_SEDs[0] # what sould I do about the anomalies?
        print('a')
    else:
        concat_SED = concatenated_SEDs[spectral_class]
    SED_list.append(concat_SED)
SED_array = np.array(SED_list, dtype=object)

In [48]:
# Save the new dataset

dataset_1['SEDs'] = SED_array 
dataset_1['SED_ids'] = class_pred
dataset_1['F1'] = f1_mean
dataset_1['success_rate'] = success_rate

np.save(
        dataset_path + 'assigned_' + dataset_name,
        dataset_1,
        allow_pickle=True
    )

In [4]:
# Verification

assigned_dataset = np.load('/Users/as274094/Documents/psf_dataset4/assigned_train_Euclid_2000_stars_id_004GT_350_bins.npy', allow_pickle=True)[()]
print(assigned_dataset['F1'],assigned_dataset['success_rate'])

0.6190589051319667 0.9545
