In [22]:
# Script that receives an unclassified PSF dataset and adds the remaining stars in the FOV with SEDs assigned by a CNN classifier

import numpy as np
import tensorflow as tf
import sklearn.metrics as skm

In [23]:
# Load datasets, there is the assumption that the datasets are coherent
dataset_path = '/Users/as274094/Documents/psf_dataset4/'
dataset_name_1 = 'train_Euclid_500_stars_id_004GT_350_bins.npy'
dataset_name_2 = 'train_Euclid_2000_stars_id_004GT_350_bins.npy'
dataset_1 = np.load(dataset_path + dataset_name_1, allow_pickle=True)[()] # Dataset with true SEDs
dataset_2 = np.load(dataset_path + dataset_name_2, allow_pickle=True)[()] # The dataset containing the whole FOV

In [24]:
n_remaining_stars = dataset_2['noisy_stars'].shape[0]-dataset_1['noisy_stars'].shape[0]

true_SEDs = np.array(dataset_2['SED_ids'][-n_remaining_stars:])
x_to_convert = np.expand_dims(dataset_2['noisy_stars'][-n_remaining_stars:], axis = 3)
true_SEDs.shape

(1500,)

In [25]:
# Load model

model_path = '/Users/as274094/GitHub/Refractored_star_classifier/tensorflow_version/best_models/CNN_model/'
classifier = tf.keras.models.load_model(model_path)
    
def calculate_success_rate(confusion_matrix):
    diagonal = np.trace(confusion_matrix)
    diagonal_neighbors = np.sum(np.diagonal(confusion_matrix, offset=1)) + np.sum(np.diagonal(confusion_matrix, offset=-1))
    total_classified = np.sum(confusion_matrix)
    
    success_rate = (diagonal + diagonal_neighbors) / total_classified
    return success_rate


In [26]:
# Make predictions and calculate metrics

y_test_pred = classifier.predict(x_to_convert, verbose = 1)
class_predictions = np.argmax(y_test_pred, axis = 1)

f1_mean = np.mean(skm.f1_score(true_SEDs, class_predictions, average = None)[:13])
print('Average F1 score:', f1_mean)

confusion_matrix = skm.confusion_matrix(true_SEDs, class_predictions)
print("\nConfusion matrix:")
print(confusion_matrix)

success_rate = calculate_success_rate(confusion_matrix)
print('\nSuccess rate:', success_rate)

Average F1 score: 0.6830978949430104

Confusion matrix:
[[ 69  37   3   0   0   0   0   0   0   0   0   0   0]
 [ 45  68  13   0   1   0   0   0   0   0   0   0   0]
 [  1   7  73  26   2   0   0   0   0   0   0   0   0]
 [  0   2  35  54  19   0   0   0   0   0   0   0   0]
 [  0   0   1  11  88  10   0   0   0   0   0   0   0]
 [  0   0   0   0  22  71  29   1   0   0   0   0   0]
 [  0   0   0   0   4  20  85  12   0   0   0   0   0]
 [  0   0   0   0   1   0  20  67  17   8   0   0   0]
 [  0   0   0   0   0   0   2  51  30  28   0   0   0]
 [  0   0   0   0   0   0   0   9  21  76   1   0   0]
 [  0   0   0   0   0   0   0   0   0   2 109   1   0]
 [  0   0   0   0   0   0   0   0   0   0   0 127   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0 121]]

Success rate: 0.9766666666666667


In [27]:
# Assign SEDs
concatenated_SEDs = np.load('concatenated_SEDs.npy', allow_pickle=True)[()]

SED_list = []
for spectral_class in class_predictions:
    concat_SED = concatenated_SEDs[spectral_class]
    SED_list.append(concat_SED)
SED_array = np.array(SED_list, dtype=object)
SED_array.shape

(1500, 350, 2)

In [28]:
# Save the new dataset

dataset_2['SED_ids'] = np.append(dataset_1['SED_ids'],class_predictions)
dataset_2['SEDs'] = np.concatenate((dataset_1['SEDs'],SED_array))
dataset_2['F1'] = f1_mean
dataset_2['success_rate'] = success_rate

np.save(
        dataset_path + 'expanded_CNN_' + dataset_name_1,
        dataset_2,
        allow_pickle=True
    )

In [29]:
# Verification

expanded_dataset = np.load('/Users/as274094/Documents/psf_dataset4/expanded_CNN_train_Euclid_500_stars_id_004GT_350_bins.npy', allow_pickle=True)[()]
print(expanded_dataset['F1'],expanded_dataset['success_rate'])

0.6830978949430104 0.9766666666666667
