In [1]:
import numpy as np
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

In [2]:
# Charge the dataset 
data_path = '/Users/as274094/Documents/psf_dataset2/'
test_dataset = np.load(data_path + 'test_Euclid_res_20000_TestStars_id_002GT_100_bins.npy', allow_pickle=True)[()]
train_dataset = np.load(data_path + 'train_Euclid_res_52000_TrainStars_id_002GT_100_bins.npy', allow_pickle=True)[()]
output_path = data_path #'/Users/as274094/GitHub/Refractored_star_classifier/tensorflow_version/'

In [3]:
# Load the stars

noiseless_train_stars = train_dataset['stars']
noiseless_test_stars = test_dataset['stars']
noisy_train_stars = train_dataset['noisy_stars']
noisy_test_stars = test_dataset['noisy_stars']

snr_50_400_indexes = np.where(test_dataset['SNR'] >= 50)[0]
noisy_test_stars_50_400 = test_dataset['noisy_stars'][snr_50_400_indexes]
classes_50_400 = np.array(test_dataset['SED_ids'])[snr_50_400_indexes]

In [4]:
def perform_PCA(fit_selection, *transform_selection):
    '''
    Performs a 24 component PCA of 32x32 arrays, fits fit_selection, transforms a variable length tuple of sets transform_selection and returns a list of the processed sets.
    '''
    
    # Create PCA object
    pca = PCA(n_components= 24)
    pca.fit(fit_selection.reshape(-1, 1024))

    pca_processed_sets = []
    for i in range(len(transform_selection)):
        pca_processed_sets.append(pca.transform(transform_selection[i].reshape(-1, 1024)))

    return pca_processed_sets



- A: fit and transform both noiseless train and test sets 
- B: fit and transform both noisy train and test sets
- C: fit noiseless train and test sets, transform noisy train and test sets
- D: fit and transform both noisy train and test sets with SNR >= 50
- E: fit and transform noisy train set, fit and transform noisy test set

In [15]:
# Dataset A

fit_selection = np.concatenate((noiseless_train_stars, noiseless_test_stars), axis = 0)
x_train, x_test = perform_PCA(fit_selection, noiseless_train_stars, noiseless_test_stars)


In [18]:
# Dataset B

fit_selection = np.concatenate((noisy_train_stars, noisy_test_stars), axis = 0)
x_train, x_test = perform_PCA(fit_selection, noisy_train_stars, noisy_test_stars)

In [21]:
# Dataset C

fit_selection = np.concatenate((noiseless_train_stars, noiseless_test_stars), axis = 0)
x_train, x_test = perform_PCA(fit_selection, noisy_train_stars, noisy_test_stars)

In [24]:
# Dataset D

fit_selection = np.concatenate((noisy_train_stars, noisy_test_stars_50_400), axis = 0)
x_train, x_test = perform_PCA(fit_selection, noisy_train_stars, noisy_test_stars_50_400)

In [28]:
# Dataset E

x_train = perform_PCA(noisy_train_stars, noisy_train_stars)[0]
x_test = perform_PCA(noisy_test_stars, noisy_test_stars)[0]

In [29]:
def SEDlisttoC(SED_list):
    sed_array = np.array(SED_list)
    return sed_array*0.5 + 1.5

y_train = SEDlisttoC(train_dataset['SED_ids'])
x_train, x_val, y_train, y_val, sed_train, sed_val = train_test_split(x_train, y_train, train_dataset['SED_ids'],test_size = 20000, shuffle = False) # Reserve 20,000 stars for validation

y_test = SEDlisttoC(test_dataset['SED_ids'])
sed_test = test_dataset['SED_ids']

In [26]:
y_test = np.array(SEDlisttoC(test_dataset['SED_ids']))[snr_50_400_indexes]
sed_test = np.array(test_dataset['SED_ids'])[snr_50_400_indexes]

In [30]:
PCA_dataset = {
    'train_stars_pca' : x_train,
    'validation_stars_pca' : x_val,
    'test_stars_pca' : x_test,
    'train_C' : y_train,
    'validation_C' : y_val,
    'test_C' : y_test,
    'train_SEDs': sed_train,
    'validation_SEDs' : sed_val,
    'test_SEDs' : sed_test
}

np.save(
    output_path + 'PCA_dataset2E.npy',
    PCA_dataset,
    allow_pickle=True
)