# Imports

In [None]:
# Data
import numpy  as np
import pandas as pd

# Graphs
import matplotlib
import matplotlib.pyplot as plt
import seaborn           as sns

# Others
from   math import ceil, floor
import random as rd
import os
import warnings
from tqdm.notebook import tqdm
from tqdm.keras import TqdmCallback
warnings.filterwarnings("ignore", category=Warning)

# Tensorflow
import tensorflow as tf

# SKLearn
import sklearn
from sklearn.model_selection     import train_test_split
from sklearn.model_selection     import KFold
from sklearn.preprocessing       import StandardScaler, LabelEncoder

# Get current time to save files
from datetime import datetime

# Datasets

In [None]:
# Fix seed
seed = 1994
np.random.seed(seed)

# Loading CSVs
All_Data = pd.read_csv('AllData.csv')

# Preprocessing Data
Al0, Al1 = All_Data['r_auto'].values >= 16,   All_Data['r_auto'].values <= 21
Al2      = All_Data['nDet_auto']     >= 8    # Number of features
Al3      = All_Data['PhotoFlag']     == 0     # No problems in photometry
Al4      = All_Data['PROB_GAL']      >= 0.5   # Only galaxies
Al5      = All_Data['z_SDSS']        >  1e-4  # Filter out SDSS stars
Al7      = All_Data['zErr']          <  0.4   # Low z error
Al8      = All_Data['class_SDSS']    != 'QSO' # No QSOs
All_Data = All_Data[Al0 & Al1 & Al2 & Al3 & Al4 & Al5 & Al7 & Al8]

# Calculating ellipticity (flattening)
All_Data['Ellipticity'] = 1 - All_Data['B']/All_Data['A']

# Defining column names
Extra_F   = ['FWHM_n', 'MUMAX', 'Ellipticity']
Features  = [s for s in All_Data.columns.values if (('auto' in s) or ('_mag' in s)) and not (s.startswith('e') or s.startswith('n') or s.endswith('err'))]
Errors    = [s for s in All_Data.columns.values if (('auto' in s) or ('_mag' in s)) and (s.startswith('e') or s.endswith('err'))]
Target    = ['z_SDSS']
Target_er = ['zErr']
zBPZ      = ['zb']

for error in Errors:
    All_Data.loc[All_Data[error] > 1, error] = 1 # Set errors > 1 to = 1
    
# Fix missing features
for feature in Features:
    All_Data.loc[All_Data[feature] < 0,  feature] = 0
    All_Data.loc[All_Data[feature] > 50, feature] = 0

All_Data['u-r']   = All_Data['uJAVA_auto'] - All_Data['r_auto']
All_Data['378-r'] = All_Data['F378_auto']  - All_Data['r_auto']
All_Data['395-r'] = All_Data['F395_auto']  - All_Data['r_auto']
All_Data['410-r'] = All_Data['F410_auto']  - All_Data['r_auto']
All_Data['430-r'] = All_Data['F430_auto']  - All_Data['r_auto']
All_Data['g-r']   = All_Data['g_auto']     - All_Data['r_auto']
All_Data['515-r'] = All_Data['F515_auto']  - All_Data['r_auto']
All_Data['r-660'] = All_Data['r_auto']     - All_Data['F660_auto']
All_Data['r-i']   = All_Data['r_auto']     - All_Data['i_auto']
All_Data['r-861'] = All_Data['r_auto']     - All_Data['F861_auto']
All_Data['r-z']   = All_Data['r_auto']     - All_Data['z_auto']
All_Data['r-W1']  = All_Data['r_auto']     - All_Data['w1_mag']
All_Data['r-W2']  = All_Data['r_auto']     - All_Data['w2_mag']

Colors    = [s for s in All_Data.columns.values if ('-' in s)]

# Fix colors from missing features
for color in Colors:
    All_Data.loc[All_Data[color] <= -10,  color] = -10
    All_Data.loc[All_Data[color] >= 10,   color] = 10

TrainingFeatures = Features + Extra_F + Colors

print('# Features:\n# %s' %(TrainingFeatures))
print('# Target:\n# %s' %Target)
print('# Errors:\n# %s' %Errors)
print()

##############################################################
# Splitting in training and test samples
TrainingSample, TestingSample = sklearn.model_selection.train_test_split(All_Data, test_size=0.30, random_state=seed)

##############################################################
# Verifying the number of objects in each sample
print('# Training Sample Objects = %s, Percentage of total = %.3g%%' %(len(TrainingSample), (100*len(TrainingSample)/(len(TrainingSample)+len(TestingSample)))))
print('# Testing Sample Objects  = %s, Percentage of total = %.3g%%' %(len(TestingSample), (100*len(TestingSample)/(len(TrainingSample)+len(TestingSample)))))
print('# Total Sample Objects    = %s' %(len(TestingSample)+len(TrainingSample)))
print()
# Verifying the existence of duplicates in the samples
print('# Number of matching rows between two samples (should be zero):')
print('# Train/Test = %s' %len(pd.merge(pd.DataFrame(TrainingSample), pd.DataFrame(TestingSample), how='inner')))
print()
# Statistics
print('# Train max Z = %s' %np.max(TrainingSample.z_SDSS))
print('# Test max Z  = %s' %np.max(TestingSample.z_SDSS))
print()

##############################################################
# Scaling Data
Scaler = StandardScaler()

Scaled_Train_X = Scaler.fit_transform(TrainingSample[TrainingFeatures])
Scaled_Train_X = pd.DataFrame(Scaled_Train_X, columns=[TrainingFeatures])

Scaled_Test_X = Scaler.transform(TestingSample[TrainingFeatures])
Scaled_Test_X = pd.DataFrame(Scaled_Test_X, columns=[TrainingFeatures])

for feature in Features:
    Scaled_Train_X.loc[TrainingSample.reset_index(drop=True)[feature] == 0, feature] = 0
    Scaled_Test_X .loc[TestingSample.reset_index(drop=True)[feature] == 0, feature] = 0
##############################################################
# Binning the redshift value
# The code below will generate Num_Bins bins between z=0 and the MaximumZ (rounded up) and name each bins as a number
Num_Bins = 200

# Get maximum Z between the samples
MaximumZ = max(round(np.max(TrainingSample[Target].values), 2), round(np.max(TestingSample[Target].values), 2))
print('# MaximumZ  = %s\n# Bin width = %s' %(MaximumZ, MaximumZ/Num_Bins))

# Creates 'Num_Bins' of redshift between 0 and MaximumZ
TrainingSample['Bin']    = pd.cut(TrainingSample.z_SDSS, bins=np.linspace(0, MaximumZ, Num_Bins), labels=np.arange(0, Num_Bins-1, 1))
TestingSample['Bin']     = pd.cut(TestingSample.z_SDSS,  bins=np.linspace(0, MaximumZ, Num_Bins), labels=np.arange(0, Num_Bins-1, 1))

# Encode class values as integers
encoder = LabelEncoder()
encoded_Y = encoder.fit_transform(TrainingSample['Bin'])

# Convert integers to dummy variables (i.e. one hot encoded)
Target_Bins = tf.keras.utils.to_categorical(encoded_Y, num_classes=Num_Bins, dtype='float32')

##############################################################
# Plot Feature Histograms
Plot_Hists = 0
if Plot_Hists == 1:
  fig, ax = plt.subplots(figsize=(15,15))
  plt.subplots_adjust(hspace=0.5, wspace=0.5)
  Features_to_plot = Features + Extra_F + Colors + zBPZ + Target
  print()
  plt_idx = 1
  for feature in Features_to_plot:
      plt.subplot(7, 5, plt_idx)

      Feature_min = min(np.min(TrainingSample[feature]), np.min(TestingSample[feature]))
      Feature_max = max(np.max(TrainingSample[feature]), np.max(TestingSample[feature]))

      plt.hist(TrainingSample[feature], lw=2, range=(Feature_min, Feature_max), bins=20, histtype='step')
      plt.hist(TestingSample[feature], lw=2, range=(Feature_min, Feature_max), bins=20, histtype='step')

      plt.yscale('log')

      plt.xlabel(feature)

      plt.grid(lw=.5)
      plt_idx = plt_idx+1  

  fig.tight_layout()
  plt.show()

# Training

In [None]:
Input              = tf.keras.layers.Input
Dense              = tf.keras.layers.Dense
BatchNormalization = tf.keras.layers.BatchNormalization
K                  = tf.keras.backend
Model              = tf.keras.models.Model

In [None]:
# Fix seed
seed = 0
rd.seed(seed)
np.random.seed(seed)
tf.compat.v1.random.set_random_seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)

# Scaling Data:
Scaler = StandardScaler()

Scaled_Train_X = Scaler.fit_transform(TrainingSample[TrainingFeatures])
Scaled_Train_X = pd.DataFrame(Scaled_Train_X, columns=[TrainingFeatures])

Scaled_Test_X = Scaler.transform(TestingSample[TrainingFeatures])
Scaled_Test_X = pd.DataFrame(Scaled_Test_X, columns=[TrainingFeatures])

# Fix missing features after scaling
for feature in Features:
    Scaled_Train_X.loc[TrainingSample.reset_index(drop=True)[feature] == 0, feature] = 0
    Scaled_Test_X .loc[TestingSample.reset_index(drop=True)[feature] == 0, feature] = 0

kernel  = 'he_normal'
Neurons = 5
Epochs  = 200

K.clear_session()
def DN_Model_Def(NumFeat):
    Input_Mags = Input(shape=(NumFeat,), name='Input_Dimensions')

    input_0 = Dense(Neurons*7, kernel_initializer=kernel, activation='relu', use_bias=False)(Input_Mags) 
    batch_0  = BatchNormalization(scale=False)(input_0)

    hidden_1 = Dense(Neurons*6, kernel_initializer=kernel, activation='relu', use_bias=False)(batch_0) 
    batch_1  = BatchNormalization(scale=False)(hidden_1)

    hidden_2 = Dense(Neurons*5, kernel_initializer=kernel, activation='relu', use_bias=False)(batch_1) 
    batch_2  = BatchNormalization(scale=False)(hidden_2)

    hidden_3 = Dense(Neurons*5, kernel_initializer=kernel, activation='relu', use_bias=False)(batch_2) 
    batch_3  = BatchNormalization(scale=False)(hidden_3)

    hidden_4 = Dense(Neurons*5, kernel_initializer=kernel, activation='relu', use_bias=False)(batch_3) 
    batch_4  = BatchNormalization(scale=False)(hidden_4)

    hidden_5 = Dense(Neurons*4, kernel_initializer=kernel, activation='relu', use_bias=False)(batch_4) 
    batch_5  = BatchNormalization(scale=False)(hidden_5)

    hidden_a = Dense(Neurons*4, kernel_initializer=kernel, activation='relu', use_bias=False)(batch_5) 
    batch_a  = BatchNormalization(scale=False)(hidden_a)

    hidden_6 = Dense(Neurons*4, kernel_initializer=kernel, activation='relu', use_bias=False)(batch_a) 
    batch_6  = BatchNormalization(scale=False)(hidden_6)

    hidden_7 = Dense(Neurons*3, kernel_initializer=kernel, activation='relu', use_bias=False)(batch_6) 
    batch_7  = BatchNormalization(scale=False)(hidden_7)

    hidden_8 = Dense(Neurons*3, kernel_initializer=kernel, activation='relu', use_bias=False)(batch_7) 
    batch_8  = BatchNormalization(scale=False)(hidden_8)

    hidden_9 = Dense(Neurons*3, kernel_initializer=kernel, activation='relu', use_bias=False)(batch_8) 
    batch_9  = BatchNormalization(scale=False)(hidden_9)

    last_layer = Dense(Neurons*3, kernel_initializer=kernel, activation='relu', use_bias=False)(batch_9) 

    pdf_output = Dense(Num_Bins, activation='softmax', name='PDF')(last_layer)

    # Define model linking inputs to outputs
    model = Model(inputs=Input_Mags, outputs=pdf_output)

    # Compile model
    # Note that each output has a specific loss and metric (you can also define two optimizers (check))
    model.compile(loss='categorical_crossentropy',
                optimizer=tf.keras.optimizers.Nadam(lr=0.01),
                metrics='categorical_accuracy')

    return model

In [None]:
# Train
i = 0
kfold = KFold(n_splits=4, shuffle=True, random_state=seed)

# Current time and output folder
TimeNow = datetime.now().strftime("%d-%m-%Y/%Hh%Mm/")
Output_Dir = 'Results/DN/'+ TimeNow
print("# Output_Dir = '%s'" %Output_Dir)
if os.path.isdir(Output_Dir) == False:
    os.makedirs(Output_Dir)

# Save each fit separately
DN_Model     = {}
DN_Model_Fit = {}
for train, validation in kfold.split(Scaled_Train_X, encoded_Y):
    print('##################################################################')    
    print('# Training Sample Objects   = %s,\tPercentage of total = %.3g%%' %(len(train), (100*len(train)/(len(train)+len(validation)))))
    print('# Validation Sample Objects = %s,\tPercentage of total = %.3g%%' %(len(validation), (100*len(validation)/(len(train)+len(validation)))))
    print('# Testing Sample Objects    = %s,\tPercentage of total = %.3g%%' %(len(Scaled_Test_X), (100*len(Scaled_Test_X)/(len(Scaled_Test_X)))))
    print('#')
    print('# Number of matching rows between two samples (should be zero):')
    print('# Train/Validation = %s' %len(pd.merge(pd.DataFrame(Scaled_Train_X.values[train]), pd.DataFrame(Scaled_Train_X.values[validation]), how='inner')))
    print('# Test/Validation  = %s' %len(pd.merge(pd.DataFrame(Scaled_Test_X.values),         pd.DataFrame(Scaled_Train_X.values[validation]), how='inner')))
    print('# Train/Test       = %s' %len(pd.merge(pd.DataFrame(Scaled_Train_X.values[train]), pd.DataFrame(Scaled_Test_X.values), how='inner')))
    print()
    
  
    DN_Model[i] = DN_Model_Def(len(TrainingFeatures))
    DN_Model_Fit[i] = DN_Model[i].fit(Scaled_Train_X[TrainingFeatures].iloc[train], Target_Bins[train],
                                      validation_data=(Scaled_Train_X[TrainingFeatures].iloc[validation], Target_Bins[validation]),
                                      epochs=Epochs, batch_size=256, verbose=0, callbacks=[TqdmCallback()])

    i = i+1

In [None]:
# Plot and save Loss
Folds = DN_Model_Fit.keys()

fig, ax = plt.subplots(figsize=(14, 7))
plt_idx = 1
for fold in Folds:
    plt.subplot(2, 2, plt_idx)
    
    plt.plot(DN_Model_Fit[fold].history['loss'], lw=2, alpha=1, label='Training')
    plt.plot(DN_Model_Fit[fold].history['val_loss'], lw=2, alpha=1, label='Validation')
    plt.ylim(bottom=2, top=4)
    
    plt.ylabel('Loss (CCE)')
    plt.xlabel('Epochs')
    if plt_idx == 1:
        plt.legend()

    plt_idx = plt_idx+1
    
fig.tight_layout()
plt.savefig(Output_Dir+'Loss.pdf', bbox_inches='tight')
plt.show()

In [None]:
# Save model
Save_Model = True
Folds = DN_Model_Fit.keys()

if Save_Model == True:
    for fold in Folds:
        # Save model in TF format
        DN_Model[fold].save(Output_Dir+'Fold_%s' %fold, overwrite=True)

        # Save training history
        pd.DataFrame(DN_Model_Fit[fold].history).to_csv(Output_Dir+'/Seed'+str(seed)+'_Fold%s.csv' %fold, index=False)

In [None]:
# Make predictions
from scipy import integrate # To calculate the CDF of objects

# To calculate PDFs
def Calc_PDF(x, Weights, Means, STDs):
    PDF = np.sum(Weights*(1/(STDs*np.sqrt(2*np.pi))) * np.exp((-1/2) * ((x[:,None]-Means)**2)/(STDs)**2), axis=1)
    return PDF/np.trapz(PDF, x)

# General function to find the nearest idx of an item in a list
def find_nearest_idx(array, value):
    array = np.asarray(array)
    idx = (np.abs(array - value)).argmin()
    return idx

# Step function
def step(x,y):
    return 1 * (x > y)

# Calculate HPDCI per object
# https://stackoverflow.com/questions/33345780/empirical-cdf-in-python-similiar-to-matlabs-one
def Check_Intervals(x):
    List, last = [[]], None
    for elem in x:
        if last is None or abs(last - elem) <= 1:
            List[-1].append(elem)
        else:
            List.append([elem])
        last = elem
    return List

def Calculate_HPDCI(x, pdf_object, zspec):
    HPDCI_Indexes = list(np.where(pdf_object >= pdf_object[find_nearest_idx(x, zspec)])[0])
    HPDCI_Indexes = Check_Intervals(HPDCI_Indexes)

    Object_HPDCI = 0
    for k in range(len(HPDCI_Indexes)):
        Object_HPDCI += np.trapz(pdf_object[HPDCI_Indexes[k]], x[HPDCI_Indexes[k]])

    return Object_HPDCI

Folds = DN_Model.keys()

if len(Folds) > 1:
    print('# Predicting for %s folds' %len(Folds))

    x = np.linspace(0, MaximumZ, 200)

    Fold_PDFs    = {}
    Fold_PhotoZs = {}
    Fold_CRPS    = {}
    Fold_PITs    = {}
    Fold_Odds    = {}

    for fold in tqdm(Folds):
        Fold_PDFs[fold] = DN_Model[fold].predict(Scaled_Test_X)

        # Calculate ZPhots, PITs, CRPS, and Odds per fold
        Fold_PhotoZs[fold] = []
        Fold_Odds[fold]    = []
        Fold_PITs[fold]    = []
        Fold_CRPS[fold]    = []

        for i in range(len(Scaled_Test_X)):
            # Get the PDF for the object i
            Obj_PDF = Fold_PDFs[fold][i]
            # Find an approximate photo-z
            Zphot   = np.average(x, weights=Obj_PDF)
            # Using the new x-grid, build the 'detailed' PDF and find a better photo-z
            Fold_PhotoZs[fold].append(Zphot)

            # From the Obj_PDF, calculate the CDF
            Obj_CDF = integrate.cumtrapz(Obj_PDF, x, initial=0)
            # Calculate the Odds of object i (arXiv 9811189, eq. 17. Also calculated as the integral of the PDF between z_peak +/- 0.02)
            Fold_Odds[fold].append( Obj_CDF[find_nearest_idx(x, Fold_PhotoZs[fold][i]+0.02)] - Obj_CDF[find_nearest_idx(x, Fold_PhotoZs[fold][i]-0.02)] )
            # Calculate the PIT of object i (arXiv 1608.08016, eq. 2)
            Fold_PITs[fold].append( Obj_CDF[find_nearest_idx(x, TestingSample['z_SDSS'].values[i])] )
            # Calculate the CRPS of object i (arXiv 1608.08016, eq. 4)
            Fold_CRPS[fold].append( np.trapz((Obj_CDF - step(x, TestingSample['z_SDSS'].values[i]))**2, x) )

    Result_DF = pd.DataFrame()
    Result_DF['r_auto']      = TestingSample['r_auto'].values
    Result_DF['z']           = TestingSample['z_SDSS'].values
    Result_DF['class_SDSS']  = TestingSample['class_SDSS'].values
    Result_DF['zml']         = np.mean([Fold_PhotoZs[fold] for fold in Folds], axis=0)
    Result_DF['Odds']        = np.mean([Fold_Odds[fold] for fold in Folds], axis=0)
    Result_DF['PIT']         = np.mean([Fold_PITs[fold] for fold in Folds], axis=0)
    Result_DF['CRPS']        = np.mean([Fold_CRPS[fold] for fold in Folds], axis=0)