# Import all the needed packages

In [1]:
from natsort import natsorted
from collections import defaultdict
import numpy as np
import os 
import pandas as pd
import joblib

# Load the dataset

In [2]:
path = '/home/elias/Projects/Github_rep/SpectralClassifier/' # Change the path with yours
file_with_data = 'test_sample'
infile_data = path + file_with_data

In [3]:
unseen_data = defaultdict(dict) #master dictionary
with open(infile_data, 'r') as inf :
    for line in inf.readlines():
        if line[0] != '#' :
            col = line.split(',') #','   
            target_name = col[0]
            spec_lines  = col[1]
            ew_values   = col[3]
            unseen_data[target_name][spec_lines] = ew_values
            

all_spec_lines = list(list(unseen_data.values())[0])
print('Spectral Lines: {}'.format(all_spec_lines))

Spectral Lines - Features: ['HeI/4009', 'HeI/4026', 'SiIV/4088', 'Hd/4100', 'SiIV/4116', 'HeI/4121', 'SiII/4130', 'HeI/4144', 'HeII/4200', 'FeII/4233', 'HeI/4387', 'OII/4416', 'HeI/4471', 'MgII/4481', 'HeII/4541', 'SiIII/4553', 'OII+CIII/4645', 'HeII/4686']


# Select the characteristic spectral lines 

In [4]:
selected_lines =['HeI/4009', 'HeI/4026', 'SiIV/4088', 'SiIV/4116', 'HeI/4121', 'SiII/4130', 
                 'HeI/4144', 'HeII/4200', 'FeII/4233', 'HeI/4387', 'OII/4416', 'HeI/4471', 
                 'MgII/4481', 'HeII/4541', 'SiIII/4553', 'OII+CIII/4645', 'HeII/4686']
print('Spectral Features are:')
print(selected_lines)

Spectral Features are:
['HeI/4009', 'HeI/4026', 'SiIV/4088', 'SiIV/4116', 'HeI/4121', 'SiII/4130', 'HeI/4144', 'HeII/4200', 'FeII/4233', 'HeI/4387', 'OII/4416', 'HeI/4471', 'MgII/4481', 'HeII/4541', 'SiIII/4553', 'OII+CIII/4645', 'HeII/4686']


# Check the number of the spectra which will be classified by the model

In [5]:
print('Unseen data: {}'.format(len(unseen_data)))
print('-'*25)

Unseen data: 8
-------------------------


# Creating the arrays with the targets ID and the EW from each line

In [6]:
Targets_unseen = []
EW_values_unseen = []

for i in unseen_data.keys():
    temp = []
    for j in selected_lines:
        temp.append(unseen_data[i][j])
    Targets_unseen.append((i))
    EW_values_unseen.append(temp)

EW_values_unseen = np.asarray(EW_values_unseen,dtype=np.float64)
Targets_unseen= np.asarray(Targets_unseen)

# Load the pre-trained model and apply it 

In [7]:
loaded_kderf = joblib.load("KDE_RF_best_model_17_lines_FINAL.joblib")
unseen_predicted = loaded_kderf.predict(EW_values_unseen)
unseen_predicted_probs = loaded_kderf.predict_proba(EW_values_unseen)

# Produce a csv file with the final results and the confidence level of each prediction

In [10]:
filename='Classification_Results_KDE_RF'
indexx=np.arange(0,11,1)
with open(filename,'w') as flname:
    flname.write('ID'+','+'KDE_RF_ST'+','+'Probability'+','+'Confidence Level'+','+'B0'+','+'B1'+','+'B2'+','+'B3-B4'+','+'B5-B7'+','+'B8'+','+'B9'+','+'O2-O6'+','+'O7'+','+'O8'+','+'O9'+'\n')
#     print(indexx)
    for ID,KDE_RF_ST,Probs in zip(Targets_unseen,unseen_predicted,unseen_predicted_probs):
        if max(Probs) > 0.64:
            flname.write(str(ID)+','+str(KDE_RF_ST)+','+str(np.round(max(Probs),4))+','+"Strong Candidate"+',')
            for i in indexx:
#                 print(Probs)
                if i in indexx[:-1]:
                    flname.write(str(np.round(Probs[i],4))+',')
                else:
                    flname.write(str(np.round(Probs[i],4)) )
            flname.write("\n")
        if max(Probs) < 0.49:
            flname.write(str(ID)+','+str(KDE_RF_ST)+','+str(np.round(max(Probs),4))+','+"Candidate"+',')
            for i in indexx:
                if i in indexx[:-1]:
                    flname.write(str(np.round(Probs[i],4))+',')
                else:
                    flname.write(str(np.round(Probs[i],4)) )
            flname.write("\n")
            
        if max(Probs)>0.49 and max(Probs)<0.64:
            flname.write(str(ID)+','+str(KDE_RF_ST)+','+str(np.round(max(Probs),4))+','+"Good Candidate"+',')
            for i in indexx:
                if i in indexx[:-1]:
                    flname.write(str(np.round(Probs[i],4))+',')
                else:
                    flname.write(str(np.round(Probs[i],4)))
            flname.write("\n") 

# See the results as a DataFrame

In [11]:
df = pd.read_csv("Classification_Results_KDE_RF",index_col=False)
df.head(30)

Unnamed: 0,ID,KDE_RF_ST,Probability,Confidence Level,B0,B1,B2,B3-B4,B5-B7,B8,B9,O2-O6,O7,O8,O9
0,id_0,B9,0.4216,Candidate,0.0252,0.0372,0.0076,0.0857,0.1327,0.2256,0.4216,0.0553,0.0014,0.0069,0.0007
1,id_1,B9,0.303,Candidate,0.1056,0.0538,0.0028,0.0511,0.1079,0.2158,0.303,0.134,0.0022,0.0111,0.0128
2,id_2,B9,0.3811,Candidate,0.0246,0.0357,0.0066,0.0632,0.119,0.2874,0.3811,0.0721,0.0,0.0097,0.0007
3,id_3,B9,0.3811,Candidate,0.0246,0.0357,0.0066,0.0632,0.119,0.2874,0.3811,0.0721,0.0,0.0097,0.0007
4,id_4,B9,0.3804,Candidate,0.0607,0.0749,0.0087,0.071,0.1235,0.2176,0.3804,0.05,0.0022,0.0075,0.0035
5,id_5,B9,0.4273,Candidate,0.0239,0.0377,0.0078,0.0698,0.1295,0.2314,0.4273,0.0645,0.0,0.0075,0.0007
6,id_6,B9,0.4189,Candidate,0.0267,0.0391,0.0078,0.0797,0.1316,0.2281,0.4189,0.0601,0.0,0.0075,0.0007
7,id_7,B9,0.3443,Candidate,0.0286,0.0282,0.0017,0.0721,0.1285,0.2458,0.3443,0.13,0.0022,0.0124,0.0063
