# Import all the needed packages

In [1]:
from natsort import natsorted
from collections import defaultdict
import numpy as np
import os 
import pandas as pd
import joblib

# Load the dataset

In [2]:
path = '/home/elias/Projects/Github_rep/SpectralClassifier/' # Chang the path with yours
file_with_data = 'test_sample'
infile_data = path + file_with_data

In [3]:
unseen_data = defaultdict(dict) #master dictionary
with open(infile_data, 'r') as inf :
    for line in inf.readlines():
        if line[0] != '#' :
            col = line.split(',') #','   
            target_name = col[0]
            spec_lines  = col[1]
            ew_values   = col[3]
            unseen_data[target_name][spec_lines] = ew_values
            

all_spec_lines = list(list(unseen_data.values())[0])
print('Spectral Lines: {}'.format(all_spec_lines))

Spectral Lines: ['HeI/4009', 'HeI/4026', 'SiIV/4088', 'Hd/4100', 'SiIV/4116', 'HeI/4121', 'SiII/4130', 'HeI/4144', 'HeII/4200', 'FeII/4233', 'HeI/4387', 'OII/4416', 'HeI/4471', 'MgII/4481', 'HeII/4541', 'SiIII/4553', 'OII+CIII/4645', 'HeII/4686']


# Characteristic spectral lines

**Do not change these spectral lines, because these features have been used for the training of the models**

In [4]:
selected_lines =['HeI/4009', 'HeI/4026', 'SiIV/4088', 'SiIV/4116', 'HeI/4121', 'SiII/4130', 
                 'HeI/4144', 'HeII/4200', 'FeII/4233', 'HeI/4387', 'OII/4416', 'HeI/4471', 
                 'MgII/4481', 'HeII/4541', 'SiIII/4553', 'OII+CIII/4645', 'HeII/4686']
print('Spectral Features are:')
print(selected_lines)

Spectral Features are:
['HeI/4009', 'HeI/4026', 'SiIV/4088', 'SiIV/4116', 'HeI/4121', 'SiII/4130', 'HeI/4144', 'HeII/4200', 'FeII/4233', 'HeI/4387', 'OII/4416', 'HeI/4471', 'MgII/4481', 'HeII/4541', 'SiIII/4553', 'OII+CIII/4645', 'HeII/4686']


# Check the number of the spectra which will be classified by the model

In [5]:
print('Unseen data: {}'.format(len(unseen_data)))
print('-'*25)

Unseen data: 8
-------------------------


# Creating the arrays with the targets ID and the EW from each line

In [6]:
Targets_unseen = []
EW_values_unseen = []

for i in unseen_data.keys():
    temp = []
    for j in selected_lines:
        temp.append(unseen_data[i][j])
    Targets_unseen.append((i))
    EW_values_unseen.append(temp)

EW_values_unseen = np.asarray(EW_values_unseen,dtype=np.float64)
Targets_unseen= np.asarray(Targets_unseen)

# Load the pre-trained model and apply it 

In [7]:
loaded_rf = joblib.load("RF_best_model_17_lines_FINAL.joblib")
unseen_predicted = loaded_rf.predict(EW_values_unseen)
unseen_predicted_probs = loaded_rf.predict_proba(EW_values_unseen)

# Produce a csv file with the final results and the confidence level of each prediction

In [8]:
filename='Classification_Results_RF'
indexx=np.arange(0,11,1)
with open(filename,'w') as flname:
    flname.write('ID'+','+'RF_ST'+','+'Probability'+','+'Confidence Level'+','+'B0'+','+'B1'+','+'B2'+','+'B3-B4'+','+'B5-B7'+','+'B8'+','+'B9'+','+'O2-O6'+','+'O7'+','+'O8'+','+'O9'+'\n')
    for ID,RF_ST,Probs in zip(Targets_unseen,unseen_predicted,unseen_predicted_probs):
        if max(Probs) > 0.64:
            flname.write(str(ID)+','+str(RF_ST)+','+str(np.round(max(Probs),4))+','+"Strong Candidate"+',')
            for i in indexx:
                if i in indexx[:-1]:
                    flname.write(str(np.round(Probs[i],4))+',')
                else:
                    flname.write(str(np.round(Probs[i],4)) )
            flname.write("\n")
        if max(Probs) < 0.49:
            flname.write(str(ID)+','+str(RF_ST)+','+str(np.round(max(Probs),4))+','+"Candidate"+',')
            for i in indexx:
                if i in indexx[:-1]:
                    flname.write(str(np.round(Probs[i],4))+',')
                else:
                    flname.write(str(np.round(Probs[i],4)) )
            flname.write("\n")
            
        if max(Probs)>0.49 and max(Probs)<0.64:
            flname.write(str(ID)+','+str(RF_ST)+','+str(np.round(max(Probs),4))+','+"Good Candidate"+',')
            for i in indexx:
                if i in indexx[:-1]:
                    flname.write(str(np.round(Probs[i],4))+',')
                else:
                    flname.write(str(np.round(Probs[i],4)))
            flname.write("\n") 

# See the results as a DataFrame

In [9]:
df = pd.read_csv("Classification_Results_RF",index_col=False)
df.head(30)

Unnamed: 0,ID,RF_ST,Probability,Confidence Level,B0,B1,B2,B3-B4,B5-B7,B8,B9,O2-O6,O7,O8,O9
0,id_0,B9,0.3335,Candidate,0.0244,0.0322,0.0038,0.0549,0.1852,0.282,0.3335,0.0681,0.0091,0.0032,0.0036
1,id_1,B8,0.2787,Candidate,0.0875,0.0411,0.0011,0.0338,0.1459,0.2787,0.2755,0.1108,0.0094,0.0039,0.0124
2,id_2,B9,0.3119,Candidate,0.0256,0.0305,0.0038,0.0471,0.1692,0.3088,0.3119,0.0847,0.0109,0.0032,0.0043
3,id_3,B9,0.3119,Candidate,0.0256,0.0305,0.0038,0.0471,0.1692,0.3088,0.3119,0.0847,0.0109,0.0032,0.0043
4,id_4,B9,0.3151,Candidate,0.0477,0.0538,0.0049,0.0493,0.1825,0.2577,0.3151,0.0686,0.0107,0.0025,0.0072
5,id_5,B9,0.3381,Candidate,0.0251,0.0314,0.0038,0.0531,0.1811,0.2814,0.3381,0.0699,0.0091,0.0032,0.0036
6,id_6,B9,0.3324,Candidate,0.0251,0.0326,0.0038,0.054,0.1905,0.2757,0.3324,0.0699,0.0091,0.0032,0.0036
7,id_7,B9,0.3067,Candidate,0.0278,0.0279,0.0019,0.0471,0.176,0.275,0.3067,0.1173,0.0108,0.0046,0.0048
