In [1]:
%matplotlib inline
import pandas as pd
from astropy.table import Table
import numpy as np
from sklearn.model_selection import train_test_split
import shutil
from pathlib import Path
import matplotlib.pyplot as plt
from astropy.io import fits

from specbox import SpecSDSS
from scipy import stats
import multiprocessing

def sdss_spname(plate, mjd, fiberid):
    filename = 'spec-{:0>4}-{}-{:0>4}.fits'.format(plate, mjd, fiberid)
    return filename

In [2]:
df = Table.read('/store/public/databases/SDSS_QSO/DR16Q_v4.fits', character_as_bytes=False)

In [3]:
df = Table(df.as_array())

### Remove multidimensional columns 

In [4]:
df.remove_columns(['Z_DLA', 'NHI_DLA', 'CONF_DLA', 'PLATE_DUPLICATE', 
                   'MJD_DUPLICATE', 'FIBERID_DUPLICATE', 
                   'SPECTRO_DUPLICATE', 'PSFFLUX', 'PSFFLUX_IVAR', 
                   'PSFMAG', 'PSFMAGERR', 'EXTINCTION'])

In [5]:
df = df.to_pandas()

In [6]:
df.query('SN_MEDIAN_ALL>=10 and \
Z>0.6 and Z<2.3 and \
(ZWARNING==0 or ZWARNING==-1)', inplace=True)
# df.query('SN_MEDIAN_ALL>=3', inplace=True)

In [7]:
# df = df.sample(n=100, random_state=8888)# 取100个测试

In [8]:
df.shape

(70948, 171)

In [9]:
df['Filename'] = df.apply(lambda x: sdss_spname(x['PLATE'],x['MJD'],x['FIBERID']),axis=1)

In [10]:
df.loc[df.PLATE<3000, 'basepath'] = '/store/public/databases/sdss_sample/old_sdss/'
df.loc[df.PLATE>3000, 'basepath'] = '/store/public/databases/sdss_sample/new_eboss/'

In [11]:
df['fullpath'] = df['basepath'] + df['Filename']

In [12]:
df

Unnamed: 0,SDSS_NAME,RA,DEC,PLATE,MJD,FIBERID,AUTOCLASS_PQN,AUTOCLASS_DR14Q,IS_QSO_QN,Z_QN,...,GAIA_G_MAG,GAIA_G_FLUX_SNR,GAIA_BP_MAG,GAIA_BP_FLUX_SNR,GAIA_RP_MAG,GAIA_RP_FLUX_SNR,SDSS2GAIA_SEP,Filename,basepath,fullpath
0,000000.15+353104.2,0.000629,35.517841,7750,58402,802,QSO,QSO,1,0.844138,...,18.572096,338.031250,18.798977,36.917076,18.126667,48.053558,0.010250,spec-7750-58402-0802.fits,/store/public/databases/sdss_sample/new_eboss/,/store/public/databases/sdss_sample/new_eboss/...
4,000000.45+092308.2,0.001914,9.385637,11277,58450,705,QSO,QSO,1,2.033733,...,18.343128,318.294739,18.520456,56.963249,17.896278,73.990738,0.068326,spec-11277-58450-0705.fits,/store/public/databases/sdss_sample/new_eboss/,/store/public/databases/sdss_sample/new_eboss/...
17,000001.27-020159.7,0.005317,-2.033273,9345,57713,519,QSO,QSO,1,1.349210,...,19.468128,310.784088,19.931030,17.889416,18.682333,39.320343,0.061786,spec-9345-57713-0519.fits,/store/public/databases/sdss_sample/new_eboss/,/store/public/databases/sdss_sample/new_eboss/...
62,000003.94+263645.6,0.016441,26.612677,6877,56544,564,QSO,QSO,1,2.181968,...,19.460083,278.808929,19.643711,41.628937,19.083792,40.916561,0.061476,spec-6877-56544-0564.fits,/store/public/databases/sdss_sample/new_eboss/,/store/public/databases/sdss_sample/new_eboss/...
66,000004.10+335133.0,0.017118,33.859176,7750,58402,217,QSO,QSO,1,2.010720,...,18.895454,370.019806,19.271940,39.402596,18.227650,75.003082,0.047771,spec-7750-58402-0217.fits,/store/public/databases/sdss_sample/new_eboss/,/store/public/databases/sdss_sample/new_eboss/...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
750390,235958.66-011225.4,359.994458,-1.207057,9159,57666,9,QSO,QSO,1,1.772507,...,18.731903,511.626129,18.830873,51.431175,18.293493,51.251629,0.077930,spec-9159-57666-0009.fits,/store/public/databases/sdss_sample/new_eboss/,/store/public/databases/sdss_sample/new_eboss/...
750394,235959.06-090944.0,359.996092,-9.162233,650,52143,448,UNK,UNK,-1,-1.000000,...,18.411589,167.735901,18.501886,37.408081,17.989172,40.908638,0.065764,spec-0650-52143-0448.fits,/store/public/databases/sdss_sample/old_sdss/,/store/public/databases/sdss_sample/old_sdss/s...
750401,235959.44+103350.6,359.997704,10.564075,11546,58488,78,QSO,QSO,1,1.280721,...,19.234463,361.596771,19.428453,38.630497,18.630774,46.729725,0.049568,spec-11546-58488-0078.fits,/store/public/databases/sdss_sample/new_eboss/,/store/public/databases/sdss_sample/new_eboss/...
750405,235959.70+084505.2,359.998774,8.751452,6152,56164,12,QSO,QSO,1,2.196123,...,19.416855,216.896667,19.699894,24.340977,18.846588,41.665398,0.049323,spec-6152-56164-0012.fits,/store/public/databases/sdss_sample/new_eboss/,/store/public/databases/sdss_sample/new_eboss/...


In [13]:
df.reset_index(drop=True, inplace=True)
flist = df.fullpath

### Use the same `spectransform` function for preprocessing 

In [14]:
from pathlib import PurePath, PureWindowsPath
from astropy import units as u

In [15]:
# 光谱数据预处理：转到静止系、选定波长范围并重采样、空值处理、平滑、zscore
new_wave = np.linspace(2000, 3199, num=1200) * u.AA 

def spectransform(filename, z):
    try:
        sp = SpecSDSS(filename)
        sp.to_restframe(z, inplace=True)
        sp.trim([1950, 3250])
        sp.flux_conserve_resample(new_wave, True)
        idx = np.isnan(sp.flux.value)
        sp.flux.value[idx] = np.nanmedian(sp.flux.value)
        #         sp.smooth(5, 3, inplace=True, plot=False)
        sp.smooth(5, 3, inplace=True, plot=False, sigclip=True)
        f_norm = stats.zscore(sp.spec.flux.value)
    except:
        f_norm = np.zeros(1200)
    return f_norm

# def spectransform(filename, z):
#     sp = SpecSDSS(filename)
#     sp.to_restframe(z, inplace=True)
#     sp.trim([1950, 3250])
#     sp.flux_conserve_resample(new_wave, True)
#     idx = np.isnan(sp.flux.value)
#     sp.flux.value[idx] = np.nanmedian(sp.flux.value)
#     sp.smooth(5, 3, inplace=True, plot=False)
#     f_norm = stats.zscore(sp.spec.flux.value)
#     return f_norm

In [16]:
flux_list = []

In [17]:
flist

0        /store/public/databases/sdss_sample/new_eboss/...
1        /store/public/databases/sdss_sample/new_eboss/...
2        /store/public/databases/sdss_sample/new_eboss/...
3        /store/public/databases/sdss_sample/new_eboss/...
4        /store/public/databases/sdss_sample/new_eboss/...
                               ...                        
70943    /store/public/databases/sdss_sample/new_eboss/...
70944    /store/public/databases/sdss_sample/old_sdss/s...
70945    /store/public/databases/sdss_sample/new_eboss/...
70946    /store/public/databases/sdss_sample/new_eboss/...
70947    /store/public/databases/sdss_sample/new_eboss/...
Name: fullpath, Length: 70948, dtype: object

In [18]:
pool = multiprocessing.Pool(processes=80)
outputs = pool.starmap(spectransform, zip(flist.values,df.Z.values))

  self.err = data['ivar']**-0.5 * 1e-17
  self.err = data['ivar']**-0.5 * 1e-17
  self.err = data['ivar']**-0.5 * 1e-17
  self.err = data['ivar']**-0.5 * 1e-17
  self.err = data['ivar']**-0.5 * 1e-17
  self.err = data['ivar']**-0.5 * 1e-17
  self.err = data['ivar']**-0.5 * 1e-17
  self.err = data['ivar']**-0.5 * 1e-17
  out_variance = np.sum(pixel_uncer * resample_grid**2, axis=-1) / np.sum(
  self.err = data['ivar']**-0.5 * 1e-17
  out_variance = np.sum(pixel_uncer * resample_grid**2, axis=-1) / np.sum(
  self.err = data['ivar']**-0.5 * 1e-17
  out_variance = np.sum(pixel_uncer * resample_grid**2, axis=-1) / np.sum(
  self.err = data['ivar']**-0.5 * 1e-17
  self.err = data['ivar']**-0.5 * 1e-17
  result = super().__array_ufunc__(function, method, *arrays, **kwargs)
  self.err = data['ivar']**-0.5 * 1e-17
  self.err = data['ivar']**-0.5 * 1e-17
  out_variance = np.sum(pixel_uncer * resample_grid**2, axis=-1) / np.sum(
  self.err = data['ivar']**-0.5 * 1e-17
  self.err = data['ivar']**-

  result = super().__array_ufunc__(function, method, *arrays, **kwargs)
  self.err = data['ivar']**-0.5 * 1e-17
  out_variance = np.sum(pixel_uncer * resample_grid**2, axis=-1) / np.sum(
  self.err = data['ivar']**-0.5 * 1e-17
  out_variance = np.sum(pixel_uncer * resample_grid**2, axis=-1) / np.sum(
  out_variance = np.sum(pixel_uncer * resample_grid**2, axis=-1) / np.sum(
  self.err = data['ivar']**-0.5 * 1e-17
  result = super().__array_ufunc__(function, method, *arrays, **kwargs)
  out_variance = np.sum(pixel_uncer * resample_grid**2, axis=-1) / np.sum(
  out_variance = np.sum(pixel_uncer * resample_grid**2, axis=-1) / np.sum(
  self.err = data['ivar']**-0.5 * 1e-17
  self.err = data['ivar']**-0.5 * 1e-17
  out_variance = np.sum(pixel_uncer * resample_grid**2, axis=-1) / np.sum(
  self.err = data['ivar']**-0.5 * 1e-17
  self.err = data['ivar']**-0.5 * 1e-17
  self.err = data['ivar']**-0.5 * 1e-17
  out_variance = np.sum(pixel_uncer * resample_grid**2, axis=-1) / np.sum(
  self.err 

  self.err = data['ivar']**-0.5 * 1e-17
  self.err = data['ivar']**-0.5 * 1e-17
  self.err = data['ivar']**-0.5 * 1e-17
  self.err = data['ivar']**-0.5 * 1e-17
  self.err = data['ivar']**-0.5 * 1e-17
  self.err = data['ivar']**-0.5 * 1e-17
  self.err = data['ivar']**-0.5 * 1e-17
  self.err = data['ivar']**-0.5 * 1e-17
  self.err = data['ivar']**-0.5 * 1e-17
  self.err = data['ivar']**-0.5 * 1e-17
  self.err = data['ivar']**-0.5 * 1e-17
  self.err = data['ivar']**-0.5 * 1e-17
  self.err = data['ivar']**-0.5 * 1e-17
  self.err = data['ivar']**-0.5 * 1e-17
  self.err = data['ivar']**-0.5 * 1e-17
  self.err = data['ivar']**-0.5 * 1e-17
  self.err = data['ivar']**-0.5 * 1e-17
  out_variance = np.sum(pixel_uncer * resample_grid**2, axis=-1) / np.sum(
  self.err = data['ivar']**-0.5 * 1e-17
  self.err = data['ivar']**-0.5 * 1e-17
  out_variance = np.sum(pixel_uncer * resample_grid**2, axis=-1) / np.sum(
  out_variance = np.sum(pixel_uncer * resample_grid**2, axis=-1) / np.sum(
  out_variance 

  out_variance = np.sum(pixel_uncer * resample_grid**2, axis=-1) / np.sum(
  out_variance = np.sum(pixel_uncer * resample_grid**2, axis=-1) / np.sum(
  out_variance = np.sum(pixel_uncer * resample_grid**2, axis=-1) / np.sum(
  out_variance = np.sum(pixel_uncer * resample_grid**2, axis=-1) / np.sum(
  out_variance = np.sum(pixel_uncer * resample_grid**2, axis=-1) / np.sum(
  out_variance = np.sum(pixel_uncer * resample_grid**2, axis=-1) / np.sum(
  out_variance = np.sum(pixel_uncer * resample_grid**2, axis=-1) / np.sum(
  result = super().__array_ufunc__(function, method, *arrays, **kwargs)
  out_variance = np.sum(pixel_uncer * resample_grid**2, axis=-1) / np.sum(
  result = super().__array_ufunc__(function, method, *arrays, **kwargs)
  out_variance = np.sum(pixel_uncer * resample_grid**2, axis=-1) / np.sum(
  out_variance = np.sum(pixel_uncer * resample_grid**2, axis=-1) / np.sum(
  out_variance = np.sum(pixel_uncer * resample_grid**2, axis=-1) / np.sum(
  result = super().__array_ufun

  out_variance = np.sum(pixel_uncer * resample_grid**2, axis=-1) / np.sum(
  out_variance = np.sum(pixel_uncer * resample_grid**2, axis=-1) / np.sum(
  out_variance = np.sum(pixel_uncer * resample_grid**2, axis=-1) / np.sum(
  out_variance = np.sum(pixel_uncer * resample_grid**2, axis=-1) / np.sum(
  self.err = data['ivar']**-0.5 * 1e-17
  out_variance = np.sum(pixel_uncer * resample_grid**2, axis=-1) / np.sum(
  result = super().__array_ufunc__(function, method, *arrays, **kwargs)
  self.err = data['ivar']**-0.5 * 1e-17
  result = super().__array_ufunc__(function, method, *arrays, **kwargs)
  out_variance = np.sum(pixel_uncer * resample_grid**2, axis=-1) / np.sum(
  out_variance = np.sum(pixel_uncer * resample_grid**2, axis=-1) / np.sum(
  out_variance = np.sum(pixel_uncer * resample_grid**2, axis=-1) / np.sum(
  out_variance = np.sum(pixel_uncer * resample_grid**2, axis=-1) / np.sum(
  result = super().__array_ufunc__(function, method, *arrays, **kwargs)
  out_variance = np.sum(pixel

  result = super().__array_ufunc__(function, method, *arrays, **kwargs)
  out_variance = np.sum(pixel_uncer * resample_grid**2, axis=-1) / np.sum(
  out_variance = np.sum(pixel_uncer * resample_grid**2, axis=-1) / np.sum(
  result = super().__array_ufunc__(function, method, *arrays, **kwargs)
  out_variance = np.sum(pixel_uncer * resample_grid**2, axis=-1) / np.sum(
  result = super().__array_ufunc__(function, method, *arrays, **kwargs)
  out_variance = np.sum(pixel_uncer * resample_grid**2, axis=-1) / np.sum(
  result = super().__array_ufunc__(function, method, *arrays, **kwargs)
  result = super().__array_ufunc__(function, method, *arrays, **kwargs)
  out_variance = np.sum(pixel_uncer * resample_grid**2, axis=-1) / np.sum(
  self.err = data['ivar']**-0.5 * 1e-17
  out_variance = np.sum(pixel_uncer * resample_grid**2, axis=-1) / np.sum(
  result = super().__array_ufunc__(function, method, *arrays, **kwargs)
  out_variance = np.sum(pixel_uncer * resample_grid**2, axis=-1) / np.sum(
  o

  result = super().__array_ufunc__(function, method, *arrays, **kwargs)
  result = super().__array_ufunc__(function, method, *arrays, **kwargs)
  out_variance = np.sum(pixel_uncer * resample_grid**2, axis=-1) / np.sum(
  result = super().__array_ufunc__(function, method, *arrays, **kwargs)
  out_variance = np.sum(pixel_uncer * resample_grid**2, axis=-1) / np.sum(
  out_variance = np.sum(pixel_uncer * resample_grid**2, axis=-1) / np.sum(
  result = super().__array_ufunc__(function, method, *arrays, **kwargs)
  result = super().__array_ufunc__(function, method, *arrays, **kwargs)
  out_variance = np.sum(pixel_uncer * resample_grid**2, axis=-1) / np.sum(
  out_variance = np.sum(pixel_uncer * resample_grid**2, axis=-1) / np.sum(
  result = super().__array_ufunc__(function, method, *arrays, **kwargs)
  out_variance = np.sum(pixel_uncer * resample_grid**2, axis=-1) / np.sum(
  result = super().__array_ufunc__(function, method, *arrays, **kwargs)
  result = super().__array_ufunc__(function, m

  result = super().__array_ufunc__(function, method, *arrays, **kwargs)
  out_variance = np.sum(pixel_uncer * resample_grid**2, axis=-1) / np.sum(
  out_variance = np.sum(pixel_uncer * resample_grid**2, axis=-1) / np.sum(
  result = super().__array_ufunc__(function, method, *arrays, **kwargs)
  out_variance = np.sum(pixel_uncer * resample_grid**2, axis=-1) / np.sum(
  out_variance = np.sum(pixel_uncer * resample_grid**2, axis=-1) / np.sum(
  out_variance = np.sum(pixel_uncer * resample_grid**2, axis=-1) / np.sum(
  out_variance = np.sum(pixel_uncer * resample_grid**2, axis=-1) / np.sum(
  result = super().__array_ufunc__(function, method, *arrays, **kwargs)
  result = super().__array_ufunc__(function, method, *arrays, **kwargs)
  out_variance = np.sum(pixel_uncer * resample_grid**2, axis=-1) / np.sum(
  result = super().__array_ufunc__(function, method, *arrays, **kwargs)
  out_variance = np.sum(pixel_uncer * resample_grid**2, axis=-1) / np.sum(
  out_variance = np.sum(pixel_uncer * re

In [19]:
# from os.path import basename
# for i, item in enumerate(flist):
#     f_norm = spectransform(item,df.iloc[i,9])
#     filename = basename(item)
#     flux_list.append(f_norm)
# #     print("No. {}: file {} done.".format(i, filename))

In [38]:
dfspec = pd.DataFrame(outputs)

In [39]:
dfspec

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1190,1191,1192,1193,1194,1195,1196,1197,1198,1199
0,-0.502274,0.817633,0.841372,0.540732,0.598625,-0.185498,-0.373632,0.111117,0.805581,0.421076,...,-1.679034,-2.236290,-2.346670,-1.990350,-1.531344,-1.827857,-2.080005,-2.242348,-2.052466,-2.048533
1,2.385174,2.400318,2.305688,2.170084,2.158930,2.175016,2.167791,2.265539,2.291760,2.217184,...,-1.449068,-1.295823,-1.265804,-1.303415,-1.334047,-1.165335,-1.257762,-1.362080,-1.637229,-1.778415
2,-1.251200,-1.248959,-0.930249,-0.782729,-0.678302,-0.538710,-0.645687,-0.555659,-0.759908,-0.972962,...,0.216105,0.139883,-0.020445,0.072952,0.018701,-0.311196,-0.258092,-0.126815,-0.098484,-0.531955
3,1.114323,1.942271,1.970470,1.684900,1.455580,1.590525,1.525256,1.458184,1.549711,1.586158,...,-1.390632,-1.351587,-2.113100,-2.429761,-2.342877,-2.385254,-2.443711,-1.696821,-0.552631,-0.563039
4,1.812995,1.526468,1.753273,1.851525,1.713133,1.296644,1.091082,1.217587,1.691732,1.802180,...,-1.158123,-1.033174,-1.076161,-1.277758,-1.648644,-1.614341,-1.049001,-0.737170,-1.111809,-1.423360
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70943,0.965142,1.325491,1.492452,1.313460,1.113065,1.129128,1.364720,1.518579,1.479688,-0.363857,...,-1.445646,-1.365981,-1.364484,-1.267746,-1.174022,-1.252856,-1.253364,-1.397802,-1.459718,-1.586850
70944,2.480158,2.055108,2.092333,2.269349,2.256526,2.295370,2.205232,2.277969,2.069695,2.096582,...,-1.575729,-1.444024,-1.410086,-1.507280,-1.502961,-1.595755,-1.552708,-1.532336,-1.369621,-1.724152
70945,1.178570,-0.267685,0.035759,0.533447,0.262927,0.248176,-0.041205,-0.136229,-0.092258,0.173166,...,-1.421607,-1.038397,-0.686900,-0.718665,-0.988417,-0.563946,-0.167370,-0.518354,-1.341732,-1.195662
70946,1.796319,1.409790,1.382368,1.335148,1.232228,1.259283,1.620753,1.600383,1.353516,1.134101,...,1.369613,-0.657458,-2.294022,-3.850269,-0.904658,-0.170311,-0.047470,-0.760304,-0.445848,-0.733999


#### Remove Nan

In [42]:
dfspec1 = dfspec.replace(0, np.nan)
dfspec1.dropna(subset=[0,1,2,100,500,900], inplace=True)
# dfspec1.dropna(subset=pd.Index(np.arange(70948)), inplace=True)

In [43]:
if dfspec1.isna().values.any():
    print('exist np.nan')
else:
    print('no np.nan')

no np.nan


#### Scale to 0-1

In [44]:
from sklearn import preprocessing
dfspec1 = preprocessing.minmax_scale(dfspec1.T).T
dfspec1 = pd.DataFrame(dfspec1)

In [45]:
dfspec1

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1190,1191,1192,1193,1194,1195,1196,1197,1198,1199
0,0.318173,0.545868,0.549963,0.498100,0.508087,0.372820,0.340365,0.423988,0.543789,0.477459,...,0.115173,0.019041,0.000000,0.061468,0.140650,0.089499,0.046002,0.017996,0.050753,0.051431
1,0.960294,0.963273,0.944654,0.917973,0.915778,0.918943,0.917522,0.936754,0.941914,0.927240,...,0.205871,0.236023,0.241930,0.234530,0.228502,0.261698,0.243512,0.222987,0.168848,0.141069
2,0.191679,0.191965,0.232620,0.251437,0.264758,0.282564,0.268918,0.280402,0.254348,0.227171,...,0.378847,0.369125,0.348673,0.360587,0.353667,0.311585,0.318359,0.335105,0.338719,0.283426
3,0.817283,0.925940,0.929640,0.892163,0.862068,0.879778,0.871212,0.862410,0.874422,0.879205,...,0.488543,0.493667,0.393729,0.352172,0.363574,0.358013,0.350341,0.448360,0.598519,0.597153
4,0.862676,0.813048,0.852332,0.869349,0.845379,0.773241,0.737637,0.759548,0.841672,0.860803,...,0.348063,0.369705,0.362259,0.327341,0.263102,0.269043,0.366963,0.420974,0.356085,0.302122
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70911,0.336543,0.372973,0.389852,0.371757,0.351498,0.353121,0.376939,0.392493,0.388562,0.202186,...,0.092821,0.100875,0.101027,0.110806,0.120282,0.112312,0.112260,0.097658,0.091399,0.078546
70912,1.000000,0.901511,0.910137,0.951153,0.948182,0.957183,0.936297,0.953151,0.904891,0.911121,...,0.060208,0.090725,0.098589,0.076068,0.077069,0.055567,0.065542,0.070262,0.107965,0.025816
70913,0.388412,0.225739,0.259870,0.315850,0.285422,0.283763,0.251214,0.240525,0.245471,0.275326,...,0.095948,0.139051,0.178587,0.175014,0.144673,0.192417,0.237023,0.197545,0.104932,0.121362
70914,0.890651,0.854669,0.852116,0.847721,0.838140,0.840658,0.874307,0.872411,0.849430,0.829005,...,0.850929,0.662229,0.509881,0.365010,0.639217,0.707577,0.719013,0.652655,0.681928,0.655104


#### Add Label

In [46]:
dfspec1['filename'] = flist
dfspec1['Label']='DR16Q'
flist_z = df.Z
dfspec1['redshift'] = flist_z

In [47]:
dfspec1.to_csv('/store/public/xiang/train_dr16q_v3.csv', index=False)