In [9]:
import os
os.environ['SCRATCH']='/global/cscratch1/sd/bos0109'
SCRATCH = %env SCRATCH

In [1]:
import numpy as np
import pandas as pd
from astropy.stats import sigma_clipped_stats 

In [2]:
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
reals = pd.read_csv(f'{SCRATCH}/results/reals_table_post.csv')
bogus = pd.read_csv(f'{SCRATCH}/results/bogus_table_post.csv')#.sample(frac=0.01)
matched_lcs = pd.read_csv(f'{SCRATCH}/results/matched_lcs_visit_by_visit.csv')

In [4]:
reals['REAL'] = True
bogus['REAL'] = False

In [5]:
len(reals), len(bogus)

(1432, 634976)

In [6]:
from sklearn import preprocessing

In [7]:
cols = list(reals.columns)
for acol in ['id', 'Unnamed: 0', 'REAL', 'cxmatch', 'sn_row', 'match_ang_dist', 
             'sn_id', 'raft', 'sensor', 'filter', 'coord_ra', 'coord_dec']:
    cols.remove(acol)

In [8]:
for acol in ['ip_diffim_NaiveDipoleCentroid_pos_x', 'slot_Centroid_pos_x', 'ip_diffim_NaiveDipoleCentroid_pos_y', 'slot_Centroid_pos_y']:
    cols.remove(acol)

In [9]:
to_remove = [acol for acol in cols if '_x' in acol or '_y' in acol]

In [10]:
for acol in to_remove:
    cols.remove(acol)

In [11]:
X = pd.concat([reals, bogus])[cols]
Y = pd.concat([reals, bogus])['REAL']

In [12]:
x_names = list(X.columns)
X_data = X.values.astype(float)

In [13]:
X_data

array([[1.82432000e+05, 0.00000000e+00, 0.00000000e+00, ...,
        5.44414864e+00, 5.22739647e+00, 5.09918130e-01],
       [1.83031000e+05, 0.00000000e+00, 0.00000000e+00, ...,
        4.87152714e+00, 5.21383765e+00, 1.32872084e+00],
       [1.83144000e+05, 0.00000000e+00, 0.00000000e+00, ...,
        5.20209479e+00, 4.72667356e+00, 5.49518789e-01],
       ...,
       [6.29601000e+05, 0.00000000e+00, 0.00000000e+00, ...,
        4.08146246e+02, 3.19156180e+00,            nan],
       [6.29602000e+05, 0.00000000e+00, 0.00000000e+00, ...,
                   nan, 3.29763322e+00,            nan],
       [6.29603000e+05, 0.00000000e+00, 0.00000000e+00, ...,
        2.41309338e+00, 3.18519382e+00, 7.62940416e-01]])

In [14]:
columns_to_throw = []
icolumns_to_throw = []
for ii in range(X_data.shape[1]):
    col = X_data[:, ii]
    if np.sum(np.isfinite(col)) == 0:
        print(x_names[ii], ' has no finite values')
        columns_to_throw.append(x_names[ii])
        icolumns_to_throw.append(ii)
        
        continue
    infs = np.isposinf(col)
    neginfs = np.isneginf(col)
    X_data[infs, ii] = np.max(col[np.isfinite(col)])
    X_data[neginfs, ii] = np.min(col[np.isfinite(col)])

ip_diffim_NaiveDipoleFlux_neg_instFluxErr  has no finite values
ip_diffim_NaiveDipoleFlux_neg_instFlux_SN  has no finite values


In [15]:
columns_to_throw

['ip_diffim_NaiveDipoleFlux_neg_instFluxErr',
 'ip_diffim_NaiveDipoleFlux_neg_instFlux_SN']

In [16]:
X.drop(columns=columns_to_throw)
for acolumn in columns_to_throw:
    x_names.remove(acolumn)
X_data = np.delete(X_data, icolumns_to_throw, axis=1)

In [17]:
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy='mean')

In [18]:
x_names

['Unnamed: 0.1',
 'parent',
 'flags_negative',
 'base_NaiveCentroid_flag',
 'base_NaiveCentroid_flag_noCounts',
 'base_NaiveCentroid_flag_edge',
 'base_NaiveCentroid_flag_resetToPeak',
 'base_PeakCentroid_flag',
 'base_SdssCentroid_flag',
 'base_SdssCentroid_flag_edge',
 'base_SdssCentroid_flag_noSecondDerivative',
 'base_SdssCentroid_flag_almostNoSecondDerivative',
 'base_SdssCentroid_flag_notAtMaximum',
 'base_SdssCentroid_flag_resetToPeak',
 'ip_diffim_NaiveDipoleCentroid_flag',
 'base_CircularApertureFlux_flag_badCentroid',
 'base_GaussianFlux_flag_badCentroid',
 'base_NaiveCentroid_flag_badInitialCentroid',
 'base_PeakLikelihoodFlux_flag_badCentroid',
 'base_PsfFlux_flag_badCentroid',
 'base_SdssCentroid_flag_badInitialCentroid',
 'base_SdssShape_flag_badCentroid',
 'slot_Centroid_flag',
 'ip_diffim_NaiveDipoleCentroid_pos_flag',
 'slot_Centroid_pos_flag',
 'ip_diffim_NaiveDipoleCentroid_neg_flag',
 'slot_Centroid_neg_flag',
 'base_SdssShape_instFlux',
 'slot_Shape_instFlux',
 'ba

In [19]:
len(x_names), X_data.shape

(194, (636408, 194))

In [20]:
np.sum(np.isnan(X_data))

13226164

In [21]:
imp.fit(X_data)

SimpleImputer(add_indicator=False, copy=True, fill_value=None,
              missing_values=nan, strategy='mean', verbose=0)

In [22]:
X_trsf = imp.transform(X_data)

In [23]:
X_scaled = preprocessing.scale(X_trsf)



In [32]:
store = pd.HDFStore(f'{SCRATCH}/results/ML_dataset_store.hdf5')
store.open()

In [33]:
store['X'] = X

In [34]:
store['X_cleaned'] = pd.DataFrame(data=X_trsf, columns=x_names)

In [35]:
store['X_scaled'] = pd.DataFrame(data=X_scaled, columns=x_names)

In [36]:
store['Y'] = Y

In [37]:
store.close()

-------

Now that we have the sample scaled, let's select features