In [1]:
import numpy as np
import pandas as pd
from astropy.stats import sigma_clipped_stats 

In [2]:
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
reals = pd.read_csv('../results/reals_table.csv')
bogus = pd.read_csv('../results/bogus_table.csv').sample(frac=0.01)
matched_lcs = pd.read_csv('../results/matched_lcs_visit_by_visit.csv')

In [4]:
reals['REAL'] = True
bogus['REAL'] = False

In [5]:
len(reals), len(bogus)

(1432, 6350)

In [6]:
from sklearn import preprocessing

In [7]:
cols = list(reals.columns)
for acol in ['id', 'Unnamed: 0', 'REAL', 'cxmatch', 'sn_row', 'match_ang_dist', 
             'sn_id', 'raft', 'sensor', 'filter', 'coord_ra', 'coord_dec']:
    cols.remove(acol)

In [8]:
for acol in ['ip_diffim_NaiveDipoleCentroid_pos_x', 'slot_Centroid_pos_x', 'ip_diffim_NaiveDipoleCentroid_pos_y', 'slot_Centroid_pos_y',
             'ip_diffim_NaiveDipoleCentroid_pos_xErr', 'slot_Centroid_pos_xErr', 'ip_diffim_NaiveDipoleCentroid_pos_yErr', 'slot_Centroid_pos_yErr']:
    cols.remove(acol)

In [9]:
X = pd.concat([reals, bogus])[cols]
Y = pd.concat([reals, bogus])['REAL']

In [10]:
x_names = list(X.columns)
X_data = X.values.astype(float)

In [11]:
X_data

array([[0.00000000e+00, 0.00000000e+00, 6.82028149e+02, ...,
        0.00000000e+00, 6.82600000e+03, 1.79000000e+02],
       [0.00000000e+00, 0.00000000e+00, 2.00595615e+03, ...,
        0.00000000e+00, 6.82600000e+03, 1.88000000e+02],
       [0.00000000e+00, 0.00000000e+00, 2.22912969e+03, ...,
        0.00000000e+00, 6.85500000e+03, 7.20000000e+01],
       ...,
       [0.00000000e+00, 0.00000000e+00, 8.20958701e+02, ...,
        0.00000000e+00, 5.15772000e+05, 1.74000000e+02],
       [0.00000000e+00, 0.00000000e+00, 2.81428489e+03, ...,
        0.00000000e+00, 2.19947000e+05, 2.10000000e+01],
       [0.00000000e+00, 0.00000000e+00, 1.70305400e+03, ...,
        0.00000000e+00, 6.94238000e+05, 8.80000000e+01]])

In [12]:
columns_to_throw = []
icolumns_to_throw = []
for ii in range(X_data.shape[1]):
    col = X_data[:, ii]
    if np.sum(np.isfinite(col)) == 0:
        print(x_names[ii], ' has no finite values')
        columns_to_throw.append(x_names[ii])
        icolumns_to_throw.append(ii)
        
        continue
    infs = np.isposinf(col)
    neginfs = np.isneginf(col)
    X_data[infs, ii] = np.max(col[np.isfinite(col)])
    X_data[neginfs, ii] = np.min(col[np.isfinite(col)])

ip_diffim_NaiveDipoleCentroid_xErr  has no finite values
slot_Centroid_xErr  has no finite values
ip_diffim_NaiveDipoleCentroid_yErr  has no finite values
slot_Centroid_yErr  has no finite values
ip_diffim_NaiveDipoleCentroid_neg_xErr  has no finite values
slot_Centroid_neg_xErr  has no finite values
ip_diffim_NaiveDipoleCentroid_neg_yErr  has no finite values
slot_Centroid_neg_yErr  has no finite values
ip_diffim_NaiveDipoleFlux_neg_instFluxErr  has no finite values
ip_diffim_PsfDipoleFlux_pos_centroid_xErr  has no finite values
ip_diffim_PsfDipoleFlux_pos_centroid_yErr  has no finite values
ip_diffim_PsfDipoleFlux_neg_centroid_xErr  has no finite values
ip_diffim_PsfDipoleFlux_neg_centroid_yErr  has no finite values
ip_diffim_PsfDipoleFlux_centroid_xErr  has no finite values
ip_diffim_PsfDipoleFlux_centroid_yErr  has no finite values


In [13]:
columns_to_throw

['ip_diffim_NaiveDipoleCentroid_xErr',
 'slot_Centroid_xErr',
 'ip_diffim_NaiveDipoleCentroid_yErr',
 'slot_Centroid_yErr',
 'ip_diffim_NaiveDipoleCentroid_neg_xErr',
 'slot_Centroid_neg_xErr',
 'ip_diffim_NaiveDipoleCentroid_neg_yErr',
 'slot_Centroid_neg_yErr',
 'ip_diffim_NaiveDipoleFlux_neg_instFluxErr',
 'ip_diffim_PsfDipoleFlux_pos_centroid_xErr',
 'ip_diffim_PsfDipoleFlux_pos_centroid_yErr',
 'ip_diffim_PsfDipoleFlux_neg_centroid_xErr',
 'ip_diffim_PsfDipoleFlux_neg_centroid_yErr',
 'ip_diffim_PsfDipoleFlux_centroid_xErr',
 'ip_diffim_PsfDipoleFlux_centroid_yErr']

In [14]:
X.drop(columns=columns_to_throw)
for acolumn in columns_to_throw:
    x_names.remove(acolumn)
X_data = np.delete(X_data, icolumns_to_throw, axis=1)

In [15]:
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy='mean')

In [16]:
len(x_names), X_data.shape

(224, (7782, 224))

In [17]:
np.sum(np.isnan(X_data))

287374

In [18]:
imp.fit(X_data)

SimpleImputer(add_indicator=False, copy=True, fill_value=None,
              missing_values=nan, strategy='mean', verbose=0)

In [19]:
X_trsf = imp.transform(X_data)

In [20]:
X_scaled = preprocessing.scale(X_trsf)

In [21]:
store = pd.HDFStore('ML_dataset_store.hdf5')
store.open()

In [22]:
store['X'] = X

In [23]:
store['X_cleaned'] = pd.DataFrame(data=X_trsf, columns=x_names)

In [24]:
store['X_scaled'] = pd.DataFrame(data=X_scaled, columns=x_names)

In [25]:
store['Y'] = Y

In [26]:
store.close()

-------

Now that we have the sample scaled, let's select features