In [1]:
%run ../code/wavelets_pca_with_zhost_1100_average_precision.py

In [2]:
%run ../code/wavelets_pca_with_zhost_1100_roc_auc.py

In [3]:
%run ../processing/read_data.py

In [6]:
import george as gg
import george.kernels as kr
import sncosmo as snc
import scipy.optimize as op
import pywt as wt

import os
import time
import numpy as np

from sklearn.decomposition import PCA

import pickle

from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold


In [7]:
pd.set_option("display.max_rows", 6)

## Loading GP and Wavelets function

In [8]:
def cleaning_df(df, method, clean_neg = False, percentage = 0.5):
    if clean_neg: #verifies if the value is negative and if it is under the error margin, if it is, turn to zero
        df[(df[:, 1] < 0) & (df[:, 1] > -df[:, 2]) , 1] = 0
        df = df[(df[:, 1] > 0)] #otherwise just cut off
    if method == 'std_dev': #cuts the points with error over the mean error + 1 std
        threshold = df.mean(axis = 0)[2] + df.std(axis = 0)[2]
        df_filter = df[(threshold>df[:,2])]
    elif method == 'percentage':
        threshold = df.max(axis = 0)[1] * percentage
        df_filter = df[(threshold>df[:,2])]
    else:
        df_filter = df
    return df_filter

In [9]:
#the keys are the names of the filters in order to be the dict keys
def get_wavelets(sn, keys, wavelet = 'sym2', mlev = 2):
    wav = wt.Wavelet(wavelet)
    
    fmin, xstar, mu, stds = gaussian_process(sn, keys)
    for filt in keys: 
        coeffs = [np.array(wt.swt(mu[filt], wav, level=mlev)).flatten()]

    return np.concatenate(coeffs)

In [10]:
def gaussian_process(data, filters):
    x = np.linspace(data.MJD.min(), data.MJD.max(), 100)
    data_dict = {band: df[['MJD', 'FLUXCAL', 'FLUXCALERR']].values for band, df in data.groupby('FLT')}
    
    
    mus = {filters[0] : [], filters[1] : [], filters[2] : [], filters[3] : []}
    stds = {filters[0] : [], filters[1] : [], filters[2] : [], filters[3] : []}

    for band, dat in data_dict.items():
        gp = gg.GP((500**2)*kr.ExpSquaredKernel(metric=20**2), fit_mean=True)
        gp.compute(dat[:,0], dat[:,2])  
        # Define the objective function (negative log-likelihood in this case).
        def nll(p):
            gp.set_parameter_vector(p)
            ll = gp.log_likelihood(dat[:,1], quiet=True)
            return -ll if np.isfinite(ll) else 1e25
    
        # And the gradient of the objective function.
        def grad_nll(p):
            gp.set_parameter_vector(p)
            return -gp.grad_log_likelihood(dat[:,1], quiet=True)
          
        p0 = gp.get_parameter_vector()
        results = op.minimize(nll, p0, jac=grad_nll, method="L-BFGS-B")
        
        mu, var = gp.predict(dat[:,1], x, return_var=True)
        std = np.sqrt(var)
        stds[band] = std
        mus[band] = mu 
    
    return 0, x, mus, stds

In [75]:
#path_to_read = '../../data/raw_data/'
#files = []
#for r, d, f in os.walk(path_to_read):
#    for file in f:
#        if '.DAT' in file:
#            files.append(os.path.join(r, file))

In [11]:
#labels = []
#for f in files:
#    #print(f[10:])
#    
#    file_name = f[20:] #edit here everytime we change the folder
#    read = read_sn(path_to_read + file_name)
#    label = read['SIM_COMMENT'][3]
#    labels.append(label)
#    #results.append(get_wavelets(read_sn(path_to_read + f[20:])['df'], keys))
#    

In [12]:
#pickle_out = open("../../models/labels.pickle","wb")
#pickle.dump(labels, pickle_out)
#pickle_out.close()

### OBS: SOME FILES FOR SOME REASON COULD NOT BE PROCESSED USING WAVELETS, Those were:
### DES_SN076747, DES_SN076747. DES_SN813144

In [13]:
#samples_wavelets = []
#
#keys = ['desg' , 'desi' , 'desr' , 'desz']
#
#start = time.time()
#
#results = []
#for f in files:
#    #print(f[10:])
#    
#    #file_name = f[20:] #edit here everytime we change the folder
#    #read = read_sn(path_to_read + file_name)
#    #df = read['df']
#    
#    results.append(get_wavelets(read_sn(path_to_read + f[20:])['df'], keys))
#    
#
#end = time.time()
#print("Time running: " , (end - start)/3600) 
### Time running:  0.9278092284997305

In [14]:
#pickle_out = open("../../models/wavelet_df_pipeline_Marcelo.pickle","wb")
#pickle.dump(results, pickle_out)
#pickle_out.close()

In [81]:
pickle_results = open("../../models/wavelet_df_pipeline_Marcelo.pickle","rb")
pickle_labels = open("../../models/labels.pickle","rb")

results = pickle.load(pickle_results)
labels = pickle.load(pickle_labels)

In [82]:
## Checking if the dimensions are all the same
#for f in wavlets_results:
#    if len(f) != 400:
#        print('error')

## Wavelets done, PCA now

In [83]:
pca = PCA(n_components=20)
pca.fit(results)  
print(pca.explained_variance_ratio_)
print(pca.singular_values_)

[8.69e-01 7.41e-02 3.83e-02 7.21e-03 6.25e-03 2.62e-03 1.08e-03 7.77e-04
 4.46e-04 1.61e-04 1.20e-04 6.50e-05 3.89e-05 2.27e-05 1.02e-05 8.11e-06
 5.22e-06 3.55e-06 2.25e-06 4.82e-07]
[3.52e+05 1.03e+05 7.39e+04 3.21e+04 2.98e+04 1.93e+04 1.24e+04 1.05e+04
 7.97e+03 4.79e+03 4.14e+03 3.04e+03 2.36e+03 1.80e+03 1.20e+03 1.08e+03
 8.62e+02 7.12e+02 5.66e+02 2.62e+02]


In [84]:
df_pca_20 = pca.fit_transform(results) #or pca.transform(results), same thing

## Using Pipelines

In [115]:
conversion = {'IIL' : 5, 'IIP' : 7, 'II' : 4, 'IIn' : 6, 'Ia' : 0, 'Ib' : 1, 'Ibc' : 2, 'Ic' : 3}
conversion_bool = {'IIL' : 0, 'IIP' : 0, 'II' : 0, 'IIn' : 0, 'Ia' : 1, 'Ib' : 0, 'Ibc' : 0, 'Ic' : 0}

labels_as_num = []
for l in labels:
    labels_as_num.append(conversion[l])

# K-FOLD

In [133]:
X = df_pca_20
y = np.array(labels_as_num)
kf = KFold(n_splits=5)
X_train = []
X_test = []
y_train = []
y_test = []
for train_index, test_index in kf.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train.append(X[train_index])
    X_test.append(X[test_index])
    y_train.append(y[train_index])
    y_test.append(y[test_index])

TRAIN: [ 4264  4265  4266 ... 21313 21314 21315] TEST: [   0    1    2 ... 4261 4262 4263]
TRAIN: [    0     1     2 ... 21313 21314 21315] TEST: [4264 4265 4266 ... 8524 8525 8526]
TRAIN: [    0     1     2 ... 21313 21314 21315] TEST: [ 8527  8528  8529 ... 12787 12788 12789]
TRAIN: [    0     1     2 ... 21313 21314 21315] TEST: [12790 12791 12792 ... 17050 17051 17052]
TRAIN: [    0     1     2 ... 17050 17051 17052] TEST: [17053 17054 17055 ... 21313 21314 21315]


In [146]:
X_train[0].shape

(17052, 20)

In [148]:
df_pca_20[:18000].shape

(18000, 20)

In [161]:
scores = model_selection.cross_val_score(exported_pipeline, df_pca_20, labels_as_num, cv=5)

In [164]:
scores

array([0.74, 0.74, 0.74, 0.73, 0.75])

In [159]:
from sklearn.model_selection import train_test_split


In [160]:
from sklearn import model_selection

In [157]:
sklearn.cross_validation

AttributeError: module 'sklearn' has no attribute 'cross_validation'

In [None]:
models = []
for i in range(len(X_train)):
    models.append(exported_pipeline.fit(X_train[i], y_train[i]))

In [150]:
exported_pipeline.fit(df_pca_20[:18000], labels_as_num[:18000])


###Pipeline(memory=None,
###     steps=[('featureunion', FeatureUnion(n_jobs=None,
###       transformer_list=[('stackingestimator-1', StackingEstimator(estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
###           metric_params=None, n_jobs=None, n_neighbors=21, p=1,
###           weights='uniform'))), ('....8500000000000001, tol=0.0001,
###              validation_fraction=0.1, verbose=0, warm_start=False))])

Pipeline(memory=None,
     steps=[('featureunion', FeatureUnion(n_jobs=None,
       transformer_list=[('stackingestimator-1', StackingEstimator(estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=21, p=1,
           weights='uniform'))), ('....8500000000000001, tol=0.0001,
              validation_fraction=0.1, verbose=0, warm_start=False))])

In [33]:
#pickle_out = open("../../models/trained_model_Marcelo_pipeline.pickle","wb")
#pickle.dump(exported_pipeline, pickle_out)
#pickle_out.close()

In [34]:
model_pickle = open("../../models/trained_model_Marcelo_pipeline.pickle","rb")
model = pickle.load(model_pickle)

In [149]:
model

Pipeline(memory=None,
     steps=[('featureunion', FeatureUnion(n_jobs=None,
       transformer_list=[('nystroem', Nystroem(coef0=None, degree=None, gamma=0.15000000000000002, kernel='linear',
     kernel_params=None, n_components=10, random_state=None)), ('stackingestimator', StackingEstimator(estimator=Pipeline(memory=None,....9000000000000001, tol=0.0001,
              validation_fraction=0.1, verbose=0, warm_start=False))])

In [65]:
import matplotlib
matplotlib.use('WebAgg')
import matplotlib.pyplot as plt

In [66]:
def plot_confusion_matrix(y_true, y_pred, classes,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    # Only use the labels that appear in the data
    #classes = classes[unique_labels(y_true, y_pred)]
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax

In [None]:
np.set_printoptions(precision=2)
class_names = ['Ia', 'Ib', 'Ibc', 'Ic', 'II', 'IIL', 'IIP', 'IIn']

In [69]:
y_pred = model.predict(df_pca_20[18000:])
y_true = labels_as_num[18000:]
cm = confusion_matrix(y_true, y_pred)

In [70]:
cm

array([[ 662,   30,    0,    1,  116,    0,    5,    0],
       [ 100,   46,    0,    1,   46,    3,    1,    0],
       [  13,    4,    0,    0,   10,    2,    1,    0],
       [  72,    6,    0,    6,   85,    1,    4,    0],
       [  97,    4,    0,    3, 1744,    3,    3,    0],
       [  30,    4,    0,    0,   30,   12,    0,    0],
       [  46,    0,    0,    1,   76,    1,   14,    0],
       [   3,    0,    0,    0,   29,    0,    0,    1]])

In [68]:
for train_index, test_index in kf.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    y_pred = model.predict(X_test)
    # Plot non-normalized confusion matrix
    plot_confusion_matrix(y_test, y_pred, classes=class_names,
                          title='Confusion matrix, without normalization')
    
    # Plot normalized confusion matrix
    
    plot_confusion_matrix(y_test, y_pred, classes=class_names, normalize=True,
                          title='Normalized confusion matrix')
    
    #plt.savefig('demo.pdf')
plt.show()

TRAIN: [ 4264  4265  4266 ... 21313 21314 21315] TEST: [   0    1    2 ... 4261 4262 4263]
Confusion matrix, without normalization
[[ 970    0    0    0   28    0    0    0]
 [   0  287    0    0   14    0    0    0]
 [   0    0   61    0    0    0    0    0]
 [   0    0    0  208    6    0    0    0]
 [   0    0    0    0 2417    0    0    0]
 [   0    0    0    0    3   80    0    0]
 [   0    0    0    0    9    0  139    0]
 [   0    0    0    0    0    0    0   42]]
Normalized confusion matrix
[[0.97 0.   0.   0.   0.03 0.   0.   0.  ]
 [0.   0.95 0.   0.   0.05 0.   0.   0.  ]
 [0.   0.   1.   0.   0.   0.   0.   0.  ]
 [0.   0.   0.   0.97 0.03 0.   0.   0.  ]
 [0.   0.   0.   0.   1.   0.   0.   0.  ]
 [0.   0.   0.   0.   0.04 0.96 0.   0.  ]
 [0.   0.   0.   0.   0.06 0.   0.94 0.  ]
 [0.   0.   0.   0.   0.   0.   0.   1.  ]]
TRAIN: [    0     1     2 ... 21313 21314 21315] TEST: [4264 4265 4266 ... 8524 8525 8526]
Confusion matrix, without normalization
[[ 976    0    0    



TRAIN: [    0     1     2 ... 17050 17051 17052] TEST: [17053 17054 17055 ... 21313 21314 21315]
Confusion matrix, without normalization
[[ 879   30    0    1  125    0    5    0]
 [ 100  103    0    1   49    3    1    0]
 [  13    4    6    0   10    2    1    0]
 [  72    6    0   49   87    1    4    0]
 [  97    4    0    3 2298    3    3    0]
 [  30    4    0    0   30   28    0    0]
 [  46    0    0    1   77    1   45    0]
 [   3    0    0    0   29    0    0    9]]




Normalized confusion matrix
[[0.85 0.03 0.   0.   0.12 0.   0.   0.  ]
 [0.39 0.4  0.   0.   0.19 0.01 0.   0.  ]
 [0.36 0.11 0.17 0.   0.28 0.06 0.03 0.  ]
 [0.33 0.03 0.   0.22 0.4  0.   0.02 0.  ]
 [0.04 0.   0.   0.   0.95 0.   0.   0.  ]
 [0.33 0.04 0.   0.   0.33 0.3  0.   0.  ]
 [0.27 0.   0.   0.01 0.45 0.01 0.26 0.  ]
 [0.07 0.   0.   0.   0.71 0.   0.   0.22]]


In [88]:
X_train.shape

(17053, 20)

In [None]:
##DRAFT VERSION USED TO 'SN_examples.ipyb'
#def get_wavelets(sn, wavelet = 'sym2', mlev = 2):
#    keys = ['sdssg', 'sdssi', 'sdssr', 'sdssz']
#    wav = wt.Wavelet(wavelet)
#    
#    fmin, xstar, mu, std = gaussian_process(sn)
#    for filt in keys: 
#        coeffs = [np.array(wt.swt(mu[filt], wav, level=mlev)).flatten()]
#
#    return np.concatenate(coeffs)
#
#
#def gaussian_process(data):
#    x = np.linspace(data.time.min(), data.time.max(), 100)
#    data_dict = {band: df[['time', 'flux', 'fluxerr']].values for band, df in data.groupby('band')}
#    
#    mus = {'sdssg' : [], 'sdssi' : [], 'sdssr' : [], 'sdssz' : []}
#    for band, dat in data_dict.items():
#        gp = gg.GP((500**2)*kr.ExpSquaredKernel(metric=20**2), fit_mean=True)
#        gp.compute(dat[:,0], dat[:,2])  
#        # Define the objective function (negative log-likelihood in this case).
#        def nll(p):
#            gp.set_parameter_vector(p)
#            ll = gp.log_likelihood(dat[:,1], quiet=True)
#            return -ll if np.isfinite(ll) else 1e25
#    
#        # And the gradient of the objective function.
#        def grad_nll(p):
#            gp.set_parameter_vector(p)
#            return -gp.grad_log_likelihood(dat[:,1], quiet=True)
#          
#        p0 = gp.get_parameter_vector()
#        results = op.minimize(nll, p0, jac=grad_nll, method="L-BFGS-B")
#        
#        mu, var = gp.predict(dat[:,1], x, return_var=True)
#        std = np.sqrt(var)
#        stds[band] = std
#        mus[band] = mu 
#    
#    return 0, x, mus, stds

###Numpy approach:
##files = np.empty([0])
##for r, d, f in os.walk(path_to_read):
##    for file in f:
##        if '.DAT' in file:
##            print(file)
##            files = np.append(files, os.path.join(r, file))