In [None]:
%run ../code/wavelets_pca_with_zhost_1100_average_precision.py

In [None]:
%run ../code/wavelets_pca_with_zhost_1100_roc_auc.py

In [None]:
%run ../processing/read_data.py

In [None]:
import george as gg
import george.kernels as kr
import sncosmo as snc
import scipy.optimize as op
import pywt as wt

import os
import time
import numpy as np

from sklearn.decomposition import PCA

import pickle

from sklearn.metrics import confusion_matrix


In [None]:
pd.set_option("display.max_rows", 6)

## Loading GP and Wavelets function

In [None]:
def cleaning_df(df, method, clean_neg = False, percentage = 0.5):
    if clean_neg: #verifies if the value is negative and if it is under the error margin, if it is, turn to zero
        df[(df[:, 1] < 0) & (df[:, 1] > -df[:, 2]) , 1] = 0
        df = df[(df[:, 1] > 0)] #otherwise just cut off
    if method == 'std_dev': #cuts the points with error over the mean error + 1 std
        threshold = df.mean(axis = 0)[2] + df.std(axis = 0)[2]
        df_filter = df[(threshold>df[:,2])]
    elif method == 'percentage':
        threshold = df.max(axis = 0)[1] * percentage
        df_filter = df[(threshold>df[:,2])]
    else:
        df_filter = df
    return df_filter

In [None]:
#the keys are the names of the filters in order to be the dict keys
def get_wavelets(sn, keys, wavelet = 'sym2', mlev = 2):
    wav = wt.Wavelet(wavelet)
    
    fmin, xstar, mu, stds = gaussian_process(sn, keys)
    for filt in keys: 
        coeffs = [np.array(wt.swt(mu[filt], wav, level=mlev)).flatten()]

    return np.concatenate(coeffs)

In [None]:
def gaussian_process(data, filters):
    x = np.linspace(data.MJD.min(), data.MJD.max(), 100)
    data_dict = {band: df[['MJD', 'FLUXCAL', 'FLUXCALERR']].values for band, df in data.groupby('FLT')}
    
    
    mus = {filters[0] : [], filters[1] : [], filters[2] : [], filters[3] : []}
    stds = {filters[0] : [], filters[1] : [], filters[2] : [], filters[3] : []}

    for band, dat in data_dict.items():
        gp = gg.GP((500**2)*kr.ExpSquaredKernel(metric=20**2), fit_mean=True)
        gp.compute(dat[:,0], dat[:,2])  
        # Define the objective function (negative log-likelihood in this case).
        def nll(p):
            gp.set_parameter_vector(p)
            ll = gp.log_likelihood(dat[:,1], quiet=True)
            return -ll if np.isfinite(ll) else 1e25
    
        # And the gradient of the objective function.
        def grad_nll(p):
            gp.set_parameter_vector(p)
            return -gp.grad_log_likelihood(dat[:,1], quiet=True)
          
        p0 = gp.get_parameter_vector()
        results = op.minimize(nll, p0, jac=grad_nll, method="L-BFGS-B")
        
        mu, var = gp.predict(dat[:,1], x, return_var=True)
        std = np.sqrt(var)
        stds[band] = std
        mus[band] = mu 
    
    return 0, x, mus, stds

In [None]:
#files = []
#for r, d, f in os.walk(path_to_read):
#    for file in f:
#        if '.DAT' in file:
#            files.append(os.path.join(r, file))
#            
###Numpy approach:
##files = np.empty([0])
##for r, d, f in os.walk(path_to_read):
##    for file in f:
##        if '.DAT' in file:
##            print(file)
##            files = np.append(files, os.path.join(r, file))

In [None]:
#labels = []
#for f in files:
#    #print(f[10:])
#    
#    file_name = f[20:] #edit here everytime we change the folder
#    read = read_sn(path_to_read + file_name)
#    label = read['SIM_COMMENT'][3]
#    labels.append(label)
#    #results.append(get_wavelets(read_sn(path_to_read + f[20:])['df'], keys))
#    

In [None]:
#pickle_out = open("../../models/labels.pickle","wb")
#pickle.dump(labels, pickle_out)
#pickle_out.close()

### OBS: SOME FILES FOR SOME REASON COULD NOT BE PROCESSED USING WAVELETS, Those were:
### DES_SN076747, DES_SN076747. DES_SN813144

In [None]:
#samples_wavelets = []
#path_to_read = '../../data/raw_data/'
#
#keys = ['desg' , 'desi' , 'desr' , 'desz']
#
#start = time.time()
#
#results = []
#for f in files:
#    #print(f[10:])
#    
#    #file_name = f[20:] #edit here everytime we change the folder
#    #read = read_sn(path_to_read + file_name)
#    #df = read['df']
#    
#    results.append(get_wavelets(read_sn(path_to_read + f[20:])['df'], keys))
#    
#
#end = time.time()
#print("Time running: " , (end - start)/3600) 
### Time running:  0.9278092284997305

In [None]:
#pickle_out = open("../../models/wavelet_df_pipeline_Marcelo.pickle","wb")
#pickle.dump(results, pickle_out)
#pickle_out.close()

In [None]:
pickle_results = open("../../models/wavelet_df_pipeline_Marcelo.pickle","rb")
pickle_labels = open("../../models/labels.pickle","rb")

results = pickle.load(pickle_results)
labels = pickle.load(pickle_labels)

In [None]:
## Checking if the dimensions are all the same
#for f in results:
#    if len(f) != 400:
#        print('error')

## Wavelets done, PCA now

In [None]:
pca = PCA(n_components=20)
pca.fit(results)  
print(pca.explained_variance_ratio_)
print(pca.singular_values_)

In [None]:
df_pca_20 = pca.fit_transform(results) #or pca.transform(results), same thing

## Using Pipelines

In [None]:
conversion = {'IIL' : 5, 'IIP' : 7, 'II' : 4, 'IIn' : 6, 'Ia' : 0, 'Ib' : 1, 'Ibc' : 2, 'Ic' : 3}
labels_as_num = []
for l in labels:
    labels_as_num.append(conversion[l])

In [None]:
#exported_pipeline.fit(df_pca_20[:18000], labels_as_num[:18000])


###Pipeline(memory=None,
###     steps=[('featureunion', FeatureUnion(n_jobs=None,
###       transformer_list=[('stackingestimator-1', StackingEstimator(estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
###           metric_params=None, n_jobs=None, n_neighbors=21, p=1,
###           weights='uniform'))), ('....8500000000000001, tol=0.0001,
###              validation_fraction=0.1, verbose=0, warm_start=False))])

In [None]:
#pickle_out = open("../../models/trained_model_Marcelo_pipeline.pickle","wb")
#pickle.dump(exported_pipeline, pickle_out)
#pickle_out.close()

In [None]:
model_pickle = open("../../models/trained_model_Marcelo_pipeline.pickle","rb")
model = pickle.load(model_pickle)

In [None]:
y_pred = model.predict(df_pca_20[18000:])
y_true = labels_as_num[18000:]
cm = confusion_matrix(y_true, y_pred)

In [None]:
import matplotlib
matplotlib.use('WebAgg')
import matplotlib.pyplot as plt

In [None]:
def plot_confusion_matrix(y_true, y_pred, classes,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    # Only use the labels that appear in the data
    #classes = classes[unique_labels(y_true, y_pred)]
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax

In [None]:
np.set_printoptions(precision=2)
class_names = ['Ia', 'Ib', 'Ibc', 'Ic', 'II', 'IIL', 'IIP', 'IIn']

# Plot non-normalized confusion matrix
plot_confusion_matrix(y_true, y_pred, classes=class_names,
                      title='Confusion matrix, without normalization')

# Plot normalized confusion matrix

plot_confusion_matrix(y_true, y_pred, classes=class_names, normalize=True,
                      title='Normalized confusion matrix')

#plt.savefig('demo.pdf')
plt.show()

In [None]:
plt.show()

In [None]:
##DRAFT VERSION USED TO 'SN_examples.ipyb'
#def get_wavelets(sn, wavelet = 'sym2', mlev = 2):
#    keys = ['sdssg', 'sdssi', 'sdssr', 'sdssz']
#    wav = wt.Wavelet(wavelet)
#    
#    fmin, xstar, mu, std = gaussian_process(sn)
#    for filt in keys: 
#        coeffs = [np.array(wt.swt(mu[filt], wav, level=mlev)).flatten()]
#
#    return np.concatenate(coeffs)
#
#
#def gaussian_process(data):
#    x = np.linspace(data.time.min(), data.time.max(), 100)
#    data_dict = {band: df[['time', 'flux', 'fluxerr']].values for band, df in data.groupby('band')}
#    
#    mus = {'sdssg' : [], 'sdssi' : [], 'sdssr' : [], 'sdssz' : []}
#    for band, dat in data_dict.items():
#        gp = gg.GP((500**2)*kr.ExpSquaredKernel(metric=20**2), fit_mean=True)
#        gp.compute(dat[:,0], dat[:,2])  
#        # Define the objective function (negative log-likelihood in this case).
#        def nll(p):
#            gp.set_parameter_vector(p)
#            ll = gp.log_likelihood(dat[:,1], quiet=True)
#            return -ll if np.isfinite(ll) else 1e25
#    
#        # And the gradient of the objective function.
#        def grad_nll(p):
#            gp.set_parameter_vector(p)
#            return -gp.grad_log_likelihood(dat[:,1], quiet=True)
#          
#        p0 = gp.get_parameter_vector()
#        results = op.minimize(nll, p0, jac=grad_nll, method="L-BFGS-B")
#        
#        mu, var = gp.predict(dat[:,1], x, return_var=True)
#        std = np.sqrt(var)
#        stds[band] = std
#        mus[band] = mu 
#    
#    return 0, x, mus, stds