In [1]:
import numpy as np, pandas as pd, matplotlib.pyplot as plt
import json, datetime, os, sys
from matplotlib.colors import LogNorm
from matplotlib.ticker import MultipleLocator
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

plt.rcParams.update({
    "font.family": "Times New Roman",
    "font.size": 14,
})
plt.rcParams['axes.linewidth'] = 1.5

In [2]:
model_dir = "models/ld-vgg6-n10/"
with open(model_dir+"report.json", 'r') as f:
    report = json.load(f)

In [3]:
report

{'Run time stamp': '20220809_150529',
 'Model name': 'VGG6_20220809_150529',
 'Model trained': 'vgg6_lowdrop',
 'Batch size': 64,
 'Optimizer': 'adam',
 'Requested number of train epochs': 100,
 'Early stopping after epochs': 50,
 'Training+validation/test split': 0.1,
 'Training/validation split': 0.1,
 'Weight training data by class': {'0': 0.8215523494801321, '1': 1.0},
 'Random state': 2,
 'Number of training examples': 45200,
 'Number of val examples': 5090,
 'X_train shape': [45200, 63, 63, 3],
 'Y_train shape': [45200],
 'X_val shape': [5090, 63, 63, 3],
 'Y_val shape': [5090],
 'Data augmentation': {'horizontal_flip': True,
  'vertical_flip': True,
  'rotation_range': 0,
  'fill_mode': 'constant',
  'cval': 1e-09},
 'Confusion matrix': [[0.6743844174935686, 0.32561558250643147],
  [0.32334318277754326, 0.6766568172224567]],
 'Misclassified val candids': [1428489660115015002,
  1472178350915015005,
  754496753715015009,
  493232896015015003,
  1723224871715015002,
  176920657501

In [4]:
df = pd.read_csv("data/candidates_pd_gr_10max.csv", index_col=False)
triplets = np.load(f'data/triplets_pd_gr_10max.npy', mmap_mode='r')

In [5]:
test_split = 0.1  # fraction of all data
random_state = 2

ztfids_seen, ztfids_test = train_test_split(pd.unique(df['objectId']), test_size=test_split, random_state=random_state)

# Want array of indices for training alerts and testing alerts
# Need to shuffle because validation is bottom 10% of train - shuffle test as well for consistency
is_seen = df['objectId'].isin(ztfids_seen)
is_test = ~is_seen
mask_seen = shuffle(df.index.values[is_seen], random_state=random_state)
mask_test  = shuffle(df.index.values[is_test], random_state=random_state)

x_seen, y_seen = triplets[mask_seen], df['label'][mask_seen]
x_test,  y_test  = triplets[mask_test] , df['label'][mask_test]

num_seen_obj = len(ztfids_seen)
num_test_obj = len(ztfids_test)
num_obj = len(pd.unique(df['objectId']))
print(f"{num_seen_obj} seen/train+val objects")
print(f"{num_test_obj} unseen/test objects")
print(f"{100*(num_seen_obj/num_obj):.2f}%/{100*(num_test_obj/num_obj):.2f}% seen/unseen split by object\n")

num_seen_alr = len(x_seen)
num_test_alr = len(x_test)
num_alr = len(df['objectId'])
print(f"{num_seen_alr} seen/train+val alerts")
print(f"{num_test_alr} unseen/test alerts")
print(f"{100*(num_seen_alr/num_alr):.2f}%/{100*(num_test_alr/num_alr):.2f}% seen/unseen split by alert\n")

validation_split = 0.1  # fraction of the seen data

ztfids_train, ztfids_val = train_test_split(ztfids_seen, test_size=validation_split, random_state=random_state)

is_train = df['objectId'].isin(ztfids_train)
is_val = df['objectId'].isin(ztfids_val)
mask_train = shuffle(df.index.values[is_train], random_state=random_state)
mask_val  = shuffle(df.index.values[is_val], random_state=random_state)

x_train, y_train = triplets[mask_train], df['label'][mask_train]
x_val, y_val = triplets[mask_val], df['label'][mask_val]

num_train_obj = len(ztfids_train)
num_val_obj = len(ztfids_val)
num_obj = len(pd.unique(df['objectId']))
print(f"{num_train_obj} train objects")
print(f"{num_val_obj} val objects")
print(f"{100*(num_train_obj/num_obj):.2f}%/{100*(num_val_obj/num_obj):.2f}% train/val split by object\n")

num_train_alr = len(x_train)
num_val_alr = len(x_val)
num_alr = len(df['objectId'])
print(f"{num_train_alr} train alerts")
print(f"{num_val_alr} val alerts")
print(f"{100*(num_train_alr/num_alr):.2f}%/{100*(num_val_alr/num_alr):.2f}% train/val split by alert\n")

5290 seen/train+val objects
588 unseen/test objects
90.00%/10.00% seen/unseen split by object

50290 seen/train+val alerts
5596 unseen/test alerts
89.99%/10.01% seen/unseen split by alert

4761 train objects
529 val objects
81.00%/9.00% train/val split by object

45200 train alerts
5090 val alerts
80.88%/9.11% train/val split by alert



In [6]:
val_alerts = df.loc[mask_val]
val_alerts

Unnamed: 0,index,objectId,candid,label,jd,fid,programid,isdiffpos,rcid,field,...,dec,magpsf,sigmapsf,fwhm,ndethist,scorr,distnr,magzpsci,magzpsciunc,drb
862,14227,ZTF20acqzjqz,1425208912915015005,1,2.459180e+06,1,1,True,29,453,...,4.060277,16.342489,0.036401,2.75,22,67.692108,,,,
24433,3294,ZTF18aacnlxz,1428490740115015002,0,2.459183e+06,2,1,True,1,709,...,38.109049,19.101851,0.102994,2.48,18,9.313935,,,,
28114,24635,ZTF19aakpfax,806491844615015012,0,2.458561e+06,1,2,True,46,825,...,62.517710,19.882389,0.203436,1.94,78,8.595606,,,,
41598,173157,ZTF18acvbigh,985459745315015001,0,2.458740e+06,2,1,True,53,300,...,-13.622151,18.579016,0.140799,2.33,21,8.226796,,,,
5031,58340,ZTF20abeeytg,1281301341515015014,1,2.459036e+06,2,1,True,15,820,...,58.915173,18.124447,0.093267,2.65,15,16.164534,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35149,67267,ZTF21aayqrgx,1595305165915015002,0,2.459350e+06,2,1,True,59,845,...,72.117254,18.552069,0.060475,2.42,20,22.698732,,,,
11943,113835,ZTF21aagmoqw,1499275265215015001,1,2.459254e+06,2,1,True,52,305,...,-13.562627,18.190701,0.077641,3.34,5,20.490553,,,,
26361,13595,ZTF18acvwdkk,770454430515015024,0,2.458525e+06,2,2,True,5,1572,...,12.573354,19.043989,0.110844,2.25,5,25.011106,,,,
37765,110600,ZTF18aarrwmi,1227298262515015001,0,2.458982e+06,1,1,True,25,679,...,33.095850,19.226009,0.133922,1.90,292,14.583209,,,,


In [7]:
misclass_candids = report['Misclassified val candids']

In [9]:
[candid in df['candid'].to_numpy() for candid in misclass_candids]

[False,
 False,
 True,
 False,
 False,
 True,
 False,
 False,
 True,
 False,
 True,
 False,
 False,
 True,
 False,
 True,
 False,
 True,
 True,
 False,
 True,
 False,
 False,
 True,
 False,
 True,
 True,
 True,
 False,
 False,
 False,
 False,
 True,
 True,
 True,
 False,
 False,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 False,
 False,
 True,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 True,
 False,
 False,
 True,
 False,
 False,
 True,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 True,
 True,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 True,
 False,
 True,
 False,
 False,
 True,
 False,
 True,
 True,
 False,
 False,
 False,
 True,
 False,
 True,
 False,
 False,
 True,
 False,
 False,
 True,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 True,
 False,
 False,
