In [1]:
# Imports
import json
import os
import sys
import pickle
import numpy as np
import torch
from sklearn.model_selection import train_test_split
filepath='data/'

## Lens Split

In [2]:
# Take in Raw data 
with open(f'{filepath}lens_flux-3600.data', 'rb') as filehandle:
    print("Loading lens fluxes")
    # read the data as binary data stream
    lens_flux = np.asarray(pickle.load(filehandle))
    
with open(f'{filepath}bg_flux-3600.data', 'rb') as filehandle:
    print("Loading background fluxes")
    # read the data as binary data stream
    bg_flux = np.asarray(pickle.load(filehandle))
    
nlenses,  nxlen  = lens_flux.shape
nbg,  nxlen  = bg_flux.shape

input_data = np.concatenate([
    lens_flux, 
    bg_flux])

labels = np.concatenate([np.full(nlenses, 0),
                        np.full(nbg, 1)])

nlenses, nbg, nxlen, labels.shape, input_data.shape

Loading lens fluxes
Loading background fluxes


(11891, 13966, 3600, (25857,), (25857, 3600))

In [3]:
# Extract length of spectra
spectra_length: int = input_data.shape[1]

# Find lenght of patch ~ 1/100 of spectra length
patch_size = spectra_length // 100 
patch_num = spectra_length // patch_size 
new_spectra_length = patch_size * patch_num
print(f"Patch size: {patch_size}, Number of Patches: {patch_num}, NumPatch/SpecSize: {patch_size/new_spectra_length}")
print("New Spectra Length: ", new_spectra_length)
# Pad spectra to be divisible by patch size
input_data = np.pad(input_data, ((0,0), (0, new_spectra_length - spectra_length)), mode='constant')




print("Input Data Shape: ", input_data.shape)
print("Labels Shape: ", labels.shape)


# update spectra length
spectra_length = input_data.shape[1]

Patch size: 36, Number of Patches: 100, NumPatch/SpecSize: 0.01
New Spectra Length:  3600
Input Data Shape:  (25857, 3600)
Labels Shape:  (25857,)


In [4]:
# split into train and test sets
x_train, x_test, y_train, y_test = train_test_split(input_data, labels, test_size=1-0.6)

x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size=.2/(.2+.2))

# convert to torch tensors
x_train = torch.Tensor(np.asarray(x_train).reshape(-1, 1, spectra_length))
x_test = torch.Tensor(np.asarray(x_test).reshape(-1, 1, spectra_length))
x_val = torch.Tensor(np.asarray(x_val).reshape(-1,1,spectra_length))
y_train = torch.Tensor(y_train).long().squeeze()
y_test = torch.Tensor(y_test).long().squeeze()
y_val = torch.Tensor(y_val).long().squeeze()



print("Training set: ", x_train.shape)
print("Test set: ", x_test.shape)
print("Validation set: ", x_val.shape)

Training set:  torch.Size([15514, 1, 3600])
Test set:  torch.Size([5172, 1, 3600])
Validation set:  torch.Size([5171, 1, 3600])


In [5]:
# Save all Training
np.save('split/V1_xtrain.npy', x_train.cpu())
np.save('split/V1_ytrain.npy', y_train.cpu())
# Save Testing
np.save('split/V1_xtest.npy', x_test.cpu())
np.save('split/V1_ytest.npy', y_test.cpu())
# Save Validation
np.save('split/V1_xval.npy', x_val.cpu())
np.save('split/V1_yval.npy', y_val.cpu())

# Save all hyperparameters
parameters = json.dumps({'patch_size':patch_size, 'patch_num':patch_num, 'spectra_length':spectra_length})

# open file for writing, "w" 
with open("split/parameters.json","w") as f:
    f.write(parameters)

In [6]:
reset -sf