## Data Cleaning

In [1]:
# Load h5 spectra and model parameters
import numpy as np
import pandas as pd
import os
import h5py
import tables

In [2]:
from pathlib import Path
PATH = "../data/train-data/"

In [3]:
def get_filename(path, is_dir=False):
    if is_dir == False:
        return [path]
    return os.listdir(path)

In [5]:
files = get_filename(PATH, is_dir=True)

In [None]:
def parse_data(PATH, filename):
    """ Given a filename return X and Y numpy arrays

    X is of 40*86
    Y is an array of 5 target parameters.
    """
    file = PATH/filename # remove suffix
    data = pd.read_hdf(file)
    data.drop_duplicates(subset='tau_e', keep='first', inplace=True)
    data.save_as()

In [98]:
def get_data(filepath, is_dir=False):
    #only get the last 10 files in directory
    files = os.listdir(filepath)[-11:-1]

    x, y, filename = [],[],[]
    for file in files:
        filename.append(file[:-3]) # remove suffix
        file = filepath + file
        data = pd.read_hdf(file)
        data.drop_duplicates(subset='tau_e', keep='first', inplace=True)
        data.reset_index(inplace=True, drop=True)
        for row in range(len(data)):
            parameters = data.loc[row][:5].values
            spectrum = data.loc[row][5]
            spectrum = np.abs(spectrum)             
            spectrum = spectrum.transpose()
            x.append(spectrum)
            y.append(parameters)
    x = np.array(x)
    y = np.array(y)
    filename = np.array(filename)
    return(x, y, filename)

In [99]:
x, y, filenames = get_data(PATH)

In [70]:
x.shape

(11140, 40, 86)

In [100]:
y.shape

(11140, 5)

In [101]:
# split train, test, valid set
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=27)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.25, random_state=27)

In [102]:
# save the data to disk
# x has 3 dimensions, need to be saved slice by slice
def save_x(filename, data):
    with open('../data/'+filename+'txt', 'w') as file:
        for record in data:
            np.savetxt(file, record)

In [104]:
save_x('x_train', x_train)
save_x('x_test', x_test)
save_x('x_val', x_val)

In [122]:
# load data from the disk
x_train = np.loadtxt('../data/x_train').reshape(6684, 40, 86)
x_test = np.loadtxt('../data/x_test').reshape(2228, 40, 86)
x_val = np.loadtxt('../data/x_val').reshape(2228, 40, 86)
y_train = np.loadtxt('../data/y_train.txt')
y_test = np.loadtxt('../data/y_test.txt')
y_valid = np.loadtxt('../data/y_val.txt')

In [123]:
x_train.shape

(6684, 40, 86)