In [None]:
# !pip install -e ../.

In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from ExoHunter.params import *
from ExoHunter.cleaner import Cleaner
from ExoHunter.trainer import Trainer



In [2]:
def get_kaggle_data(data_name='kaggle', test_size=0.2, drive=0):
    train_path = os.path.join(DRIVE[drive], 'raw_data', data_name, FILEPATHS[data_name][0])
    test_path = os.path.join(DRIVE[drive], 'raw_data', data_name, FILEPATHS[data_name][1])
    train_data = pd.read_csv(train_path)
    data_test = pd.read_csv(test_path)
    data_train, data_val = train_test_split(train_data, test_size=test_size)
    return data_train, data_val, data_test

In [3]:
def min_window(data):
    temp = data.drop(columns='LABEL').T
    minim = temp[temp>-660].count().min()
    return minim

In [4]:
def get_nasa_data(data_name='nasa', test_size=0.2, drive=0):
    exo_path = os.path.join(DRIVE[drive], 'raw_data', data_name, FILEPATHS[data_name][0][0])
    non_exo_path = os.path.join(DRIVE[drive], 'raw_data', data_name, FILEPATHS[data_name][0][1])
    
    exo_data = pd.read_csv(exo_path)
    non_exo_data = pd.read_csv(non_exo_path)
    exo_data.drop(columns=['Unnamed: 0', 'Unnamed: 0.1'], inplace=True)
    non_exo_data.drop(columns=['Unnamed: 0'], inplace=True)
    
    exo_data = exo_data.T
    non_exo_data = non_exo_data.T
    exo_data[['LABEL']] = 2
    non_exo_data[['LABEL']] = 1
    
    all_data = pd.concat([exo_data, non_exo_data])
    minim = min_window(all_data)
    print(minim)
    
    labels = all_data[['LABEL']]
    all_data = all_data.iloc[:, :minim]
    all_data[['LABEL']] = labels
    
    train_data, test_data = train_test_split(all_data, test_size=test_size)

    return train_data, test_data

In [5]:
def get_raw_data(data_name='nasa', test_size=0.2, drive=0):
    if data_name == 'kaggle':
        return get_kaggle_data(test_size=test_size, drive=drive)
    if data_name == 'nasa':
        return get_nasa_data(test_size=test_size, drive=drive)
    return None

In [6]:
def get_proc_data(data_name='nasa', test_size=0.2, drive=0):
    train_path = os.path.join(DRIVE[drive],'processed_data', data_name, FILEPATHS[data_name][1][0])
    test_path = os.path.join(DRIVE[drive],'processed_data', data_name, FILEPATHS[data_name][1][1])
    if data_name=='nasa':
        train_data = pd.read_csv(train_path, index_col='Index')
        data_test = pd.read_csv(test_path, index_col='Index')
    else:
        train_data = pd.read_csv(train_path)
        data_test = pd.read_csv(test_path)
    
    return train_data, data_test

In [7]:
def get_data(data_name='nasa', test_size=0.2, drive=0, raw=0):
    if raw:
        return get_raw_data(data_name, test_size, drive)
    return get_proc_data(data_name, test_size, drive)

In [8]:
def get_Xy(data):
        X = data.drop(columns='LABEL')
        y = data['LABEL'].map({1:0, 2:1})
        return X, y

In [9]:
cleaner = Cleaner()

In [None]:
train_nasa, test_nasa = cleaner.get_data()

In [None]:
train_raw_nasa, test_raw_nasa = cleaner.get_data('nasa', 0.2, 0, 1)

In [10]:
train_raw_kaggle, test_raw_kaggle = cleaner.get_data('kaggle', 0.2, 0, 1)

In [None]:
# FILEPATHS = {
#     'kaggle': ['exoTrain.csv', 'exoTest.csv'],
#     'nasa': [['nasa_exo_wo_label_df.csv', 'nasa_non_exo_wo_label_df.csv'], ['nasaTrain.csv', 'nasaTest.csv']]
# }
# train_nasa, test_nasa = get_data()

In [None]:
train_raw_nasa

In [11]:
train_raw_kaggle

Unnamed: 0,LABEL,FLUX.1,FLUX.2,FLUX.3,FLUX.4,FLUX.5,FLUX.6,FLUX.7,FLUX.8,FLUX.9,...,FLUX.3188,FLUX.3189,FLUX.3190,FLUX.3191,FLUX.3192,FLUX.3193,FLUX.3194,FLUX.3195,FLUX.3196,FLUX.3197
0,2,93.85,83.81,20.10,-26.98,-39.56,-124.71,-135.18,-96.27,-79.89,...,-78.07,-102.15,-102.15,25.13,48.57,92.54,39.32,61.42,5.08,-39.54
1,2,-38.88,-33.83,-58.54,-40.09,-79.31,-72.81,-86.55,-85.33,-83.97,...,-3.28,-32.21,-32.21,-24.89,-4.86,0.76,-11.70,6.46,16.00,19.93
2,2,532.64,535.92,513.73,496.92,456.45,466.00,464.50,486.39,436.56,...,-71.69,13.31,13.31,-29.89,-20.88,5.06,-11.80,-28.91,-70.02,-96.67
3,2,326.52,347.39,302.35,298.13,317.74,312.70,322.33,311.31,312.42,...,5.71,-3.73,-3.73,30.05,20.03,-12.67,-8.77,-17.31,-17.35,13.98
4,2,-1107.21,-1112.59,-1118.95,-1095.10,-1057.55,-1034.48,-998.34,-1022.71,-989.57,...,-594.37,-401.66,-401.66,-357.24,-443.76,-438.54,-399.71,-384.65,-411.79,-510.54
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5082,1,-91.91,-92.97,-78.76,-97.33,-68.00,-68.24,-75.48,-49.25,-30.92,...,139.95,147.26,156.95,155.64,156.36,151.75,-24.45,-17.00,3.23,19.28
5083,1,989.75,891.01,908.53,851.83,755.11,615.78,595.77,458.87,492.84,...,-26.50,-4.84,-76.30,-37.84,-153.83,-136.16,38.03,100.28,-45.64,35.58
5084,1,273.39,278.00,261.73,236.99,280.73,264.90,252.92,254.88,237.60,...,-26.82,-53.89,-48.71,30.99,15.96,-3.47,65.73,88.42,79.07,79.43
5085,1,3.82,2.09,-3.29,-2.88,1.66,-0.75,3.85,-0.03,3.28,...,10.86,-3.23,-5.10,-4.61,-9.82,-1.50,-4.65,-14.55,-6.41,-2.55


In [None]:
train_data, test_data = get_data(raw=1)

In [None]:
train_data

In [None]:
# minim = temp[temp>-660].count().min()
train_data.to_csv('../nasaTrain.csv', index=True, index_label='KepID')
test_data.to_csv('../nasaTest.csv', index=True, index_label='KepID')

In [None]:
pd.read_csv('../nasaTrain.csv', index_col='KepID')

In [None]:
plt.plot(X_train.iloc[700])

In [None]:
# exo_data.shape, non_exo_data.shape

In [None]:
# all_data = pd.concat([exo_data, non_exo_data])

In [None]:
temp2 = temp[temp>-660].count()

In [None]:
min_length_frac = temp2[temp2==minim].count()/all_data.shape[0]

In [None]:
majority_frac = temp2[temp2==18422].count()/all_data.shape[0]+temp2[temp2==18421].count()/all_data.shape[0]

In [None]:
snip_len = 18422

In [None]:
cols = [
    'Min snippet length',
    'Fraction of samples of min snippet length',
    'Fraction of samples of min snippet length+1||min snippet length',
    'Fraction of samples of greater length',
    'Chosen snippet length'
]
dats = [
    minim,
    min_length_frac,
    majority_frac,
    1-majority_frac,
    snip_len
]

In [None]:
pd.DataFrame(dats, index=cols)