In [1]:
import numpy as np
from tensorflow import keras
import pickle
from numpy import exp
import lightkurve
from sklearn import preprocessing
import pandas as pd
import os
import lightkurve as lk
import matplotlib.pyplot as plt
import numpy as np
from io import BytesIO
import os.path
import tensorflow as tf
import base64
from tsfresh import extract_features
from tsfresh.feature_extraction import EfficientFCParameters
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

#https://stackoverflow.com/questions/60191681/cannot-load-file-containing-pickled-data-python-npy-i-o

In [2]:
def find_tce(kepid, tce_plnt_num, filenames):
    for filename in filenames:
        for record in tf.compat.v1.python_io.tf_record_iterator(filename):
            ex = tf.train.Example.FromString(record)
            if (ex.features.feature["kepid"].int64_list.value[0] == kepid and
                ex.features.feature["tce_plnt_num"].int64_list.value[0] == tce_plnt_num):
                print("Found {}_{} in file {}".format(kepid, tce_plnt_num, filename))
                return ex
    raise ValueError("{}_{} not found in files: {}".format(kepid, tce_plnt_num, filenames))

def getLocalView(kepid, dir):
    filenames = tf.io.gfile.glob(os.path.join(dir, "*"))
    assert filenames, "No files found in {}".format(dir)
    ex = find_tce(kepid, 1, filenames)

    # Get the local view.
    local_view = np.array(ex.features.feature["local_view"].float_list.value)
    return local_view

def downloadLC_kepler(target):
    #search and download target light curve
    search_result = lk.search_lightcurve(target, author='Kepler', cadence='long')

    kep_id = search_result.target_name.data[0]
    kep_id = kep_id[4:] #substr to remove "kplr" prefix
    kep_id = int(kep_id.lstrip("0"))

    local_view = getLocalView(kep_id, "../Kepler/TFRecords")
    local_view_matrix = np.expand_dims(local_view,axis=0)
    return local_view_matrix

#error handling for downloading
def downloadLC_kaggle(target):
    light_curve = lk.search_lightcurve(target, author='Kepler', cadence='long', quarter=3).download()
    light_curve = light_curve.flatten().remove_outliers()
    flux = np.array(light_curve.flux)
    flux = np.expand_dims(flux, axis=0)
    return flux


In [3]:
def tobool(probs, path):
    if(np.any(probs >= .5)): #potential bug
        return "true"
    else:
        return "false"

def logistic_layer(y):
    y = np.array(y)
    y = 1 / (1 + exp(-y))
    y = y.ravel()
    return y

def normalize(data):
    mean = np.mean(data, axis=0)
    std = np.std(data, axis=0)
    data = (data - mean) / std
    return data

def light_curve_to_matrix_kaggle_lightkurve(lc):
    inputLC = downloadLC_kaggle(lc)
    flux_copy = np.array(inputLC[0])
    amt_pad = 3197-(len(inputLC[0])%3197)                                              #amount of medians to pad the last row of dataframe
    flux_median = np.full(shape=amt_pad, fill_value=np.median(np.array(inputLC[0])))   #array of medians to pad the last row to reach a factor of 3197
    flux_copy = np.append(flux_copy, flux_median)
    final_matrix = np.reshape(flux_copy, (len(flux_copy)//3197,3197))
    normalized_matrix = normalize(final_matrix)
    return normalized_matrix

def lightcurve_to_kaggle_tsfresh(lc):
    matrix = pd.DataFrame({
        "id" : np.zeros(len(lc), dtype=int),
        "time" : list(range(0, len(lc))),
        "flux" : lc})
    
    extracted_features = extract_features(matrix, column_id= "id", column_sort= "time", 
                                      column_value= "flux", 
                                      default_fc_parameters= EfficientFCParameters())
    extracted_features.dropna(axis=1, inplace=True)  #dropped the nan column
    normalized_matrix = preprocessing.normalize(extracted_features,norm='max', axis=0)
    return normalized_matrix

def lightcurve_to_kepler_tsfresh(lc):
    matrix = pd.DataFrame({
        "id" : np.zeros(len(lc), dtype=int),
        "time" : list(range(0, len(lc))),
        "FLUX" : lc})
    
    extracted_features = extract_features(matrix, column_id= "id", column_sort= "time", 
                                      column_value= "FLUX", 
                                      default_fc_parameters= EfficientFCParameters())
    with open("../Kepler/TSFresh/nanlist.txt") as file:
        for readline in file: 
            line_strip = readline.strip()
            extracted_features.drop(line_strip, axis=1, inplace=True)
    normalized_matrix = preprocessing.normalize(extracted_features,norm='max', axis=0)
    return normalized_matrix

def getPredictions(inputLC, models, path):
    inputLC_keras = np.expand_dims(inputLC, -1)
    predictions = {}
    
    if "rnn" in models:
        reconstructed_model = keras.models.load_model(path + "/keras-models/KerasRNN")
        probability = reconstructed_model.predict(inputLC_keras)
        predictions["RNN"] = {"Probability" : probability, "Classification" : tobool(probability, path)}

    if "lstm" in models:
        reconstructed_model = keras.models.load_model(path + "/keras-models/KerasLSTM")
        probability = reconstructed_model.predict(inputLC_keras)
        predictions["LSTM"] = {"Probability" : probability, "Classification" : tobool(probability, path)}
            
    if "gru" in models:
        reconstructed_model = keras.models.load_model(path + "/keras-models/KerasGRU")
        probability = reconstructed_model.predict(inputLC_keras)
        predictions["GRU"] = {"Probability" : probability, "Classification" : tobool(probability, path)}

    #Logistic Regression
    if "lr" in models:
        with open(path + "/tf-models/multi-lr.npy", 'rb') as f:
            W = np.load(f)
            b = np.load(f)
        predicted_y = W * inputLC + b
        predicted_y = logistic_layer(predicted_y)
        probability = np.max(predicted_y)
        predictions["LR"] = {"Probability" : probability, "Classification" : tobool(np.array(predicted_y), path)}

            
    #Decision Tree & Random Forest
    if "rf" in models:
        with open(path + "/tf-models/decision-tree.pkl", 'rb') as f:
            dt = pickle.load(f)
        probability = dt.predict_proba(inputLC)
        predictions["DT"] = {"Probability" : probability, "Classification" : tobool(probability, path)}

        with open(path + "/tf-models/random-forest.pkl", 'rb') as f:
            rf = pickle.load(f)
        probability = rf.predict_proba(inputLC)
        predictions["RF"] = {"Probability" : probability, "Classification" : tobool(probability, path)}
    
    return predictions
                
def predict(target_name, training_data, processing, models):
    
    if training_data == "kaggle" and processing == "tsfresh":
            inputLC = downloadLC_kaggle(target_name)[0]
            inputLC_kaggle_tsfresh = lightcurve_to_kaggle_tsfresh(inputLC)
            return getPredictions(inputLC_kaggle_tsfresh, models, "../Kaggle/TSFresh")
                
    if training_data == "kaggle" and processing == "lightkurve":
            inputLC_kaggle_lightkurve = light_curve_to_matrix_kaggle_lightkurve(target_name)
            return getPredictions(inputLC_kaggle_lightkurve, models, "../Kaggle/Regular")
    
    if training_data == "kepler" and processing == "tsfresh":
            inputLC = downloadLC_kepler(target_name)[0]
            inputLC_kepler_tsfresh = lightcurve_to_kepler_tsfresh(inputLC)
            return getPredictions(inputLC_kepler_tsfresh, models, "../Kepler/TSFresh")
        
    if training_data == "kepler" and processing == "lightkurve":
            inputLC_kepler_lightkurve = downloadLC_kepler(target_name)
            return getPredictions(inputLC_kepler_lightkurve, models, "../Kepler/Regular")    
    
    return {"error"}

In [4]:
print(predict("KIC 3733346", "kaggle", "tsfresh", ["gru", "lr", "rf"]))
#print(predict("KIC 3733346", "kaggle", "lightkurve", ["gru", "lr", "rf"]))
print(predict("KIC 3733346", "kepler", "lightkurve", ["gru", "lr", "rf"]))
print(predict("KIC 3733346", "kepler", "tsfresh", ["gru", "lr", "rf"]))

Feature Extraction: 100%|██████████| 1/1 [00:01<00:00,  1.60s/it]


{'GRU': {'Probability': array([[0.4452046]], dtype=float32), 'Classification': 'false'}, 'LR': {'Probability': 0.5018438824981267, 'Classification': 'true'}, 'DT': {'Probability': array([[0., 1.]]), 'Classification': 'true'}, 'RF': {'Probability': array([[0.43, 0.57]]), 'Classification': 'true'}}
Instructions for updating:
Use eager execution and: 
`tf.data.TFRecordDataset(path)`
Found 3733346_1 in file ../Kepler/TFRecords/train-00004-of-00008
{'GRU': {'Probability': array([[0.2680739]], dtype=float32), 'Classification': 'false'}, 'LR': {'Probability': 0.5096319572817811, 'Classification': 'true'}, 'DT': {'Probability': array([[1., 0.]]), 'Classification': 'true'}, 'RF': {'Probability': array([[0.72, 0.28]]), 'Classification': 'true'}}
Found 3733346_1 in file ../Kepler/TFRecords/train-00004-of-00008


Feature Extraction: 100%|██████████| 1/1 [00:00<00:00, 11.78it/s]


{'GRU': {'Probability': array([[0.4138532]], dtype=float32), 'Classification': 'false'}, 'LR': {'Probability': 0.552142351657521, 'Classification': 'true'}, 'DT': {'Probability': array([[1., 0.]]), 'Classification': 'true'}, 'RF': {'Probability': array([[0.57, 0.43]]), 'Classification': 'true'}}
