In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import lightkurve as lk
import warnings
from imblearn.over_sampling import SMOTE 
warnings.filterwarnings("ignore")

In [2]:
def search(periodogram, left_perc, right_perc):
    max_p = 0
    ass_p = 0
    periods = periodogram.period.value
    powers = periodogram.power.value
    #print(periods, powers)
    delta = periods[len(periods)-1] - periods[0]
    left = periods[0] + delta*left_perc
    right = periods[len(periods)-1] - delta*right_perc
    for i in range(1,len(periods)):
        if(periods[i] > left and periods[i] < right):
            if(powers[i] > max_p):
                max_p = powers[i]
                ass_p = periods[i]
    return ass_p

In [3]:
def preprocess(data):    
    fig = plt.figure(figsize=(12,40))
    
    #store processed light curve(flux) data
    features = np.zeros(shape=(len(data),3192))
    percent = 0
    for i in range(37,60): 
        
        #create light curve for each sample
        lc = lk.LightCurve(time=time_x, flux=np.array(data[i,1:3193]).flatten())
        #lc.scatter()
        
        #detrend lightcurve
        lc = lc.flatten()
        
        #fil outliers with random noise
        lc_temp, bool_mask = lc.remove_outliers(return_mask=True)
        rands = np.random.normal(np.mean(lc.flux), np.std(lc.flux), np.count_nonzero(bool_mask))
        replica = lc.flux
        replica[bool_mask] = rands
        lc.scatter()
        
        #normalize
        lc = lc.normalize(unit='ppm')
        lc.scatter()
        
        #Phase folding using dominant frequencies from periodogram
        periodogram = lc.to_periodogram("bls", oversample_factor=1)
        periodogram.plot()
        period = search(periodogram, .01, .01) #trim edges of periodogram (BUG)
        print(period)
        lc = lc.fold(period)
        features[i,:] = lc.flux
        lc.scatter()
        
        if(((i / len(features)) * 100) > percent):
            print(percent, "%")
            percent += 1
            
        #graph all of them together
        ax = fig.add_subplot(13,3,i+1-37)
        ax.scatter(np.array(range(3192)),features[i,:])
    return features

In [4]:
def preprocess_2(data):
    #fig = plt.figure(figsize=(12,40))
    
    #store processed light curve(flux) data
    features = np.zeros(shape=(len(data),3197))
    percent = 0
    for i in range(len(data)): 
        #create light curve for each sample
        lc = lk.LightCurve(time=np.arange(3197), flux=np.array(data.iloc[i,:]).flatten())
        #lc.scatter()
        
        #detrend lightcurve
        lc = lc.flatten()
        #lc.scatter()

        replica = lc.flux
        lc, bool_mask= lc.remove_outliers(sigma_lower = 12, sigma_upper = 2, return_mask = True)
        medians = np.empty(np.count_nonzero(bool_mask))
        medians.fill(np.median(lc.flux))
        #lc.scatter()
        
        replica[bool_mask] = medians

        features[i,:] = replica

        
        if(((i / len(features)) * 100) > percent):
            print(percent, "%")
            percent += 1
        
        #graph all of them together
        #ax = fig.add_subplot(13,3,i+1-37)
        #ax.scatter(np.array(range(3196)),features[i,:])
    return features

In [5]:
data = pd.read_csv('archive/exoTrain.csv')
data_test = pd.read_csv('archive/exoTest.csv')

data_y = data['LABEL']
data_y-= 1

data_test_y = data_test['LABEL']
data_test_y-= 1

#pandas 1 = col, 0 = row
data_x = data.drop('LABEL', axis=1)
data_test_x = data_test.drop('LABEL', axis=1)

train_processing = preprocess_2(data_x)
test_processing = preprocess_2(data_test_x)

0 %
1 %
2 %
3 %
4 %
5 %
6 %
7 %
8 %
9 %
10 %
11 %
12 %
13 %
14 %
15 %
16 %
17 %
18 %
19 %
20 %
21 %
22 %
23 %
24 %
25 %
26 %
27 %
28 %
29 %
30 %
31 %
32 %
33 %
34 %
35 %
36 %
37 %
38 %
39 %
40 %
41 %
42 %
43 %
44 %
45 %
46 %
47 %
48 %
49 %
50 %
51 %
52 %
53 %
54 %
55 %
56 %
57 %
58 %
59 %
60 %
61 %
62 %
63 %
64 %
65 %
66 %
67 %
68 %
69 %
70 %
71 %
72 %
73 %
74 %
75 %
76 %
77 %
78 %
79 %
80 %
81 %
82 %
83 %
84 %
85 %
86 %
87 %
88 %
89 %
90 %
91 %
92 %
93 %
94 %
95 %
96 %
97 %
98 %
99 %
0 %
1 %
2 %
3 %
4 %
5 %
6 %
7 %
8 %
9 %
10 %
11 %
12 %
13 %
14 %
15 %
16 %
17 %
18 %
19 %
20 %
21 %
22 %
23 %
24 %
25 %
26 %
27 %
28 %
29 %
30 %
31 %
32 %
33 %
34 %
35 %
36 %
37 %
38 %
39 %
40 %
41 %
42 %
43 %
44 %
45 %
46 %
47 %
48 %
49 %
50 %
51 %
52 %
53 %
54 %
55 %
56 %
57 %
58 %
59 %
60 %
61 %
62 %
63 %
64 %
65 %
66 %
67 %
68 %
69 %
70 %
71 %
72 %
73 %
74 %
75 %
76 %
77 %
78 %
79 %
80 %
81 %
82 %
83 %
84 %
85 %
86 %
87 %
88 %
89 %
90 %
91 %
92 %
93 %
94 %
95 %
96 %
97 %
98 %
99 %


In [6]:
#sm = SMOTE(random_state=42, k_neighbors=3)
#train_smote_x, train_smote_y = sm.fit_resample(train_processing, data_y)
#test_smote_x, test_smote_y = sm.fit_resample(test_processing, data_test_y)

#unique, counts = np.unique(train_smote_y, return_counts=True)
#print (unique, counts)

#unique, counts = np.unique(test_smote_y, return_counts=True)
#print (unique, counts)

In [7]:
with open('archive/ProcessData.npy', 'wb') as f:
    np.save(f, train_processing)
    np.save(f, test_processing)
    np.save(f, data_y)
    np.save(f, data_test_y)
    