### Dependencies

In [1]:
import sqlite3 # Database library.
import os # Folder management library.
import pickle # Serializing module.
import numpy as np # Scientific computing library.
from sklearn.feature_selection import SelectFromModel # Feature selection library.
# Classifier used in feature selection technique -- SelectFromModel.
from sklearn.svm import LinearSVC
from sklearn.ensemble import ExtraTreesClassifier

### Retrieve Binary Data Sets -- Train, CV, Test

In [2]:
# Initializing database and cursor.
star_data_db = sqlite3.connect('star_data_sets_binary.db')
star_data_cursor = star_data_db.cursor()

sets = ['train_set', 'test_set', 'cv_set']
set_types = ['X', 'Y']

X_train, X_cv, X_test = [], [], []
y_train, y_cv, y_test = [], [], []

data_temp = []
for dset in sets:
    for set_type in set_types:
        star_data_cursor.execute('SELECT '+set_type+' FROM '+dset)
        for row in star_data_cursor.fetchall():
            # Deserializing features.
            data = pickle.loads(row[0])
            data_temp.append(data)

X_train = np.array(data_temp[0])
y_train = np.array(data_temp[1])

X_test = np.array(data_temp[2])
y_test = np.array(data_temp[3])

X_cv = np.array(data_temp[4])
y_cv = np.array(data_temp[5])

print('Done.')

Done.


### The Features

In [3]:
feature_names = ['Amplitude', 'AndersonDarling', 'Autocor_length', 'Beyond1Std', 'CAR_mean', 
                 'CAR_sigma', 'CAR_tau', 'Con', 'Eta_e', 'FluxPercentileRatioMid20', 
                 'FluxPercentileRatioMid35', 'FluxPercentileRatioMid50','FluxPercentileRatioMid65',
                 'FluxPercentileRatioMid80', 'Freq1_harmonics_amplitude_0', 
                 'Freq1_harmonics_amplitude_1', 'Freq1_harmonics_amplitude_2', 
                 'Freq1_harmonics_amplitude_3', 'Freq1_harmonics_rel_phase_0',
                 'Freq1_harmonics_rel_phase_1', 'Freq1_harmonics_rel_phase_2', 
                 'Freq1_harmonics_rel_phase_3', 'Freq2_harmonics_amplitude_0', 
                 'Freq2_harmonics_amplitude_1', 'Freq2_harmonics_amplitude_2', 
                 'Freq2_harmonics_amplitude_3', 'Freq2_harmonics_rel_phase_0',
                 'Freq2_harmonics_rel_phase_1', 'Freq2_harmonics_rel_phase_2', 
                 'Freq2_harmonics_rel_phase_3', 'Freq3_harmonics_amplitude_0', 
                 'Freq3_harmonics_amplitude_1', 'Freq3_harmonics_amplitude_2', 
                 'Freq3_harmonics_amplitude_3', 'Freq3_harmonics_rel_phase_0', 
                 'Freq3_harmonics_rel_phase_1','Freq3_harmonics_rel_phase_2', 
                 'Freq3_harmonics_rel_phase_3', 'Gskew', 'LinearTrend', 'MaxSlope','Mean', 
                 'Meanvariance', 'MedianAbsDev', 'MedianBRP', 'PairSlopeTrend', 'PercentAmplitude', 
                 'PercentDifferenceFluxPercentile', 'Period_fit', 'PeriodLS', 'Psi_CS', 'Psi_eta', 'Q31',
                 'Rcs', 'Skew', 'SlottedA_length', 'SmallKurtosis', 'Std', 'StetsonK', 'StetsonK_AC', 
                 'StructureFunction_index_21', 'StructureFunction_index_31', 'StructureFunction_index_32',
                 'Colour']

### Feature selection using SelectFromModel and Various Classifiers
Classifiers used include: LinearSVC, ExtraTreesClassifier.

In [3]:
# The smaller C the fewer features selected.
fs_model = SelectFromModel(LinearSVC(C=0.1, penalty="l1", dual=False))
fs_model.fit(X_train, y_train)

# Size of original and new training data features.
print(X_train.shape, ' Is the original size.')
print(fs_model.transform(X_train).shape, ' Is the new size.\n')

# List features that were determined to be important.
selected_features = fs_model.get_support()
selected_ind = np.where(selected_features == True)[0]

print(len(selected_ind), ' features were selected to be important.')
print('The selected features are:\n')
for ind in selected_ind:
    print(feature_names[ind])

(6460, 64)  Is the original size.
(6460, 39)  Is the new size.

39  features were selected to be important.
The selected features are:

AndersonDarling
Autocor_length
Beyond1Std
CAR_mean
CAR_sigma
CAR_tau
FluxPercentileRatioMid20
FluxPercentileRatioMid35
FluxPercentileRatioMid50
FluxPercentileRatioMid65
FluxPercentileRatioMid80
Freq1_harmonics_amplitude_0
Freq1_harmonics_rel_phase_1
Freq1_harmonics_rel_phase_2
Freq2_harmonics_rel_phase_1
Freq2_harmonics_rel_phase_2
Freq2_harmonics_rel_phase_3
Freq3_harmonics_rel_phase_1
Freq3_harmonics_rel_phase_2
Freq3_harmonics_rel_phase_3
Gskew
MaxSlope
Mean
MedianBRP
PairSlopeTrend
Period_fit
Psi_CS
Psi_eta
Q31
Rcs
Skew
SlottedA_length
SmallKurtosis
StetsonK
StetsonK_AC
StructureFunction_index_21
StructureFunction_index_31
StructureFunction_index_32
Colour


In [5]:
fs_model = SelectFromModel(ExtraTreesClassifier(n_estimators=250, random_state=0))
fs_model.fit(X_train, y_train)

# Size of original and new training data features.
print(X_train.shape, ' Is the original size.')
print(fs_model.transform(X_train).shape, ' Is the new size.\n')

# List features that were determined to be important.
selected_features = fs_model.get_support()
selected_ind = np.where(selected_features == True)[0]

print(len(selected_ind), ' features were selected to be important.')
print('The selected features are:\n')
for ind in selected_ind:
    print(feature_names[ind])

(6460, 64)  Is the original size.
(6460, 29)  Is the new size.

29  features were selected to be important.
The selected features are:

Amplitude
Autocor_length
Beyond1Std
CAR_mean
CAR_tau
FluxPercentileRatioMid20
FluxPercentileRatioMid35
FluxPercentileRatioMid50
FluxPercentileRatioMid65
FluxPercentileRatioMid80
Freq1_harmonics_amplitude_0
Freq1_harmonics_amplitude_1
Gskew
Mean
Meanvariance
MedianAbsDev
MedianBRP
PercentAmplitude
PercentDifferenceFluxPercentile
Period_fit
Psi_eta
Q31
Rcs
Skew
SlottedA_length
Std
StetsonK
StetsonK_AC
Colour
{'estimator__max_features': 'auto', 'estimator__n_estimators': 250, 'estimator__min_samples_leaf': 1, 'estimator__n_jobs': 1, 'estimator__min_impurity_decrease': 0.0, 'estimator__oob_score': False, 'estimator__verbose': 0, 'estimator__min_weight_fraction_leaf': 0.0, 'estimator__random_state': 0, 'estimator__max_leaf_nodes': None, 'estimator__min_samples_split': 2, 'estimator__class_weight': None, 'estimator': ExtraTreesClassifier(bootstrap=False, cla