### Dependencies

In [1]:
# Feature selection functions imported from scikit-learn.
#from sklearn.feature_selection import VarianceThreshold
import sqlite3 # Database library.
import os # Folder management library.
import pickle # Serializing module
import numpy as np

### Retrieve Examples from Database

In [2]:
# Initializing database and cursor.
star_data_db = sqlite3.connect('star_data.db')
star_data_cursor = star_data_db.cursor()

# Retrieving star_data from database.
classes = ['cep_1o', 'cep_f', 'dsct','eb_ec', 'eb_ed', 'eb_esd', 'lpv_mira_agb_c', 'lpv_mira_agb_o', 'lpv_osarg_agb',
           'lpv_osarg_rgb', 'lpv_srv_agb_c', 'lpv_srv_agb_o', 'rrab', 'rrc', 'rrd', 'rre', 't2cep']

X, Y = [], []
for label, classv in enumerate(classes):
    temp_X, temp_Y = [], []
    star_data_cursor.execute('SELECT star_features FROM '+classv)
    for row in star_data_cursor.fetchall()[:600]:
        # Deserializing features.
        features = pickle.loads(row[0])
        temp_X.append(features)
        temp_Y.append([label])
    X.append(temp_X)
    Y.append(temp_Y)

# Randomly shuffle data set for each label.
for label, class_set in enumerate(X):
    np.random.shuffle(X[label])

# Form train, cross-validate, and test sets.
# with 380 examples in train set, 110 in test set, and at most 110 in cv set.
train_X, test_X, cv_X = [], [], []
train_Y, test_Y, cv_Y = [], [], []

for label, class_set in enumerate(X):
    train_X.append(X[label][0:380])
    test_X.append(X[label][380:490])
    cv_X.append(X[label][490:601])
    
    train_Y.append(Y[label][0:380])
    test_Y.append(Y[label][380:490])
    cv_Y.append(Y[label][490:601])
    
# Flatten lists
train_X = [item for sublist in train_X for item in sublist]
test_X = [item for sublist in test_X for item in sublist]
cv_X = [item for sublist in cv_X for item in sublist]

train_Y = [item for sublist in train_Y for item in sublist]
test_Y = [item for sublist in test_Y for item in sublist]
cv_Y = [item for sublist in cv_Y for item in sublist]

# Close cursor and database    
star_data_cursor.close
star_data_db.close()

### Write Train, Test, CV sets to Database

In [3]:
# Initializing database and cursor.
star_sets_db = sqlite3.connect('star_data_sets.db')
star_sets_cursor = star_sets_db.cursor()

# Initializing table--(star type) and data type--(BLOB).
star_sets_cursor.execute("CREATE TABLE IF NOT EXISTS train_set(X BLOB, Y BLOB)")
star_sets_cursor.execute("CREATE TABLE IF NOT EXISTS test_set(X BLOB, Y BLOB)")
star_sets_cursor.execute("CREATE TABLE IF NOT EXISTS cv_set(X BLOB, Y BLOB)")

# Serializing Data so that it can be stored in database.
train_X_pickled = pickle.dumps(train_X, pickle.HIGHEST_PROTOCOL)
train_Y_pickled = pickle.dumps(train_Y, pickle.HIGHEST_PROTOCOL)
test_X_pickled = pickle.dumps(test_X, pickle.HIGHEST_PROTOCOL)
test_Y_pickled = pickle.dumps(test_Y, pickle.HIGHEST_PROTOCOL)
cv_X_pickled = pickle.dumps(cv_X, pickle.HIGHEST_PROTOCOL)
cv_Y_pickled = pickle.dumps(cv_Y, pickle.HIGHEST_PROTOCOL)

# Storing star_data in database for future reference.
star_sets_cursor.execute("INSERT INTO train_set(X, Y) VALUES (?,?)",
                         (sqlite3.Binary(train_X_pickled), sqlite3.Binary(train_Y_pickled)))
star_sets_cursor.execute("INSERT INTO test_set(X, Y) VALUES (?,?)",
                         (sqlite3.Binary(test_X_pickled), sqlite3.Binary(test_Y_pickled)))
star_sets_cursor.execute("INSERT INTO cv_set(X, Y) VALUES (?,?)",
                         (sqlite3.Binary(cv_X_pickled), sqlite3.Binary(cv_Y_pickled)))
star_sets_db.commit()

# Close cursor and database    
star_sets_cursor.close
star_sets_db.close()


[[0.08800000000000097, 1.0, 1.0, 0.489010989010989, 6.944630877859366, -0.05576545814126668, 2.1916053390848518, 0.0, 6.042266743868836, 0.30120481927710135, 0.5240963855421766, 0.710843373493987, 0.8373493975903556, 0.9337349397590302, 0.08246852258898776, 0.005479340910972905, 0.0028418699652533906, 0.006062250079006834, 0.0, 1.0012177723951088, 2.3707261226147103, 0.4131451226812992, 0.008407622911196557, 0.0006301700138350181, 0.001762160800906117, 0.0015300389094897848, 0.0, -1.4717197584117114, -2.153745700507042, -1.0848633980523668, 0.00586348791031404, 0.0009843682846886622, 0.0009263988120563587, 0.00014403100305964252, 0.0, 0.8330425550140839, 1.4265006334693706, 0.2137081369086926, -0.00999999999999801, 4.445812853411769e-06, 0.08314061275582775, 15.219890109890107, 0.0039012415127720815, 0.05850000000000044, 0.13736263736263737, 0.03333333333333333, 0.006667323545833725, 0.010904194173481813, 2.2766357398164545, 5.185860340689965e-72, 0.23123358495738228, 0.063916452415423

In [21]:
from sklearn.preprocessing import MultiLabelBinarizer

Y = MultiLabelBinarizer().fit_transform(y)
print(Y)

[[1 0 0 ..., 0 0 0]
 [1 0 0 ..., 0 0 0]
 [1 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 1]
 [0 0 0 ..., 0 0 1]
 [0 0 0 ..., 0 0 1]]


### Removing features with low variance

In [24]:
from sklearn.feature_selection import VarianceThreshold
#X = [[0, 0, 1], [0, 1, 0], [1, 0, 0], [0, 1, 1], [0, 1, 0], [0, 1, 1]]
print(len(X))
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
J = sel.fit_transform(X)

print(J[0])



120
[  1.           6.74065412   2.26944314   9.00431076   2.01618904
   0.25089575   2.51641834  -0.19544798   0.42600746   1.57417675
   0.72741717   0.48184163  -0.42631486   0.14521918  15.29753125
   1.93854131   0.04479627  -0.10384533   3.85524     -1.41473181
   2.0305565   -0.59214443]


### Univariate feature selection

In [33]:
print(y)
for i in range(0, len(y)):
    if y[i] != [12]:
        y[i] = [0]
    if y[i] == [12]:
        y[i] = [1]
print(y)
        

[[1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [2], [2], [2], [2], [2], [2], [2], [2], [2], [2], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [4], [4], [4], [4], [4], [4], [4], [4], [4], [4], [5], [5], [5], [5], [5], [5], [5], [5], [5], [5], [6], [6], [6], [6], [6], [6], [6], [6], [6], [6], [7], [7], [7], [7], [7], [7], [7], [7], [7], [7], [8], [8], [8], [8], [8], [8], [8], [8], [8], [8], [9], [9], [9], [9], [9], [9], [9], [9], [9], [9], [10], [10], [10], [10], [10], [10], [10], [10], [10], [10], [11], [11], [11], [11], [11], [11], [11], [11], [11], [11], [12], [12], [12], [12], [12], [12], [12], [12], [12], [12]]
[[0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0]

In [34]:
print(y)

[[0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1]]


In [5]:
a = ['hi1', 'hi2', 'hi3']

for i, item in enumerate(a):
    print(i+1)
    print(item)
    
print(a)
print(a[:2])
print(a[:45645555])

1
hi1
2
hi2
3
hi3
['hi1', 'hi2', 'hi3']
['hi1', 'hi2']
['hi1', 'hi2', 'hi3']


In [55]:
a = ['hi1', 'hi2', 'hi3']

b = []
b.append(a)
b.append(a)
print(b)
np.random.shuffle(b[0])
print(b[0][:2])

[['hi1', 'hi2', 'hi3'], ['hi1', 'hi2', 'hi3']]
['hi1', 'hi3']


In [54]:
c = ['a', 'b', 'c', 'd']
e = c[2:45646]
print(e)

['c', 'd']


In [72]:
r = [[1,2,3],[1,2],[1,4,5,6,7]]
r = [item for sublist in r for item in sublist]
print(r)

[1, 2, 3, 1, 2, 1, 4, 5, 6, 7]
