# FaST - Feature (a) Selection Tool

## About: 

The a is there b/c I wanted it to say FaST, even if it is slow.

Author: Terek Arce

##TO DO:
* add other fs techniques

In [1]:
Database = "predict_kit2"
User = "terek"
Password = ""
Host = "localhost"
Port = "5432"

Enter the path where files can be stored and located for future reference.

In [2]:
Path = "/Users/terek/Dropbox/Mac/FutureConfStability/iNotebook/FaST_files"

Set the following variables to True if you'd like the corresponding feature selection algorithm to be used, False otherwise.

In [3]:
Chi2 = True

Set the list to format: [series, REGEX of classes]

In [4]:
get_data = [ ["GSE19804", "%normal%|%tumor%"] ,
             ["GSE39582", "%C1|%C2|%C3|%C4|%C5|%C6"] ,
             ["GSE27562", "%normal mammogram|%breast cancer, confirmed by diagnostic biopsy|%benign%" ] ]

##Main Program Start:

No user input is needed beyond this point.  When executing the code, responses from the DB and program will be displayed below each code piece.

The packages used by our program are found below:

In [5]:
import numpy as np
from sklearn.feature_selection import SelectKBest, chi2
from os.path import exists
from psycopg2 import connect

Opens a connection to the database:

In [6]:
conn = connect(database=Database, user=User, password=Password, host=Host, port=Port)
print ( "DB Response: Opened connection to database successfully :)" )

DB Response: Opened connection to database successfully :)


Given a set of genes (n_samples, n_features), the classes associated with the genes (n_features) the number of features to be selected (k), this function will return the indices of the features to be selected with the top k highest chi2 scores.

In [7]:
def chi2_fs ( genes, classes, num ):
    b = SelectKBest( chi2, num ).fit( genes, classes )
    a = b.get_support( indices = True )
    return a

Gets the genes and classes from the database.  Genese are of size [n_samples, n_features] and classes are of size [n_samples]

In [8]:
get_samples = ("SELECT sample.source_name, series_sample.sample_geo_accession, series_sample.series_geo_accession, series.platform_id " 
                       "FROM sample "
                       "JOIN series_sample ON sample.geo_accession = series_sample.sample_geo_accession "
                       "JOIN series ON series.geo_accession = series_sample.series_geo_accession "
                       "WHERE series_sample.series_geo_accession='{0}' AND sample.source_name SIMILAR TO '{1}' "
                       "ORDER BY sample.source_name ASC;")

get_genes = ("SELECT gb_acc, MAX({0}) AS {0} "
                 "FROM {2}_platform "
                 "JOIN {1}_matrix ON {2}_platform.id = {1}_matrix.id_ref "
                 "WHERE gb_acc != '' "
                 "GROUP BY gb_acc "
                 "ORDER BY gb_acc")  

for i in range( len( get_data ) ): 
    values = []
    classes = []
        
    with conn:
        with conn.cursor() as curs:
            curs.execute( get_samples.format( get_data[i][0], get_data[i][1] )  ) 
            samples = curs.fetchall()

            for s in samples:
                classes.append(s[0])
                
                with conn:
                    with conn.cursor() as curs:
                        curs.execute( get_genes.format( s[1], s[2], s[3] ) )
                        genes = curs.fetchall()
                        values.append( [g[1] for g in genes] ) 
    
    save_classes = np.array( classes )
    save_genes = np.array( values, dtype = float )
    
    classes_file = ( "FaST_files/%s_classes.npy" % get_data[i][0]  )
    genes_file = ( "FaST_files/%s_genes.npy" % get_data[i][0] )
    
    np.save( genes_file, save_genes)
    np.save( classes_file, save_classes)  

Saves the feature selected indices to a file, size [n_selected_features].

In [9]:
feature_selection_sizes = np.arange(50,301,5)

for i in range( len( get_data ) ): 
    classes_file = ( "FaST_files/%s_classes.npy" % get_data[i][0]  )
    genes_file = ( "FaST_files/%s_genes.npy" % get_data[i][0] )
    if ( exists( genes_file ) and exists( classes_file ) ):
        genes = np.load( genes_file )
        classes = np.load( classes_file )
        for fs_size in feature_selection_sizes:
            if ( Chi2 ):
                indices = chi2_fs ( genes, classes, fs_size ) 
                indices_file = ( "FaST_files/%s_%03d_fs_indices.npy" % (get_data[i][0], fs_size) )
                np.save( indices_file, indices)
            # TODO: add other fs methods here

Saves the gene values of the features selected, size [n_samples, n_selected_features]

In [10]:
# genes is [n_samples, n_features]
for i in range( len( get_data ) ): 
    genes_file = ( "FaST_files/%s_genes.npy" % get_data[i][0] )
    
    if exists( genes_file ):
        genes = np.load( genes_file )
        for fs_size in feature_selection_sizes:
            indices_file = ( "FaST_files/%s_%03d_fs_indices.npy" % (get_data[i][0], fs_size) )
            
            if exists( indices_file ):
                indices = np.load( indices_file )
                fs_values = [[g[i] for i in indices] for g in genes]
                fs_genes = np.array(fs_values)
                fs_genes_file = ( "FaST_files/%s_%03d_fs_genes.npy" % (get_data[i][0], fs_size) )
                np.save( fs_genes_file, fs_genes)

Closes the connection to the DB.

In [11]:
conn.close()
print ("DB Response: Closed connection to database successfully - Goodbye :(")

DB Response: Closed connection to database successfully - Goodbye :(
