In order to speed up processing time when running classification algorithms, it is often useful to choose only the most "best" genes to use.  There are various algorithms available to choose genes, however here we use Chi2 Select K best.  K is how many genes you wish to use for testing stability.  More genese is usually better, however again in order to speed up processing time we limit the number of genes used.  This program allows you to set a min and max number of genes and an interval.  This will in turn setup numpy arrays with class and the select number of genes for further processing by FASTR and FASTrand.

In [139]:
import numpy as np
from random import shuffle
from os import path, getcwd
from sklearn.feature_selection import SelectKBest, chi2

In [96]:
series = "GSE27562"

In [140]:
notebook_dir = getcwd();
main_dir = path.dirname(path.dirname(notebook_dir))
series_dir = path.join(main_dir,"GSE",series)
classes_path = path.join(series_dir, "classes.txt")
exprs_path = path.join(series_dir, "exprs.txt")

In [113]:
exprs = np.loadtxt(exprs_path, delimiter="\t", unpack=True)
classes =np.loadtxt(classes_path, dtype=np.str, delimiter="\t")

In [117]:
def chi2_fs ( genes, classes, num ):
    b = SelectKBest( chi2, num ).fit( genes, classes )
    a = b.get_support( indices = True )
    return a

In [None]:
def rand_fs( genes, num ):
    b = np.arange(0, num+1, 1)
    random.shuffle(b)

In [118]:
fs_sizes = np.arange(50,101,25)

In [125]:
for fs in fs_sizes:
    indices = chi2_fs(exprs, classes, fs)
    print(indices)

[ 129  273  344  828 1172 1519 1782 1961 2077 2080 2084 2120 2301 2405
 2575 2760 2852 3091 3484 3681 3892 3895 3943 3973 4195 4277 4283 4618
 4677 4954 5218 5229 5370 5768 6140 6213 6495 6611 6930 7141 7187 7907
 8494 8501 8581 9262 9326 9367 9661 9678]
[ 129  273  299  344  828  900  934 1161 1172 1372 1474 1519 1725 1782
 1961 1992 2077 2080 2081 2084 2120 2251 2279 2301 2405 2575 2760 2852
 2866 3091 3311 3484 3681 3881 3892 3895 3943 3973 4049 4069 4195 4277
 4283 4546 4618 4677 4954 5218 5229 5370 5390 5408 5768 6140 6213 6495
 6611 6826 6930 7141 7155 7187 7907 8494 8501 8537 8581 9262 9326 9353
 9367 9557 9661 9678 9845]
[ 129  157  273  299  344  570  761  828  833  900  934  947 1161 1172
 1372 1474 1519 1725 1782 1961 1992 2077 2080 2081 2084 2120 2251 2279
 2301 2405 2575 2760 2852 2866 2876 3091 3125 3137 3141 3184 3311 3393
 3484 3681 3757 3812 3881 3892 3895 3943 3973 4049 4069 4195 4277 4283
 4546 4576 4618 4624 4677 4954 5218 5229 5370 5390 5408 5429 5664 5768
 5933 61

In [127]:
selected = exprs[:, indices]

In [128]:
selected.shape

(125, 100)

In [129]:
selected

array([[4.31152269, 4.4071346 , 5.77272197, ..., 5.83642061, 6.17221809,
        6.24050669],
       [4.7492688 , 3.5923242 , 7.04037055, ..., 5.82981242, 7.350479  ,
        7.94567565],
       [4.43324923, 4.26058516, 5.25456661, ..., 6.98871558, 6.12976211,
        5.41058594],
       ...,
       [4.34395247, 5.26436647, 8.34279915, ..., 4.94476902, 6.67167526,
        6.42701463],
       [4.16553918, 6.06950417, 6.54962712, ..., 4.71671069, 6.30876414,
        6.03027806],
       [3.80756212, 4.93443121, 7.427341  , ..., 4.82043323, 5.45108573,
        5.79667554]])

In [137]:
    indices = chi2_fs(selected, classes, 20)
    print(indices)

[ 4 19 22 32 35 47 48 53 61 62 63 69 71 73 77 78 89 90 93 97]


In [138]:
indices = chi2_fs(exprs,classes,20)
print(indices)

[ 344 1961 2080 2852 3091 3892 3895 4195 4954 5218 5229 5768 6140 6495
 6930 7141 8581 9262 9367 9678]


In [134]:
exprs.shape

(125, 10094)

In [None]:
exp = np.arange(0, main_size, 1)
random.shuffle(exp)