In order to speed up processing time when running classification algorithms, it is often useful to choose only the most "best" genes to use.  There are various algorithms available to choose genes, however here we use Chi2 Select K best.  K is how many genes you wish to use for testing stability.  More genese is usually better, however again in order to speed up processing time we limit the number of genes used.  This program allows you to set a min and max number of genes and an interval.  This will in turn setup numpy arrays with class and the select number of genes for further processing by FASTR and FASTrand.

### Libraries
Must be pre-installed.  Recommended to use virtual environment.

In [124]:
import numpy as np
from enum import Enum
from random import shuffle
from os import path, getcwd, makedirs
from sklearn.feature_selection import SelectKBest, chi2
from collections import Counter
from sklearn.metrics import mean_squared_error
from scipy.sparse.linalg import lsqr
from math import sqrt
from random import uniform, choice

## Methods and Classes

In [6]:
class AlterStrategy(Enum):
    GREEDY = 0
    CHI2 = 1
    RAND = 2
    PERCENT = 3

### NBC.py

In [7]:
class Model:
    """Describes the model class."""

    def __init__(self, samples, eps, class_label):
        """Initialize the model class.

        Args:
            samples: training samples of size [samples, genes].
            eps: epsilon value for correlation cutoff.
            class_label: classification label.
        """

        self.class_label = class_label
        self.samples = np.array(samples)
        self.eps = eps

        # columns are variables, rows are samples
        self.correlation = np.corrcoef(self.samples, y=None, rowvar=False)

        # note that the mask is actually the graph
        self.mask = (np.absolute(self.correlation) > self.eps)

        # the coefficients associated with the system of equation: Ax=b,
        # where A is an equation list created from the neighbors of gene
        # n and b is the value of gene n.
        self.geneFuncMasks = []  # these are the coefficients in Ax=b
        for gene in range(len(self.correlation)):
            currMask = self.mask[gene]
            setOfNeighbors = []
            solutions = []
            for sample in self.samples:
                neighbors = [sample[neighbor] if (currMask[neighbor] and (gene != neighbor))
                             else 0 for neighbor in range(len(currMask))]
                neighbors.append(1)
                setOfNeighbors.append(neighbors)
                solutions.append(sample[gene])
            coeff = self.solver(setOfNeighbors, solutions, 2)
            self.geneFuncMasks.append(coeff.tolist())

        self.coefficients = np.array(self.geneFuncMasks)

    def solver(self, neighbors, sols, choice):
        # Use lsqr to solve Ax=b
        A = np.array(neighbors)
        b = np.array(sols)
        x = lsqr(A, b)[0]
        return x

    def expression(self, sample):
        """Given a sample, return the hypothetical expression.

        Args:
            sample: the sample whose hypothetical expression we wish to
            calculate
        Returns:
            expr: A list with the expression values of size number of genes.
        """
        expression = []
        for gene in range(len(self.coefficients)):
            geneVal = 0
            for neighbor in range(len(self.mask)-1):
                geneVal += self.coefficients[gene][neighbor] * sample[neighbor]
            geneVal += self.coefficients[gene][len(self.mask)]
            expression.append(geneVal)
        return np.array(expression)

    def label (self):
        """Return the classification label of this model."""
        return self.class_label


class NetworkBasedClassifier:
    """Describes the NBClassifier class."""

    def __init__(self, epsilon):
        """Initialize a NBF classifier.

        Args:
            eps: epsilon value
        """
        self.models = []
        self.epsilon = epsilon

    def fit(self, X, y):
        """Fit the data with classes to create class models.

        Fits the data [num_samples, num_genes] with classifications
        [num_samples] to the model.  Creates as many models as classes.

        Args:
            X: the data we wish to train the classifier on
            y: the classifications associated with the samples
        """
        y = np.array(y)
        X = np.array(X)
        for key in Counter(y):
            a_class = np.where(y == key)
            self.models.append(Model([X[i] for i in a_class[0]], self.epsilon, key))

    def predict(self, X):
        """Predict the classification of a sample.

        Must fit the classifier before this method is called.

        Args:
            samples: the samples we wish to predict classification for.

        Returns:
            classifications: the classifications of the samples.
        """
        classifications = []
        for sample in X:
            RMSEs = []
            for model in self.models:
                rmse = sqrt( mean_squared_error(sample, model.expression(sample)))
                RMSEs.append(rmse)
            min_index = RMSEs.index(min(RMSEs))
            label = self.models[min_index].label()
            classifications.append(label)
        return np.array(classifications)

### Common.py

### Alter.py

In [147]:
def alter (exprs, percent):
    result = []
    for expr in exprs:
        alt = []
        for gene in expr:
            _offset = gene * percent
            _low = gene - _offset
            _high = gene + _offset
            alt.append(choice([_low, _high]))
        result.append(alt)
    return np.array(result)

## START MAIN PROGRAM

###### Enter the series and feature_size to use
Must be all upper case. e.g. `"GSE27562"`

In [140]:
series = "GSE27562"
feature_size = 10
fs_strategy = fsStrategy.KBEST

### Get/Create Directories
Assumes this notebook is in `GenClass-Stability/main/notebooks/`

In [141]:
notebook_dir = getcwd();
main_dir = path.dirname(path.dirname(notebook_dir))
load_path = path.join(main_dir, "GSE", series)
gsa_path = path.join(main_dir,"GSA", series, str(feature_size))
if not path.exists(gsa_path):
    makedirs(gsa_path)

### Import Classes and Expressions
Load original data. Assumes SIT and custome GSE script have been run to import data.

In [142]:
classes =np.loadtxt(path.join(load_path, "classes.txt"), dtype=np.str, delimiter="\t")
exprs = np.loadtxt(path.join(load_path, "exprs.txt"), delimiter="\t")

Select K best genes for analysis.

In [143]:
b = SelectKBest(chi2, feature_size).fit(exprs, classes)
a = b.get_support(indices = True)
exprs = exprs[:, a]

Save the selected expression data for potential later use.

In [144]:
np.save(path.join(gsa_path,"exprs.npy"), exprs)
np.save(path.join(gsa_path,"classses.npy"), classes)

## Stability Test I

In [145]:
exprs

array([[ 9.98205891,  8.80499698,  6.06705536, ...,  4.04860832,
         6.00040582,  6.4677598 ],
       [ 6.72960679,  8.07265421,  5.59216432, ...,  4.15901196,
         4.26277999,  4.51382143],
       [10.05329346,  7.24772335,  5.82537025, ...,  3.89825586,
         3.42084859,  6.05324247],
       ...,
       [10.36843308,  8.24140776,  4.75391267, ...,  6.20912277,
         3.13763694,  8.21837823],
       [11.05542227,  8.69241259,  4.89380999, ...,  7.26503729,
         4.0318941 ,  8.70174055],
       [10.47816641, 10.40235359,  4.85256164, ...,  6.62386944,
         6.99643153,  7.62414493]])

In [150]:
alter(exprs,0.5)

array([[14.97308836,  4.40249849,  9.10058304, ...,  6.07291248,
         9.00060873,  3.2338799 ],
       [ 3.36480339, 12.10898131,  8.38824648, ...,  2.07950598,
         6.39416998,  6.77073214],
       [15.07994019, 10.87158503,  8.73805537, ...,  5.84738379,
         1.7104243 ,  9.07986371],
       ...,
       [ 5.18421654,  4.12070388,  7.13086901, ...,  3.10456138,
         4.70645542, 12.32756734],
       [ 5.52771113, 13.03861888,  2.446905  , ..., 10.89755594,
         2.01594705,  4.35087027],
       [ 5.2390832 , 15.60353039,  2.42628082, ...,  3.31193472,
        10.4946473 , 11.4362174 ]])

In [127]:
selected = exprs[:, indices]

In [128]:
selected.shape

(125, 100)

In [129]:
selected

array([[4.31152269, 4.4071346 , 5.77272197, ..., 5.83642061, 6.17221809,
        6.24050669],
       [4.7492688 , 3.5923242 , 7.04037055, ..., 5.82981242, 7.350479  ,
        7.94567565],
       [4.43324923, 4.26058516, 5.25456661, ..., 6.98871558, 6.12976211,
        5.41058594],
       ...,
       [4.34395247, 5.26436647, 8.34279915, ..., 4.94476902, 6.67167526,
        6.42701463],
       [4.16553918, 6.06950417, 6.54962712, ..., 4.71671069, 6.30876414,
        6.03027806],
       [3.80756212, 4.93443121, 7.427341  , ..., 4.82043323, 5.45108573,
        5.79667554]])

In [137]:
    indices = chi2_fs(selected, classes, 20)
    print(indices)

[ 4 19 22 32 35 47 48 53 61 62 63 69 71 73 77 78 89 90 93 97]


In [138]:
indices = chi2_fs(exprs,classes,20)
print(indices)

[ 344 1961 2080 2852 3091 3892 3895 4195 4954 5218 5229 5768 6140 6495
 6930 7141 8581 9262 9367 9678]


In [134]:
exprs.shape

(125, 10094)

In [None]:
exp = np.arange(0, main_size, 1)
random.shuffle(exp)