## Assignment 3 Breast Cancer Stage Classification

Breast cancer (BRCA) is the most common cancer in women. One important task to improve the survival rate of BRCA patients is identifying the cancer stage and applying different treatment strategies. We can train a model to classify cancer stages using RNA-seq of patient samples. 

Tasks:
1.	Prepare a dataset using TCGA-BRCA RNA-Seq data as features and cancer stages as labels. (Hint: you can find the processed RNA-Seq data and patient phenotype data from UCSC Xena)
2.	Applying data processing methods. (Normalization, Training-Test split, etc.)
3.	Applying three different classification estimators and optimizing the parameters through cross-validation.
4.	Comparing three estimators by evaluating the performance on the test dataset.
5.	Applying feature selection to improve performance.


### Task 1

In [29]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.feature_selection import VarianceThreshold, SelectKBest, chi2

In [30]:
class Classifier(object):
    def __init__(self, reads, label):
        self.reads = reads
        self.label = label
        self.X_train = []
        self.y_train = []
        self.X_test = []
        self.y_test = []
        self.svm = SVC() # will be updated after the grid search
        self.rf = RandomForestClassifier()
        self.knn = KNeighborsClassifier()

    def preprossessing(self):
        '''This function calls the preprocess_label and preprocess_reads functions and merge two dataframes
        '''
        self.preprocess_reads()
        self.preprocess_label()
        data = pd.merge(self.reads, self.label, on='sample_ID', how='inner') # merge two dataframes
        newname = data.columns[-1:1].str.split('.').str[0]
        data.rename(columns=dict(zip(data.columns[1:-1], newname)), inplace=True)
        print('The shape of dataframe after merging is: ', data.shape)
        self.data = data
        print('-'*14, 'Finish preprocessing', '-'*14, '\n')
    
    def preprocess_label(self):
        '''This function merges some substages into one stage, drop the samples with no diagnosis, 
        and drop the samples with "not reported" diagnosis
        '''
        print('Processing labels')
        label = self.label
        label.loc[label['tumor_stage.diagnoses'] == 'stage ia', 'tumor_stage.diagnoses'] = 'stage i'
        label.loc[label['tumor_stage.diagnoses'] == 'stage ib', 'tumor_stage.diagnoses'] = 'stage i'
        label.loc[label['tumor_stage.diagnoses'] == 'stage iia', 'tumor_stage.diagnoses'] = 'stage ii'
        label.loc[label['tumor_stage.diagnoses'] == 'stage iib', 'tumor_stage.diagnoses'] = 'stage ii'
        label.loc[label['tumor_stage.diagnoses'] == 'stage iiia', 'tumor_stage.diagnoses'] = 'stage iii'
        label.loc[label['tumor_stage.diagnoses'] == 'stage iiib', 'tumor_stage.diagnoses'] = 'stage iii'
        label.loc[label['tumor_stage.diagnoses'] == 'stage iiic', 'tumor_stage.diagnoses'] = 'stage iii'
        label = label.loc[:, ['submitter_id.samples', 'tumor_stage.diagnoses']].rename(columns={'submitter_id.samples':'sample_ID', 'tumor_stage.diagnoses': 'diagnosis'})
        label.dropna(inplace=True) # drop samples with no diagnosis
        label = label.query('diagnosis != "not reported"') # exclude samples with 'not reported' diagnoses
        self.label = label
        print('After processing, the shape of the label dataframe is:', label.shape)
        print('Summary of labels:\n', label.diagnosis.value_counts())
        print('Finish processing labels')
        return label
    
    def preprocess_reads(self):
        '''This function transposes the reads dataframe, rename the first column into sample_ID to prepare for merging, duplicates and NA are also dropped'''
        print('Processing reads')
        print('This procedure may take 30 seconds to 1 minute.')
        reads = self.reads.set_index('Ensembl_ID').T
        reads.reset_index(inplace=True)
        reads.dropna(inplace=True)
        reads.drop_duplicates(inplace=True)
        reads.rename(columns={'index':'sample_ID'}, inplace=True)
        print('After processing, the shape of the reads dataframe is:', reads.shape)
        print('Finish processing reads')
        self.reads = reads

    def train_test_split(self):
        '''First convert the label to binary numerical values, then split the data into training and testing sets;
        A label dictionary is created to map the numerical values back to the original labels
        '''
        print('Splitting data')
        y = self.data['diagnosis']
        X = self.data.drop(['sample_ID', 'diagnosis'], axis=1)
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=0.3)
        print('The shape of training set is: ', self.X_train.shape)
        print('The shape of testing set is: ', self.X_test.shape)
        print('Finish splitting data')

    def train_test(self):
        '''This function calls three models (SVM, RF, KNN) to train the data'''
        print('SVM, RF, and KNN will be used to train the data.')
        self.train_svm()
        self.train_rf()
        self.train_knn()
        print('\n')
        
    def train_svm(self):
        '''This function trains the SVM model'''
        print('Training SVM...')
        self.svm.fit(self.X_train, self.y_train)
        ypred_svm = self.svm.predict(self.X_test)
        print('The accuracy of SVM is: ', accuracy_score(self.y_test, ypred_svm))
        print('The confusion matrix of SVM is: \n', confusion_matrix(self.y_test, ypred_svm))
        print('\n')
    
    def train_rf(self):
        '''This function trains the RF model'''
        print('Training RF...')
        self.rf.fit(self.X_train, self.y_train)
        ypred_rf = self.rf.predict(self.X_test)
        print('The accuracy of RF is: ', accuracy_score(self.y_test, ypred_rf))
        print('The confusion matrix of RF is: \n', confusion_matrix(self.y_test, ypred_rf))
        print('\n')

    def train_knn(self):
        '''This function trains the KNN model'''
        print('Training KNN...')
        self.knn.fit(self.X_train, self.y_train)
        ypred_knn = self.knn.predict(self.X_test)
        print('The accuracy of KNN is: ', accuracy_score(self.y_test, ypred_knn))
        print('The confusion matrix of KNN is: \n', confusion_matrix(self.y_test, ypred_knn))
        print('\n')

    def hyperparameter_tuning(self):
        '''GridSearchCV is used to find the best parameters for the selected model, this process may take a long while to finish.'''
        print('-'*10, 'Hyperparameter tuning', '-'*10)
        svm_params = {
            'kernel':['linear', 'sigmoid', 'poly', 'rbf'],
            'gamma':['auto', 'scale'],
        }
        grid1 = GridSearchCV(SVC(), svm_params, cv=3, verbose=3)
        grid1.fit(self.X_train, self.y_train)
        print('The best parameters for SVM are:', grid1.best_params_)
        self.svm = grid1.best_estimator_ # save the best svm model
        print('SVM has been updated.')
        print('Evaluate the performance on test set...')
        ypred1 = grid1.predict(self.X_test)
        print('Accuracy:', accuracy_score(self.y_test, ypred1))
        print('Confusion matrix: \n', confusion_matrix(self.y_test, ypred1))

        rf_params = {
            'bootstrap': [True, False],
            'max_depth': [5, 50, 100, None],
            'max_features': [20, 50, 80],
            'min_samples_leaf': [1, 2, 4],
            'min_samples_split': [2, 5, 10],
            'n_estimators': [10, 100, 200]
        }
        grid2 = GridSearchCV(RandomForestClassifier(), rf_params, cv=3, verbose=3, n_jobs=-1)
        grid2.fit(self.X_train, self.y_train)
        print('The best parameters for RF are:', grid2.best_params_)
        self.rf = grid2.best_estimator_ # save the best rf model
        print('RF has been updated.')
        print('Evaluate the performance on test set')
        ypred2 = grid2.predict(self.X_test)
        print('Accuracy:', accuracy_score(self.y_test, ypred2))
        print('Confusion matrix: \n', confusion_matrix(self.y_test, ypred2))

        knn_params = {'n_neighbors':[5, 10, 20, 50, 100]}
        grid3 = GridSearchCV(KNeighborsClassifier(), knn_params, cv=3, verbose=3, n_jobs=-1)
        grid3.fit(self.X_train, self.y_train)
        print('The best parameters for KNN are:', grid3.best_params_)
        self.knn = grid3.best_estimator_ # save the best knn model
        print('KNN has been updated.')
        print('Evaluate the performance on test set')
        ypred3 = grid3.predict(self.X_test)
        print('Accuracy:', accuracy_score(self.y_test, ypred3))
        print('Confusion matrix: \n', confusion_matrix(self.y_test, ypred3))

    def feature_selection(self):
        '''Six methods are called by this method. The first three are three ways to select features, the last three are three ways to evaluate the importance of features.'''
        self.feature_selection_variance()
        self.feature_selection_chi2()
        self.feature_selection_prior()
        self.training1()
        self.training2()
        self.training3()

    def feature_selection_variance(self):
        '''This function uses variance threshold (0.5) to select features'''
        selector = VarianceThreshold(threshold=0.5)
        selector.fit(self.X_train)
        self.X_train1 = selector.transform(self.X_train)
        self.X_test1 = selector.transform(self.X_test)
        print(f'Feature selection: [{self.X_test1.shape[1]}] features with variance > 0.5 are selected.')
        
    def feature_selection_chi2(self):
        '''This function selects genes based on chi2 test'''
        selector = SelectKBest(chi2, k=566)
        selector.fit(self.X_train, self.y_train)
        self.X_train2 = selector.transform(self.X_train)
        self.X_test2 = selector.transform(self.X_test)
        print(f'Feature selection: [{self.X_test2.shape[1]}] features with p-value < 0.01 are selected.')

    def feature_selection_prior(self, genes):
        '''This function selects genes that are mentioned in the literature'''
        ID = genes['ID']
        self.X_train3 = self.X_train.loc[:, ID]
        self.X_test3 = self.X_test.loc[:, ID]
        print(f'Feature selection: [{self.X_test3.shape[1]}] features are selected based on literature.')

    def training1(self):
        '''This function trains the models using the first set of selected features'''
        print('Performance of different models using the first set of selected features:')
        self.svm.fit(self.X_train1, self.y_train)
        ypred1 = self.svm.predict(self.X_test1)
        print('The accuracy of SVM is: ', accuracy_score(self.y_test, ypred1))
        print('The confusion matrix of SVM is: \n', confusion_matrix(self.y_test, ypred1))
        self.rf.fit(self.X_train1, self.y_train)
        ypred2 = self.rf.predict(self.X_test1)
        print('The accuracy of RF is: ', accuracy_score(self.y_test, ypred2))
        print('The confusion matrix of RF is: \n', confusion_matrix(self.y_test, ypred2))
        self.knn.fit(self.X_train1, self.y_train)
        ypred3 = self.knn.predict(self.X_test1)
        print('The accuracy of KNN is: ', accuracy_score(self.y_test, ypred3))
        print('The confusion matrix of KNN is: \n', confusion_matrix(self.y_test, ypred3))
        print('\n')

    def training2(self):
        '''This function trains the models using the second set of selected features'''
        print('Performance of different models using the second set of selected features:')
        self.svm.fit(self.X_train2, self.y_train)
        ypred1 = self.svm.predict(self.X_test2)
        print('The accuracy of SVM is: ', accuracy_score(self.y_test, ypred1))
        print('The confusion matrix of SVM is: \n', confusion_matrix(self.y_test, ypred1))
        self.rf.fit(self.X_train2, self.y_train)
        ypred2 = self.rf.predict(self.X_test2)
        print('The accuracy of RF is: ', accuracy_score(self.y_test, ypred2))
        print('The confusion matrix of RF is: \n', confusion_matrix(self.y_test, ypred2))
        self.knn.fit(self.X_train2, self.y_train)
        ypred3 = self.knn.predict(self.X_test2)
        print('The accuracy of KNN is: ', accuracy_score(self.y_test, ypred3))
        print('The confusion matrix of KNN is: \n', confusion_matrix(self.y_test, ypred3))
        print('\n')
    
    def training3(self):
        '''This function trains the models using the third set of selected features'''
        print('Performance of different models using the third set of selected features:')
        self.svm.fit(self.X_train3, self.y_train)
        ypred1 = self.svm.predict(self.X_test3)
        print('The accuracy of SVM is: ', accuracy_score(self.y_test, ypred1))
        print('The confusion matrix of SVM is: \n', confusion_matrix(self.y_test, ypred1))
        self.rf.fit(self.X_train3, self.y_train)
        ypred2 = self.rf.predict(self.X_test3)
        print('The accuracy of RF is: ', accuracy_score(self.y_test, ypred2))
        print('The confusion matrix of RF is: \n', confusion_matrix(self.y_test, ypred2))
        self.knn.fit(self.X_train3, self.y_train)
        ypred3 = self.knn.predict(self.X_test3)
        print('The accuracy of KNN is: ', accuracy_score(self.y_test, ypred3))
        print('The confusion matrix of KNN is: \n', confusion_matrix(self.y_test, ypred3))
        print('\n')

In [4]:
if __name__ == '__main__':
    reads = pd.read_csv('TCGA-BRCA.htseq_fpkm.tsv', sep='\t', header=0)
    label = pd.read_csv('TCGA-BRCA.GDC_phenotype.tsv', sep='\t', header=0)
    genes = pd.read_csv('genes.csv', header=0)
    genes = genes.drop(['Unnamed: 2'], axis=1).rename(columns={'Gene Symbol': 'symbol', 'Ensembl Gene ID': 'ID'})

    C = Classifier(reads, label)
    C.preprossessing()
    C.train_test_split()
    C.train_test()
    C.hyperparameter_tuning()
    C.feature_selection()

In [33]:
reads = pd.read_csv('TCGA-BRCA.htseq_fpkm.tsv', sep='\t', header=0)
label = pd.read_csv('TCGA-BRCA.GDC_phenotype.tsv', sep='\t', header=0)
genes = pd.read_csv('genes.txt', sep='\t', header=0)
genes = genes.drop(['Unnamed: 2'], axis=1).rename(columns={'Gene Symbol': 'symbol', 'Ensembl Gene ID': 'ID'})

In [35]:
t = Classifier(reads, label)

In [36]:
t.preprossessing()
# t.train_test_split()
# t.training()

Processing reads
This procedure may take 30 seconds to 1 minute.
After processing, the shape of the reads dataframe is: (1217, 60484)
Finish processing reads
Processing labels
After processing, the shape of the label dataframe is: (1270, 2)
Summary of labels:
 stage ii     731
stage iii    289
stage i      215
stage iv      22
stage x       13
Name: diagnosis, dtype: int64
Finish processing labels
The shape of dataframe after merging is:  (1204, 60485)
-------------- Finish preprocessing -------------- 



In [37]:
t.data

Unnamed: 0,sample_ID,ENSG00000242268.2,ENSG00000270112.3,ENSG00000167578.15,ENSG00000273842.1,ENSG00000078237.5,ENSG00000146083.10,ENSG00000225275.4,ENSG00000158486.12,ENSG00000198242.12,...,ENSG00000186115.11,ENSG00000216352.1,ENSG00000267117.1,ENSG00000273233.1,ENSG00000105063.17,ENSG00000231119.2,ENSG00000280861.1,ENSG00000123685.7,ENSG00000181518.3,diagnosis
0,TCGA-E9-A1NI-01A,0.091708,0.019573,2.235898,0.0,2.321945,3.620056,0.0,0.337087,7.705589,...,0.073008,0.0,0.000000,0.000000,3.680055,0.285640,0.0,0.599579,0.0,stage ii
1,TCGA-A1-A0SP-01A,0.000000,0.004701,1.863334,0.0,4.226699,3.546117,0.0,0.016016,6.835508,...,0.000000,0.0,0.105328,0.055477,3.969785,0.115149,0.0,1.382192,0.0,stage ii
2,TCGA-BH-A1EU-11A,0.057899,0.016302,1.704753,0.0,1.975755,3.396943,0.0,0.041455,7.125310,...,0.039503,0.0,0.092108,0.000000,3.011921,0.384451,0.0,0.629043,0.0,stage i
3,TCGA-A8-A06X-01A,0.000000,0.000000,1.947481,0.0,2.808757,4.723270,0.0,0.002361,7.259318,...,0.118749,0.0,0.000000,0.000000,4.059347,0.345883,0.0,0.396315,0.0,stage ii
4,TCGA-E2-A14T-01A,0.000000,0.000000,2.734690,0.0,1.964479,3.770091,0.0,0.111647,7.643035,...,0.000000,0.0,0.113546,0.000000,4.249147,0.065679,0.0,0.157504,0.0,stage ii
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1199,TCGA-EW-A1P3-01A,0.000000,0.000000,2.861514,0.0,2.120565,2.992269,0.0,0.037693,7.673599,...,0.066625,0.0,0.153477,0.000000,3.772585,0.149851,0.0,0.551165,0.0,stage ii
1200,TCGA-A7-A13F-11A,0.471630,0.000000,1.626213,0.0,1.605675,3.224406,0.0,0.039177,7.439460,...,0.017897,0.0,0.180659,0.127060,3.133380,0.270187,0.0,0.611319,0.0,stage iii
1201,TCGA-A2-A0T6-01A,0.083825,0.000000,1.692995,0.0,2.055448,3.476192,0.0,0.033737,7.138577,...,0.000000,0.0,0.132664,0.000000,4.186347,0.484561,0.0,0.706170,0.0,stage ii
1202,TCGA-A7-A5ZW-01A,0.000000,0.005858,2.051916,0.0,2.195782,3.369091,0.0,0.031773,7.446467,...,0.101503,0.0,0.130170,0.000000,3.340223,0.127070,0.0,0.683710,0.0,stage ii


In [39]:
ID = genes.loc[:,'ID']
ID

0      ENSG00000019186
1      ENSG00000170893
2      ENSG00000213759
3      ENSG00000177238
4      ENSG00000043591
            ...       
104    ENSG00000102003
105    ENSG00000114279
106    ENSG00000132872
107    ENSG00000129990
108    ENSG00000166922
Name: ID, Length: 109, dtype: object

In [49]:
newname = t.data.columns[1:-1].str.split('.').str[0]
t.data.rename(columns=dict(zip(t.data.columns[1:-1], newname)), inplace=True)
t.data

Unnamed: 0,sample_ID,ENSG00000242268,ENSG00000270112,ENSG00000167578,ENSG00000273842,ENSG00000078237,ENSG00000146083,ENSG00000225275,ENSG00000158486,ENSG00000198242,...,ENSG00000186115,ENSG00000216352,ENSG00000267117,ENSG00000273233,ENSG00000105063,ENSG00000231119,ENSG00000280861,ENSG00000123685,ENSG00000181518,diagnosis
0,TCGA-E9-A1NI-01A,0.091708,0.019573,2.235898,0.0,2.321945,3.620056,0.0,0.337087,7.705589,...,0.073008,0.0,0.000000,0.000000,3.680055,0.285640,0.0,0.599579,0.0,stage ii
1,TCGA-A1-A0SP-01A,0.000000,0.004701,1.863334,0.0,4.226699,3.546117,0.0,0.016016,6.835508,...,0.000000,0.0,0.105328,0.055477,3.969785,0.115149,0.0,1.382192,0.0,stage ii
2,TCGA-BH-A1EU-11A,0.057899,0.016302,1.704753,0.0,1.975755,3.396943,0.0,0.041455,7.125310,...,0.039503,0.0,0.092108,0.000000,3.011921,0.384451,0.0,0.629043,0.0,stage i
3,TCGA-A8-A06X-01A,0.000000,0.000000,1.947481,0.0,2.808757,4.723270,0.0,0.002361,7.259318,...,0.118749,0.0,0.000000,0.000000,4.059347,0.345883,0.0,0.396315,0.0,stage ii
4,TCGA-E2-A14T-01A,0.000000,0.000000,2.734690,0.0,1.964479,3.770091,0.0,0.111647,7.643035,...,0.000000,0.0,0.113546,0.000000,4.249147,0.065679,0.0,0.157504,0.0,stage ii
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1199,TCGA-EW-A1P3-01A,0.000000,0.000000,2.861514,0.0,2.120565,2.992269,0.0,0.037693,7.673599,...,0.066625,0.0,0.153477,0.000000,3.772585,0.149851,0.0,0.551165,0.0,stage ii
1200,TCGA-A7-A13F-11A,0.471630,0.000000,1.626213,0.0,1.605675,3.224406,0.0,0.039177,7.439460,...,0.017897,0.0,0.180659,0.127060,3.133380,0.270187,0.0,0.611319,0.0,stage iii
1201,TCGA-A2-A0T6-01A,0.083825,0.000000,1.692995,0.0,2.055448,3.476192,0.0,0.033737,7.138577,...,0.000000,0.0,0.132664,0.000000,4.186347,0.484561,0.0,0.706170,0.0,stage ii
1202,TCGA-A7-A5ZW-01A,0.000000,0.005858,2.051916,0.0,2.195782,3.369091,0.0,0.031773,7.446467,...,0.101503,0.0,0.130170,0.000000,3.340223,0.127070,0.0,0.683710,0.0,stage ii


In [48]:
t.data.columns[1:-1].str.split('.').str[0]

Index(['ENSG00000242268', 'ENSG00000270112', 'ENSG00000167578',
       'ENSG00000273842', 'ENSG00000078237', 'ENSG00000146083',
       'ENSG00000225275', 'ENSG00000158486', 'ENSG00000198242',
       'ENSG00000259883',
       ...
       'ENSG00000238244', 'ENSG00000186115', 'ENSG00000216352',
       'ENSG00000267117', 'ENSG00000273233', 'ENSG00000105063',
       'ENSG00000231119', 'ENSG00000280861', 'ENSG00000123685',
       'ENSG00000181518'],
      dtype='object', length=60483)

In [40]:
t.data.loc[:,ID]

KeyError: "None of [Index(['ENSG00000019186', 'ENSG00000170893', 'ENSG00000213759',\n       'ENSG00000177238', 'ENSG00000043591', 'ENSG00000196091',\n       'ENSG00000159212', 'ENSG00000107014', 'ENSG00000070193',\n       'ENSG00000106631',\n       ...\n       'ENSG00000164850', 'ENSG00000148053', 'ENSG00000161509',\n       'ENSG00000160716', 'ENSG00000008056', 'ENSG00000102003',\n       'ENSG00000114279', 'ENSG00000132872', 'ENSG00000129990',\n       'ENSG00000166922'],\n      dtype='object', length=109)] are in the [columns]"

## Base-line performances
### SVM

In [None]:
# construct multi-class SVM classifier
model = SVC(kernel='linear', gamma="auto")
model.fit(A3.X_train, A3.y_train)
y_pred_svm = model.predict(A3.X_test)
print('Accuracy of SVM:', accuracy_score(A3.y_test, y_pred_svm))
print('Confusion matrix: \n', confusion_matrix(A3.y_test, y_pred_svm))

In [None]:
# CV
svm_params = {
    'kernel':['linear', 'sigmoid', 'poly', 'rbf'],
    'gamma':['auto', 'scale'],
}

grid1 = GridSearchCV(SVC(), svm_params, cv=3, verbose=3)
grid1.fit(A3.X_train, A3.y_train)

In [None]:
grid1.best_params_

In [None]:
ypred1 = grid1.predict(A3.X_test)
print('Accuracy:', accuracy_score(A3.y_test, ypred1))
print('Confusion matrix: \n', confusion_matrix(A3.y_test, ypred1))

### Random forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
tree = RandomForestClassifier(n_estimators=100, random_state=0)
tree.fit(A3.X_train, A3.y_train)
ypred_tree = tree.predict(A3.X_test)
print('Accuracy of RF:', accuracy_score(A3.y_test, ypred_tree))
print('Confusion matrix: \n', confusion_matrix(A3.y_test, ypred_tree))

In [None]:
rf_params = {'bootstrap': [True, False],
              'max_depth': [5, 50, 100, None],
              'max_features': [20, 50, 80],
              'min_samples_leaf': [1, 2, 4],
              'min_samples_split': [2, 5, 10],
              'n_estimators': [10, 100, 200]}
grid2 = GridSearchCV(RandomForestClassifier(), rf_params, cv=3, verbose=3, n_jobs=-1)
grid2.fit(A3.X_train, A3.y_train)
print(grid2.best_params_)

In [None]:
ypred2 = grid2.predict(A3.X_test)
print('Accuracy:', accuracy_score(ypred2, A3.y_test))
print('Confusion matrix:\n',confusion_matrix(ypred2, A3.y_test))

### KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=10)
knn.fit(A3.X_train, A3.y_train)
ypred_knn = knn.predict(A3.X_test)
print('Accuracy of KNN:', accuracy_score(A3.y_test, ypred_knn))
print('Confusion matrix: \n', confusion_matrix(A3.y_test, ypred_knn))

In [10]:
knn_params = {'n_neighbors':[5, 10, 20, 50, 100]}
grid3 = GridSearchCV(KNeighborsClassifier(), knn_params, cv=3, verbose=3, n_jobs=-1)
grid3.fit(A3.X_train, A3.y_train)

Fitting 3 folds for each of 5 candidates, totalling 15 fits
[CV 1/3] END ....................n_neighbors=20;, score=0.552 total time=  13.7s
[CV 2/3] END ....................n_neighbors=20;, score=0.523 total time=  14.8s
[CV 2/3] END ....................n_neighbors=10;, score=0.498 total time=  14.9s
[CV 1/3] END ....................n_neighbors=10;, score=0.520 total time=  14.9s
[CV 3/3] END ....................n_neighbors=10;, score=0.529 total time=  14.9s
[CV 2/3] END .....................n_neighbors=5;, score=0.459 total time=  15.0s
[CV 3/3] END .....................n_neighbors=5;, score=0.454 total time=  15.0s
[CV 1/3] END .....................n_neighbors=5;, score=0.484 total time=  15.1s
[CV 3/3] END ....................n_neighbors=20;, score=0.568 total time=   4.5s
[CV 1/3] END ...................n_neighbors=100;, score=0.569 total time=   5.3s
[CV 3/3] END ...................n_neighbors=100;, score=0.571 total time=   5.3s
[CV 2/3] END ....................n_neighbors=50;,

In [None]:
print(grid3.best_params_)
ypred3 = grid3.predict(A3.X_test)
print('Accuracy:',accuracy_score(A3.y_test, ypred3))
print('Confusion matrix:\n', confusion_matrix(A3.y_test, ypred3))

### Feature selection

Use RF to predict based on the selected features

### Try something new
I try to select features based on their variance

In [None]:
# feature selection
from sklearn.feature_selection import VarianceThreshold
selector = VarianceThreshold(threshold=0.5)
selector.fit(A3.X_train)
A3.sel_X = selector.transform(A3.X)
print(A3.sel_X.shape)
A3.sel_train_test_split()

In [None]:
# use random forest
rfc2 = RandomForestClassifier(n_estimators=50, max_depth=7, max_features=20, min_samples_leaf=1)
rfc2.fit(A3.X_train_sel, A3.y_train_sel)
y_pred7 = rfc2.predict(A3.X_test_sel)
print('Accuracy: ', accuracy_score(A3.y_test_sel, y_pred7))
print('Confusion matrix: \n', confusion_matrix(A3.y_test_sel, y_pred7))

In [None]:
from sklearn.feature_selection import SelectKBest, chi2
select2 = SelectKBest(chi2, k=566)
X_new_chi = select2.fit_transform(A3.X_train, A3.y_train)
X_test_chi = select2.transform(A3.X_test)

In [None]:
# train a model using the selected features
model4 = RandomForestClassifier(n_estimators=100)
model4.fit(X_new_chi, A3.y_train)
y_pred4 = model4.predict(X_test_chi)
print('Accuracy: ', accuracy_score(A3.y_test, y_pred4))
print('Confusion matrix: \n', confusion_matrix(A3.y_test, y_pred4))

#### Try SVM

In [None]:
svm2 = SVC(kernel='linear', gamma="auto")
svm2.fit(A3.X_train_sel, A3.y_train_sel)
y_pred10 = svm2.predict(A3.X_test_sel)
print('Accuracy: ', accuracy_score(A3.y_test_sel, y_pred10))
print('confusion_matrix\n', confusion_matrix(A3.y_test_sel, y_pred10))

Try to use XGBoost

### try another way of feature selection

In [None]:
data = A3.data
newname = data.columns[1:-1].str.split('.').str[0]
data.rename(columns=dict(zip(data.columns[1:-1], newname)), inplace=True)
data

In [None]:
list_gene = pd.read_csv('list_of_genes.txt', sep='\t', header=0)
list_gene = list_gene.drop(['Unnamed: 2'], axis=1).rename(columns={'Gene Symbol': 'symbol', 'Ensembl Gene ID': 'ID'})
list_gene

In [None]:
data_new = data.loc[:, list_gene['ID']]
# add the label column
data_new['diagnosis'] = data['diagnosis']
data_new

In [None]:
A3.sel_X = data_new.drop(['diagnosis'], axis=1)

In [None]:
A3.sel_train_test_split()

In [None]:
print(A3.y_test_sel.value_counts())

In [None]:
# use random forest to train the model using selected features
rfc3 = RandomForestClassifier(n_estimators=50, max_depth=7, max_features=20, min_samples_leaf=1)
rfc3.fit(A3.X_train_sel, A3.y_train_sel)
y_pred9 = rfc3.predict(A3.X_test_sel)
print('Accuracy: ', accuracy_score(A3.y_test_sel, y_pred9))
print('Confusion matrix\n', confusion_matrix(A3.y_test_sel, y_pred9))

更长的list

In [18]:
list_gene2 = pd.read_csv('list_of_genes2.txt', sep='\t', header=0)
list_gene2 = list_gene2.drop(['Unnamed: 2'], axis=1).rename(columns={'Gene Symbol': 'symbol', 'Ensembl Gene ID': 'ID'})
list_gene2

Unnamed: 0,symbol,ID
0,CYP24A1,ENSG00000019186
1,TRH,ENSG00000170893
2,UGT2B11,ENSG00000213759
3,TRIM72,ENSG00000177238
4,ADRB1,ENSG00000043591
...,...,...
104,SYP,ENSG00000102003
105,FGF12,ENSG00000114279
106,SYT4,ENSG00000132872
107,SYT5,ENSG00000129990


In [None]:
data_new2 = data.loc[:, list_gene2['ID']]
# add the label column
data_new2['diagnosis'] = data['diagnosis']
data_new2

In [None]:
A3.sel_X = data_new2.drop(['diagnosis'], axis=1)
A3.sel_train_test_split()

In [None]:
# use random forest to train the model using selected features
rfc3 = RandomForestClassifier(n_estimators=100, max_depth=7, max_features=20, min_samples_leaf=1)
rfc3.fit(A3.X_train_sel, A3.y_train_sel)
y_pred9 = rfc3.predict(A3.X_test_sel)
print('Accuracy: ', accuracy_score(A3.y_test_sel, y_pred9))
print('Confusion matrix\n', confusion_matrix(A3.y_test_sel, y_pred9))

In [None]:
chosen = np.random.randint(1, 60482, 1)
print(chosen)
A3.sel_X = A3.X.iloc[:, chosen]
A3.sel_train_test_split()
# train using rf
rfc4 = RandomForestClassifier(n_estimators=50, max_depth=7, max_features=20, min_samples_leaf=1)
rfc4.fit(A3.X_train_sel, A3.y_train_sel)
y_pred11 = rfc4.predict(A3.X_test_sel)
print('Accuracy: ', accuracy_score(A3.y_test_sel, y_pred11))
print('Confusion matrix:\n', confusion_matrix(A3.y_test_sel, y_pred11))