In [8]:
import numpy as np
from sklearn import datasets
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

class fisherscore():
    def __init__(self, X, y):
        self.X = X
        self.y = y
        self.nSample, self.nDim = X.shape
        self.labels = np.unique(y)
        self.nClass = len(self.labels)
#         print(X)
        self.total_mean = np.mean(self.X, axis=0)
        '''
        [mean(a1), mean(a2), ... , mean(am)]
        '''
        self.class_num, self.class_mean, self.class_std = self.get_mean_std()
        '''
        std(c1_a1), std(c1_a2), ..., std(c1_am)
        std(c2_a1), std(c2_a2), ..., std(c2_am)
        std(c3_a1), std(c3_a2), ..., std(c3_am)
        '''
        self.fisher_score_list = [self.cal_FS(j) for j in range(self.nDim)]

    def get_mean_std(self):
        Num = np.zeros(self.nClass)
        Mean = np.zeros((self.nClass, self.nDim))
        Std = np.zeros((self.nClass, self.nDim))
        for i, lab in enumerate(self.labels):
            idx_list = np.where(self.y == lab)[0]
            Num[i] = len(idx_list)
            Mean[i] = np.mean(self.X[idx_list], axis=0)
            Std[i] = np.std(self.X[idx_list], axis=0)
        return Num, Mean, Std

    def cal_FS(self,j):
        Sb_j = 0.0
        Sw_j = 0.0
        for i in range(self.nClass):
            Sb_j += self.class_num[i] * (self.class_mean[i,j] - self.total_mean[j])**2
            Sw_j += self.class_num[i] * self.class_std[i,j] **2
        return Sb_j / Sw_j

In [9]:
import pandas as pd

labels_organ = []
labels = []
dataset = []

df1 = pd.read_csv("GBM_fpkm_01.csv")
df1 = df1.drop(columns='gene_name')
df1 = df1.drop(columns='gene_type')
df1 = df1.drop(index=0)

LIST1 = df1.columns.values
LIST1 = np.array(LIST1)
LIST1 = LIST1[1:]
for i in range(len(LIST1)):
    data = LIST1[i][:7]
    labels_organ.append(data)

features = df1[df1.columns[0]]
print(features)
for j in range(len(df1.iloc[0])):
    if j != 0:
        data = df1[df1.columns[j]]
        data = [ float(x) for x in data ]
        data = np.array(data)
        dataset.append(data)
        labels.append('GBM')
        #labels_organ.append()

df2 = pd.read_csv("LIHC_fpkm_01.csv")
df2 = df2.drop(columns='gene_name')
df2 = df2.drop(columns='gene_type')
df2 = df2.drop(index=0)
df2 = df2.loc[:,:]

LIST2 = df2.columns.values
LIST2 = np.array(LIST2)
LIST2 = LIST2[1:]
for i in range(len(LIST2)):
    data = LIST2[i][:7]
    labels_organ.append(data)

for j in range(len(df2.iloc[0])):
    if j!=0 :
        data = df2[df2.columns[j]]
        data = [ float(x) for x in data ]
        data = np.array(data)
        dataset.append(data)
        labels.append('LIHC')

df3 = pd.read_csv("LUAD_fpkm_01.csv")
df3 = df3.drop(columns='gene_name')
df3 = df3.drop(columns='gene_type')
df3 = df3.drop(index=0)
df3 = df3.loc[:,:]

LIST3 = df3.columns.values
LIST3 = np.array(LIST3)
LIST3 = LIST3[1:]
for i in range(len(LIST3)):
    data = LIST3[i][:7]
    labels_organ.append(data)

for j in range(len(df3.iloc[0])):
    if j!=0:
        data = df3[df3.columns[j]]
        data = [ float(x) for x in data ]
        data = np.array(data)
        dataset.append(data)
        labels.append('LUAD')

df4 = pd.read_csv("SKCM_fpkm_01.csv")
df4 = df4.drop(columns='gene_name')
df4 = df4.drop(columns='gene_type')
df4 = df4.drop(index=0)
df4 = df4.loc[:,:]

LIST4 = df4.columns.values
LIST4 = np.array(LIST4)
LIST4 = LIST4[1:]
for i in range(len(LIST4)):
    data = LIST4[i][:7]
    labels_organ.append(data)

for j in range(len(df4.iloc[0])):
    if j!=0:
        data = df4[df4.columns[j]]
        data = [ float(x) for x in data ]
        data = np.array(data)
        dataset.append(data)
        labels.append('SKCM')

print(len(dataset))
print(len(labels_organ))
print(len(labels))
if len(dataset)==len(labels_organ):
    print(labels_organ)
    print(labels)

1        ENSG00000000003.13
2         ENSG00000000005.5
3        ENSG00000000419.11
4        ENSG00000000457.12
5        ENSG00000000460.15
                ...        
60479     ENSGR0000275287.3
60480     ENSGR0000276543.3
60481     ENSGR0000277120.3
60482     ENSGR0000280767.1
60483     ENSGR0000281849.1
Name: gene_id, Length: 60483, dtype: object
1153
1153
1153
['TCGA-02', 'TCGA-02', 'TCGA-02', 'TCGA-02', 'TCGA-02', 'TCGA-06', 'TCGA-06', 'TCGA-06', 'TCGA-06', 'TCGA-06', 'TCGA-06', 'TCGA-06', 'TCGA-06', 'TCGA-06', 'TCGA-06', 'TCGA-06', 'TCGA-06', 'TCGA-06', 'TCGA-06', 'TCGA-06', 'TCGA-06', 'TCGA-06', 'TCGA-06', 'TCGA-06', 'TCGA-06', 'TCGA-06', 'TCGA-06', 'TCGA-06', 'TCGA-06', 'TCGA-06', 'TCGA-06', 'TCGA-06', 'TCGA-06', 'TCGA-06', 'TCGA-06', 'TCGA-06', 'TCGA-06', 'TCGA-06', 'TCGA-06', 'TCGA-06', 'TCGA-06', 'TCGA-06', 'TCGA-06', 'TCGA-06', 'TCGA-06', 'TCGA-06', 'TCGA-06', 'TCGA-06', 'TCGA-06', 'TCGA-06', 'TCGA-06', 'TCGA-06', 'TCGA-06', 'TCGA-06', 'TCGA-06', 'TCGA-06', 'TCGA-06', 'TCGA

In [12]:
import numpy as np
dataset = np.array(dataset)
labels = np.array(labels)
features = np.array(features)
# fs = fisherscore(dataset,labels)
labels_organ = np.array(labels_organ)
fs = fisherscore(dataset,labels_organ)
print(fs.fisher_score_list)
list = fs.fisher_score_list
list = np.array(list)
cnt = 0
feature_list = []
fs_score_above = 4.0
for i in range(len(list)):
    if list[i]>fs_score_above:
        cnt = cnt+1
        feature_list.append(i)
print('the features number is:'+str(cnt))
print(len(np.unique(labels_organ)))

  return Sb_j / Sw_j
  return Sb_j / Sw_j


[0.3289218578508027, 0.07125227192512663, 0.17942241157434177, 0.3979708588032541, 0.20308673836924407, 0.4870972900402058, 1.4071286374862348, 0.1854147915608063, 0.14086483179208928, 0.4437623549744143, 0.7432237133520572, 0.8599640553503795, 0.2800342638809826, 0.7584627101120888, 0.20960125035283741, 0.0807947761152883, 0.3650100248697324, 0.17329752991069589, 0.20513237161272166, 0.5126916409400865, 0.15813575192256385, 0.3318704079755686, 0.20208605499045237, 1.4291048428717268, 0.38858705448755265, 0.24139907015986573, 0.05504559457571798, 0.37783273317108995, 1.0157155626582985, 0.24040153542028656, 0.615486622470276, 1.8710123422605278, 0.8813957825790628, 1.1945532964018197, 0.17554409449963196, 0.8607722121525824, 0.3932399294535907, 0.18210990207998765, 1.2951858499310178, 0.8154829326111607, 0.3094782215264354, 0.236910230987978, 0.7270849448347523, 0.07327634895452029, 0.14330876437681067, 0.536543182624247, 1.2270661729785644, 0.8472751845690264, 0.41572307532705594, 0.3

In [16]:
filter_sample = []
filter_sample = []
for i in range(len(dataset)):
    data = []
    for j in range(len(dataset[i])):
        if j in feature_list:
            data.append(dataset[i][j])
    filter_sample.append(data)

X_train, X_test, y_train, y_test = train_test_split(filter_sample,labels,test_size=0.1)
model1 = SVC()
model1.fit(X=X_train,y=y_train)
y_hat = model1.predict(X=X_test)
acc = accuracy_score(y_true=y_test,y_pred=y_hat)
print("ACC=",acc)


#  feature selected by kinds of cancer (classify cancer)
#  fs_score(above)  feature_number  accuracy
#     2.5               129          97%+
#      3                49           95%+
#     3.5               24           94%+
#      5                5            83%+ 

#  feature selected by tissue (classify tissue)
#  fs_score(above)  feature_number  accuracy
#     4.0               89          24%+
#  terrible... the samples are too small to converge

ACC= 0.9482758620689655


['TCGA-02' 'TCGA-02' 'TCGA-02' ... 'TCGA-XV' 'TCGA-XV' 'TCGA-YG']
