In [118]:
import statsmodels
import scipy
from scipy import stats
import numpy as np
import pandas as pd
import statsmodels.stats.multitest as smm

In [119]:
def out(name, s):
    f = open(name, 'w')
    f.write(str(s))
    f.close()

In [120]:
data = pd.read_csv('gene_high_throughput_sequencing.csv')

In [121]:
data.head()

Unnamed: 0,Patient_id,Diagnosis,LOC643837,LOC100130417,SAMD11,NOC2L,KLHL17,PLEKHN1,C1orf170,HES4,...,CLIC2,RPS4Y1,ZFY,PRKY,USP9Y,DDX3Y,CD24,CYorf15B,KDM5D,EIF1AY
0,STT5425_Breast_001_normal,normal,1.257614,2.408148,13.368622,9.494779,20.880435,12.722017,9.494779,54.349694,...,4.76125,1.257614,1.257614,1.257614,1.257614,1.257614,23.268694,1.257614,1.257614,1.257614
1,STT5427_Breast_023_normal,normal,4.567931,16.602734,42.477752,25.562376,23.221137,11.622386,14.330573,72.445474,...,6.871902,1.815112,1.815112,1.815112,1.815112,1.815112,10.427023,1.815112,1.815112,1.815112
2,STT5430_Breast_002_normal,normal,2.077597,3.978294,12.863214,13.728915,14.543176,14.141907,6.23279,57.011005,...,7.096343,2.077597,2.077597,2.077597,2.077597,2.077597,22.344226,2.077597,2.077597,2.077597
3,STT5439_Breast_003_normal,normal,2.066576,8.520713,14.466035,7.823932,8.520713,2.066576,10.870009,53.292034,...,5.20077,2.066576,2.066576,2.066576,2.066576,2.066576,49.295538,2.066576,2.066576,2.066576
4,STT5441_Breast_004_normal,normal,2.613616,3.434965,12.682222,10.543189,26.688686,12.484822,1.364917,67.140393,...,11.22777,1.364917,1.364917,1.364917,1.364917,1.364917,23.627911,1.364917,1.364917,1.364917


In [122]:
data.Diagnosis.value_counts()
normal = data[data.Diagnosis == 'normal']
early = data[data.Diagnosis == 'early neoplasia']
cancer = data[data.Diagnosis == 'cancer']

In [123]:
def fold_change(C, T):
    if T > C:
        return float(T)/C
    else:
        return -float(C)/T
    
def pract_value(gene, control, treatment):
    control_data = data[data.Diagnosis == control][gene]
    treatment_data = data[data.Diagnosis == treatment][gene]
    return abs(fold_change(np.mean(control_data), np.mean(treatment_data))) > 1.5

#### Двухвыборочный критерий Стьюдента (независимые выборки)

С помощью критерия Стьюдента проверим гипотезу о развенстве средних двух выборок.
Критерий Стьюдента:

H0: средние значения экспрессии гена одинаковы в двух выборках.

H0: не одинаковы.

In [124]:
def test_mean_student(data1, data2):
    (stat, pvalue) = scipy.stats.ttest_ind(data1, data2, equal_var = False)
    return pvalue

In [125]:
pvalue_data = pd.DataFrame(index=data.columns[2:], columns=[])
pvalues1 = []
pvalues2 = []
for gene in data.columns[2:]:
    pvalues1.append(test_mean_student(normal[gene], early[gene]))
    pvalues2.append(test_mean_student(early[gene], cancer[gene]))
pvalue_data['normal_early_p'] = pvalues1
pvalue_data['early_cancer_p'] = pvalues2

In [126]:
pvalue_data.head()

Unnamed: 0,normal_early_p,early_cancer_p
LOC643837,0.690766,0.413735
LOC100130417,3.2e-05,0.653429
SAMD11,0.060273,0.079556
NOC2L,0.826429,0.287581
KLHL17,0.049876,0.463292


In [127]:
len(pvalue_data['normal_early_p'][pvalue_data['normal_early_p'] < alpha])

1575

In [128]:
alpha = 0.05
print 'alpha = %f' % alpha
print 'Answer 1.1: ', len(pvalue_data['normal_early_p'][pvalue_data['normal_early_p'] < alpha])
out('1.1.txt', len(pvalue_data['normal_early_p'][pvalue_data['normal_early_p'] < alpha]))
print 'Answer 1.2: ', len(pvalue_data['early_cancer_p'][pvalue_data['early_cancer_p'] < alpha])
out('1.2.txt', len(pvalue_data['early_cancer_p'][pvalue_data['early_cancer_p'] < alpha]))

alpha = 0.050000
Answer 1.1:  1575
Answer 1.2:  3490


Поправка Бонферрони

In [129]:
alpha = 0.05 / 2
alpha

0.025

Поправка Холма

In [130]:
corrected_holm_1 = smm.multipletests(pvals=pvalue_data['normal_early_p'], alpha=alpha, method='holm')
pvalue_data['normal_early_p_holm'] = corrected_holm_1[1]
#print corrected1[1][np.where(corrected1[0] == True)[0]]
corrected_holm_2 = smm.multipletests(pvals=pvalue_data['early_cancer_p'], alpha=alpha, method='holm')
pvalue_data['early_cancer_p_holm'] = corrected_holm_2[1]
#print corrected2[1][np.where(corrected2[0] == True)[0]]

In [131]:
pvalue_data.head()

Unnamed: 0,normal_early_p,early_cancer_p,normal_early_p_holm,early_cancer_p_holm
LOC643837,0.690766,0.413735,1.0,1.0
LOC100130417,3.2e-05,0.653429,0.500174,1.0
SAMD11,0.060273,0.079556,1.0,1.0
NOC2L,0.826429,0.287581,1.0,1.0
KLHL17,0.049876,0.463292,1.0,1.0


In [135]:
genes_corrected1 = pvalue_data.ix[pvalue_data['normal_early_p_holm'] < alpha].index.tolist()
genes_practical1 = [x for x in genes_corrected1 if pract_value(x, 'normal', 'early neoplasia')]

genes_corrected2 = pvalue_data.ix[pvalue_data['early_cancer_p_holm'] < alpha].index.tolist()
genes_practical2 = [x for x in genes_corrected2 if pract_value(x, 'early neoplasia', 'cancer')]

print 'alpha = %f' % alpha
print 'Answer 2.1: %i (of %i)' % (len(genes_practical1), len(genes_corrected1))
out('2.1.txt', len(genes_practical1))
print 'Answer 2.2: %i (of %i)' % (len(genes_practical2), len(genes_corrected2))
out('2.2.txt', len(genes_practical2))

alpha = 0.025000
Answer 2.1: 2 (of 2)
Answer 2.2: 77 (of 79)


Поправка методом Бенджамини-Хохберга

In [136]:
corrected_bh_1 = smm.multipletests(pvals=pvalue_data['normal_early_p'], alpha=alpha, method='fdr_bh')
pvalue_data['normal_early_p_bh'] = corrected_bh_1[1]
corrected_bh_2 = smm.multipletests(pvals=pvalue_data['early_cancer_p'], alpha=alpha, method='fdr_bh')
pvalue_data['early_cancer_p_bh'] = corrected_bh_2[1]

In [137]:
genes_corrected1 = pvalue_data.ix[pvalue_data['normal_early_p_bh'] < alpha].index.tolist()
genes_practical1 = [x for x in genes_corrected1 if pract_value(x, 'normal', 'early neoplasia')]

genes_corrected2 = pvalue_data.ix[pvalue_data['early_cancer_p_bh'] < alpha].index.tolist()
genes_practical2 = [x for x in genes_corrected2 if pract_value(x, 'early neoplasia', 'cancer')]

print 'alpha = %f' % alpha
print 'Answer 3.1: %i (of %i)' % (len(genes_practical1), len(genes_corrected1))
out('3.1.txt', len(genes_practical1))
print 'Answer 3.2: %i (of %i)' % (len(genes_practical2), len(genes_corrected2))
out('3.2.txt', len(genes_practical2))

alpha = 0.025000
Answer 3.1: 4 (of 4)
Answer 3.2: 524 (of 832)
