In [2]:
import pandas as pd

In [3]:
train = pd.read_csv("../champ_result/all_beta_normalized.csv")

In [4]:
id_list = pd.read_csv("../result/id_list.txt", header=None)

In [5]:
# get the intersection of train and test on "Unnamed: 0"
train = train[train["Unnamed: 0"].isin(id_list[0])]

In [6]:
train_normal = train.iloc[:, 1:95:2].T
train_tumor = train.iloc[:,95::2].T

##### Calculate Δβ

In [7]:
# remove outlier based on every column
def IQR(df):
    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)
    IQR = Q3 - Q1
    upper_fence = Q3 + IQR*1.5
    lower_fence = Q1 - IQR*1.5
    return upper_fence,lower_fence
def no_outlier(df):
    upper_fence, lower_fence = IQR(df)
    ddf=df[(df>lower_fence)&(df<upper_fence)]
    return ddf

In [8]:
train_normal = no_outlier(train_normal)
train_tumor = no_outlier(train_tumor)

In [9]:
train

Unnamed: 0.1,Unnamed: 0,1,2,3,4,5,6,7,8,9,...,881,882,883,884,885,886,887,888,889,890
0,cg00000957,0.825079,0.825079,0.836188,0.836188,0.855953,0.855953,0.856379,0.856379,0.833668,...,0.901550,0.901550,0.873928,0.873928,0.820407,0.820407,0.880965,0.880965,0.866919,0.866919
1,cg00001349,0.690023,0.690023,0.802989,0.802989,0.744400,0.744400,0.826541,0.826541,0.683470,...,0.864090,0.864090,0.878295,0.878295,0.699745,0.699745,0.851946,0.851946,0.784683,0.784683
2,cg00001583,0.095879,0.095879,0.030527,0.030527,0.058828,0.058828,0.103293,0.103293,0.054348,...,0.788893,0.788893,0.566003,0.566003,0.648568,0.648568,0.755152,0.755152,0.503848,0.503848
3,cg00002028,0.037414,0.037414,0.028130,0.028130,0.036667,0.036667,0.026973,0.026973,0.032372,...,0.053580,0.053580,0.037539,0.037539,0.063686,0.063686,0.067983,0.067983,0.053181,0.053181
4,cg00002837,0.393330,0.393330,0.278496,0.278496,0.354795,0.354795,0.371494,0.371494,0.372948,...,0.799111,0.799111,0.654631,0.654631,0.785799,0.785799,0.709136,0.709136,0.452765,0.452765
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
349415,cg27656573,0.950538,0.950538,0.950720,0.950720,0.955993,0.955993,0.956690,0.956690,0.947600,...,0.960415,0.960415,0.965732,0.965732,0.966887,0.966887,0.971332,0.971332,0.967507,0.967507
349416,cg27657363,0.959770,0.959770,0.935633,0.935633,0.956899,0.956899,0.947483,0.947483,0.956285,...,0.903175,0.903175,0.956787,0.956787,0.949065,0.949065,0.932598,0.932598,0.887922,0.887922
349417,cg27657537,0.071075,0.071075,0.051070,0.051070,0.052173,0.052173,0.082928,0.082928,0.097815,...,0.318862,0.318862,0.358653,0.358653,0.223531,0.223531,0.232292,0.232292,0.107879,0.107879
349418,cg27662611,0.044393,0.044393,0.038378,0.038378,0.051068,0.051068,0.061392,0.061392,0.050559,...,0.083533,0.083533,0.056394,0.056394,0.073767,0.073767,0.086748,0.086748,0.029125,0.029125


In [10]:
train_normal_avg = train_normal.mean(skipna=True, axis = 0)

In [11]:
train_tumor=(train_tumor).subtract(train_normal_avg, axis = 1)

In [12]:
train_tumor_mean = train_tumor.mean(skipna=True, axis = 0)
print(train_tumor_mean.shape)

(341413,)


In [13]:
Δβ = pd.merge(train.iloc[:,:1], pd.DataFrame(train_tumor_mean, columns = ["dbeta"]), left_index=True, right_index=True)
Δβ

Unnamed: 0.1,Unnamed: 0,dbeta
0,cg00000957,0.016443
1,cg00001349,0.028795
2,cg00001583,0.316867
3,cg00002028,0.014177
4,cg00002837,0.120412
...,...,...
349415,cg27656573,0.003466
349416,cg27657363,-0.014363
349417,cg27657537,0.063332
349418,cg27662611,0.004058


```
after calculating the Δβ for each biomarker, need to map biomarker to gene
since DMP have this info
look up both DMP_result (from train) to map biomarker to gene
```

In [14]:
dmp_train = pd.read_csv("../champ_result/DMP_result.csv")
print(f"raw train shape: {dmp_train.shape}")
dmp_train = dmp_train[["Unnamed: 0", "gene"]]
dmp_train.dropna(inplace=True)
print(f"train shape after dropna: {dmp_train.shape}")

raw train shape: (268112, 24)
train shape after dropna: (206165, 2)


In [15]:
result = pd.merge(Δβ, dmp_train, on="Unnamed: 0", how="inner")
result

Unnamed: 0.1,Unnamed: 0,dbeta,gene
0,cg00001583,0.316867,NR5A2
1,cg00002028,0.014177,PINK1
2,cg00003287,0.110206,TNNT2
3,cg00007036,0.001742,ZNF362
4,cg00008647,0.047382,FAIM3
...,...,...,...
201467,cg27611781,0.005099,PDXP
201468,cg27612019,0.088749,SLC25A18
201469,cg27634744,-0.006444,CELSR1
201470,cg27657537,0.063332,MED15


In [16]:
# a = result.sort_values(by=['gene'])[result['gene'] == "A2BP1"]
# a.groupby(['gene']).apply(find_max_dBeta)

In [17]:
def find_max_dBeta_grouped(group):
    idx_max = group['dbeta'].abs().idxmax()
    return group.loc[idx_max]

result_max_per_gene = result.groupby("gene").apply(find_max_dBeta_grouped).reset_index(drop=True)
result_max_per_gene

Unnamed: 0.1,Unnamed: 0,dbeta,gene
0,cg03630821,0.328376,A1BG
1,cg06719334,-0.436338,A2BP1
2,cg13776095,-0.269246,A2LD1
3,cg00134295,0.207143,A2M
4,cg15769388,-0.143162,A2ML1
...,...,...,...
18327,cg23995459,0.015419,ZYG11B
18328,cg09704136,0.234926,ZYX
18329,cg26591066,0.292028,ZZEF1
18330,cg26534213,0.006199,ZZZ3


In [18]:
single = pd.read_csv('../comorbidity/matchgene174_single_3Y10__OR2.txt', sep='\t', header=None)
single

Unnamed: 0,0
0,A2M
1,NAT1
2,NAT2
3,SERPINA3
4,AADAC
...,...
10224,MNS16A
10225,TAS2R18P
10226,TAS2R6P
10227,COX11


In [19]:
result_max_per_gene_single = result_max_per_gene[result_max_per_gene['gene'].isin(single[0])]
result_max_per_gene_single

Unnamed: 0.1,Unnamed: 0,dbeta,gene
3,cg00134295,0.207143,A2M
4,cg15769388,-0.143162,A2ML1
11,cg13001012,-0.061497,AADAC
14,cg00150882,0.161237,AADAT
15,cg20940607,0.009252,AAGAB
...,...,...,...
18285,cg07135797,-0.288723,ZNRD1
18288,cg20080983,-0.261198,ZNRF3
18304,cg14231297,0.401591,ZSCAN18
18324,cg14642833,-0.017558,ZWINT


In [20]:
# remove those abs(dbeta) < 0.35
result_max_per_gene_single = result_max_per_gene_single[result_max_per_gene_single['dbeta'].abs() > 0.35]

In [21]:
result_max_per_gene

Unnamed: 0.1,Unnamed: 0,dbeta,gene
0,cg03630821,0.328376,A1BG
1,cg06719334,-0.436338,A2BP1
2,cg13776095,-0.269246,A2LD1
3,cg00134295,0.207143,A2M
4,cg15769388,-0.143162,A2ML1
...,...,...,...
18327,cg23995459,0.015419,ZYG11B
18328,cg09704136,0.234926,ZYX
18329,cg26591066,0.292028,ZZEF1
18330,cg26534213,0.006199,ZZZ3


In [22]:
result_max_per_gene_single.to_csv("../result/result_max_per_gene_single_0.35.csv", index=False)