### Implementation
1. split normal and tumor samples
2. remove outliers in normal and tumor samples
3. calculate the mean of normal sammples
4. tumor - avg(normal)
5. calculate the mean of tumor samples
6. merge with DMP file
7. exclude the genes not in single comorbidity list

### Notes
normal samples are procceed with tumor samples in all_beta_normalized file

### Input Columns
1. `Unnamed: 0` - id of the sample
> list of serial number for each sample 

### Output Columns
1. `id` - id of the sample
2. `gene` - gene name
3. `dbeta` - beta value of tumor - average of beta value of normal

### Parameters
1. `cancer_type` - cancer type name
2. `data_source` - folder name where all_beta_normalized and DMP files are stored, either GSE accession number or associated 
3. `all_beta_normalized_path` - all_beta_normalized file path within `champ_result`
4. `DMP_path` - DMP file path within `champ_result`
5. `result_folder` - default `train100` folder
6. `normal_count` - number of normal samples
7. `is_duplicate` - if the sample is duplicate or not, 2 if duplicate, 1 if not duplicate

In [1]:
import pandas as pd
import os

In [2]:
cancer_type = "rectal"
data_source = "GDC_rectal_tissue_450k"


all_beta_normalized_path = "all_beta_normalized_train_oversample_smote.csv"


DMP_path = "DMP_result.csv"


result_folder = "train80"


normal_count = 330


is_duplicate = 1

In [3]:
all_beta_normalized = pd.read_csv(
    f"../../{cancer_type}/result/{data_source}/train80/{all_beta_normalized_path}"
)

In [4]:
all_beta_normalized_normal = all_beta_normalized.iloc[
    :, 1 : normal_count + 1 : is_duplicate
]


all_beta_normalized_tumor = all_beta_normalized.iloc[
    :, normal_count + 1 :: is_duplicate
]

In [5]:
all_beta_normalized_normal

Unnamed: 0,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,...,0.320,0.321,0.322,0.323,0.324,0.325,0.326,0.327,0.328,0.329
0,0.856113,0.839813,0.836635,0.839600,0.872039,0.827300,0.867226,0.856248,0.844131,0.828738,...,0.849747,0.882134,0.846986,0.817176,0.836555,0.836717,0.858648,0.852715,0.868460,0.815862
1,0.804604,0.819941,0.846237,0.840729,0.873266,0.785063,0.836406,0.826497,0.811564,0.802157,...,0.767423,0.839149,0.856903,0.710779,0.826441,0.826289,0.805543,0.797866,0.844787,0.707730
2,0.185599,0.100295,0.095473,0.241155,0.081563,0.074698,0.085243,0.095008,0.074505,0.101004,...,0.170414,0.235625,0.175042,0.108461,0.129423,0.129328,0.088176,0.104919,0.158033,0.109189
3,0.522515,0.489466,0.523515,0.450116,0.455411,0.539920,0.507077,0.486059,0.485272,0.404073,...,0.446230,0.535593,0.418445,0.533796,0.471852,0.471482,0.478751,0.392958,0.483998,0.536529
4,0.170130,0.250159,0.257916,0.176988,0.135848,0.194919,0.151219,0.182795,0.184079,0.277530,...,0.109670,0.157546,0.148612,0.184559,0.177539,0.177600,0.208263,0.231410,0.170175,0.185909
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
364222,0.964895,0.947315,0.947070,0.952066,0.952888,0.947812,0.959611,0.964744,0.953798,0.957385,...,0.955357,0.929954,0.963244,0.941274,0.955550,0.955562,0.957279,0.958502,0.965890,0.940692
364223,0.928772,0.932969,0.930216,0.928250,0.943854,0.905401,0.950299,0.949012,0.936415,0.932015,...,0.949339,0.931991,0.942488,0.885951,0.929400,0.929468,0.945744,0.938616,0.925183,0.884119
364224,0.104119,0.084282,0.074635,0.072809,0.114334,0.118694,0.088020,0.057756,0.095004,0.080336,...,0.076590,0.125420,0.088776,0.077221,0.087660,0.087667,0.081772,0.084081,0.101796,0.077463
364225,0.048974,0.065580,0.069607,0.055919,0.060528,0.055811,0.038455,0.036974,0.042911,0.055916,...,0.024077,0.063161,0.041506,0.041119,0.060444,0.060313,0.045531,0.041465,0.036985,0.041092


In [6]:
all_beta_normalized_tumor

Unnamed: 0,1,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,1.9,...,1.320,1.321,1.322,1.323,1.324,1.325,1.326,1.327,1.328,1.329
0,0.854829,0.875830,0.825972,0.889856,0.893317,0.866069,0.827269,0.905498,0.867410,0.890144,...,0.888416,0.837523,0.884360,0.875896,0.859817,0.845340,0.885423,0.805013,0.740653,0.824682
1,0.756300,0.856087,0.775657,0.872075,0.889066,0.859926,0.845616,0.848822,0.678133,0.877380,...,0.905431,0.816566,0.915958,0.831712,0.862241,0.805669,0.853217,0.868710,0.755751,0.867710
2,0.595009,0.493012,0.487701,0.837026,0.637440,0.395609,0.816834,0.666147,0.032599,0.597777,...,0.315933,0.025216,0.678708,0.512912,0.456226,0.825931,0.707738,0.300958,0.737305,0.481995
3,0.114031,0.273195,0.192698,0.157257,0.141368,0.321180,0.538645,0.318996,0.800777,0.226349,...,0.432962,0.509807,0.604417,0.438432,0.282946,0.159257,0.691693,0.283691,0.216872,0.658773
4,0.092132,0.113851,0.131657,0.069189,0.088359,0.183186,0.076780,0.162886,0.540924,0.079249,...,0.109457,0.083747,0.111750,0.112370,0.181156,0.083998,0.105068,0.089882,0.113225,0.124865
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
364222,0.938310,0.968771,0.951855,0.895664,0.979077,0.940950,0.927105,0.980799,0.979572,0.946278,...,0.964225,0.968654,0.947149,0.952817,0.967555,0.935477,0.943424,0.961357,0.960380,0.964356
364223,0.895693,0.899945,0.723128,0.620364,0.661122,0.928049,0.797565,0.923006,0.943278,0.956340,...,0.957469,0.685197,0.963930,0.930816,0.946185,0.904104,0.759986,0.742072,0.891878,0.946043
364224,0.060459,0.056313,0.074530,0.083636,0.144531,0.088819,0.037921,0.042754,0.051983,0.130468,...,0.068791,0.071161,0.059028,0.081142,0.084324,0.070557,0.040480,0.049996,0.062273,0.070444
364225,0.023476,0.044411,0.047230,0.075666,0.034805,0.042890,0.042764,0.053074,0.042871,0.035589,...,0.020892,0.046774,0.064979,0.038843,0.043956,0.064004,0.026731,0.016975,0.051021,0.039027


In [7]:
def IQR(df):
    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)
    IQR = Q3 - Q1
    upper_fence = Q3 + IQR * 1.5
    lower_fence = Q1 - IQR * 1.5
    return upper_fence, lower_fence


def no_outlier(df):
    upper_fence, lower_fence = IQR(df)
    ddf = df[(df > lower_fence) & (df < upper_fence)]
    return ddf

In [8]:
all_beta_normalized_normal = no_outlier(all_beta_normalized_normal)
all_beta_normalized_tumor = no_outlier(all_beta_normalized_tumor)

In [9]:
all_beta_normalized_tumor

Unnamed: 0,1,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,1.9,...,1.320,1.321,1.322,1.323,1.324,1.325,1.326,1.327,1.328,1.329
0,0.854829,0.875830,0.825972,0.889856,0.893317,0.866069,0.827269,0.905498,0.867410,0.890144,...,0.888416,0.837523,0.884360,0.875896,0.859817,0.845340,0.885423,0.805013,0.740653,0.824682
1,0.756300,0.856087,0.775657,0.872075,0.889066,0.859926,0.845616,0.848822,0.678133,0.877380,...,0.905431,0.816566,0.915958,0.831712,0.862241,0.805669,0.853217,0.868710,0.755751,0.867710
2,0.595009,0.493012,0.487701,0.837026,0.637440,0.395609,0.816834,0.666147,0.032599,0.597777,...,0.315933,0.025216,0.678708,0.512912,0.456226,0.825931,0.707738,0.300958,0.737305,0.481995
3,0.114031,0.273195,0.192698,0.157257,0.141368,0.321180,0.538645,0.318996,0.800777,0.226349,...,0.432962,0.509807,0.604417,0.438432,0.282946,0.159257,0.691693,0.283691,0.216872,0.658773
4,0.092132,0.113851,0.131657,0.069189,0.088359,0.183186,0.076780,0.162886,0.540924,0.079249,...,0.109457,0.083747,0.111750,0.112370,0.181156,0.083998,0.105068,0.089882,0.113225,0.124865
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
364222,0.938310,0.968771,0.951855,0.895664,0.979077,0.940950,0.927105,0.980799,0.979572,0.946278,...,0.964225,0.968654,0.947149,0.952817,0.967555,0.935477,0.943424,0.961357,0.960380,0.964356
364223,0.895693,0.899945,0.723128,0.620364,0.661122,0.928049,0.797565,0.923006,0.943278,0.956340,...,0.957469,0.685197,0.963930,0.930816,0.946185,0.904104,0.759986,0.742072,0.891878,0.946043
364224,0.060459,0.056313,0.074530,0.083636,0.144531,0.088819,0.037921,0.042754,0.051983,0.130468,...,0.068791,0.071161,0.059028,0.081142,0.084324,0.070557,0.040480,0.049996,0.062273,0.070444
364225,0.023476,0.044411,0.047230,0.075666,0.034805,0.042890,0.042764,0.053074,0.042871,0.035589,...,0.020892,0.046774,0.064979,0.038843,0.043956,0.064004,0.026731,0.016975,0.051021,0.039027


In [10]:
train_normal_avg = all_beta_normalized_normal.mean(skipna=True, axis=1)

In [11]:
train_normal_avg

0         0.849299
1         0.814150
2         0.134108
3         0.481266
4         0.182342
            ...   
364222    0.956150
364223    0.932237
364224    0.085179
364225    0.045981
364226    0.831724
Length: 364227, dtype: float64

In [12]:
all_beta_normalized_tumor = (all_beta_normalized_tumor).subtract(
    train_normal_avg, axis=0
)

In [13]:
all_beta_normalized_tumor

Unnamed: 0,1,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,1.9,...,1.320,1.321,1.322,1.323,1.324,1.325,1.326,1.327,1.328,1.329
0,0.005530,0.026531,-0.023328,0.040556,0.044017,0.016770,-0.022031,0.056199,0.018111,0.040845,...,0.039117,-0.011777,0.035061,0.026597,0.010517,-0.003959,0.036124,-0.044286,-0.108647,-0.024617
1,-0.057850,0.041937,-0.038493,0.057925,0.074916,0.045776,0.031466,0.034672,-0.136017,0.063230,...,0.091281,0.002416,0.101809,0.017562,0.048091,-0.008481,0.039067,0.054561,-0.058399,0.053560
2,0.460901,0.358905,0.353593,0.702918,0.503333,0.261501,0.682726,0.532039,-0.101508,0.463669,...,0.181825,-0.108891,0.544600,0.378804,0.322118,0.691823,0.573630,0.166850,0.603197,0.347887
3,-0.367234,-0.208071,-0.288568,-0.324009,-0.339898,-0.160086,0.057379,-0.162270,0.319512,-0.254916,...,-0.048303,0.028541,0.123151,-0.042834,-0.198320,-0.322009,0.210428,-0.197575,-0.264394,0.177507
4,-0.090210,-0.068490,-0.050685,-0.113153,-0.093983,0.000844,-0.105562,-0.019456,0.358582,-0.103093,...,-0.072885,-0.098595,-0.070592,-0.069972,-0.001186,-0.098344,-0.077273,-0.092460,-0.069117,-0.057477
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
364222,-0.017840,0.012620,-0.004295,-0.060487,0.022927,-0.015201,-0.029046,0.024649,0.023422,-0.009873,...,0.008075,0.012504,-0.009001,-0.003333,0.011405,-0.020674,-0.012727,0.005206,0.004230,0.008205
364223,-0.036544,-0.032292,-0.209109,-0.311873,-0.271115,-0.004188,-0.134672,-0.009231,0.011041,0.024103,...,0.025232,-0.247040,0.031692,-0.001421,0.013948,-0.028133,-0.172251,-0.190165,-0.040359,0.013806
364224,-0.024720,-0.028866,-0.010649,-0.001544,0.059352,0.003640,-0.047258,-0.042425,-0.033196,0.045288,...,-0.016388,-0.014018,-0.026151,-0.004037,-0.000855,-0.014622,-0.044699,-0.035183,-0.022906,-0.014735
364225,-0.022505,-0.001570,0.001248,0.029684,-0.011177,-0.003091,-0.003217,0.007093,-0.003110,-0.010392,...,-0.025089,0.000793,0.018998,-0.007138,-0.002025,0.018023,-0.019250,-0.029006,0.005040,-0.006954


In [14]:
all_beta_normalized_tumor = no_outlier(all_beta_normalized_tumor)

In [15]:
train_tumor_mean = all_beta_normalized_tumor.mean(skipna=True, axis=1)

In [16]:
delta_beta = pd.merge(
    all_beta_normalized.iloc[:, :1],
    pd.DataFrame(train_tumor_mean, columns=["dbeta"]),
    left_index=True,
    right_index=True,
)


print(delta_beta.shape)
delta_beta

(364227, 2)


Unnamed: 0.1,Unnamed: 0,dbeta
0,cg00000957,-0.001540
1,cg00001349,0.010555
2,cg00002719,0.002800
3,cg00002837,-0.055733
4,cg00003287,-0.045245
...,...,...
364222,cg27656573,-0.002945
364223,cg27657363,-0.039050
364224,cg27657537,-0.020922
364225,cg27662611,-0.003758


In [17]:
dmp = pd.read_csv(f"../../{cancer_type}/champ_result/{data_source}/{DMP_path}")
print(f"dmp shape: {dmp.shape}")
dmp = dmp[["Unnamed: 0", "gene", "feature"]]
dmp.dropna(inplace=True)
print(f"dmp shape after dropna: {dmp.shape}")

dmp shape: (262248, 24)
dmp shape after dropna: (198417, 3)


In [18]:
result = pd.merge(delta_beta, dmp, on="Unnamed: 0", how="inner")

In [19]:
def find_max_dBeta_grouped(group):
    idx_max = group["dbeta"].abs().idxmax()
    return group.loc[idx_max]


max_dbeta_per_gene = result.groupby("gene", as_index=False).apply(
    find_max_dBeta_grouped, include_groups=False
)

In [20]:
max_dbeta_per_gene.columns = ["gene", "ID", "dbeta", "feature"]
max_dbeta_per_gene = max_dbeta_per_gene[["ID", "gene", "dbeta", "feature"]]
max_dbeta_per_gene

Unnamed: 0,ID,gene,dbeta,feature
0,cg22286978,A1BG,-0.057195,Body
1,cg27394794,A1CF,0.000501,Body
2,cg01597629,A2BP1,-0.126290,5'UTR
3,cg07218357,A2LD1,0.057881,TSS200
4,cg00134295,A2M,-0.106726,TSS1500
...,...,...,...,...
18540,cg09704136,ZYX,0.065520,Body
18541,cg12087627,ZZEF1,0.061508,Body
18542,cg05776075,ZZZ3,-0.010260,TSS1500
18543,cg20009101,psiTPTE22,0.061513,Body


In [22]:
comorbidity = pd.read_csv(
    "../external_result/matchgene174_single_3Y10__OR2.txt", sep="\t", header=None
)

In [23]:
result_max_per_gene_single = max_dbeta_per_gene[
    max_dbeta_per_gene["gene"].isin(comorbidity[0])
]


result_max_per_gene_single

Unnamed: 0,ID,gene,dbeta,feature
3,cg08300930,A2M,0.055621,Body
4,cg21416544,A2ML1,0.074046,Body
11,cg13001012,AADAC,-0.039686,TSS1500
14,cg06339629,AADAT,0.035697,TSS1500
15,cg20940607,AAGAB,0.009760,1stExon
...,...,...,...,...
18318,cg07135797,ZNRD1,-0.063229,Body
18321,cg13298682,ZNRF3,-0.087323,Body
18337,cg05616010,ZSCAN18,0.084201,TSS1500
18357,cg14642833,ZWINT,-0.015362,TSS1500


In [21]:
# check if the folder exists
if not os.path.exists(f"../../{cancer_type}/result/{data_source}/{result_folder}"):
    os.makedirs(f"../../{cancer_type}/result/{data_source}/{result_folder}")
max_dbeta_per_gene.to_csv(
    f"../../{cancer_type}/result/{data_source}/{result_folder}/dbeta.csv", index=False
)