### Implementation
1. split normal and tumor samples by phenotype_distinguisher
2. remove outliers in normal and tumor samples
3. calculate the mean of normal samples
4. tumor - avg(normal)
5. calculate the mean of tumor samples
6. merge with gene and CpG Site Location
7. exclude the genes not in single comorbidity list

### Notes
normal and tumor samples are mixed
external gene list needs further investigation

### Input Columns
The first column is the ids of the samples. It could be different between datasets for not being generated by ChAMP. It will be rename to `ID` in the output. 
> list of serial number for each sample 

### Output Columns
1. `ID` - ids of the samples
2. `gene` - genes name
3. `dbeta` - the difference of beta values between tumor and normal samples
4. `feature` - CpG Site Location

### Parameters
1. `cancer_type` - cancer type
2. `data_source` - folder name where all_beta_normalized and phenotyep (if any) are stored, either GSE accession number or associated cancer name
4. `result_folder` - default `train100` folder. Specify the folder name if the dataset is not 100% used as training set in later steps.
4. `phenotype_distinguisher` - varies between datasets. It is used to distinguish between normal and tumor samples.
5. `phenotype_negative` - the value of normal samples in the phenotype_distinguisher column

In [80]:
import pandas as pd
import numpy as np
import os

In [81]:
cancer_type = "breast"
data_source = "GSE89093_nc"

result_folder = "train100"

phenotype_distinguisher = "cancer_status"
phenotype_negative = "healthy"

In [82]:
all_beta_normalized = pd.read_csv(
    f"../{cancer_type}/champ_result/{data_source}/all_beta_normalized.csv"
)

In [83]:
all_beta_normalized

Unnamed: 0,ID_REF,0,1,2,3,4,5,6,7,8,...,82,83,84,85,86,87,88,89,90,91
0,cg00000029,0.486644,0.472790,0.440157,0.401009,0.403449,0.453738,0.505430,0.525945,0.539579,...,0.586559,0.557733,0.515352,0.570047,0.586289,0.428182,0.410767,0.535594,0.561125,0.534982
1,cg00000108,0.989213,0.993538,0.991012,0.995878,0.996242,0.992823,0.990628,0.992809,0.999285,...,0.980657,0.994747,0.987606,0.994711,0.982437,0.982954,0.994045,0.987665,0.993932,0.991112
2,cg00000109,0.984875,0.951471,0.948615,0.949571,0.963231,0.972137,0.972501,0.969343,0.950113,...,0.966235,0.960497,0.955022,0.955616,0.936133,0.955798,0.937965,0.965013,0.942853,0.945216
3,cg00000165,0.102033,0.168589,0.158073,0.141595,0.124284,0.179538,0.181141,0.198009,0.171876,...,0.164425,0.146259,0.162503,0.170724,0.151290,0.113963,0.176928,0.133602,0.147707,0.166224
4,cg00000236,0.798909,0.720664,0.745585,0.751120,0.738786,0.777767,0.776491,0.839313,0.830973,...,0.795153,0.799779,0.796590,0.831788,0.794952,0.783985,0.720944,0.756649,0.848983,0.820339
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
453622,ch.9.98463211R,0.000802,0.017480,0.013947,0.011970,0.010956,0.011151,0.009306,0.008481,0.001029,...,0.007897,0.004232,0.011624,0.011549,0.007974,0.015673,0.013178,0.014985,0.016102,0.007668
453623,ch.9.98937537R,0.003578,0.016064,0.022522,0.009246,0.004903,0.017005,0.011053,0.009776,0.004001,...,0.010352,0.012666,0.015397,0.008783,0.011411,0.018158,0.014805,0.012377,0.012630,0.017469
453624,ch.9.98957343R,0.001104,0.025162,0.030792,0.011537,0.020435,0.014164,0.012570,0.012651,0.001225,...,0.014115,0.024711,0.018897,0.023771,0.023488,0.024783,0.018400,0.021078,0.032479,0.032993
453625,ch.9.98959675F,0.051028,0.062762,0.086426,0.094496,0.059940,0.091989,0.098019,0.132013,0.001345,...,0.135202,0.047782,0.079773,0.072900,0.141466,0.141630,0.130800,0.076828,0.080214,0.182504


In [49]:
print(all_beta_normalized.shape)

(453627, 93)


In [51]:
all_beta_normalized_t = all_beta_normalized.iloc[:, 1:]
print(all_beta_normalized_t.shape)
all_beta_normalized_t.head()

(453627, 92)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,82,83,84,85,86,87,88,89,90,91
0,0.486644,0.47279,0.440157,0.401009,0.403449,0.453738,0.50543,0.525945,0.539579,0.378885,...,0.586559,0.557733,0.515352,0.570047,0.586289,0.428182,0.410767,0.535594,0.561125,0.534982
1,0.989213,0.993538,0.991012,0.995878,0.996242,0.992823,0.990628,0.992809,0.999285,0.99813,...,0.980657,0.994747,0.987606,0.994711,0.982437,0.982954,0.994045,0.987665,0.993932,0.991112
2,0.984875,0.951471,0.948615,0.949571,0.963231,0.972137,0.972501,0.969343,0.950113,0.964967,...,0.966235,0.960497,0.955022,0.955616,0.936133,0.955798,0.937965,0.965013,0.942853,0.945216
3,0.102033,0.168589,0.158073,0.141595,0.124284,0.179538,0.181141,0.198009,0.171876,0.160247,...,0.164425,0.146259,0.162503,0.170724,0.15129,0.113963,0.176928,0.133602,0.147707,0.166224
4,0.798909,0.720664,0.745585,0.75112,0.738786,0.777767,0.776491,0.839313,0.830973,0.844176,...,0.795153,0.799779,0.79659,0.831788,0.794952,0.783985,0.720944,0.756649,0.848983,0.820339


In [52]:
phenotype = pd.read_csv(f"../{cancer_type}/champ_result/{data_source}/phenotype.csv")
phenotype = pd.DataFrame(phenotype[phenotype_distinguisher])
mask = phenotype == phenotype_negative

In [53]:
all_beta_normalized_normal = (
    all_beta_normalized_t.iloc[:, np.nonzero(mask.T)[1]].T.reset_index(drop=True).T
)
all_beta_normalized_tumor = (
    all_beta_normalized_t.iloc[:, np.nonzero(~mask.T)[1]].T.reset_index(drop=True).T
)

In [54]:
def IQR(df):
    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)
    IQR = Q3 - Q1
    upper_fence = Q3 + IQR * 1.5
    lower_fence = Q1 - IQR * 1.5
    return upper_fence, lower_fence


def no_outlier(df):
    upper_fence, lower_fence = IQR(df)
    ddf = df[(df > lower_fence) & (df < upper_fence)]
    return ddf

In [55]:
all_beta_normalized_normal = no_outlier(all_beta_normalized_normal)
all_beta_normalized_tumor = no_outlier(all_beta_normalized_tumor)

In [56]:
train_normal_avg = all_beta_normalized_normal.mean(skipna=True, axis=0)

In [57]:
all_beta_normalized_tumor = (all_beta_normalized_tumor).subtract(
    train_normal_avg, axis=1
)

In [58]:
all_beta_normalized_tumor = no_outlier(all_beta_normalized_tumor)

In [59]:
train_tumor_mean = all_beta_normalized_tumor.mean(skipna=True, axis=0)

In [67]:
delta_beta = pd.merge(
    all_beta_normalized.iloc[:, :1],
    pd.DataFrame(train_tumor_mean, columns=["dbeta"]),
    left_index=True,
    right_index=True,
)


print(delta_beta.shape)


delta_beta.head()

(46, 2)


Unnamed: 0,ID_REF,dbeta
0,cg00000029,0.008309
1,cg00000108,0.004831
2,cg00000109,0.003466
3,cg00000165,-0.00022
4,cg00000236,0.003781


In [68]:
delta_beta.columns = ["ID", "dbeta"]

In [69]:
gene_info = pd.read_csv("../external_result/GPL13534-11288.txt", sep="\t")

  gene_info = pd.read_csv("../external_result/GPL13534-11288.txt", sep="\t")


In [71]:
gene_info = gene_info[["ID", "UCSC_RefGene_Name", "UCSC_RefGene_Group"]]
gene_info.dropna(inplace=True)
print(gene_info.shape)
gene_info.head()

(365600, 3)


Unnamed: 0,ID,UCSC_RefGene_Name,UCSC_RefGene_Group
0,cg00035864,TTTY18,TSS1500
1,cg00050873,TSPY4;FAM197Y2,Body;TSS1500
2,cg00061679,DAZ1;DAZ4;DAZ4,Body;Body;Body
3,cg00063477,EIF1AY,Body
4,cg00121626,BCORL2,Body


In [72]:
result = pd.merge(delta_beta, gene_info, on="ID", how="inner")
print(result.shape)
result.head()

(36, 4)


Unnamed: 0,ID,dbeta,UCSC_RefGene_Name,UCSC_RefGene_Group
0,cg00000029,0.008309,RBL2,TSS1500
1,cg00000108,0.004831,C3orf35;C3orf35,Body;3'UTR
2,cg00000109,0.003466,FNDC3B;FNDC3B,Body;Body
3,cg00000236,0.003781,VDAC3;VDAC3,3'UTR;3'UTR
4,cg00000289,0.002234,ACTN1;ACTN1;ACTN1,3'UTR;3'UTR;3'UTR


In [73]:
result.columns = ["ID", "dbeta", "gene", "feature"]

In [74]:
# fetch first gene name
result["gene"] = result["gene"].str.split(";").str[0]
result["feature"] = result["feature"].str.split(";").str[0]
result.dropna(inplace=True)

In [75]:
def find_max_dBeta_grouped(group):
    idx_max = group["dbeta"].abs().idxmax()
    return group.loc[idx_max]


max_dbeta_per_gene = result.groupby("gene", as_index=False).apply(
    find_max_dBeta_grouped, include_groups=False
)

In [77]:
# relocate columns
max_dbeta_per_gene = max_dbeta_per_gene[["ID", "gene", "dbeta", "feature"]]


max_dbeta_per_gene

Unnamed: 0,ID,gene,dbeta,feature
0,cg00000289,ACTN1,0.002234,3'UTR
1,cg00001874,ATOH7,-0.002518,1stExon
2,cg00000292,ATP2A1,0.003465,1stExon
3,cg00000108,C3orf35,0.004831,Body
4,cg00001687,CDK10,0.004254,Body
5,cg00000734,CNBP,-0.011372,5'UTR
6,cg00000769,DDX55,0.017234,TSS200
7,cg00001854,DNAJA2,0.005271,TSS1500
8,cg00001446,ELOVL1,0.002029,Body
9,cg00001793,ETV6,0.012314,Body


In [38]:
comorbidity = pd.read_csv(
    "../external_result/matchgene174_single_3Y10__OR2.txt", sep="\t", header=None
)

In [78]:
result_max_per_gene_single = max_dbeta_per_gene[
    max_dbeta_per_gene["gene"].isin(comorbidity[0])
]


result_max_per_gene_single

Unnamed: 0,ID,gene,dbeta,feature
0,cg00000289,ACTN1,0.002234,3'UTR
3,cg00000108,C3orf35,0.004831,Body
4,cg00001687,CDK10,0.004254,Body
5,cg00000734,CNBP,-0.011372,5'UTR
7,cg00001854,DNAJA2,0.005271,TSS1500
8,cg00001446,ELOVL1,0.002029,Body
9,cg00001793,ETV6,0.012314,Body
10,cg00001534,FAF1,-0.012317,Body
12,cg00000109,FNDC3B,0.003466,Body
13,cg00000924,KCNQ1,0.006598,Body


In [79]:
# check if the folder exists
if not os.path.exists(f"../{cancer_type}/result/{data_source}/{result_folder}"):
    os.makedirs(f"../{cancer_type}/result/{data_source}/{result_folder}")


result_max_per_gene_single.to_csv(
    f"../{cancer_type}/result/{data_source}/{result_folder}/dbeta.csv", index=False
)