### Implementation
1. split normal and tumor samples
2. remove outliers in normal and tumor samples
3. calculate the mean of normal sammples
4. tumor - avg(normal)
5. calculate the mean of tumor samples
6. merge with DMP file
7. exclude the genes not in single comorbidity list

### Notes
normal samples are procceed with tumor samples in all_beta_normalized file

### Input Columns
1. `Unnamed: 0` - id of the sample
> list of serial number for each sample 

### Output Columns
1. `id` - id of the sample
2. `gene` - gene name
3. `dbeta` - beta value of tumor - average of beta value of normal

### Parameters
1. `cancer_type` - cancer type name
2. `data_source` - folder name where all_beta_normalized and DMP files are stored, either GSE accession number or associated 
3. `all_beta_normalized_path` - all_beta_normalized file path within `champ_result`
4. `DMP_path` - DMP file path within `champ_result`
5. `result_folder` - default `train100` folder
6. `normal_count` - number of normal samples
7. `is_duplicate` - if the sample is duplicate or not, 2 if duplicate, 1 if not duplicate

In [40]:
import pandas as pd
import os

In [100]:
cancer_type = "breast"
data_source = "GDC_breast_tissue"


all_beta_normalized_path = "all_beta_normalized_0.csv"


DMP_path = "DMP_result_0.csv"


result_folder = "train100"


normal_count = 47


is_duplicate = 2

In [42]:
all_beta_normalized = pd.read_csv(
    f"../{cancer_type}/champ_result/{data_source}/{all_beta_normalized_path}"
)

In [43]:
print(all_beta_normalized.shape)

(349420, 891)


In [44]:
all_beta_normalized_normal = all_beta_normalized.iloc[
    :, 1 : normal_count + 1 : is_duplicate
].T


all_beta_normalized_tumor = all_beta_normalized.iloc[
    :, normal_count + 1 :: is_duplicate
].T

In [45]:
def IQR(df):
    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)
    IQR = Q3 - Q1
    upper_fence = Q3 + IQR * 1.5
    lower_fence = Q1 - IQR * 1.5
    return upper_fence, lower_fence


def no_outlier(df):
    upper_fence, lower_fence = IQR(df)
    ddf = df[(df > lower_fence) & (df < upper_fence)]
    return ddf

In [46]:
all_beta_normalized_normal = no_outlier(all_beta_normalized_normal)
all_beta_normalized_tumor = no_outlier(all_beta_normalized_tumor)

In [47]:
train_normal_avg = all_beta_normalized_normal.mean(skipna=True, axis=0)

In [48]:
all_beta_normalized_tumor = (all_beta_normalized_tumor).subtract(
    train_normal_avg, axis=1
)

In [49]:
all_beta_normalized_tumor = no_outlier(all_beta_normalized_tumor)

In [50]:
train_tumor_mean = all_beta_normalized_tumor.mean(skipna=True, axis=0)

In [51]:
delta_beta = pd.merge(
    all_beta_normalized.iloc[:, :1],
    pd.DataFrame(train_tumor_mean, columns=["dbeta"]),
    left_index=True,
    right_index=True,
)


print(delta_beta.shape)
delta_beta

(349420, 2)


Unnamed: 0.1,Unnamed: 0,dbeta
0,cg00000957,0.009047
1,cg00001349,0.021041
2,cg00001583,0.285006
3,cg00002028,0.015331
4,cg00002837,0.100378
...,...,...
349415,cg27656573,0.002886
349416,cg27657363,-0.017049
349417,cg27657537,0.020609
349418,cg27662611,0.005775


In [60]:
dmp = pd.read_csv(f"../{cancer_type}/champ_result/{data_source}/{DMP_path}")
print(f"dmp shape: {dmp.shape}")
dmp = dmp[["Unnamed: 0", "gene"]]
dmp.dropna(inplace=True)
print(f"dmp shape after dropna: {dmp.shape}")

dmp shape: (268112, 24)
dmp shape after dropna: (206165, 2)


In [68]:
result = pd.merge(delta_beta, dmp, on="Unnamed: 0", how="inner")

In [94]:
def find_max_dBeta_grouped(group):
    idx_max = group["dbeta"].abs().idxmax()
    return group.loc[idx_max]


max_dbeta_per_gene = result.groupby("gene", as_index=False).apply(
    find_max_dBeta_grouped, include_groups=False
)

In [95]:
max_dbeta_per_gene.columns = ["gene", "id", "dbeta"]
max_dbeta_per_gene = max_dbeta_per_gene[["id", "gene", "dbeta"]]
max_dbeta_per_gene

Unnamed: 0,id,gene,dbeta
0,cg03630821,A1BG,0.280839
1,cg06719334,A2BP1,-0.410930
2,cg13776095,A2LD1,-0.241959
3,cg00134295,A2M,0.210242
4,cg15769388,A2ML1,-0.129877
...,...,...,...
18360,cg23995459,ZYG11B,0.019597
18361,cg09704136,ZYX,0.239694
18362,cg26591066,ZZEF1,0.299207
18363,cg04127303,ZZZ3,0.005078


In [96]:
comorbidity = pd.read_csv(
    "../external_result/matchgene174_single_3Y10__OR2.txt", sep="\t", header=None
)

In [97]:
result_max_per_gene_single = max_dbeta_per_gene[
    max_dbeta_per_gene["gene"].isin(comorbidity[0])
]


result_max_per_gene_single

Unnamed: 0,id,gene,dbeta
3,cg00134295,A2M,0.210242
4,cg15769388,A2ML1,-0.129877
11,cg13001012,AADAC,-0.052858
14,cg00150882,AADAT,0.133425
15,cg20940607,AAGAB,0.009609
...,...,...,...
18318,cg07135797,ZNRD1,-0.277232
18321,cg20080983,ZNRF3,-0.232866
18337,cg14231297,ZSCAN18,0.406579
18357,cg14642833,ZWINT,-0.014116


In [102]:
# check if the folder exists
if not os.path.exists(f"../{cancer_type}/result/{data_source}/{result_folder}"):
    os.makedirs(f"../{cancer_type}/result/{data_source}/{result_folder}")
result_max_per_gene_single.to_csv(
    f"../{cancer_type}/result/{data_source}/{result_folder}/dbeta.csv", index=False
)