### Implementation
1. split normal and tumor samples
2. remove outliers in normal and tumor samples
3. calculate the mean of normal sammples
4. tumor - avg(normal)
5. calculate the mean of tumor samples
6. merge with DMP file
7. exclude the genes not in single comorbidity list

### Notes
normal samples are procceed with tumor samples in all_beta_normalized file

### Input Columns
1. `Unnamed: 0` - id of the sample
> list of serial number for each sample 

### Output Columns
1. `id` - id of the sample
2. `gene` - gene name
3. `dbeta` - beta value of tumor - average of beta value of normal

### Parameters
1. `cancer_type` - cancer type name
2. `data_source` - folder name where all_beta_normalized and DMP files are stored, either GSE accession number or associated 
3. `all_beta_normalized_path` - all_beta_normalized file path within `champ_result`
4. `DMP_path` - DMP file path within `champ_result`
5. `result_folder` - default `train100` folder
6. `normal_count` - number of normal samples
7. `is_duplicate` - if the sample is duplicate or not, 2 if duplicate, 1 if not duplicate

In [1]:
import pandas as pd
import os

In [21]:
cancer_type = "lung"
data_source = "GDC_lung_tissue"


all_beta_normalized_path = "all_beta_normalized_train.csv"


DMP_path = "DMP_result_0.csv"


result_folder = "train80"


normal_count = 30


is_duplicate = 1

In [4]:
all_beta_normalized = pd.read_csv(
    f"../result/{data_source}/{result_folder}/{all_beta_normalized_path}"
)

In [5]:
print(all_beta_normalized.shape)

(115120, 601)


In [9]:
all_beta_normalized_normal = all_beta_normalized.iloc[
    :-1, 1 : normal_count * 2 + 1 : is_duplicate
]


all_beta_normalized_tumor = all_beta_normalized.iloc[
    :-1, normal_count * 2 + 1 :: is_duplicate
]

In [11]:
def IQR(df):
    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)
    IQR = Q3 - Q1
    upper_fence = Q3 + IQR * 1.5
    lower_fence = Q1 - IQR * 1.5
    return upper_fence, lower_fence


def no_outlier(df):
    upper_fence, lower_fence = IQR(df)
    ddf = df[(df > lower_fence) & (df < upper_fence)]
    return ddf

In [12]:
all_beta_normalized_normal = no_outlier(all_beta_normalized_normal)
all_beta_normalized_tumor = no_outlier(all_beta_normalized_tumor)

In [13]:
train_normal_avg = all_beta_normalized_normal.mean(skipna=True, axis=1)

In [14]:
all_beta_normalized_tumor = (all_beta_normalized_tumor).subtract(
    train_normal_avg, axis=0
)

In [15]:
all_beta_normalized_tumor = no_outlier(all_beta_normalized_tumor)

In [16]:
train_tumor_mean = all_beta_normalized_tumor.mean(skipna=True, axis=1)

In [17]:
delta_beta = pd.merge(
    all_beta_normalized.iloc[:, :1],
    pd.DataFrame(train_tumor_mean, columns=["dbeta"]),
    left_index=True,
    right_index=True,
)


print(delta_beta.shape)
delta_beta

(115119, 2)


Unnamed: 0.1,Unnamed: 0,dbeta
0,cg16031338,-0.000718
1,cg08167951,0.022218
2,cg12712270,-0.005783
3,cg24422826,-0.001865
4,cg16247931,0.011955
...,...,...
115114,cg05175333,-0.006219
115115,cg21082271,-0.013783
115116,cg23376870,0.004818
115117,cg26284982,0.007483


In [32]:
dmp = pd.read_csv(f"../champ_result/{data_source}/{DMP_path}")
print(f"dmp shape: {dmp.shape}")
dmp = dmp[["Unnamed: 0", "gene", "feature"]]
dmp.dropna(inplace=True)
print(f"dmp shape after dropna: {dmp.shape}")

dmp shape: (289650, 24)
dmp shape after dropna: (219382, 3)


In [33]:
result = pd.merge(delta_beta, dmp, on="Unnamed: 0", how="inner")

In [34]:
def find_max_dBeta_grouped(group):
    idx_max = group["dbeta"].abs().idxmax()
    return group.loc[idx_max]


max_dbeta_per_gene = result.groupby("gene", as_index=False).apply(
    find_max_dBeta_grouped, include_groups=False
)

In [35]:
max_dbeta_per_gene.columns = ["gene", "ID", "dbeta", "feature"]
max_dbeta_per_gene = max_dbeta_per_gene[["ID", "gene", "dbeta", "feature"]]
max_dbeta_per_gene

Unnamed: 0,ID,gene,dbeta,feature
0,cg03630821,A1BG,0.010959,Body
1,cg03986562,A2BP1,0.047753,Body
2,cg17902007,A2LD1,0.002987,5'UTR
3,cg08300930,A2M,-0.000617,Body
4,cg15384867,A2ML1,-0.005984,Body
...,...,...,...,...
14414,cg00397635,ZYG11A,0.015986,Body
14415,cg03935117,ZYG11B,0.008274,TSS1500
14416,cg05102190,ZYX,-0.019736,TSS200
14417,cg21517865,ZZEF1,0.011545,TSS1500


In [None]:
# comorbidity = pd.read_csv(
#     "../external_result/matchgene174_single_3Y10__OR2.txt", sep="\t", header=None
# )

In [None]:
# result_max_per_gene_single = max_dbeta_per_gene[
#     max_dbeta_per_gene["gene"].isin(comorbidity[0])
# ]


# result_max_per_gene_single

Unnamed: 0,ID,gene,dbeta,feature
3,cg08300930,A2M,0.055621,Body
4,cg21416544,A2ML1,0.074046,Body
11,cg13001012,AADAC,-0.039686,TSS1500
14,cg06339629,AADAT,0.035697,TSS1500
15,cg20940607,AAGAB,0.009760,1stExon
...,...,...,...,...
18318,cg07135797,ZNRD1,-0.063229,Body
18321,cg13298682,ZNRF3,-0.087323,Body
18337,cg05616010,ZSCAN18,0.084201,TSS1500
18357,cg14642833,ZWINT,-0.015362,TSS1500


In [40]:
# check if the folder exists
if not os.path.exists(f"../result/{data_source}/{result_folder}"):
    os.makedirs(f"../result/{data_source}/{result_folder}")
max_dbeta_per_gene.to_csv(
    f"../result/{data_source}/{result_folder}/dbeta.csv", index=False
)