### Import Packages

In [43]:
import pandas as pd 
import warnings
from modules.utils import save_json
warnings.filterwarnings('ignore') 


import numpy as np
def downsample_majority_to_ratio(
    df: pd.DataFrame,
    label_col: str = "Diagnosis",
    control_label: str = "Control",
    tumor_label: str = "Tumor",   # use "AD" here if that's your positive label
    ratio: float = 1.10,
    random_state: int = 42
) -> pd.DataFrame:
    """Keep all Control; downsample Tumor to â‰ˆ ratio * n_control."""
    df_ctrl = df[df[label_col] == control_label]
    df_tum  = df[df[label_col] == tumor_label]

    n_ctrl = len(df_ctrl)
    target_tum = int(np.ceil(ratio * n_ctrl))

    # If we have more tumor than target, sample down; otherwise keep as is
    if len(df_tum) > target_tum:
        df_tum = df_tum.sample(n=target_tum, random_state=random_state)

    out = pd.concat([df_ctrl, df_tum], axis=0).sample(frac=1, random_state=random_state).reset_index(drop=True)
    return out

#### 1. Prepare ROSMAP DATASETS

In [2]:
# load rosmap datasets
miRNA = pd.read_csv('../data/ROSMAP/preprocessed/miRNA_data.csv').drop(columns = ['Gender',	'Race',	'PMI',	'Braak'])
mRNA = pd.read_csv('../data/ROSMAP/preprocessed/gene_expression_data.csv').drop(columns = ['Gender',	'Race',	'PMI',	'Braak'])
meth = pd.read_csv('../data/ROSMAP/preprocessed/dna_methylation_data.csv').drop(columns = ['Gender',	'Race',	'PMI',	'Braak'])

# a few samples have repeats in mRNA
df = mRNA.copy()

# Separate categorical and numeric columns
categorical_cols = ['Diagnosis']  # adjust if needed
numeric_cols = df.columns.difference(['Sample ID'] + categorical_cols)

# Aggregate numeric columns by mean
df_numeric = df.groupby("Sample ID", as_index=False)[numeric_cols].mean()

# For categorical columns, take the first occurrence for each Sample ID
df_categorical = df.groupby("Sample ID", as_index=False)[categorical_cols].first()

# Merge numeric and categorical data back together
mRNA = pd.merge(df_categorical, df_numeric, on="Sample ID")

In [3]:
# save experimental data

# save single omics
miRNA.to_csv('../data/ROSMAP/prepared/miRNA_data.csv', index=False)
mRNA.to_csv('../data/ROSMAP/prepared/gene_expression_data.csv', index=False)
meth.to_csv('../data/ROSMAP/prepared/dna_methylation_data.csv', index=False)  

# select and save samples
miRNA_and_gene_expression_samples = list(set(miRNA['Sample ID']) & set(mRNA['Sample ID']))
miRNA_and_dna_methylation_samples = list(set(miRNA['Sample ID']) & set(meth['Sample ID']))
gene_expression_and_dna_methylation_samples = list(set(mRNA['Sample ID']) & set(meth['Sample ID'])) 
miRNA_gene_expression_and_dna_methylation_samples = list(set(miRNA['Sample ID']) & set(mRNA['Sample ID']) & set(meth['Sample ID'])) 


pd.merge(miRNA, mRNA).to_csv('../data/ROSMAP/prepared/miRNA_and_gene_expression_data.csv', index=False)
pd.merge(miRNA, meth).to_csv('../data/ROSMAP/prepared/miRNA_and_dna_methylation_data.csv', index=False)
pd.merge(mRNA, meth).to_csv('../data/ROSMAP/prepared/gene_expression_and_dna_methylation_data.csv', index=False)
pd.merge(miRNA, pd.merge(mRNA, meth)).to_csv('../data/ROSMAP/prepared/miRNA_and_gene_expression_and_dna_methylation_data.csv', index=False)

print(f'Number of miRNA samples: {miRNA.shape[0]}')
print(f'Number of gene expression samples: {mRNA.shape[0]}')
print(f'Number of dna methylation samples: {meth.shape[0]}')

print(f'Number of miRNA and gene expression Multiltomics samples: {len(miRNA_and_gene_expression_samples)}')
print(f'Number of miRNA and dna methylation Multiltomics samples: {len(miRNA_and_gene_expression_samples)}')
print(f'Number of gene expression and dna methylation Multiltomics samples: {len(gene_expression_and_dna_methylation_samples )}')
print(f'Number of miRNA, gene expression and dna methylation Multiltomics samples: {len(miRNA_gene_expression_and_dna_methylation_samples)}')

Number of miRNA samples: 378
Number of gene expression samples: 378
Number of dna methylation samples: 375
Number of miRNA and gene expression Multiltomics samples: 378
Number of miRNA and dna methylation Multiltomics samples: 378
Number of gene expression and dna methylation Multiltomics samples: 375
Number of miRNA, gene expression and dna methylation Multiltomics samples: 375


#### 2. Prepare MayoRNASeq DATASETS

In [15]:
# load rosmap datasets
metabolomics = pd.read_csv('../data/MayoRNASeq/preprocessed/metabolomics_data.csv').drop(columns = ['Gender',	'ageDeath', 'Braak'])
mRNA = pd.read_csv('../data/MayoRNASeq/preprocessed/gene_expression_data.csv').drop(columns = ['Gender',	'ageDeath', 'Braak'])
proteomics = pd.read_csv('../data/MayoRNASeq/preprocessed/proteomics_data.csv').drop(columns = ['Gender',	'ageDeath', 'Braak']) 
metabolomics['Sample ID'] = metabolomics['Sample ID'].astype(str)
mRNA['Sample ID'] = mRNA['Sample ID'].astype(str)
proteomics['Sample ID'] = proteomics['Sample ID'].astype(str)

In [16]:
# save experimental data

# save single omics
metabolomics.to_csv('../data/MayoRNASeq/prepared/metabolomics_data.csv', index=False)
mRNA.to_csv('../data/MayoRNASeq/prepared/gene_expression_data.csv', index=False)
proteomics.to_csv('../data/MayoRNASeq/prepared/proteomics_data.csv', index=False)  

# select and save samples
metabolomics_and_gene_expression_samples = list(set(metabolomics['Sample ID']) & set(mRNA['Sample ID']))
metabolomics_and_proteomics_samples = list(set(metabolomics['Sample ID']) & set(proteomics['Sample ID']))
gene_expression_and_proteomics_samples = list(set(mRNA['Sample ID']) & set(proteomics['Sample ID'])) 
metabolomics_gene_expression_and_proteomics_samples = list(set(metabolomics['Sample ID']) & set(mRNA['Sample ID']) & set(proteomics['Sample ID'])) 


df1 = pd.merge(metabolomics, mRNA, on='Sample ID')
df1.rename(columns={'Diagnosis_x':'Diagnosis'}, inplace=True)
df1.drop(columns=['Diagnosis_y'], inplace=True)
df1.to_csv('../data/MayoRNASeq/prepared/metabolomics_and_gene_expression_data.csv', index=False)

df2 = pd.merge(metabolomics, proteomics, on='Sample ID') 
df2.rename(columns={ column: column.split('_x')[0] for column in df2.columns if '_x' in column}, inplace=True) 
df2.drop(columns=['Diagnosis_y'], inplace=True)
df2.to_csv('../data/MayoRNASeq/prepared/metabolomics_and_proteomics_data.csv', index=False)

df3= pd.merge(mRNA, proteomics, on='Sample ID') 
df3.rename(columns={'Diagnosis_x':'Diagnosis'}, inplace=True) 
df3.drop(columns=['Diagnosis_y'], inplace=True)
df3.to_csv('../data/MayoRNASeq/prepared/gene_expression_and_proteomics_data.csv', index=False)

df4 = pd.merge(metabolomics, pd.merge(mRNA, proteomics, on='Sample ID'), on='Sample ID')   
df4.drop(columns=['Diagnosis_y', 'Diagnosis_x'], inplace=True) 
df4.rename(columns={ column: column.split('_x')[0] for column in df4.columns if '_x' in column}, inplace=True)
df4.to_csv('../data/MayoRNASeq/prepared/metabolomics_and_gene_expression_and_proteomics_data.csv', index=False)

print(f'Number of metabolomics samples: {metabolomics.shape[0]}')
print(f'Number of gene expression samples: {mRNA.shape[0]}')
print(f'Number of proteomics samples: {proteomics.shape[0]}')

print(f'Number of metabolomics and gene expression Multiltomics samples: {len(metabolomics_and_gene_expression_samples)}')
print(f'Number of metabolomics and proteomics Multiltomics samples: {len(metabolomics_and_gene_expression_samples)}')
print(f'Number of gene expression and proteomics Multiltomics samples: {len(gene_expression_and_proteomics_samples )}')
print(f'Number of metabolomics, gene expression and proteomics Multiltomics samples: {len(metabolomics_gene_expression_and_proteomics_samples)}')

Number of metabolomics samples: 98
Number of gene expression samples: 162
Number of proteomics samples: 112
Number of metabolomics and gene expression Multiltomics samples: 98
Number of metabolomics and proteomics Multiltomics samples: 98
Number of gene expression and proteomics Multiltomics samples: 112
Number of metabolomics, gene expression and proteomics Multiltomics samples: 97


#### 3. Prepare BRCA DATASETS

In [37]:
# load rosmap datasets
miRNA = pd.read_csv('../data/BRCA/preprocessed/miRNA_expression_data.csv').drop(columns = ['Age','Gender','Stage'])
mRNA = pd.read_csv('../data/BRCA/preprocessed/gene_expression_data.csv').drop(columns = ['Age','Gender','Stage'])
meth = pd.read_csv('../data/BRCA/preprocessed/dna_methylation_data.csv').drop(columns = ['Age','Gender','Stage'])

print(f'Number of miRNA samples before downsampling: {miRNA.shape[0]}')
print(f'Number of gene expression samples before downsampling: {mRNA.shape[0]}')
print(f'Number of dna methylation samples before downsampling: {meth.shape[0]}')

Number of miRNA samples before downsampling: 819
Number of gene expression samples before downsampling: 1202
Number of dna methylation samples before downsampling: 877


In [38]:
# balance dataset
miRNA1 = downsample_majority_to_ratio(
    df=miRNA,
    label_col="Diagnosis",
    control_label="Control",
    tumor_label="Tumor",  
    ratio=1.10,
    random_state=42
)

# balance dataset
mRNA1 = downsample_majority_to_ratio(
    df=mRNA,
    label_col="Diagnosis",
    control_label="Control",
    tumor_label="Tumor",  
    ratio=1.10,
    random_state=42
)

# balance dataset
meth1 = downsample_majority_to_ratio(
    df=meth,
    label_col="Diagnosis",
    control_label="Control",
    tumor_label="Tumor",  
    ratio=1.10,
    random_state=42
)

In [39]:
df4.Diagnosis.value_counts()

Diagnosis
Tumor      57
Control    51
Name: count, dtype: int64

In [40]:
# save experimental data

# save single omics
miRNA1.to_csv('../data/BRCA/prepared/miRNA_data.csv', index=False)
mRNA1.to_csv('../data/BRCA/prepared/gene_expression_data.csv', index=False)
meth1.to_csv('../data/BRCA/prepared/dna_methylation_data.csv', index=False)  

# select and save samples
miRNA_and_gene_expression_samples = list(set(miRNA['Sample ID']) & set(mRNA['Sample ID']))
miRNA_and_dna_methylation_samples = list(set(miRNA['Sample ID']) & set(meth['Sample ID']))
gene_expression_and_dna_methylation_samples = list(set(mRNA['Sample ID']) & set(meth['Sample ID'])) 
miRNA_gene_expression_and_dna_methylation_samples = list(set(miRNA['Sample ID']) & set(mRNA['Sample ID']) & set(meth['Sample ID'])) 

df1 = pd.merge(miRNA, mRNA)
df1 = downsample_majority_to_ratio(
    df=df1,
    label_col="Diagnosis",
    control_label="Control",
    tumor_label="Tumor",  
    ratio=1.10,
    random_state=42
)
df1.to_csv('../data/BRCA/prepared/miRNA_and_gene_expression_data.csv', index=False)

df2 = pd.merge(miRNA, meth)
df2 = downsample_majority_to_ratio(
    df=df2,
    label_col="Diagnosis",
    control_label="Control",
    tumor_label="Tumor",  
    ratio=1.10,
    random_state=42
)
df2.to_csv('../data/BRCA/prepared/miRNA_and_dna_methylation_data.csv', index=False)

df3 = pd.merge(mRNA, meth)
df3 = downsample_majority_to_ratio(
    df=df3,
    label_col="Diagnosis",
    control_label="Control",
    tumor_label="Tumor",  
    ratio=1.10,
    random_state=42
)
df3.to_csv('../data/BRCA/prepared/gene_expression_and_dna_methylation_data.csv', index=False) 

df4 = pd.merge(miRNA, pd.merge(mRNA, meth))
df4 = downsample_majority_to_ratio(
    df=df4,
    label_col="Diagnosis",
    control_label="Control",
    tumor_label="Tumor",  
    ratio=1.10,
    random_state=42
)
df4.to_csv('../data/BRCA/prepared/miRNA_and_gene_expression_and_dna_methylation_data.csv', index=False)

print(f'Number of miRNA samples: {miRNA1.shape[0]}')
print(f'Number of gene expression samples: {mRNA1.shape[0]}')
print(f'Number of dna methylation samples: {meth1.shape[0]}')

print(f'Number of miRNA and gene expression Multiltomics samples: {df1.shape[0]}')
print(f'Number of miRNA and dna methylation Multiltomics samples: {df2.shape[0]}')
print(f'Number of gene expression and dna methylation Multiltomics samples: {df3.shape[0]}')
print(f'Number of miRNA, gene expression and dna methylation Multiltomics samples: {df4.shape[0]}')

Number of miRNA samples: 158
Number of gene expression samples: 236
Number of dna methylation samples: 202
Number of miRNA and gene expression Multiltomics samples: 158
Number of miRNA and dna methylation Multiltomics samples: 108
Number of gene expression and dna methylation Multiltomics samples: 175
Number of miRNA, gene expression and dna methylation Multiltomics samples: 108


In [46]:
## feature names
feature_names = {
    1:df1.columns[1:].to_list(),
    2:df2.columns[1:].to_list(),
    3:df3.columns[1:].to_list()
}
save_json("../results/BRCA/featurenames.json", feature_names)