In [1]:
%load_ext autoreload
# %reload_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import argparse
from methylVA.utils.common import load_config



In [23]:
import pandas as pd
import pickle
import argparse
from methylVA.utils.common import load_config


from sklearn.model_selection import train_test_split

def select_HV_cpgs(config):

    input_dir = config['input_dir']
    thresholds = config['thresholds']
    output_dir = config['output_dir']

    
    print("Loading a sample of raw data for testing, dropping NaNs, and selecting highly variable CpGs...")
    data_files = [f'{input_dir}methyl_scores_v2_HM450k_{i}.pkl' for i in range(1, 12)]


    print("Loading raw data, dropping nans, and select highly variable cpgs ...")
    data_files = [f'{input_dir}methyl_scores_v2_HM450k_{i}.pkl' for i in range(1, 12)]
    # dataframes = [pd.read_pickle(file, compression="bz2") for file in data_files]
    # df = pd.concat(dataframes, axis=0)
    sample_size = 100  # Specify the number of rows to load
    df = pd.read_pickle(data_files[3], compression="bz2").head(sample_size)


    metadata_columns = [
        'id', 'geo_accession', 'title', 'sex', 'age', 'race', 'tissue',
        'geo_platform', 'inferred_age_Hannum', 'inferred_age_SkinBlood',
        'inferred_age_Horvath353'
    ]
    label_column = 'disease'
    sex_condition_column = 'inferred_sex'
    age_condition_column = 'inferred_age_MepiClock'

    numerical_data = df.drop(
        metadata_columns + [label_column, sex_condition_column, age_condition_column],
        axis=1
    )

    # Fix FutureWarning
    df[label_column] = df[label_column].fillna('no_label')

    # Fix PerformanceWarning
    labels_encoded = df[label_column].astype('category').cat.codes
    df = pd.concat([df, labels_encoded.rename('labels_encoded')], axis=1)
    df = df.reset_index()

    nan_percentage = numerical_data.isna().sum() / numerical_data.shape[0] * 100
    selected_columns = nan_percentage[nan_percentage < 10].index.tolist()
    numerical_data_filtered = numerical_data[selected_columns]
    print("Data is processed successfully.")

    print("Creating metadata with labels ...")
    metadata_columns_with_labels = metadata_columns + [label_column, sex_condition_column, age_condition_column, 'labels_encoded']
    df_metadata = df[metadata_columns_with_labels]


    print("Splitting the data to train and test and select the variable features based on the train data.")
    data_train, data_test, meta_data_train, meta_data_test = train_test_split(
        numerical_data_filtered, df_metadata, test_size=0.1, random_state=42, stratify=df_metadata['labels_encoded']
    )

    print("Calculating column variances ...")
    column_variances = data_train.var()


    for threshold in thresholds:
        print(f"Number of columns with variance > {threshold}: {(column_variances > threshold).sum()}")

    print(" ***** Descriptive statistics of column variances *****")
    print(f"Mean: {column_variances.mean()}")
    print(f"Median: {column_variances.median()}")
    print(f"Min: {column_variances.min()}")
    print(f"Max: {column_variances.max()}")
    print(f"Standard deviation: {column_variances.std()}")
    print(f"Variance: {column_variances.var()}")

    for threshold in thresholds:
        print(f"Saving train data with variance > {threshold} ...")
        data_train[column_variances.index[(column_variances>threshold)]].to_csv(f'{output_dir}train_data_filtered_{threshold}.csv')
        print(f"Saving test data with variance > {threshold} ...")
        data_test[column_variances.index[(column_variances>threshold)]].to_csv(f'{output_dir}test_data_filtered_{threshold}.csv')
        
    print("Saving train and test metadata with labels ...")
    meta_data_train.to_csv(f'{output_dir}train_metadata_with_labels.csv')
    meta_data_test.to_csv(f'{output_dir}test_metadata_with_labels.csv')



In [24]:
if __name__ == '__main__':
    # argparser = argparse.ArgumentParser()
    # argparser.add_argument('--config', default="../methylVA/configs/config_hv_cpg_selection.yaml" , type=str, required=True)
    # args = argparser.parse_args()
    # config = load_config(args.config)
    config = load_config("../methylVA/configs/config_hv_cpg_selection.yaml")

    select_HV_cpgs(config['hvcpg_selection'])



Loading a sample of raw data for testing, dropping NaNs, and selecting highly variable CpGs...
Loading raw data, dropping nans, and select highly variable cpgs ...
Data is processed successfully.
Creating metadata with labels ...
Splitting the data to train and test and select the variable features based on the train data.
Calculating column variances ...
Number of columns with variance > 0.01: 149569
Number of columns with variance > 0.03: 42574
Number of columns with variance > 0.05: 10335
Number of columns with variance > 0.1: 236
 ***** Descriptive statistics of column variances *****
Mean: 0.0
Median: 0.0033206939697265625
Min: 7.748603820800781e-07
Max: 0.22021484375
Standard deviation: 0.0
Variance: 0.0
Saving train data with variance > 0.01 ...


  return dtype.type(n)
  return dtype.type(n)


Saving test data with variance > 0.01 ...
Saving train data with variance > 0.03 ...
Saving test data with variance > 0.03 ...
Saving train data with variance > 0.05 ...
Saving test data with variance > 0.05 ...
Saving train data with variance > 0.1 ...
Saving test data with variance > 0.1 ...
Saving train and test metadata with labels ...


In [5]:
config['hvcpg_selection']

{'input_dir': '../data/v2_HM450/',
 'output_dir': '../data/dimension_reduction/highly_variable_features/',
 'thresholds': [0.01, 0.03, 0.05, 0.1]}

In [6]:
output_dir = config['hvcpg_selection']['output_dir']
threshold = config['hvcpg_selection']['thresholds'][3]
numerical_data_filtered = pd.read_csv(f'{output_dir}numerical_data_filtered_{threshold}.csv', index_col=0)


In [7]:
numerical_data_filtered

Unnamed: 0,cg00011616,cg00015530,cg00017461,cg00025044,cg00025496,cg00032912,cg00035969,cg00041401,cg00041575,cg00041666,...,rs5931272,rs6546473,rs7660805,rs7746156,rs798149,rs877309,rs9292570,rs9363764,rs939290,rs951295
GSM2947470,0.3940,0.9380,0.5786,0.7980,0.03625,0.6340,0.7466,0.83060,0.8960,0.14360,...,,,,,,,,,,
GSM2947471,0.4387,0.8980,0.6143,0.7320,0.04556,0.6406,0.7910,0.89750,0.8975,0.12520,...,,,,,,,,,,
GSM2947472,0.3990,0.9280,0.5557,0.8203,0.04270,0.6304,0.7227,0.82000,0.8650,0.16750,...,,,,,,,,,,
GSM2947473,0.3733,0.8790,0.6070,0.7935,0.04065,0.6123,0.7476,0.78200,0.8850,0.17580,...,,,,,,,,,,
GSM2947474,0.6990,0.8520,0.5910,0.8330,0.06420,0.4587,0.7450,0.78560,0.8460,0.16960,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GSM4565216,0.5493,0.9780,0.9510,0.9595,0.95850,0.9805,0.6860,0.07450,0.3690,0.03195,...,0.9800,0.5180,0.6900,0.9800,0.9830,0.5464,0.5140,0.9700,0.5260,0.2108
GSM4565217,0.4927,0.9700,0.9697,0.9670,0.95900,0.9810,0.7030,0.04517,0.3542,0.02908,...,0.9810,0.5410,0.7114,0.9800,0.9844,0.5560,0.4907,0.9670,0.5234,0.1937
GSM4565218,0.4453,0.9785,0.9620,0.9673,0.95800,0.9760,0.7160,0.07090,0.3208,0.02728,...,0.9820,0.5790,0.6816,0.9800,0.9860,0.5790,0.4968,0.9650,0.5825,0.1909
GSM4565219,0.4722,0.9746,0.9580,0.9650,0.94630,0.9785,0.6885,0.08190,0.3500,0.03912,...,0.9824,0.5684,0.7310,0.9824,0.9834,0.5884,0.5100,0.9697,0.5800,0.2238


In [8]:
df_metadata = pd.read_csv(f'{output_dir}metadata_with_labels.csv', index_col=0)


  df_metadata = pd.read_csv(f'{output_dir}metadata_with_labels.csv', index_col=0)


In [9]:
df_metadata['labels_encoded']

0         0
1         0
2         0
3         6
4         6
         ..
37062    17
37063    17
37064    17
37065    17
37066    17
Name: labels_encoded, Length: 37067, dtype: int64

In [10]:
from methylVA.data_processing.split_train_test import split_train_test

# df_metadata['labels_encoded'].values
numerical_data_filtered
split_train_test(numerical_data_filtered, df_metadata['labels_encoded'].values

array([ 0,  0,  0, ..., 17, 17, 17])

In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    numerical_data_filtered, df_metadata, test_size=0.1, random_state=42, stratify=df_metadata['labels_encoded']
    )

In [16]:
X_train

Unnamed: 0,cg00011616,cg00015530,cg00017461,cg00025044,cg00025496,cg00032912,cg00035969,cg00041401,cg00041575,cg00041666,...,rs5931272,rs6546473,rs7660805,rs7746156,rs798149,rs877309,rs9292570,rs9363764,rs939290,rs951295
GSM2808906,0.4226,0.89550,0.97800,0.65330,0.04987,0.26300,0.95360,0.94730,0.91600,0.0800,...,0.02790,0.02315,0.97460,0.97360,0.021770,0.02954,0.97500,0.51860,0.55960,0.53500
GSM3398158,0.9844,0.06220,0.01863,0.04214,0.90700,0.08310,0.07380,0.04110,0.09480,0.9260,...,0.01491,0.50700,0.54350,0.98140,0.984400,0.98500,0.97850,0.57860,0.02307,0.53170
GSM3090665,0.4946,0.96400,0.95650,0.88040,0.04530,0.94400,0.52300,0.77640,0.86500,0.1907,...,0.98440,0.98340,0.53800,0.98440,0.985000,0.55220,0.97300,0.53660,0.97950,0.53000
GSM4315821,0.9795,0.06903,0.01624,0.05984,0.81600,0.10156,0.08320,0.03340,0.07214,0.8687,...,0.02042,0.50440,0.54000,0.50800,0.983400,0.98440,0.51370,0.53400,0.57670,0.04034
GSM2864268,0.8940,0.92140,0.82280,0.84800,0.81540,0.72360,0.75700,0.82600,0.77150,0.2980,...,0.02231,0.02610,0.47830,0.02669,0.022320,0.02396,0.52150,0.03842,0.49700,0.96880
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GSM4316293,0.9653,,0.02208,,,0.05652,0.07056,0.03770,,0.7407,...,0.61670,0.73440,0.95950,0.96340,0.023500,0.81640,0.97360,0.94800,0.04758,0.57000
GSM2941420,0.2810,0.90700,0.10706,0.39970,0.12067,0.80370,0.46200,0.36770,0.82200,0.2415,...,0.03105,0.97750,0.96300,0.97000,0.974600,0.02625,0.02373,0.55100,0.56400,0.51030
GSM3112935,0.9854,0.04898,0.02722,0.06510,0.86900,0.07825,0.07153,0.04126,0.07320,0.9240,...,,,,,,,,,,
GSM3814424,0.7583,0.92300,0.66300,0.88670,0.08410,0.58800,0.76950,0.82900,0.91650,0.2305,...,0.01814,0.98500,0.02853,0.48460,0.015236,0.56150,0.52100,0.54200,0.02904,0.97360


In [17]:
y_train

Unnamed: 0,id,geo_accession,title,sex,age,race,tissue,geo_platform,inferred_age_Hannum,inferred_age_SkinBlood,inferred_age_Horvath353,disease,inferred_sex,inferred_age_MepiClock,labels_encoded
2048,116419,GSM2808906,Dnr19_M_71_CTRL_Neuron,male,71.0,,brain,GPL13534,31.630000,16.220000,56.050000,control,M,58.090000,6
32962,187371,GSM3398158,Genomic DNA from twin A of pair 13,female,22.0,,,GPL13534,,,,no_label,,,17
7558,80851,GSM3090665,JMICSM081_DC_ESAM,male,0.0,,buccal epithelial cell,GPL13534,26.885593,0.083097,0.409117,no_label,M,0.672317,17
8473,113367,GSM4315821,3998888069_R06C02,male,,,blood,GPL13534,81.530000,77.100000,83.080000,control,M,85.780000,6
26032,174098,GSM2864268,Prostate_tumour_CPCG0462_rep1 [re-analysis],,,,,GPL13534,,,,no_label,,,17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8717,113839,GSM4316293,101070190122_R01C01,female,,,blood,GPL13534,63.990000,53.420000,66.310000,Parkinson's disease,F,62.430000,4
17607,196226,GSM2941420,"DMG, K27, sample 696 [validation set]",,,,,GPL13534,59.970000,61.570000,86.200000,no_label,M,77.050000,17
2290,98921,GSM3112935,3998998186_R05C02,male,45.0,,,GPL13534,49.873068,40.984756,46.847845,no_label,M,44.329345,17
19213,234996,GSM3814424,H43_UMB5176_ASD/Autism,male,,,,GPL13534,11.320000,9.370000,31.090000,no_label,M,28.830000,17
