## 1.Download the datasets

### 1.1 Download the multi-omics data

* parse the multiomics data from the synapse website of the dataset ROSMAP https://www.synapse.org/#!Synapse:syn23446022
* Genome Variants https://www.synapse.org/#!Synapse:syn26263118
* Methylation https://www.synapse.org/#!Synapse:syn3168763
* RNA sequence https://www.synapse.org/#!Synapse:syn3505720
* Proteomics https://www.synapse.org/#!Synapse:syn21266454    

### 1.2 Download the clinical data

* parse the clinical data https://www.synapse.org/#!Synapse:syn3191087

## 2.Read the files

In [1]:
import pandas as pd
from pyensembl import EnsemblRelease

### 2.1 Read DNA methylation data

In [2]:
methylation_value = pd.read_csv("./ROSMAP-raw/Methylation/DNA-methylation-array/AMP-AD_ROSMAP_Rush-Broad_IlluminaHumanMethylation450_740_imputed.tsv",sep='\t')

In [3]:
methylation_value

Unnamed: 0,TargetID,TBI-AUTO73325-PT-3149,PT-BZHL,PT-BZCH,PT-BY9H,TBI-AUTO73307-PT-314I,TBI-AUTO73043-PT-35BD,PT-BZI5,PT-BZ1A,PT-318X,...,TBI-AUTO72955-PT-35OC,TBI-AUTO73291-PT-35OD,PT-BZD7,PT-BZG8,PT-C1N8,PT-BYJP,PT-BZHV,PT-BZI2,TBI-AUTO73257-PT-35OE,PT-BZD3
0,cg00000165,0.231359,0.157857,0.127105,0.149988,0.130532,0.174451,0.170026,0.165900,0.157745,...,0.124635,0.168002,0.152200,0.230482,0.177287,0.195611,0.151338,0.151508,0.167960,0.136220
1,cg00000363,0.140664,0.114399,0.140580,0.145805,0.122833,0.117144,0.155559,0.131309,0.157749,...,0.129301,0.126910,0.135973,0.162883,0.122399,0.112684,0.125054,0.118550,0.114618,0.133814
2,cg00000957,0.776220,0.818279,0.630316,0.849261,0.861136,0.751543,0.778917,0.859892,0.787279,...,0.735520,0.740686,0.748494,0.797072,0.793081,0.692638,0.785597,0.712386,0.839168,0.888304
3,cg00001349,0.865488,0.919570,0.882256,0.902912,0.907610,0.897496,0.939212,0.919514,0.805626,...,0.885292,0.900684,0.906257,0.756173,0.948341,0.904996,0.927201,0.856637,0.917827,0.943320
4,cg00001364,0.772899,0.711099,0.694639,0.651986,0.748538,0.706625,0.743491,0.712513,0.722169,...,0.720420,0.724226,0.694462,0.776165,0.758569,0.733207,0.750224,0.737402,0.685959,0.770543
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
420127,ch.22.772318F,0.233366,0.198370,0.176775,0.141996,0.167610,0.252353,0.153457,0.124837,0.224565,...,0.253596,0.184639,0.175109,0.183692,0.000000,0.193311,0.149942,0.135414,0.123727,0.121499
420128,ch.22.43177094F,0.173744,0.199095,0.189069,0.226845,0.193875,0.194734,0.194701,0.207784,0.183201,...,0.225843,0.172531,0.234507,0.169050,0.177599,0.205192,0.176225,0.191057,0.204723,0.179136
420129,ch.22.909671F,0.409177,0.463476,0.307656,0.302488,0.426867,0.425683,0.335775,0.361516,0.385871,...,0.431509,0.390838,0.402118,0.382906,0.410532,0.472419,0.340013,0.436682,0.413712,0.447053
420130,ch.22.46830341F,0.225070,0.279521,0.224084,0.286071,0.272743,0.322463,0.271097,0.268207,0.250027,...,0.267995,0.279551,0.333601,0.305278,0.280626,0.344297,0.251353,0.233328,0.280114,0.306628


### 2.2 Read Platform annotations for 450k methylation 

In [4]:
# Reading and processing basic data
try:
    annotation = pd.read_table('./ROSMAP-raw/Methylation/DNA-methylation-array/GPL16304-47833.txt', delimiter='\t')
    annotation['Distance_closest_TSS'] = annotation['Distance_closest_TSS'].astype(int)
    annotation = annotation[~annotation['Closest_TSS'].apply(lambda x: len(str(x).split(';')) > 1)]
except ValueError as e:
    print(f"Unable to convert 'Closest_TSS' column to integer: {e}")
    problematic_rows = annotation['Distance_closest_TSS'].apply(lambda x: not str(x).isnumeric())
    print("Problematic rows:")
    print(annotation.loc[problematic_rows])

annotation

  annotation = pd.read_table('./ROSMAP-raw/Methylation/DNA-methylation-array/GPL16304-47833.txt', delimiter='\t')


Unnamed: 0,ID,MAPINFO-1,MAPINFO+1,Probe_start,Probe_end,Target CpG SNP,n_target CpG SNP,SNPprobe,n_SNPprobe,HIL_CpG_class,...,AlleleA_Hits,AlleleB_Hits,XY_Hits,Autosomal_Hits,Closest_TSS,Closest_TSS_1,Distance_closest_TSS,Closest_TSS_gene_name,Closest_TSS_Transcript,SPOT_ID
0,cg00000029,53468111,53468113,53468112,53468162,,,,,HC,...,1,0,XY_NO,A_NO,53468350,53468351,-238,RBL2,NM_005611,cg00000029
1,cg00000108,37459205,37459207,37459206,37459256,,,rs9857774,1.0,LC,...,1,0,XY_NO,A_NO,37458757,37458758,449,C3orf35,CCDS46792,cg00000108
2,cg00000109,171916036,171916038,171916037,171916087,,,rs9864492,1.0,LC,...,1,0,XY_NO,A_NO,171851260,171851261,64777,FNDC3B,AY358367,cg00000109
3,cg00000165,91194673,91194675,91194624,91194674,,,rs76771611,1.0,ICshore,...,1,0,XY_NO,A_NO,91182793,91182794,-11880,BARHL2,NM_020063,cg00000165
4,cg00000236,42263293,42263295,42263244,42263294,,,,,IC,...,1,0,XY_NO,A_NO,42251727,42251728,11567,VDAC3,CCDS47850,cg00000236
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
485507,ch.X.97129969R,97243312,97243314,97243313,97243363,,,,,LC,...,1,0,XY_NO,A_NO,97607750,97607751,-364437,Mir_340,.,ch.X.97129969R
485508,ch.X.97133160R,97246503,97246505,97246504,97246554,,,,,LC,...,2,0,XY_NO,A_YES,97607750,97607751,-361246,Mir_340,.,ch.X.97133160R
485509,ch.X.97651759F,97765102,97765104,97765103,97765153,,,,,LC,...,1,0,XY_NO,A_NO,97607750,97607751,157353,Mir_340,.,ch.X.97651759F
485510,ch.X.97737721F,97851064,97851066,97851065,97851115,,,,,LC,...,21,0,XY_NO,A_YES,97607750,97607751,243315,Mir_340,.,ch.X.97737721F


In [5]:
map_annotation= annotation[['ID', 'Closest_TSS','Closest_TSS_gene_name', 'Distance_closest_TSS']]
map_annotation

Unnamed: 0,ID,Closest_TSS,Closest_TSS_gene_name,Distance_closest_TSS
0,cg00000029,53468350,RBL2,-238
1,cg00000108,37458757,C3orf35,449
2,cg00000109,171851260,FNDC3B,64777
3,cg00000165,91182793,BARHL2,-11880
4,cg00000236,42251727,VDAC3,11567
...,...,...,...,...
485507,ch.X.97129969R,97607750,Mir_340,-364437
485508,ch.X.97133160R,97607750,Mir_340,-361246
485509,ch.X.97651759F,97607750,Mir_340,157353
485510,ch.X.97737721F,97607750,Mir_340,243315


### 2.3 Read gene expression data, mapping information and substitue the the gene name

In [6]:
gene_expression = pd.read_csv('./ROSMAP-raw/RNASeq/ROSMAP_RNAseq_FPKM_gene.tsv', sep='\t')
gene_expression

Unnamed: 0,tracking_id,gene_id,525_120515_0,383_120503_0,93_120417_0,610_120523_0,560_120517_0,492_120515_0,576_120521_0,150_120419_0,...,831_130725_8,901_131010_8,894_130923_8,938_131101_8,942_131101_8,939_131101_8,895_130923_8,829_130725_8,944_131107_8,775_130528_8
0,ENSG00000167578.11,ENSG00000167578.11,60.84,65.45,69.18,51.60,48.61,55.65,51.18,60.63,...,44.85,45.02,33.71,38.30,28.27,51.02,31.87,37.14,41.77,44.01
1,ENSG00000242268.1,ENSG00000242268.1,0.08,0.05,0.08,0.08,0.10,0.08,0.06,0.11,...,0.00,0.10,0.20,0.00,0.00,0.00,0.00,0.00,0.09,0.10
2,ENSG00000078237.4,ENSG00000078237.4,4.39,4.49,2.51,2.90,2.67,5.50,3.36,3.43,...,8.15,4.05,6.06,6.03,4.34,5.16,2.70,5.00,3.82,3.69
3,ENSG00000263642.1,ENSG00000263642.1,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
4,ENSG00000225275.4,ENSG00000225275.4,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55884,ENSG00000265520.1,ENSG00000265520.1,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
55885,ENSG00000231119.2,ENSG00000231119.2,0.20,0.20,0.24,0.21,0.09,0.14,0.42,0.26,...,0.22,0.11,0.18,0.24,0.14,0.28,0.24,0.15,0.23,0.15
55886,ENSG00000105063.14,ENSG00000105063.14,40.59,39.26,34.70,36.74,39.85,35.28,24.78,31.14,...,28.82,24.08,24.10,24.81,27.30,22.24,20.08,23.68,17.91,26.37
55887,ENSG00000123685.4,ENSG00000123685.4,4.46,4.51,5.27,5.11,3.77,4.31,4.63,4.40,...,3.03,5.06,2.05,4.01,2.89,2.69,3.42,4.25,5.46,3.65


In [7]:
import pandas as pd
from pyensembl import EnsemblRelease

# Function to fetch the requested information for a gene ID
def fetch_gene_info(gene_id_with_version, ensembl_data, attributes):
    # Split the gene_id to exclude the version number
    gene_id = gene_id_with_version.split('.')[0]
    info = {}
    try:
        gene = ensembl_data.gene_by_id(gene_id)
        for attribute in attributes:
            value = None
            if hasattr(gene, attribute):
                value = getattr(gene, attribute)
            elif attribute == 'transcript_names':
                value = ','.join([t.name for t in gene.transcripts])
            elif attribute == 'transcript_ids':
                value = ','.join([t.transcript_id for t in gene.transcripts])
            elif attribute == 'exon_ids':
                exons = [exon.exon_id for transcript in gene.transcripts for exon in transcript.exons]
                value = ','.join(exons)
            info[attribute] = value
    except ValueError:
        # If gene ID is not found, fill all attributes with "ValueError"
        for attribute in attributes:
            info[attribute] = "ValueError"
    return info

# Main function to update the CSV file with the requested gene information
def update_csv_with_gene_attributes(csv_path, attributes):
    # Load the CSV file
    data = pd.read_csv(csv_path, sep='\t')

    # Initialize the Ensembl data
    ensembl_data = EnsemblRelease()
    ensembl_data.download()
    ensembl_data.index()

    # Fetch and add the requested gene-related information
    for attribute in attributes:
        # Apply the fetch_gene_info function for each gene_id and extract the attribute
        data[attribute] = data['gene_id'].apply(lambda gene_id: fetch_gene_info(gene_id, ensembl_data, attributes)[attribute])

    # Save the updated CSV file
    updated_csv_path = csv_path.replace('.tsv', '_with_attributes.tsv')
    data.to_csv(updated_csv_path, sep='\t', index=False)
    return updated_csv_path

In [8]:
# input the file path and additional attributes you want:
csv_file_path = './ROSMAP-raw/RNASeq/ROSMAP_RNAseq_FPKM_gene.tsv' 
attributes_to_add = ['gene_name'] 

# Call the function with the CSV path and the list of attributes
updated_file_path = update_csv_with_gene_attributes(csv_file_path, attributes_to_add)
print(f"Updated CSV file is saved at {updated_file_path}")

INFO:pyensembl.sequence_data:Loaded sequence dictionary from C:\Users\hemingzhang\AppData\Local\pyensembl\GRCh38\ensembl109\pyensembl\GRCh38\ensembl109\Cache\Homo_sapiens.GRCh38.cdna.all.fa.gz.pickle
INFO:pyensembl.sequence_data:Loaded sequence dictionary from C:\Users\hemingzhang\AppData\Local\pyensembl\GRCh38\ensembl109\pyensembl\GRCh38\ensembl109\Cache\Homo_sapiens.GRCh38.ncrna.fa.gz.pickle
INFO:pyensembl.sequence_data:Loaded sequence dictionary from C:\Users\hemingzhang\AppData\Local\pyensembl\GRCh38\ensembl109\pyensembl\GRCh38\ensembl109\Cache\Homo_sapiens.GRCh38.pep.all.fa.gz.pickle


Updated CSV file is saved at ./ROSMAP-raw/RNASeq/ROSMAP_RNAseq_FPKM_gene_with_attributes.tsv


In [9]:
#Read the new gene_expression with gene_name
gene_expression = pd.read_csv('./ROSMAP-raw/RNASeq/ROSMAP_RNAseq_FPKM_gene_with_attributes.tsv', sep='\t')
gene_expression

Unnamed: 0,tracking_id,gene_id,525_120515_0,383_120503_0,93_120417_0,610_120523_0,560_120517_0,492_120515_0,576_120521_0,150_120419_0,...,901_131010_8,894_130923_8,938_131101_8,942_131101_8,939_131101_8,895_130923_8,829_130725_8,944_131107_8,775_130528_8,gene_name
0,ENSG00000167578.11,ENSG00000167578.11,60.84,65.45,69.18,51.60,48.61,55.65,51.18,60.63,...,45.02,33.71,38.30,28.27,51.02,31.87,37.14,41.77,44.01,RAB4B
1,ENSG00000242268.1,ENSG00000242268.1,0.08,0.05,0.08,0.08,0.10,0.08,0.06,0.11,...,0.10,0.20,0.00,0.00,0.00,0.00,0.00,0.09,0.10,LINC02082
2,ENSG00000078237.4,ENSG00000078237.4,4.39,4.49,2.51,2.90,2.67,5.50,3.36,3.43,...,4.05,6.06,6.03,4.34,5.16,2.70,5.00,3.82,3.69,TIGAR
3,ENSG00000263642.1,ENSG00000263642.1,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,MIR4802
4,ENSG00000225275.4,ENSG00000225275.4,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,NUP210P2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55884,ENSG00000265520.1,ENSG00000265520.1,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,MIR548V
55885,ENSG00000231119.2,ENSG00000231119.2,0.20,0.20,0.24,0.21,0.09,0.14,0.42,0.26,...,0.11,0.18,0.24,0.14,0.28,0.24,0.15,0.23,0.15,
55886,ENSG00000105063.14,ENSG00000105063.14,40.59,39.26,34.70,36.74,39.85,35.28,24.78,31.14,...,24.08,24.10,24.81,27.30,22.24,20.08,23.68,17.91,26.37,PPP6R1
55887,ENSG00000123685.4,ENSG00000123685.4,4.46,4.51,5.27,5.11,3.77,4.31,4.63,4.40,...,5.06,2.05,4.01,2.89,2.69,3.42,4.25,5.46,3.65,BATF3


In [10]:
#Check if any gene names have not been found
count = gene_expression[gene_expression['gene_name'] == 'Gene name not found'].shape[0]
count

0

In [11]:
# set gene to the first column
cols = ['gene_name'] + [col for col in gene_expression if col != 'gene_name']
gene_expression_new = gene_expression[cols]
gene_expression.drop(columns=['tracking_id', 'gene_id'], inplace=True)
#ensures that there is no ambiguity about whether the operations are performed on a copy or on the original DataFrame
gene_name_column = gene_expression.pop('gene_name')  # This removes and returns the 'gene_name' column
gene_expression.insert(0, 'gene_name', gene_name_column)  # This inserts the 'gene_name' column at the first position


In [12]:
gene_expression

Unnamed: 0,gene_name,525_120515_0,383_120503_0,93_120417_0,610_120523_0,560_120517_0,492_120515_0,576_120521_0,150_120419_0,416_120503_0,...,831_130725_8,901_131010_8,894_130923_8,938_131101_8,942_131101_8,939_131101_8,895_130923_8,829_130725_8,944_131107_8,775_130528_8
0,RAB4B,60.84,65.45,69.18,51.60,48.61,55.65,51.18,60.63,62.38,...,44.85,45.02,33.71,38.30,28.27,51.02,31.87,37.14,41.77,44.01
1,LINC02082,0.08,0.05,0.08,0.08,0.10,0.08,0.06,0.11,0.05,...,0.00,0.10,0.20,0.00,0.00,0.00,0.00,0.00,0.09,0.10
2,TIGAR,4.39,4.49,2.51,2.90,2.67,5.50,3.36,3.43,2.72,...,8.15,4.05,6.06,6.03,4.34,5.16,2.70,5.00,3.82,3.69
3,MIR4802,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
4,NUP210P2,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55884,MIR548V,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
55885,,0.20,0.20,0.24,0.21,0.09,0.14,0.42,0.26,0.22,...,0.22,0.11,0.18,0.24,0.14,0.28,0.24,0.15,0.23,0.15
55886,PPP6R1,40.59,39.26,34.70,36.74,39.85,35.28,24.78,31.14,40.02,...,28.82,24.08,24.10,24.81,27.30,22.24,20.08,23.68,17.91,26.37
55887,BATF3,4.46,4.51,5.27,5.11,3.77,4.31,4.63,4.40,5.49,...,3.03,5.06,2.05,4.01,2.89,2.69,3.42,4.25,5.46,3.65


### 2.4 Read clinical data

In [13]:
survival = pd.read_csv('./ROSMAP-raw/ROSMAP_clinical/ROSMAP_clinical.csv', sep=',')
survival

Unnamed: 0,projid,Study,msex,educ,race,spanish,apoe_genotype,age_at_visit_max,age_first_ad_dx,age_death,cts_mmse30_first_ad_dx,cts_mmse30_lv,pmi,braaksc,ceradsc,cogdx,dcfdx_lv,individualID
0,10101589,ROS,1.0,20.0,1.0,2.0,34.0,90+,90+,90+,18.0,5.0,9.916667,4.0,2.0,4.0,4.0,R6939144
1,86767530,MAP,0.0,10.0,1.0,2.0,33.0,90+,90+,90+,18.0,10.0,6.500000,4.0,2.0,4.0,4.0,R3893503
2,9650662,MAP,0.0,15.0,1.0,2.0,23.0,90+,90+,90+,0.0,0.0,3.850000,3.0,2.0,4.0,4.0,R8937093
3,50402855,MAP,0.0,21.0,1.0,2.0,33.0,90+,,,,27.0,,,,,1.0,R7139444
4,20544321,ROS,0.0,16.0,1.0,2.0,23.0,90+,90+,,13.0,14.0,,,,,4.0,R4971237
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3579,22207815,ROS,0.0,18.0,2.0,2.0,23.0,57.653661875427787,,,,29.0,,,,,1.0,R5306025
3580,22207941,ROS,0.0,16.0,2.0,2.0,34.0,56.651608487337441,,,,27.0,,,,,1.0,R6142763
3581,49333806,MAP,0.0,12.0,2.0,2.0,,56.599589322381931,,,,30.0,,,,,1.0,R4468842
3582,59720188,MAP,0.0,13.0,1.0,1.0,,54.622861054072551,,,,29.0,,,,,1.0,R9446033


### 2.5 Read proteomics data

In [14]:
protein = pd.read_csv("./ROSMAP-raw/Proteomics/C2.median_polish_corrected_log2(abundanceRatioCenteredOnMedianOfBatchMediansPerProtein)-8817x400.csv",sep=',')
protein

Unnamed: 0.1,Unnamed: 0,b01.127C,b01.127N,b01.128C,b01.128N,b01.129C,b01.129N,b01.130C,b01.130N,b02.127C,...,b49.130C,b49.130N,b50.127C,b50.127N,b50.128C,b50.128N,b50.129C,b50.129N,b50.130C,b50.130N
0,VAMP1|P23763,-0.278255,0.410722,-0.112329,-0.162641,-0.071647,0.117440,-0.042157,0.187085,-0.835022,...,-0.073641,0.081353,0.084443,-0.220764,0.231834,-0.076819,-0.130791,-0.173961,-0.043709,-0.054343
1,KCTD13|Q8WZ19,,,,,,,,,,...,,,0.225794,0.019362,0.393129,0.238450,-0.039415,0.313539,0.335718,0.197204
2,TXNDC12|O95881,-0.366508,-0.498610,-0.276421,-0.354865,-0.238212,-0.321546,-0.213956,-0.532379,-0.742539,...,-0.521676,-0.289618,-0.295169,-0.506661,-0.268946,-0.403568,-0.320117,-0.466931,-0.289946,-0.373653
3,PDHX|O00330,-0.144993,0.155336,0.050900,0.079455,-0.021996,-0.071422,0.081380,0.089962,0.058278,...,-0.132236,-0.095648,-0.145319,0.016944,0.065491,0.135842,0.147856,-0.020226,0.083475,0.047506
4,APIP|Q96GX9,0.145339,-0.417854,-0.843172,0.577196,-1.468350,0.221551,-0.136442,-0.323159,0.976332,...,0.713746,-0.831854,0.236361,-0.814508,-0.009136,-1.053596,-0.009197,-1.010743,-0.467762,0.403568
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8812,S1PR5|Q9H228,0.505338,0.391576,-0.219065,-0.239153,0.426485,-0.055620,-0.359028,0.013233,,...,0.560319,0.031164,-0.026439,0.217259,0.358087,-0.165559,-0.033307,-0.419110,-0.010325,-0.054859
8813,GIGYF2|Q6Y7W6,-0.449736,-0.411664,-0.432484,-0.488819,-0.458550,-0.421006,-0.526388,-0.527799,-0.554216,...,-0.482969,-0.416386,-0.507107,-0.374713,-0.455986,-0.407432,-0.529108,-0.483092,-0.439813,-0.469659
8814,YME1L1|Q96TA2,0.462210,0.035486,0.035512,-0.117947,0.494874,-0.018295,-0.081191,-0.002421,-0.047577,...,0.277916,-0.021786,0.104153,0.092986,0.206298,-0.137310,-0.037395,-0.114052,0.053101,-0.077162
8815,PPP1R3F|Q6ZSY5,-0.177028,-0.080174,-0.145297,0.049689,-0.064137,-2.274703,0.017258,-0.120344,0.063776,...,-0.264418,-0.035055,-0.124822,-0.035162,-0.159635,-0.145910,-0.093056,-0.001350,-0.129813,-0.086022


In [15]:
protein.iloc[:, 0] = protein.iloc[:, 0].str.split('|').str[0]
protein.columns = ['gene_name'] + protein.columns[1:].tolist()
protein

Unnamed: 0,gene_name,b01.127C,b01.127N,b01.128C,b01.128N,b01.129C,b01.129N,b01.130C,b01.130N,b02.127C,...,b49.130C,b49.130N,b50.127C,b50.127N,b50.128C,b50.128N,b50.129C,b50.129N,b50.130C,b50.130N
0,VAMP1,-0.278255,0.410722,-0.112329,-0.162641,-0.071647,0.117440,-0.042157,0.187085,-0.835022,...,-0.073641,0.081353,0.084443,-0.220764,0.231834,-0.076819,-0.130791,-0.173961,-0.043709,-0.054343
1,KCTD13,,,,,,,,,,...,,,0.225794,0.019362,0.393129,0.238450,-0.039415,0.313539,0.335718,0.197204
2,TXNDC12,-0.366508,-0.498610,-0.276421,-0.354865,-0.238212,-0.321546,-0.213956,-0.532379,-0.742539,...,-0.521676,-0.289618,-0.295169,-0.506661,-0.268946,-0.403568,-0.320117,-0.466931,-0.289946,-0.373653
3,PDHX,-0.144993,0.155336,0.050900,0.079455,-0.021996,-0.071422,0.081380,0.089962,0.058278,...,-0.132236,-0.095648,-0.145319,0.016944,0.065491,0.135842,0.147856,-0.020226,0.083475,0.047506
4,APIP,0.145339,-0.417854,-0.843172,0.577196,-1.468350,0.221551,-0.136442,-0.323159,0.976332,...,0.713746,-0.831854,0.236361,-0.814508,-0.009136,-1.053596,-0.009197,-1.010743,-0.467762,0.403568
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8812,S1PR5,0.505338,0.391576,-0.219065,-0.239153,0.426485,-0.055620,-0.359028,0.013233,,...,0.560319,0.031164,-0.026439,0.217259,0.358087,-0.165559,-0.033307,-0.419110,-0.010325,-0.054859
8813,GIGYF2,-0.449736,-0.411664,-0.432484,-0.488819,-0.458550,-0.421006,-0.526388,-0.527799,-0.554216,...,-0.482969,-0.416386,-0.507107,-0.374713,-0.455986,-0.407432,-0.529108,-0.483092,-0.439813,-0.469659
8814,YME1L1,0.462210,0.035486,0.035512,-0.117947,0.494874,-0.018295,-0.081191,-0.002421,-0.047577,...,0.277916,-0.021786,0.104153,0.092986,0.206298,-0.137310,-0.037395,-0.114052,0.053101,-0.077162
8815,PPP1R3F,-0.177028,-0.080174,-0.145297,0.049689,-0.064137,-2.274703,0.017258,-0.120344,0.063776,...,-0.264418,-0.035055,-0.124822,-0.035162,-0.159635,-0.145910,-0.093056,-0.001350,-0.129813,-0.086022


In [16]:
#calculate the NaN proportion of each row
nan_proportions = protein.isna().mean(axis=1)
# Display the results
print(nan_proportions)

0       0.000000
1       0.458853
2       0.000000
3       0.000000
4       0.019950
          ...   
8812    0.159601
8813    0.000000
8814    0.000000
8815    0.000000
8816    0.438903
Length: 8817, dtype: float64


In [17]:
protein = protein[nan_proportions <= 1/3]

# Fill NaN values with 0 in the remaining rows
protein = protein.fillna(0)
protein

Unnamed: 0,gene_name,b01.127C,b01.127N,b01.128C,b01.128N,b01.129C,b01.129N,b01.130C,b01.130N,b02.127C,...,b49.130C,b49.130N,b50.127C,b50.127N,b50.128C,b50.128N,b50.129C,b50.129N,b50.130C,b50.130N
0,VAMP1,-0.278255,0.410722,-0.112329,-0.162641,-0.071647,0.117440,-0.042157,0.187085,-0.835022,...,-0.073641,0.081353,0.084443,-0.220764,0.231834,-0.076819,-0.130791,-0.173961,-0.043709,-0.054343
2,TXNDC12,-0.366508,-0.498610,-0.276421,-0.354865,-0.238212,-0.321546,-0.213956,-0.532379,-0.742539,...,-0.521676,-0.289618,-0.295169,-0.506661,-0.268946,-0.403568,-0.320117,-0.466931,-0.289946,-0.373653
3,PDHX,-0.144993,0.155336,0.050900,0.079455,-0.021996,-0.071422,0.081380,0.089962,0.058278,...,-0.132236,-0.095648,-0.145319,0.016944,0.065491,0.135842,0.147856,-0.020226,0.083475,0.047506
4,APIP,0.145339,-0.417854,-0.843172,0.577196,-1.468350,0.221551,-0.136442,-0.323159,0.976332,...,0.713746,-0.831854,0.236361,-0.814508,-0.009136,-1.053596,-0.009197,-1.010743,-0.467762,0.403568
5,CIAO1,-0.186780,-0.163890,-0.161632,-0.144342,-0.240011,-0.196930,-0.288192,-0.061229,-0.207954,...,-0.167275,-0.312318,-0.209734,-0.118838,-0.177729,-0.205076,-0.173406,-0.281418,-0.190299,-0.091933
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8811,FGD4,-0.029756,0.118771,0.031205,0.135610,0.128260,0.038545,0.107212,0.135619,-0.001359,...,0.193069,0.233360,0.043709,0.160643,0.297046,0.034585,0.103630,0.025273,0.104994,0.162066
8812,S1PR5,0.505338,0.391576,-0.219065,-0.239153,0.426485,-0.055620,-0.359028,0.013233,0.000000,...,0.560319,0.031164,-0.026439,0.217259,0.358087,-0.165559,-0.033307,-0.419110,-0.010325,-0.054859
8813,GIGYF2,-0.449736,-0.411664,-0.432484,-0.488819,-0.458550,-0.421006,-0.526388,-0.527799,-0.554216,...,-0.482969,-0.416386,-0.507107,-0.374713,-0.455986,-0.407432,-0.529108,-0.483092,-0.439813,-0.469659
8814,YME1L1,0.462210,0.035486,0.035512,-0.117947,0.494874,-0.018295,-0.081191,-0.002421,-0.047577,...,0.277916,-0.021786,0.104153,0.092986,0.206298,-0.137310,-0.037395,-0.114052,0.053101,-0.077162


### 2.6 Read Genome Variants data

In [18]:
cnv_data = pd.read_csv('./ROSMAP-raw/GenomeVariants/ROSMAP.CNV.Matrix.txt',sep='\t')
cnv_data

Unnamed: 0,CHROM,START,END,NAME,SVTYPE,Quality,SM-CTDSC,SM-CJGMZ,SM-CJFM3,SM-CTEET,...,SM-CTDVK,SM-CTED3,SM-CJK4V,SM-CJK5F,SM-CJIY9,SM-CTDQZ,SM-CJIY3,SM-CTEN7,SM-CJGH9,SM-CTEFP
0,1,766593,769113,mCNV39,mCNV,Consensus.III,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
1,1,776769,791879,mCNV40,mCNV,Consensus.III,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
2,1,830676,834492,mCNV42,mCNV,Consensus.III,2,2,2,2,...,2,2,1,2,2,2,2,2,2,2
3,1,1139154,1140550,mCNV53,mCNV,Consensus.III,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
4,1,1238076,1239491,mCNV57,mCNV,Consensus.III,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9897,22,49765259,49767008,mCNV48272,mCNV,Consensus.III,2,2,2,2,...,2,2,2,2,2,2,2,1,2,2
9898,22,49780301,49781900,DUP48274,DUP,Consensus.III,3,3,4,3,...,3,4,3,2,4,3,4,3,3,3
9899,22,49840021,49908322,mCNV48276,mCNV,Consensus.III,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
9900,22,51082701,51121100,mCNV48291,mCNV,Consensus.III,2,2,2,2,...,2,2,1,2,2,2,2,2,2,2


In [19]:
ensembl_data = EnsemblRelease()
ensembl_data.download()
ensembl_data.index()

INFO:pyensembl.sequence_data:Loaded sequence dictionary from C:\Users\hemingzhang\AppData\Local\pyensembl\GRCh38\ensembl109\pyensembl\GRCh38\ensembl109\Cache\Homo_sapiens.GRCh38.cdna.all.fa.gz.pickle
INFO:pyensembl.sequence_data:Loaded sequence dictionary from C:\Users\hemingzhang\AppData\Local\pyensembl\GRCh38\ensembl109\pyensembl\GRCh38\ensembl109\Cache\Homo_sapiens.GRCh38.ncrna.fa.gz.pickle
INFO:pyensembl.sequence_data:Loaded sequence dictionary from C:\Users\hemingzhang\AppData\Local\pyensembl\GRCh38\ensembl109\pyensembl\GRCh38\ensembl109\Cache\Homo_sapiens.GRCh38.pep.all.fa.gz.pickle


In [20]:
# use pyensemb to map the gene name
def get_gene_names(chromosome, start, end):
    genes = ensembl_data.genes_at_locus(contig=chromosome, position=start, end=end)
    gene_names = [gene.gene_name for gene in genes]
    return ', '.join(gene_names) if gene_names else None

# Apply the function to each row in the DataFrame
cnv_data['Gene_Names'] = cnv_data.apply(
    lambda row: get_gene_names(str(row['CHROM']), row['START'], row['END']), axis=1
)
cnv_data

Unnamed: 0,CHROM,START,END,NAME,SVTYPE,Quality,SM-CTDSC,SM-CJGMZ,SM-CJFM3,SM-CTEET,...,SM-CTED3,SM-CJK4V,SM-CJK5F,SM-CJIY9,SM-CTDQZ,SM-CJIY3,SM-CTEN7,SM-CJGH9,SM-CTEFP,Gene_Names
0,1,766593,769113,mCNV39,mCNV,Consensus.III,2,2,2,2,...,2,2,2,2,2,2,2,2,2,", ,"
1,1,776769,791879,mCNV40,mCNV,Consensus.III,2,2,2,2,...,2,2,2,2,2,2,2,2,2,", LINC01409, ,"
2,1,830676,834492,mCNV42,mCNV,Consensus.III,2,2,2,2,...,2,1,2,2,2,2,2,2,2,LINC01128
3,1,1139154,1140550,mCNV53,mCNV,Consensus.III,2,2,2,2,...,2,2,2,2,2,2,2,2,2,LINC01342
4,1,1238076,1239491,mCNV57,mCNV,Consensus.III,2,2,2,2,...,2,2,2,2,2,2,2,2,2,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9897,22,49765259,49767008,mCNV48272,mCNV,Consensus.III,2,2,2,2,...,2,2,2,2,2,2,1,2,2,
9898,22,49780301,49781900,DUP48274,DUP,Consensus.III,3,3,4,3,...,4,3,2,4,3,4,3,3,3,BRD1
9899,22,49840021,49908322,mCNV48276,mCNV,Consensus.III,2,2,2,2,...,2,2,2,2,2,2,2,2,2,"ZBED4, ALG12, , , Metazoa_SRP,"
9900,22,51082701,51121100,mCNV48291,mCNV,Consensus.III,2,2,2,2,...,2,1,2,2,2,2,2,2,2,


In [21]:
# Function to clean gene names
def clean_gene_names(gene_names):
    if gene_names:
        # Split the string by comma, strip whitespace and periods, then filter out empty strings
        genes = [gene.strip(' .') for gene in gene_names.split(',') if gene.strip(' .')]
        # Join the cleaned gene names with a comma
        return ', '.join(genes)
    else:
        # If gene_names is None or empty, return a placeholder
        return 'No gene found'

# Apply the function to clean up the gene names
cnv_data['Gene_Names'] = cnv_data['Gene_Names'].apply(clean_gene_names)
cnv_data

Unnamed: 0,CHROM,START,END,NAME,SVTYPE,Quality,SM-CTDSC,SM-CJGMZ,SM-CJFM3,SM-CTEET,...,SM-CTED3,SM-CJK4V,SM-CJK5F,SM-CJIY9,SM-CTDQZ,SM-CJIY3,SM-CTEN7,SM-CJGH9,SM-CTEFP,Gene_Names
0,1,766593,769113,mCNV39,mCNV,Consensus.III,2,2,2,2,...,2,2,2,2,2,2,2,2,2,
1,1,776769,791879,mCNV40,mCNV,Consensus.III,2,2,2,2,...,2,2,2,2,2,2,2,2,2,LINC01409
2,1,830676,834492,mCNV42,mCNV,Consensus.III,2,2,2,2,...,2,1,2,2,2,2,2,2,2,LINC01128
3,1,1139154,1140550,mCNV53,mCNV,Consensus.III,2,2,2,2,...,2,2,2,2,2,2,2,2,2,LINC01342
4,1,1238076,1239491,mCNV57,mCNV,Consensus.III,2,2,2,2,...,2,2,2,2,2,2,2,2,2,No gene found
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9897,22,49765259,49767008,mCNV48272,mCNV,Consensus.III,2,2,2,2,...,2,2,2,2,2,2,1,2,2,No gene found
9898,22,49780301,49781900,DUP48274,DUP,Consensus.III,3,3,4,3,...,4,3,2,4,3,4,3,3,3,BRD1
9899,22,49840021,49908322,mCNV48276,mCNV,Consensus.III,2,2,2,2,...,2,2,2,2,2,2,2,2,2,"ZBED4, ALG12, Metazoa_SRP"
9900,22,51082701,51121100,mCNV48291,mCNV,Consensus.III,2,2,2,2,...,2,1,2,2,2,2,2,2,2,No gene found


In [22]:
cnv_data['Gene_Names'] = cnv_data['Gene_Names'].str.split(', ')
cnv_data_exploded = cnv_data.explode('Gene_Names')
cnv_data_exploded

Unnamed: 0,CHROM,START,END,NAME,SVTYPE,Quality,SM-CTDSC,SM-CJGMZ,SM-CJFM3,SM-CTEET,...,SM-CTED3,SM-CJK4V,SM-CJK5F,SM-CJIY9,SM-CTDQZ,SM-CJIY3,SM-CTEN7,SM-CJGH9,SM-CTEFP,Gene_Names
0,1,766593,769113,mCNV39,mCNV,Consensus.III,2,2,2,2,...,2,2,2,2,2,2,2,2,2,
1,1,776769,791879,mCNV40,mCNV,Consensus.III,2,2,2,2,...,2,2,2,2,2,2,2,2,2,LINC01409
2,1,830676,834492,mCNV42,mCNV,Consensus.III,2,2,2,2,...,2,1,2,2,2,2,2,2,2,LINC01128
3,1,1139154,1140550,mCNV53,mCNV,Consensus.III,2,2,2,2,...,2,2,2,2,2,2,2,2,2,LINC01342
4,1,1238076,1239491,mCNV57,mCNV,Consensus.III,2,2,2,2,...,2,2,2,2,2,2,2,2,2,No gene found
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9899,22,49840021,49908322,mCNV48276,mCNV,Consensus.III,2,2,2,2,...,2,2,2,2,2,2,2,2,2,ZBED4
9899,22,49840021,49908322,mCNV48276,mCNV,Consensus.III,2,2,2,2,...,2,2,2,2,2,2,2,2,2,ALG12
9899,22,49840021,49908322,mCNV48276,mCNV,Consensus.III,2,2,2,2,...,2,2,2,2,2,2,2,2,2,Metazoa_SRP
9900,22,51082701,51121100,mCNV48291,mCNV,Consensus.III,2,2,2,2,...,2,1,2,2,2,2,2,2,2,No gene found


In [23]:
# Filter out rows with empty 'Gene_Names' or 'No gene found'
cnv_data_exploded = cnv_data_exploded[(cnv_data_exploded['Gene_Names'] != '') & (cnv_data_exploded['Gene_Names'] != 'No gene found')]

sample_columns = cnv_data_exploded.columns[6:-1] 
cnv_aggregated = cnv_data_exploded.groupby(['Gene_Names','SVTYPE'])[sample_columns].sum().reset_index()
# cnv_aggregated = cnv_data_exploded.groupby('Gene_Names')[sample_columns].sum().reset_index()
cnv_aggregated

Unnamed: 0,Gene_Names,SVTYPE,SM-CTDSC,SM-CJGMZ,SM-CJFM3,SM-CTEET,SM-CJK4L,SM-CJIZ4,SM-CJK3I,SM-CJGN4,...,SM-CTDVK,SM-CTED3,SM-CJK4V,SM-CJK5F,SM-CJIY9,SM-CTDQZ,SM-CJIY3,SM-CTEN7,SM-CJGH9,SM-CTEFP
0,5S_rRNA,DEL,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
1,5S_rRNA,DUP,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
2,A2ML1-AS1,DEL,2,2,2,2,2,2,2,2,...,2,2,1,2,2,2,2,2,2,2
3,A4GALT,mCNV,2,3,2,3,2,3,2,3,...,2,3,2,2,2,2,2,1,3,3
4,AAAS,mCNV,2,2,2,2,2,2,2,2,...,2,2,1,2,2,2,1,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7247,ZSWIM6,DEL,2,2,2,2,2,2,2,2,...,2,2,2,1,2,2,2,2,2,2
7248,ZSWIM9,mCNV,6,6,6,6,6,6,6,5,...,6,6,3,3,5,6,3,6,6,6
7249,ZUP1,mCNV,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
7250,ZZZ3,DEL,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2


In [24]:
cnv_aggregated_del = cnv_aggregated[cnv_aggregated['SVTYPE'] == 'DEL']
cnv_aggregated_dup = cnv_aggregated[cnv_aggregated['SVTYPE'] == 'DUP']
cnv_aggregated_mcnv = cnv_aggregated[cnv_aggregated['SVTYPE'] == 'mCNV']
print(f"Shape of cnv_aggregated_del: {cnv_aggregated_del.shape}")
print(f"Shape of cnv_aggregated_dup: {cnv_aggregated_dup.shape}")
print(f"Shape of cnv_aggregated_mcnv: {cnv_aggregated_mcnv.shape}")

Shape of cnv_aggregated_del: (2466, 1129)
Shape of cnv_aggregated_dup: (1082, 1129)
Shape of cnv_aggregated_mcnv: (3704, 1129)


In [25]:
# Create a DataFrame from the union set of all gene names
cnv_all_genes = pd.DataFrame({'Gene_Names': list(set(cnv_aggregated_del['Gene_Names']).union(set(cnv_aggregated_dup['Gene_Names']), set(cnv_aggregated_mcnv['Gene_Names'])))})

# Merge each aggregated DataFrame with the all genes DataFrame
# This will align all genes across all DataFrames, and fill missing entries with 0
cnv_aggregated_del = pd.merge(cnv_all_genes, cnv_aggregated_del, on='Gene_Names', how='outer').fillna(0)
cnv_aggregated_dup = pd.merge(cnv_all_genes, cnv_aggregated_dup, on='Gene_Names', how='outer').fillna(0)
cnv_aggregated_mcnv = pd.merge(cnv_all_genes, cnv_aggregated_mcnv, on='Gene_Names', how='outer').fillna(0)

# Drop the 'SVTYPE' column from each merged DataFrame
cnv_aggregated_del.drop(columns='SVTYPE', inplace=True)
cnv_aggregated_dup.drop(columns='SVTYPE', inplace=True)
cnv_aggregated_mcnv.drop(columns='SVTYPE', inplace=True)
print(f"Shape of cnv_aggregated_del: {cnv_aggregated_del.shape}")
print(f"Shape of cnv_aggregated_dup: {cnv_aggregated_dup.shape}")
print(f"Shape of cnv_aggregated_mcnv: {cnv_aggregated_mcnv.shape}")

Shape of cnv_aggregated_del: (6373, 1128)
Shape of cnv_aggregated_dup: (6373, 1128)
Shape of cnv_aggregated_mcnv: (6373, 1128)


In [26]:
cnv_aggregated_del

Unnamed: 0,Gene_Names,SM-CTDSC,SM-CJGMZ,SM-CJFM3,SM-CTEET,SM-CJK4L,SM-CJIZ4,SM-CJK3I,SM-CJGN4,SM-CTDSM,...,SM-CTDVK,SM-CTED3,SM-CJK4V,SM-CJK5F,SM-CJIY9,SM-CTDQZ,SM-CJIY3,SM-CTEN7,SM-CJGH9,SM-CTEFP
0,KDM2A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,WDR18,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0
2,CHIAP2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,SETD4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,CABP5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6368,OPCML,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6369,AP4E1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6370,APBB3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6371,MIR181D,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
# cnv_aggregated.iloc[:, 1:] = cnv_aggregated.iloc[:, 1:].where(cnv_aggregated.iloc[:, 1:] == 0, 1)
# cnv_aggregated

In [28]:
# snphead = pd.read_csv('./ROSMAP-raw/GenomeVariants/SNP-Array-raw/AMP-AD_ROSMAP_Rush-Broad_AffymetrixGenechip6_Imputed.fam', sep='\t',header=None)
# snphead

In [29]:
# survival_gwas_id = pd.DataFrame(survival.iloc[:, 1].astype(str) + survival.iloc[:, 0].astype(str), columns=['gwas_id'])
# survival_gwas_id['individualID'] = survival['individualID']

# # Displaying the new DataFrame
# survival_gwas_id

In [30]:
# gwas_id_list = survival_gwas_id['gwas_id'].tolist()

# snphead['extracted_gwas_id'] = snphead.iloc[:, 0].apply(
#     lambda x: next((gwas_id for gwas_id in gwas_id_list if gwas_id in x), np.nan)
# )

# # Perform a left merge with 'survival_gwas_id' on the extracted 'gwas_id' to keep all rows from 'snphead'
# # and to get the corresponding 'individualID' where there is a match
# merged_result = pd.merge(
#     snphead, 
#     survival_gwas_id, 
#     how='left', 
#     left_on='extracted_gwas_id', 
#     right_on='gwas_id'
# )

# # Select only the 'gwas_id' and 'individualID' columns for the final result
# # Fill missing values with 'NA' to indicate unmatched rows
# snphead_updated = merged_result[['gwas_id', 'individualID']].fillna('NA')

# # Display the head of the final DataFrame
# snphead_updated

In [31]:
# snphead_updated_withoutNA = snphead_updated[snphead_updated.iloc[:, 0] != "NA"]
# snphead_updated_withoutNA

## 3.Methylation data process

### 3.1 Define methylation region

In [32]:
import pandas as pd
import numpy as np

#Define vectorized area determination function for methylation data
def vectorized_determine_region(distances):
    regions = ['Upstream', 'Distal Promoter', 'Proximal Promoter', 'Core Promoter', 'Downstream']
    conditions = [
        (-6000 <= distances) & (distances < -3000),
        (-3000 <= distances) & (distances < -250),
        (-250 <= distances) & (distances < -50),
        (-50 <= distances) & (distances <= 0),
        (0 < distances) & (distances <= 3000)
    ]
    return np.select(conditions, regions, default=None)

### 3.2 Merge the annotation files to the methylation data and apply region function

In [33]:
# Merging basic data and methylation data
methylation_merged_df = pd.merge(map_annotation, methylation_value, left_on='ID', right_on='TargetID', how='right')

# Determining the region for each row outside the loop
methylation_merged_df['Region'] = vectorized_determine_region(methylation_merged_df['Distance_closest_TSS'])

methylation_merged_df = methylation_merged_df.dropna(subset=['Region'])  # Remove rows without a region

# Initializing a dictionary to store data for each region
regions_data = {region: pd.DataFrame() for region in ["Upstream", "Distal Promoter", "Proximal Promoter", "Core Promoter", "Downstream"]}

In [34]:
methylation_merged_df

Unnamed: 0,ID,Closest_TSS,Closest_TSS_gene_name,Distance_closest_TSS,TargetID,TBI-AUTO73325-PT-3149,PT-BZHL,PT-BZCH,PT-BY9H,TBI-AUTO73307-PT-314I,...,TBI-AUTO73291-PT-35OD,PT-BZD7,PT-BZG8,PT-C1N8,PT-BYJP,PT-BZHV,PT-BZI2,TBI-AUTO73257-PT-35OE,PT-BZD3,Region
3,cg00001349,166958518,MAEL,-79.0,cg00001349,0.865488,0.919570,0.882256,0.902912,0.907610,...,0.900684,0.906257,0.756173,0.948341,0.904996,0.927201,0.856637,0.917827,0.943320,Proximal Promoter
5,cg00001446,43833698,ELOVL1,2658.0,cg00001446,0.842025,0.831457,0.835075,0.834157,0.839395,...,0.847344,0.825179,0.887574,0.881636,0.852642,0.833056,0.842108,0.841747,0.833336,Downstream
7,cg00001583,200011716,NR5A2,70.0,cg00001583,0.099888,0.076240,0.075861,0.101837,0.071947,...,0.084641,0.092234,0.066651,0.040554,0.059607,0.116837,0.069391,0.061016,0.035535,Downstream
8,cg00001593,170489886,AK096329,-547.0,cg00001593,0.918081,0.955305,0.896318,0.941423,0.940628,...,0.918949,0.929937,0.952355,0.968486,0.972349,0.944855,0.924879,0.885508,0.939249,Distal Promoter
9,cg00002028,20959947,PINK1,63.0,cg00002028,0.026821,0.035839,0.020016,0.014101,0.031390,...,0.021462,0.033637,0.024296,0.033715,0.019535,0.011352,0.020888,0.035047,0.026995,Downstream
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
420108,cg27652464,48885287,FAM19A5,1488.0,cg27652464,0.077791,0.081391,0.066050,0.076101,0.081930,...,0.082855,0.078940,0.033035,0.029362,0.041915,0.059740,0.060660,0.040806,0.044582,Downstream
420111,cg27657537,20862338,MED15,1424.0,cg27657537,0.246730,0.273854,0.250933,0.297172,0.256656,...,0.296850,0.295459,0.266729,0.301941,0.265106,0.263730,0.268427,0.252280,0.243976,Downstream
420112,cg27662611,38599026,MAFF,-45.0,cg27662611,0.091869,0.098040,0.066290,0.094694,0.069012,...,0.080595,0.067712,0.072670,0.069177,0.084736,0.071235,0.072670,0.083982,0.062013,Core Promoter
420113,cg27665648,30116343,CABP7,-3940.0,cg27665648,0.680242,0.683700,0.662226,0.711880,0.699262,...,0.606769,0.694081,0.700518,0.710467,0.697145,0.675443,0.671625,0.682511,0.718816,Upstream


In [35]:
# Delete the 'sample' column
methylation_merged_df = methylation_merged_df.drop('TargetID', axis=1)
# Delete the 'ID' column
methylation_merged_df = methylation_merged_df.drop('ID', axis=1)
# Delete the 'Distance_closest_TSS' column
methylation_merged_df = methylation_merged_df.drop('Distance_closest_TSS', axis=1)

In [36]:
methylation_merged_df

Unnamed: 0,Closest_TSS,Closest_TSS_gene_name,TBI-AUTO73325-PT-3149,PT-BZHL,PT-BZCH,PT-BY9H,TBI-AUTO73307-PT-314I,TBI-AUTO73043-PT-35BD,PT-BZI5,PT-BZ1A,...,TBI-AUTO73291-PT-35OD,PT-BZD7,PT-BZG8,PT-C1N8,PT-BYJP,PT-BZHV,PT-BZI2,TBI-AUTO73257-PT-35OE,PT-BZD3,Region
3,166958518,MAEL,0.865488,0.919570,0.882256,0.902912,0.907610,0.897496,0.939212,0.919514,...,0.900684,0.906257,0.756173,0.948341,0.904996,0.927201,0.856637,0.917827,0.943320,Proximal Promoter
5,43833698,ELOVL1,0.842025,0.831457,0.835075,0.834157,0.839395,0.842799,0.850811,0.842074,...,0.847344,0.825179,0.887574,0.881636,0.852642,0.833056,0.842108,0.841747,0.833336,Downstream
7,200011716,NR5A2,0.099888,0.076240,0.075861,0.101837,0.071947,0.036712,0.041233,0.083998,...,0.084641,0.092234,0.066651,0.040554,0.059607,0.116837,0.069391,0.061016,0.035535,Downstream
8,170489886,AK096329,0.918081,0.955305,0.896318,0.941423,0.940628,0.930346,0.944459,0.977162,...,0.918949,0.929937,0.952355,0.968486,0.972349,0.944855,0.924879,0.885508,0.939249,Distal Promoter
9,20959947,PINK1,0.026821,0.035839,0.020016,0.014101,0.031390,0.022777,0.031833,0.014578,...,0.021462,0.033637,0.024296,0.033715,0.019535,0.011352,0.020888,0.035047,0.026995,Downstream
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
420108,48885287,FAM19A5,0.077791,0.081391,0.066050,0.076101,0.081930,0.035654,0.070314,0.065970,...,0.082855,0.078940,0.033035,0.029362,0.041915,0.059740,0.060660,0.040806,0.044582,Downstream
420111,20862338,MED15,0.246730,0.273854,0.250933,0.297172,0.256656,0.273096,0.278133,0.267989,...,0.296850,0.295459,0.266729,0.301941,0.265106,0.263730,0.268427,0.252280,0.243976,Downstream
420112,38599026,MAFF,0.091869,0.098040,0.066290,0.094694,0.069012,0.100561,0.103787,0.059369,...,0.080595,0.067712,0.072670,0.069177,0.084736,0.071235,0.072670,0.083982,0.062013,Core Promoter
420113,30116343,CABP7,0.680242,0.683700,0.662226,0.711880,0.699262,0.738775,0.720822,0.667222,...,0.606769,0.694081,0.700518,0.710467,0.697145,0.675443,0.671625,0.682511,0.718816,Upstream


In [37]:
methylation_merged_df['Closest_TSS'] = methylation_merged_df['Closest_TSS'].astype(int)
methylation_merged_df['Closest_TSS_gene_name'] = methylation_merged_df['Closest_TSS_gene_name'].astype(str)
methylation_merged_df['Region'] = methylation_merged_df['Region'].astype(str)

In [38]:
print(methylation_merged_df[['Closest_TSS', 'Closest_TSS_gene_name', 'Region']].dtypes)

Closest_TSS               int32
Closest_TSS_gene_name    object
Region                   object
dtype: object


### 3.3 Calculate the average methylation value of five regions

In [39]:
# obtain all regions
regions = methylation_merged_df['Region'].unique()
regions

array(['Proximal Promoter', 'Downstream', 'Distal Promoter',
       'Core Promoter', 'Upstream'], dtype=object)

In [40]:
import pandas as pd
# Initialize empty DataFrames for each region
Upstream_df = pd.DataFrame()
Distal_Promoter_df = pd.DataFrame()
Proximal_Promoter_df = pd.DataFrame()
Core_Promoter_df = pd.DataFrame()
Downstream_df = pd.DataFrame()

# Operate on each region
for region in regions:
    # Get all data for this region
    region_data = methylation_merged_df[methylation_merged_df['Region'] == region]
    
    # Group and calculate the average for each (TSS, Region) combination
    grouped = region_data.groupby(['Closest_TSS_gene_name', 'Region'], as_index=False).mean()
    
    # Since we split the data into different files based on Region, we can delete this column
    grouped = grouped.drop(columns=['Region'])
    
    # Print the shape of the grouped data
    print(f"Shape of {region}: {grouped.shape}")
    
    # Assign the grouped data to the respective DataFrame
    if region == 'Upstream':
        Upstream_df = grouped
    elif region == 'Distal Promoter':
        Distal_Promoter_df = grouped
    elif region == 'Proximal Promoter':
        Proximal_Promoter_df = grouped
    elif region == 'Core Promoter':
        Core_Promoter_df = grouped
    elif region == 'Downstream':
        Downstream_df = grouped

    # Optionally, save the data for this region to a new csv file
    # grouped.to_csv(f"{region}_averaged_tss_data.csv", index=False)


Shape of Proximal Promoter: (15812, 742)
Shape of Downstream: (20578, 742)
Shape of Distal Promoter: (18239, 742)
Shape of Core Promoter: (9871, 742)
Shape of Upstream: (7128, 742)


### 3.4 Unify gene and TSS for five methylation value files

In [41]:
import pandas as pd

# from methylation files above DataFrame 
dfs = [Upstream_df, Distal_Promoter_df, Proximal_Promoter_df, Core_Promoter_df, Downstream_df]

# merge those files to find all combos 
all_genes_tss = pd.concat(dfs)['Closest_TSS_gene_name'].drop_duplicates()

In [42]:
all_genes_tss

0            1/2-SBSRNA4
1                  40969
2                  40970
3                  40971
4                  40972
              ...       
20460             ZNF791
20498             ZNF862
20507              ZNF91
20529             ZRANB2
20573    hsa-miR-3194-3p
Name: Closest_TSS_gene_name, Length: 23281, dtype: object

In [43]:
# Merge unique combinations back into each DataFrame and fill NaN values with 0
# Upstream
Upstream_df = pd.merge(all_genes_tss, Upstream_df, on=['Closest_TSS_gene_name'], how='outer').fillna(0)
print(f"Shape of Upstream_df: {Upstream_df.shape}")

# Distal Promoter
Distal_Promoter_df = pd.merge(all_genes_tss, Distal_Promoter_df, on=['Closest_TSS_gene_name'], how='outer').fillna(0)
print(f"Shape of Distal_Promoter_df: {Distal_Promoter_df.shape}")

# Proximal Promoter
Proximal_Promoter_df = pd.merge(all_genes_tss, Proximal_Promoter_df, on=['Closest_TSS_gene_name'], how='outer').fillna(0)
print(f"Shape of Proximal_Promoter_df: {Proximal_Promoter_df.shape}")

# Core Promoter
Core_Promoter_df = pd.merge(all_genes_tss, Core_Promoter_df, on=['Closest_TSS_gene_name'], how='outer').fillna(0)
print(f"Shape of Core_Promoter_df: {Core_Promoter_df.shape}")

# Downstream
Downstream_df = pd.merge(all_genes_tss, Downstream_df, on=['Closest_TSS_gene_name'], how='outer').fillna(0)
print(f"Shape of Downstream_df: {Downstream_df.shape}")

Shape of Upstream_df: (23281, 742)
Shape of Distal_Promoter_df: (23281, 742)
Shape of Proximal_Promoter_df: (23281, 742)
Shape of Core_Promoter_df: (23281, 742)
Shape of Downstream_df: (23281, 742)


## 4.Unify genes and patient samples within datasets

### 4.1 Unify gene and TSS for methylation, copynumer, and gene expression data

In [44]:
gene_expression

Unnamed: 0,gene_name,525_120515_0,383_120503_0,93_120417_0,610_120523_0,560_120517_0,492_120515_0,576_120521_0,150_120419_0,416_120503_0,...,831_130725_8,901_131010_8,894_130923_8,938_131101_8,942_131101_8,939_131101_8,895_130923_8,829_130725_8,944_131107_8,775_130528_8
0,RAB4B,60.84,65.45,69.18,51.60,48.61,55.65,51.18,60.63,62.38,...,44.85,45.02,33.71,38.30,28.27,51.02,31.87,37.14,41.77,44.01
1,LINC02082,0.08,0.05,0.08,0.08,0.10,0.08,0.06,0.11,0.05,...,0.00,0.10,0.20,0.00,0.00,0.00,0.00,0.00,0.09,0.10
2,TIGAR,4.39,4.49,2.51,2.90,2.67,5.50,3.36,3.43,2.72,...,8.15,4.05,6.06,6.03,4.34,5.16,2.70,5.00,3.82,3.69
3,MIR4802,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
4,NUP210P2,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55884,MIR548V,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
55885,,0.20,0.20,0.24,0.21,0.09,0.14,0.42,0.26,0.22,...,0.22,0.11,0.18,0.24,0.14,0.28,0.24,0.15,0.23,0.15
55886,PPP6R1,40.59,39.26,34.70,36.74,39.85,35.28,24.78,31.14,40.02,...,28.82,24.08,24.10,24.81,27.30,22.24,20.08,23.68,17.91,26.37
55887,BATF3,4.46,4.51,5.27,5.11,3.77,4.31,4.63,4.40,5.49,...,3.03,5.06,2.05,4.01,2.89,2.69,3.42,4.25,5.46,3.65


In [45]:
Upstream_df

Unnamed: 0,Closest_TSS_gene_name,Closest_TSS,TBI-AUTO73325-PT-3149,PT-BZHL,PT-BZCH,PT-BY9H,TBI-AUTO73307-PT-314I,TBI-AUTO73043-PT-35BD,PT-BZI5,PT-BZ1A,...,TBI-AUTO72955-PT-35OC,TBI-AUTO73291-PT-35OD,PT-BZD7,PT-BZG8,PT-C1N8,PT-BYJP,PT-BZHV,PT-BZI2,TBI-AUTO73257-PT-35OE,PT-BZD3
0,1/2-SBSRNA4,110354972.0,0.931928,0.922927,0.921658,0.934680,0.950280,0.915567,0.895290,0.942072,...,0.929474,0.932108,0.909341,0.921341,0.930176,0.916491,0.913546,0.936391,0.905927,0.942917
1,40969,220960038.0,0.718230,0.698030,0.712573,0.705048,0.734309,0.771367,0.782897,0.704986,...,0.766211,0.675562,0.720371,0.689035,0.618610,0.764157,0.576607,0.754224,0.745078,0.710471
2,40970,220921675.0,0.868322,0.845299,0.849537,0.892048,0.881078,0.867949,0.886519,0.902767,...,0.874877,0.851768,0.870165,0.878390,0.893037,0.901645,0.876906,0.870745,0.852633,0.877366
3,40971,126366439.0,0.914631,0.890200,0.893712,0.904816,0.934809,0.901929,0.918363,0.889949,...,0.907466,0.905140,0.898941,0.857999,0.960562,0.964220,0.896651,0.901460,0.907702,0.936499
4,40972,217236749.0,0.689192,0.726004,0.691986,0.669473,0.690793,0.725609,0.711018,0.717188,...,0.822901,0.727588,0.706725,0.726323,0.695048,0.800269,0.685743,0.758808,0.726784,0.716149
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23276,ZNF791,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
23277,ZNF862,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
23278,ZNF91,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
23279,ZRANB2,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [46]:
# Convert the gene name columns from each DataFrame to sets
gene_expression_genes = set(gene_expression['gene_name'])
methylation_genes = set(Upstream_df['Closest_TSS_gene_name'])
protein_genes = set(protein['gene_name'])
cnv_aggregated_genes = set(cnv_aggregated_del['Gene_Names'])
# Find the intersection of these sets
common_genes =  gene_expression_genes & methylation_genes & protein_genes & cnv_aggregated_genes

# Convert the intersection back to a list, if needed
common_genes_list = list(common_genes)

# Print the number of common genes
print(f"Number of common genes: {len(common_genes)}")

Number of common genes: 1748


In [47]:
# common_genes_left = common_genes - common_genes_cnv_with_others
# print(f"Number of common genes: {len(common_genes_left)}")

In [48]:
# cnv_aggregated_filtered = pd.DataFrame(columns=cnv_aggregated.columns)

# for gene in common_genes_cnv_with_others:
#     if gene in cnv_aggregated['Gene_Names'].values:
#         gene_row = cnv_aggregated[cnv_aggregated['Gene_Names'] == gene]
#         cnv_aggregated_filtered = pd.concat([cnv_aggregated_filtered, gene_row])

# for gene in common_genes_left:
#     zero_data = [0] * (len(cnv_aggregated.columns) - 1)
#     zero_row = pd.DataFrame([[gene] + zero_data], columns=cnv_aggregated.columns)
#     cnv_aggregated_filtered = pd.concat([cnv_aggregated_filtered, zero_row])

# cnv_aggregated_filtered.reset_index(drop=True, inplace=True)

In [49]:
# cnv_aggregated_filtered

#### 4.1.1  Intersecting genes with various databases

In [50]:
import pandas as pd
# Add the gene names from databases like [KEGG / BioGRID] to intersect with the common genes
# KEGG
kegg_pathway_df = pd.read_csv('./Regulatory-network-data/KEGG/full_kegg_pathway_list.csv')
kegg_pathway_df = kegg_pathway_df[['source', 'target', 'pathway_name']]
kegg_df = kegg_pathway_df[kegg_pathway_df['pathway_name'].str.contains('signaling pathway|signaling pathways', case=False)]
print(kegg_df['pathway_name'].value_counts())
kegg_df = kegg_df.rename(columns={'source': 'src', 'target': 'dest'})
src_list = list(kegg_df['src'])
dest_list = list(kegg_df['dest'])
path_list = list(kegg_df['pathway_name'])
# ADJUST ALL GENES TO UPPERCASE
up_src_list = []
for src in src_list:
    up_src = src.upper()
    up_src_list.append(up_src)
up_dest_list = []
for dest in dest_list:
    up_dest = dest.upper()
    up_dest_list.append(up_dest)
up_kegg_conn_dict = {'src': up_src_list, 'dest': up_dest_list}
up_kegg_df = pd.DataFrame(up_kegg_conn_dict)
up_kegg_df = up_kegg_df.drop_duplicates()
up_kegg_df.to_csv('./Regulatory-network-data/KEGG/up_kegg.csv', index=False, header=True)
kegg_gene_list = list(set(list(up_kegg_df['src']) + list(up_kegg_df['dest'])))
print('----- NUMBER OF GENES IN KEGG: ' + str(len(kegg_gene_list)) + ' -----')
print(up_kegg_df.shape)

up_kegg_path_conn_dict = {'src': up_src_list, 'dest': up_dest_list, 'path': path_list}
up_kegg_path_df = pd.DataFrame(up_kegg_path_conn_dict)
up_kegg_path_df = up_kegg_path_df.drop_duplicates()
up_kegg_path_df.to_csv('./Regulatory-network-data/KEGG/up_kegg_path.csv', index=False, header=True)
kegg_path_gene_list = list(set(list(up_kegg_path_df['src']) + list(up_kegg_path_df['dest'])))
print('----- NUMBER OF GENES IN KEGG PATH: ' + str(len(kegg_path_gene_list)) + ' -----')
print(up_kegg_path_df.shape)

PI3K-Akt signaling pathway                                  3992
JAK-STAT signaling pathway                                  3280
Chemokine signaling pathway                                 2766
MAPK signaling pathway                                      2002
cAMP signaling pathway                                      1680
Ras signaling pathway                                       1640
Rap1 signaling pathway                                      1304
Calcium signaling pathway                                   1260
Apelin signaling pathway                                    1011
Wnt signaling pathway                                        935
mTOR signaling pathway                                       836
Hippo signaling pathway                                      830
Insulin signaling pathway                                    721
Glucagon signaling pathway                                   707
Relaxin signaling pathway                                    684
Phospholipase D signaling

In [51]:
# BioGRID
biogrid_df = pd.read_table('./Regulatory-network-data/BioGrid/BIOGRID-ALL-3.5.174.mitab.Symbol.txt', delimiter = '\t')
eh_list = list(biogrid_df['e_h'])
et_list = list(biogrid_df['e_t'])
# ADJUST ALL GENES TO UPPERCASE
up_eh_list = []
for eh in eh_list:
    up_eh = eh.upper()
    up_eh_list.append(up_eh)
up_et_list = []
for et in et_list:
    up_et = et.upper()
    up_et_list.append(up_et)
up_biogrid_conn_dict = {'src': up_eh_list, 'dest': up_et_list}
up_biogrid_df = pd.DataFrame(up_biogrid_conn_dict)
print(up_biogrid_df)
print(up_biogrid_df.shape)
up_biogrid_df.to_csv('./Regulatory-network-data/BioGrid/up_biogrid.csv', index = False, header = True)
up_biogrid_gene_list = list(set(list(up_biogrid_df['src']) + list(up_biogrid_df['dest'])))
print('----- NUMBER OF GENES IN BioGRID: ' + str(len(up_biogrid_gene_list)) + ' -----')

           src    dest
0       MAP2K4    FLNC
1         MYPN   ACTN2
2        ACVR1    FNTA
3        GATA2     PML
4         RPA2   STAT3
...        ...     ...
472638   USP18  SAMHD1
472639   USP18    SKP2
472640  SAMHD1   USP18
472641  SAMHD1   CCNA2
472642  SAMHD1    CDK1

[472643 rows x 2 columns]
(472643, 2)
----- NUMBER OF GENES IN BioGRID: 19349 -----


In [52]:
# STRING
string_df = pd.read_csv('./Regulatory-network-data/STRING/9606.protein.links.detailed.v11.0_sym.csv', low_memory=False)
src_list = list(string_df['Source'])
tar_list = list(string_df['Target'])
# ADJUST ALL GENES TO UPPERCASE
up_src_list = []
for src in src_list:
    up_src = src.upper()
    up_src_list.append(up_src)
up_tar_list = []
for tar in tar_list:
    up_tar = tar.upper()
    up_tar_list.append(up_tar)
up_string_conn_dict = {'src': up_src_list, 'dest': up_tar_list}
up_string_df = pd.DataFrame(up_string_conn_dict)
print(up_string_df)
up_string_df.to_csv('./Regulatory-network-data/STRING/up_string.csv', index = False, header = True)
up_string_gene_list = list(set(list(up_string_df['src']) + list(up_string_df['dest'])))
print('----- NUMBER OF GENES IN STRING: ' + str(len(up_string_gene_list)) + ' -----')

          src    dest
0        ARF5  SPTBN2
1        ARF5  KIF13B
2        ARF5   AP1B1
3        ARF5  KIF21A
4        ARF5   TMED7
...       ...     ...
841063  OR6Q1   REEP1
841064  OR6Q1   REEP4
841065  OR6Q1    GNB1
841066  OR6Q1    RTP3
841067  OR6Q1   REEP2

[841068 rows x 2 columns]
----- NUMBER OF GENES IN STRING: 17179 -----


In [53]:
# intersect the [common genes] with the genes in the different databases [KEGG / BioGRID / STRING]
selected_database = 'KEGG'
# selected_database = 'BioGRID'
# selected_database = 'STRING'
if selected_database == 'KEGG':
    edge_common_genes = list(set(common_genes) & set(kegg_gene_list))
    print('----- NUMBER OF INTERSECTED GENES IN KEGG: ' + str(len(edge_common_genes)) + ' -----')
elif selected_database == 'BioGRID':
    edge_common_genes = list(set(common_genes) & set(up_biogrid_gene_list))
    print('----- NUMBER OF INTERSECTED GENES IN BioGRID: ' + str(len(edge_common_genes)) + ' -----')
elif selected_database == 'STRING':
    edge_common_genes = list(set(common_genes) & set(up_string_gene_list))
    print('----- NUMBER OF INTERSECTED GENES IN STRING: ' + str(len(edge_common_genes)) + ' -----')

# filter the genes in the different databases [KEGG / BioGRID / STRING] with the [common genes]
if selected_database == 'KEGG':
    filtered_up_kegg_df = up_kegg_df[up_kegg_df['src'].isin(edge_common_genes) & up_kegg_df['dest'].isin(edge_common_genes)]
    filtered_up_kegg_df = filtered_up_kegg_df.drop_duplicates()
    filtered_up_kegg_df = filtered_up_kegg_df.sort_values(by=['src', 'dest']).reset_index(drop=True)
    print('----- NEW KEGG EDGE CONNECTIONS: ' + str(len(filtered_up_kegg_df)) + ' -----')
    filtered_up_kegg_path_df = up_kegg_path_df[up_kegg_path_df['src'].isin(edge_common_genes) & up_kegg_path_df['dest'].isin(edge_common_genes)]
    filtered_up_kegg_path_df = filtered_up_kegg_path_df.drop_duplicates()
    filtered_up_kegg_path_df = filtered_up_kegg_path_df.sort_values(by=['src', 'dest']).reset_index(drop=True)
    print('----- NEW KEGG PATHWAY CONNECTIONS: ' + str(len(filtered_up_kegg_path_df)) + ' -----')
elif selected_database == 'BioGRID':
    filtered_up_biogrid_df = up_biogrid_df[up_biogrid_df['src'].isin(edge_common_genes) & up_biogrid_df['dest'].isin(edge_common_genes)]
    filtered_up_biogrid_df = filtered_up_biogrid_df.drop_duplicates()
    filtered_up_biogrid_df = filtered_up_biogrid_df.sort_values(by=['src', 'dest']).reset_index(drop=True)
    print('----- NEW BioGRID EDGE CONNECTIONS: ' + str(len(filtered_up_biogrid_df)) + ' -----')
elif selected_database == 'STRING':
    filtered_up_string_df = up_string_df[up_string_df['src'].isin(edge_common_genes) & up_string_df['dest'].isin(edge_common_genes)]
    filtered_up_string_df = filtered_up_string_df.drop_duplicates()
    filtered_up_string_df = filtered_up_string_df.sort_values(by=['src', 'dest']).reset_index(drop=True)
    print('----- NEW STRING EDGE CONNECTIONS: ' + str(len(filtered_up_string_df)) + ' -----')

----- NUMBER OF INTERSECTED GENES IN KEGG: 282 -----
----- NEW KEGG EDGE CONNECTIONS: 521 -----
----- NEW KEGG PATHWAY CONNECTIONS: 812 -----


In [54]:
if selected_database == 'KEGG':
    display(filtered_up_kegg_df)
    display(filtered_up_kegg_path_df)
elif selected_database == 'BioGRID':
    display(filtered_up_biogrid_df)
elif selected_database == 'STRING':
    display(filtered_up_string_df)

Unnamed: 0,src,dest
0,ADRA1B,GNA11
1,ADRA1B,GNAQ
2,ADRA2A,GNA11
3,ADRA2A,GNAQ
4,AKT2,FOXO3
...,...,...
516,WDR24,NPRL3
517,WDR59,DEPDC5
518,WDR59,NPRL2
519,WDR59,NPRL3


Unnamed: 0,src,dest,path
0,ADRA1B,GNA11,Calcium signaling pathway
1,ADRA1B,GNA11,cGMP-PKG signaling pathway
2,ADRA1B,GNAQ,Calcium signaling pathway
3,ADRA1B,GNAQ,cGMP-PKG signaling pathway
4,ADRA2A,GNA11,cGMP-PKG signaling pathway
...,...,...,...
807,WDR24,NPRL3,mTOR signaling pathway
808,WDR59,DEPDC5,mTOR signaling pathway
809,WDR59,NPRL2,mTOR signaling pathway
810,WDR59,NPRL3,mTOR signaling pathway


In [55]:
# select common genes in gene expression data
gene_expression_filtered = gene_expression.loc[gene_expression['gene_name'].isin(edge_common_genes)]

In [56]:
gene_expression_filtered

Unnamed: 0,gene_name,525_120515_0,383_120503_0,93_120417_0,610_120523_0,560_120517_0,492_120515_0,576_120521_0,150_120419_0,416_120503_0,...,831_130725_8,901_131010_8,894_130923_8,938_131101_8,942_131101_8,939_131101_8,895_130923_8,829_130725_8,944_131107_8,775_130528_8
610,THEM4,6.37,9.40,9.82,8.40,9.24,9.70,4.52,9.43,10.28,...,13.35,8.95,11.83,12.09,10.10,11.38,8.13,12.56,12.27,12.38
946,ADCY2,42.26,23.32,17.63,36.59,29.92,35.56,51.30,36.02,30.43,...,36.96,30.92,25.82,54.58,39.21,22.54,41.60,29.79,14.29,21.16
958,PDE1C,4.39,5.01,3.33,4.82,4.89,7.75,6.75,6.05,3.35,...,7.13,8.09,9.04,6.49,12.12,5.60,5.39,4.44,8.88,12.15
1162,TRIM25,3.88,5.28,3.06,3.64,3.25,4.07,4.15,4.03,3.71,...,4.08,3.70,4.18,9.95,4.26,5.20,4.67,3.77,3.93,3.67
1413,TBC1D4,3.08,3.24,3.14,3.06,3.63,4.79,3.15,4.67,3.82,...,7.01,4.78,5.42,5.51,5.37,6.43,5.59,5.50,4.84,6.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54889,ADCY9,6.51,5.20,4.27,4.07,5.21,6.63,5.66,5.52,5.29,...,5.86,5.17,4.53,4.46,4.78,4.14,3.66,4.77,3.41,4.37
55240,LLGL2,3.16,2.48,4.24,3.32,4.56,1.89,4.64,6.77,3.62,...,2.71,1.37,2.61,1.98,4.22,1.29,3.12,2.24,3.08,4.78
55245,MAP3K7,7.99,7.45,5.54,6.31,5.08,9.15,10.64,10.27,6.56,...,15.50,13.99,12.03,12.85,11.64,10.95,7.73,9.61,10.55,11.21
55529,EIF4B,47.77,28.31,25.03,30.25,27.86,58.48,46.89,44.14,35.55,...,55.87,50.26,45.64,57.85,37.17,45.64,34.89,41.26,36.48,39.63


In [57]:
gene_expression_filtered['gene_name'].nunique()

282

In [58]:
gene_expression_filtered = gene_expression_filtered.groupby('gene_name', as_index=False).mean()
gene_expression_filtered

Unnamed: 0,gene_name,525_120515_0,383_120503_0,93_120417_0,610_120523_0,560_120517_0,492_120515_0,576_120521_0,150_120419_0,416_120503_0,...,831_130725_8,901_131010_8,894_130923_8,938_131101_8,942_131101_8,939_131101_8,895_130923_8,829_130725_8,944_131107_8,775_130528_8
0,ABL1,8.84,8.55,7.78,10.04,11.92,7.72,11.81,9.18,8.44,...,7.36,6.76,6.97,7.13,9.99,6.59,8.27,6.87,8.75,10.48
1,ABL2,5.48,4.72,3.95,4.09,4.91,4.87,4.78,5.79,5.07,...,6.69,6.70,6.26,7.27,7.20,5.98,5.14,6.71,8.11,6.38
2,ACOX1,10.02,6.91,9.08,9.37,8.00,9.26,14.77,9.13,9.37,...,11.73,12.51,10.69,11.77,10.50,10.50,11.50,9.95,8.15,8.01
3,ACOX2,1.63,2.78,2.44,3.64,3.30,1.89,2.07,2.42,2.65,...,2.05,1.92,3.03,2.96,2.94,3.17,4.76,3.02,3.76,2.00
4,ACSBG1,36.44,16.49,24.62,35.44,31.74,20.34,44.74,16.83,24.71,...,25.98,23.13,16.32,24.88,29.64,17.70,24.50,18.67,11.92,17.06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
277,USP25,7.99,7.99,6.13,6.01,5.10,11.97,7.02,8.52,6.15,...,15.98,13.82,10.38,13.19,8.05,8.82,5.99,10.45,7.67,10.05
278,VAV3,1.20,1.18,0.98,0.95,1.53,1.77,0.97,1.15,0.79,...,2.35,1.81,1.56,1.54,3.48,2.55,1.75,1.17,1.29,1.14
279,WDR24,9.01,9.23,6.83,6.92,7.59,7.81,6.56,8.96,9.14,...,5.51,6.09,4.52,4.89,5.01,6.05,3.45,5.09,5.14,4.68
280,WDR59,18.20,14.75,15.50,18.52,17.34,14.70,20.13,21.20,18.53,...,16.46,17.07,21.96,17.79,25.63,17.70,19.37,17.71,23.08,19.77


In [59]:
protein_filtered = protein.loc[protein['gene_name'].isin(edge_common_genes)]
protein_filtered

Unnamed: 0,gene_name,b01.127C,b01.127N,b01.128C,b01.128N,b01.129C,b01.129N,b01.130C,b01.130N,b02.127C,...,b49.130C,b49.130N,b50.127C,b50.127N,b50.128C,b50.128N,b50.129C,b50.129N,b50.130C,b50.130N
20,PIP5K1A,0.178313,0.341864,0.028784,0.160739,0.183682,0.236123,0.090067,0.186331,0.318075,...,0.242292,0.177952,0.196724,-0.015791,0.127133,0.232012,0.096085,0.212372,0.147912,0.237355
39,AKT2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-0.105931,...,0.307514,0.062060,0.160037,0.128148,0.157055,0.100353,0.183790,0.040994,0.129260,0.095798
106,PARD6B,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-0.917102,...,-0.255236,-0.007149,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
121,TLN2,0.051904,-0.057848,-0.012410,0.026899,-0.085752,-0.015643,-0.048042,-0.104717,0.214088,...,-0.133747,-0.026273,-0.130651,-0.132338,-0.080822,0.031302,0.010045,-0.059629,-0.021415,0.009894
138,SOS1,0.086330,0.209750,0.038917,0.100709,0.090624,0.041626,0.054894,0.073253,0.165085,...,0.151001,0.074282,0.023096,0.213818,0.139749,0.003861,0.066514,-0.068570,0.075709,0.089524
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8685,NOTCH2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-0.888597,...,0.070047,0.209882,0.100821,0.169877,0.254273,-0.031092,0.116303,-0.101832,0.114146,-0.000504
8693,ECSIT,0.300293,0.308493,-0.402215,-0.289328,0.023840,-0.091529,-0.279219,0.322110,0.325842,...,0.053910,-0.112191,0.058098,-0.056180,-0.132315,-0.028867,-0.096218,-0.000382,0.135265,-0.096495
8753,MAP4K2,-0.168412,0.173810,-0.080564,0.115536,-0.052832,-0.028030,-0.037669,-0.116189,-0.251972,...,-0.077508,0.009067,-0.146521,-0.035258,0.516013,-0.039903,-0.181673,-0.068631,-0.013004,-0.067956
8773,RPS6KB1,0.024752,0.108101,0.012416,-0.022313,0.077919,0.040058,-0.024371,-0.056199,-0.881026,...,-0.238451,0.044346,-0.035123,0.103621,-0.106608,0.041581,0.100258,-0.103901,0.036144,-0.016336


In [60]:
protein_filtered['gene_name'].nunique()
protein_filtered = protein_filtered.groupby('gene_name', as_index=False).mean()
protein_filtered

Unnamed: 0,gene_name,b01.127C,b01.127N,b01.128C,b01.128N,b01.129C,b01.129N,b01.130C,b01.130N,b02.127C,...,b49.130C,b49.130N,b50.127C,b50.127N,b50.128C,b50.128N,b50.129C,b50.129N,b50.130C,b50.130N
0,ABL1,-0.103582,0.015336,-0.060367,-0.301662,-0.176597,-0.168287,-0.083194,-0.200645,-0.648434,...,-0.286807,-0.162889,-0.183929,-0.135078,0.057625,-0.115102,-0.273137,-0.154150,-0.212377,-0.080477
1,ABL2,0.013649,0.120403,-0.160178,0.060566,0.062877,0.029663,0.029067,0.176011,0.115946,...,-0.054777,0.024013,-0.099587,0.306420,-0.147124,0.145804,0.025654,0.047217,-0.050547,0.074370
2,ACOX1,-0.231938,-0.234615,-0.098837,-0.137983,-0.203026,-0.369375,0.079500,-0.323854,-0.232612,...,-0.179384,-0.060935,0.107517,-0.532369,0.328355,-0.513306,-0.329483,-0.122840,-0.085473,-0.430132
3,ACOX2,0.229814,0.698708,0.264218,0.305458,0.133755,0.242651,0.325068,0.222209,0.043442,...,0.095562,0.305927,0.468961,-0.041950,0.946098,-0.086123,0.229065,0.445173,0.260445,0.214540
4,ACSBG1,0.112325,-0.125347,-0.145176,-0.165873,-0.089858,-0.291286,0.017044,-0.194700,0.129824,...,-0.202923,0.105410,-0.103723,-0.475825,0.048453,-0.313050,-0.083820,-0.184159,-0.001842,-0.287780
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
277,USP25,0.048064,0.064028,0.053987,0.111727,0.216005,0.156309,0.113332,0.112448,0.063420,...,0.146471,0.094181,0.092233,0.099662,0.107154,0.085431,-0.021249,0.142780,0.135491,0.170657
278,VAV3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.213189,0.444728,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
279,WDR24,0.103879,0.115408,0.191761,0.215023,0.288794,0.223932,0.251621,0.202675,0.224688,...,0.165323,0.233864,0.266288,0.306279,0.079271,0.169097,0.209893,0.190447,0.216066,0.010723
280,WDR59,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,-0.058411,0.046777,0.191445,0.172955,-0.067315,0.000403,0.000672,0.089002,0.060449,0.034606


In [61]:
# select common genes in methylation data
Upstream_df_filtered = Upstream_df.loc[Upstream_df['Closest_TSS_gene_name'].isin(edge_common_genes)]
Distal_Promoter_df_filtered = Distal_Promoter_df.loc[Distal_Promoter_df['Closest_TSS_gene_name'].isin(edge_common_genes)]
Proximal_Promoter_df_filtered = Proximal_Promoter_df.loc[Proximal_Promoter_df['Closest_TSS_gene_name'].isin(edge_common_genes)]
Core_Promoter_df_filtered = Core_Promoter_df.loc[Core_Promoter_df['Closest_TSS_gene_name'].isin(edge_common_genes)]
Downstream_df_filtered = Downstream_df.loc[Downstream_df['Closest_TSS_gene_name'].isin(edge_common_genes)]

In [62]:
Upstream_df_filtered

Unnamed: 0,Closest_TSS_gene_name,Closest_TSS,TBI-AUTO73325-PT-3149,PT-BZHL,PT-BZCH,PT-BY9H,TBI-AUTO73307-PT-314I,TBI-AUTO73043-PT-35BD,PT-BZI5,PT-BZ1A,...,TBI-AUTO72955-PT-35OC,TBI-AUTO73291-PT-35OD,PT-BZD7,PT-BZG8,PT-C1N8,PT-BYJP,PT-BZHV,PT-BZI2,TBI-AUTO73257-PT-35OE,PT-BZD3
44,ABL1,133710830.0,0.866411,0.850319,0.850696,0.875926,0.853794,0.848104,0.876568,0.876418,...,0.872650,0.885689,0.877116,0.848954,0.873008,0.867169,0.880158,0.845706,0.833943,0.896733
122,ADCY2,7396342.0,0.416075,0.378824,0.426870,0.425817,0.350645,0.402792,0.445429,0.390155,...,0.391174,0.383262,0.424440,0.447257,0.394301,0.410102,0.385054,0.419917,0.388136,0.398047
123,ADCY3,25142054.0,0.638868,0.656564,0.671461,0.658125,0.637145,0.739025,0.690906,0.669437,...,0.701021,0.682131,0.685608,0.694760,0.714894,0.722603,0.624439,0.669764,0.679973,0.620078
124,ADCY5,123167391.0,0.769910,0.813722,0.742026,0.786115,0.783664,0.811388,0.764028,0.772730,...,0.577060,0.759150,0.788389,0.828399,0.778853,0.866111,0.793008,0.803425,0.777613,0.833527
127,ADCY9,4166185.0,0.894040,0.895229,0.838150,0.870131,0.885601,0.869973,0.896978,0.904201,...,0.900253,0.847817,0.860108,0.908653,0.913245,0.926032,0.887049,0.876566,0.856698,0.875721
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22894,PLCD3,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
22933,PTEN,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
23010,SIPA1L1,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
23015,SLC27A4,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [63]:
cnv_aggregated_filtered_del = cnv_aggregated_del.loc[cnv_aggregated_del['Gene_Names'].isin(edge_common_genes)]
cnv_aggregated_filtered_dup = cnv_aggregated_dup.loc[cnv_aggregated_dup['Gene_Names'].isin(edge_common_genes)]
cnv_aggregated_filtered_mcnv = cnv_aggregated_mcnv.loc[cnv_aggregated_mcnv['Gene_Names'].isin(edge_common_genes)]
cnv_aggregated_filtered_mcnv

Unnamed: 0,Gene_Names,SM-CTDSC,SM-CJGMZ,SM-CJFM3,SM-CTEET,SM-CJK4L,SM-CJIZ4,SM-CJK3I,SM-CJGN4,SM-CTDSM,...,SM-CTDVK,SM-CTED3,SM-CJK4V,SM-CJK5F,SM-CJIY9,SM-CTDQZ,SM-CJIY3,SM-CTEN7,SM-CJGH9,SM-CTEFP
7,PLAT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
48,PIK3CB,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,3.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
83,GFAP,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
97,JUND,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0
110,IL6ST,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6232,ADCY8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6250,DEPDC5,4.0,4.0,4.0,3.0,4.0,4.0,4.0,2.0,4.0,...,4.0,4.0,2.0,2.0,2.0,4.0,2.0,4.0,2.0,4.0
6290,CAMK1D,3.0,4.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,...,4.0,3.0,3.0,2.0,3.0,3.0,3.0,4.0,3.0,3.0
6326,PLCG1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 4.2 Make all data use a unified patient ID

In [64]:
ROSMAP_biospecimen = pd.read_csv('./ROSMAP-raw/Meta-Data/ROSMAP_biospecimen_metadata.csv', sep=',')
ROSMAP_biospecimen

Unnamed: 0,individualID,specimenID,specimenIdSource,organ,tissue,BrodmannArea,sampleStatus,tissueWeight,tissueVolume,nucleicAcidSource,cellType,fastingState,isPostMortem,samplingAge,samplingAgeUnits,visitNumber,assay,exclude,excludeReason,samplingDate
0,R1743384,190403-B4-A_R1743384,,brain,dorsolateral prefrontal cortex,,frozen,,,single nucleus,,,True,,,,scrnaSeq,False,,
1,R2670295,190403-B4-A_R2670295,,brain,dorsolateral prefrontal cortex,,frozen,,,single nucleus,,,True,,,,scrnaSeq,False,,
2,R4119160,190403-B4-A_R4119160,,brain,dorsolateral prefrontal cortex,,frozen,,,single nucleus,,,True,,,,scrnaSeq,True,RNA genotype discordant with WGS,
3,R4641987,190403-B4-A_R4641987,,brain,dorsolateral prefrontal cortex,,frozen,,,single nucleus,,,True,,,,scrnaSeq,False,,
4,R5693901,190403-B4-A_R5693901,,brain,dorsolateral prefrontal cortex,,frozen,,,single nucleus,,,True,,,,scrnaSeq,True,Duplicated donor,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13230,R2757104,DUKE-08135,Human,brain,dorsolateral prefrontal cortex,,,,10.0,,,False,True,,,,Metabolon,,,
13231,R9354381,DUKE-08136,Human,brain,dorsolateral prefrontal cortex,,,,10.0,,,False,True,,,,Metabolon,,,
13232,R2711188,DUKE-08137,Human,brain,dorsolateral prefrontal cortex,,,,10.0,,,False,True,,,,Metabolon,,,
13233,R9047934,DUKE-08138,Human,brain,dorsolateral prefrontal cortex,,,,10.0,,,False,True,,,,Metabolon,,,


In [65]:
# Split the 'specimenID' and construct the matching ID
ROSMAP_biospecimen['matching_id'] = ROSMAP_biospecimen['specimenID'].apply(lambda x: '.'.join(x.split('.')[2:4]))

# Update the dictionary mapping with the new matching IDs
matching_id_to_individual_protein = dict(zip(ROSMAP_biospecimen['matching_id'], ROSMAP_biospecimen['individualID']))

# Replace the column names in the protein DataFrame if they are in the matching_id_to_individual_protein mapping
protein_filtered.columns = [matching_id_to_individual_protein.get(col, col) for col in protein_filtered.columns]
protein_filtered

Unnamed: 0,gene_name,R8316516,R1822146,R3512949,R3143439,R4590536,R9679238,R6879714,R8963331,R4182458,...,R9680160,R7015250,R5676537,R2087569,R2421650,R1498848,R9794121,R7298955,R6084846,R9798367
0,ABL1,-0.103582,0.015336,-0.060367,-0.301662,-0.176597,-0.168287,-0.083194,-0.200645,-0.648434,...,-0.286807,-0.162889,-0.183929,-0.135078,0.057625,-0.115102,-0.273137,-0.154150,-0.212377,-0.080477
1,ABL2,0.013649,0.120403,-0.160178,0.060566,0.062877,0.029663,0.029067,0.176011,0.115946,...,-0.054777,0.024013,-0.099587,0.306420,-0.147124,0.145804,0.025654,0.047217,-0.050547,0.074370
2,ACOX1,-0.231938,-0.234615,-0.098837,-0.137983,-0.203026,-0.369375,0.079500,-0.323854,-0.232612,...,-0.179384,-0.060935,0.107517,-0.532369,0.328355,-0.513306,-0.329483,-0.122840,-0.085473,-0.430132
3,ACOX2,0.229814,0.698708,0.264218,0.305458,0.133755,0.242651,0.325068,0.222209,0.043442,...,0.095562,0.305927,0.468961,-0.041950,0.946098,-0.086123,0.229065,0.445173,0.260445,0.214540
4,ACSBG1,0.112325,-0.125347,-0.145176,-0.165873,-0.089858,-0.291286,0.017044,-0.194700,0.129824,...,-0.202923,0.105410,-0.103723,-0.475825,0.048453,-0.313050,-0.083820,-0.184159,-0.001842,-0.287780
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
277,USP25,0.048064,0.064028,0.053987,0.111727,0.216005,0.156309,0.113332,0.112448,0.063420,...,0.146471,0.094181,0.092233,0.099662,0.107154,0.085431,-0.021249,0.142780,0.135491,0.170657
278,VAV3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.213189,0.444728,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
279,WDR24,0.103879,0.115408,0.191761,0.215023,0.288794,0.223932,0.251621,0.202675,0.224688,...,0.165323,0.233864,0.266288,0.306279,0.079271,0.169097,0.209893,0.190447,0.216066,0.010723
280,WDR59,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,-0.058411,0.046777,0.191445,0.172955,-0.067315,0.000403,0.000672,0.089002,0.060449,0.034606


In [66]:
mirna_ids = protein.columns.tolist()[1:]
individual_ids = protein_filtered.columns.tolist()[1:]

protein_map = pd.DataFrame({
    'individualID': individual_ids,
    'mirna_id': mirna_ids
})

protein_map

Unnamed: 0,individualID,mirna_id
0,R8316516,b01.127C
1,R1822146,b01.127N
2,R3512949,b01.128C
3,R3143439,b01.128N
4,R4590536,b01.129C
...,...,...
395,R1498848,b50.128N
396,R9794121,b50.129C
397,R7298955,b50.129N
398,R6084846,b50.130C


In [67]:
matching_id_to_individual = dict(zip(ROSMAP_biospecimen['specimenID'], ROSMAP_biospecimen['individualID']))
Upstream_df_filtered.columns = [matching_id_to_individual.get(col, col) for col in Upstream_df_filtered.columns]
Distal_Promoter_df_filtered.columns = [matching_id_to_individual.get(col, col) for col in Distal_Promoter_df_filtered.columns]
Proximal_Promoter_df_filtered.columns = [matching_id_to_individual.get(col, col) for col in Proximal_Promoter_df_filtered.columns]
Core_Promoter_df_filtered.columns = [matching_id_to_individual.get(col, col) for col in Core_Promoter_df_filtered.columns]
Downstream_df_filtered.columns = [matching_id_to_individual.get(col, col) for col in Downstream_df_filtered.columns]
Upstream_df_filtered

Unnamed: 0,Closest_TSS_gene_name,Closest_TSS,R3978789,R8140052,R7881801,R6108690,R9662437,R9936070,R4119160,R6284795,...,R5508487,R7503784,R4887967,R5130901,R9380629,R6016948,R5259690,R1977848,R6536689,R6759986
44,ABL1,133710830.0,0.866411,0.850319,0.850696,0.875926,0.853794,0.848104,0.876568,0.876418,...,0.872650,0.885689,0.877116,0.848954,0.873008,0.867169,0.880158,0.845706,0.833943,0.896733
122,ADCY2,7396342.0,0.416075,0.378824,0.426870,0.425817,0.350645,0.402792,0.445429,0.390155,...,0.391174,0.383262,0.424440,0.447257,0.394301,0.410102,0.385054,0.419917,0.388136,0.398047
123,ADCY3,25142054.0,0.638868,0.656564,0.671461,0.658125,0.637145,0.739025,0.690906,0.669437,...,0.701021,0.682131,0.685608,0.694760,0.714894,0.722603,0.624439,0.669764,0.679973,0.620078
124,ADCY5,123167391.0,0.769910,0.813722,0.742026,0.786115,0.783664,0.811388,0.764028,0.772730,...,0.577060,0.759150,0.788389,0.828399,0.778853,0.866111,0.793008,0.803425,0.777613,0.833527
127,ADCY9,4166185.0,0.894040,0.895229,0.838150,0.870131,0.885601,0.869973,0.896978,0.904201,...,0.900253,0.847817,0.860108,0.908653,0.913245,0.926032,0.887049,0.876566,0.856698,0.875721
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22894,PLCD3,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
22933,PTEN,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
23010,SIPA1L1,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
23015,SLC27A4,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [68]:
mwas_id = methylation_value.columns.tolist()[1:]
individual_ids = Upstream_df_filtered.columns.tolist()[2:]

methylation_map = pd.DataFrame({
    'individualID': individual_ids,
    'mwas_id': mwas_id
})

methylation_map

Unnamed: 0,individualID,mwas_id
0,R3978789,TBI-AUTO73325-PT-3149
1,R8140052,PT-BZHL
2,R7881801,PT-BZCH
3,R6108690,PT-BY9H
4,R9662437,TBI-AUTO73307-PT-314I
...,...,...
735,R6016948,PT-BYJP
736,R5259690,PT-BZHV
737,R1977848,PT-BZI2
738,R6536689,TBI-AUTO73257-PT-35OE


In [69]:
def process_gene_column_name(col_name):
    parts = col_name.split('_')
    return '_'.join(parts[:2]) if len(parts) > 1 else col_name
gene_expression_filtered.columns = [process_gene_column_name(col) for col in gene_expression_filtered.columns]
matching_id_to_individual_gene = dict(zip(ROSMAP_biospecimen['specimenID'], ROSMAP_biospecimen['individualID']))
gene_expression_filtered.columns = [matching_id_to_individual_gene.get(col, col) for col in gene_expression_filtered.columns]
gene_expression_filtered

Unnamed: 0,gene_name,R1743384,R6862468,R5415701,R1407047,R2197944,R5693901,R9210731,R6911631,R5636935,...,R7868788,R2901804,R4956716,R5924065,R3914030,R1133959,R2975126,R1224782,R3211474,R3341095
0,ABL1,8.84,8.55,7.78,10.04,11.92,7.72,11.81,9.18,8.44,...,7.36,6.76,6.97,7.13,9.99,6.59,8.27,6.87,8.75,10.48
1,ABL2,5.48,4.72,3.95,4.09,4.91,4.87,4.78,5.79,5.07,...,6.69,6.70,6.26,7.27,7.20,5.98,5.14,6.71,8.11,6.38
2,ACOX1,10.02,6.91,9.08,9.37,8.00,9.26,14.77,9.13,9.37,...,11.73,12.51,10.69,11.77,10.50,10.50,11.50,9.95,8.15,8.01
3,ACOX2,1.63,2.78,2.44,3.64,3.30,1.89,2.07,2.42,2.65,...,2.05,1.92,3.03,2.96,2.94,3.17,4.76,3.02,3.76,2.00
4,ACSBG1,36.44,16.49,24.62,35.44,31.74,20.34,44.74,16.83,24.71,...,25.98,23.13,16.32,24.88,29.64,17.70,24.50,18.67,11.92,17.06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
277,USP25,7.99,7.99,6.13,6.01,5.10,11.97,7.02,8.52,6.15,...,15.98,13.82,10.38,13.19,8.05,8.82,5.99,10.45,7.67,10.05
278,VAV3,1.20,1.18,0.98,0.95,1.53,1.77,0.97,1.15,0.79,...,2.35,1.81,1.56,1.54,3.48,2.55,1.75,1.17,1.29,1.14
279,WDR24,9.01,9.23,6.83,6.92,7.59,7.81,6.56,8.96,9.14,...,5.51,6.09,4.52,4.89,5.01,6.05,3.45,5.09,5.14,4.68
280,WDR59,18.20,14.75,15.50,18.52,17.34,14.70,20.13,21.20,18.53,...,16.46,17.07,21.96,17.79,25.63,17.70,19.37,17.71,23.08,19.77


In [70]:
mrna_id = gene_expression.columns.tolist()[1:]
individual_ids = gene_expression_filtered.columns.tolist()[1:]

gene_expression_map = pd.DataFrame({
    'individualID': individual_ids,
    'mrna_id': mrna_id
})

gene_expression_map

Unnamed: 0,individualID,mrna_id
0,R1743384,525_120515_0
1,R6862468,383_120503_0
2,R5415701,93_120417_0
3,R1407047,610_120523_0
4,R2197944,560_120517_0
...,...,...
635,R1133959,939_131101_8
636,R2975126,895_130923_8
637,R1224782,829_130725_8
638,R3211474,944_131107_8


In [71]:
cnv_aggregated_filtered_del.columns = [matching_id_to_individual.get(col, col) for col in cnv_aggregated_filtered_del.columns]
cnv_aggregated_filtered_dup.columns = [matching_id_to_individual.get(col, col) for col in cnv_aggregated_filtered_dup.columns]
cnv_aggregated_filtered_mcnv.columns = [matching_id_to_individual.get(col, col) for col in cnv_aggregated_filtered_mcnv.columns]
cnv_aggregated_filtered_del

Unnamed: 0,Gene_Names,R3978789,R6108690,R6013281,R2367199,R4323608,R2577726,R9489952,R8125311,R1234575,...,R1724172,R9323873,R4859661,R1074668,R6739300,R3008520,R3151599,R7993799,R9907075,R5026720
7,PLAT,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
48,PIK3CB,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
83,GFAP,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
97,JUND,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
110,IL6ST,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6232,ADCY8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6250,DEPDC5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6290,CAMK1D,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,...,2.0,2.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0
6326,PLCG1,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,...,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0


In [72]:
cnvdata_id = cnv_aggregated_del.columns.tolist()[1:]
individual_ids = cnv_aggregated_filtered_del.columns.tolist()[1:]

cnv_aggregated_map = pd.DataFrame({
    'individualID': individual_ids,
    'cnvdata_id': cnvdata_id
})

cnv_aggregated_map

Unnamed: 0,individualID,cnvdata_id
0,R3978789,SM-CTDSC
1,R6108690,SM-CJGMZ
2,R6013281,SM-CJFM3
3,R2367199,SM-CTEET
4,R4323608,SM-CJK4L
...,...,...
1122,R3008520,SM-CTDQZ
1123,R3151599,SM-CJIY3
1124,R7993799,SM-CTEN7
1125,R9907075,SM-CJGH9


In [73]:
combined_individualID = pd.concat([
    protein_map['individualID'],
    methylation_map['individualID'],
    gene_expression_map['individualID'],
    # snphead_updated_withoutNA['individualID'],
    cnv_aggregated_map['individualID'],
    survival['individualID']
]).unique()

# Now we create a new DataFrame that contains all unique 'individualID' and their corresponding values
# from the other columns in the original DataFrames.
# We perform an outer merge to ensure all unique 'individualID' are included.
union_map = pd.DataFrame(combined_individualID, columns=['individualID'])

# Merge with each original DataFrame
union_map = union_map.merge(protein_map, on='individualID', how='outer')
union_map = union_map.merge(methylation_map, on='individualID', how='outer')
union_map = union_map.merge(gene_expression_map, on='individualID', how='outer')
union_map = union_map.merge(cnv_aggregated_map, on='individualID', how='outer')
union_map = union_map.merge(survival[['individualID', 'projid', 'Study']], on='individualID', how='outer')

# Display the merged DataFrame
union_map

Unnamed: 0,individualID,mirna_id,mwas_id,mrna_id,cnvdata_id,projid,Study
0,R8316516,b01.127C,TBI-AUTO73025-PT-319J,279_120430_3,,33477756.0,MAP
1,R1822146,b01.127N,PT-M5AF,575_120521_2,SM-CTDQQ,20271359.0,ROS
2,R3512949,b01.128C,,,SM-CTDVR,10100736.0,ROS
3,R3143439,b01.128N,PT-BZBT,502_120515_1,SM-CTEMJ,38967303.0,MAP
4,R4590536,b01.129C,PT-BZLT,,SM-CJFNQ,46291609.0,MAP
...,...,...,...,...,...,...,...
3628,R2268751,,,,,75861964.0,MAP
3629,R5306025,,,,,22207815.0,ROS
3630,R6142763,,,,,22207941.0,ROS
3631,R4468842,,,,,49333806.0,MAP


In [74]:
union_map_cleaned = union_map.dropna()
union_map_cleaned.reset_index(drop=True, inplace=True)
union_map_cleaned

Unnamed: 0,individualID,mirna_id,mwas_id,mrna_id,cnvdata_id,projid,Study
0,R1822146,b01.127N,PT-M5AF,575_120521_2,SM-CTDQQ,20271359.0,ROS
1,R3143439,b01.128N,PT-BZBT,502_120515_1,SM-CTEMJ,38967303.0,MAP
2,R6879714,b01.130C,PT-3PTN,660_120530_1,SM-CTEEU,65736039.0,MAP
3,R8963331,b01.130N,PT-M5GT,607_120523_2,SM-CJEKL,20197364.0,ROS
4,R5656511,b02.128C,PT-BYE5,551_120517_2,SM-CTEFZ,67185070.0,MAP
...,...,...,...,...,...,...,...
133,R9781891,b48.130N,PT-BZGE,522_120515_1,SM-CTEFJ,83034844.0,MAP
134,R6759986,b49.127N,PT-BZD3,489_120515_1,SM-CTDRI,98953007.0,MAP
135,R2516394,b49.128C,PT-BZBN,398_120503_0,SM-CJEJG,50403446.0,MAP
136,R9286126,b49.128N,PT-M5Q5,589_120522_6,SM-CJEK7,20584923.0,ROS


### 4.3 Unify patient samples within methylation, copynumer,  gene expression, clinical, proteomics, molecular subtype, and sample type, primary disease datasets

In [75]:
#clinical data
survival

Unnamed: 0,projid,Study,msex,educ,race,spanish,apoe_genotype,age_at_visit_max,age_first_ad_dx,age_death,cts_mmse30_first_ad_dx,cts_mmse30_lv,pmi,braaksc,ceradsc,cogdx,dcfdx_lv,individualID
0,10101589,ROS,1.0,20.0,1.0,2.0,34.0,90+,90+,90+,18.0,5.0,9.916667,4.0,2.0,4.0,4.0,R6939144
1,86767530,MAP,0.0,10.0,1.0,2.0,33.0,90+,90+,90+,18.0,10.0,6.500000,4.0,2.0,4.0,4.0,R3893503
2,9650662,MAP,0.0,15.0,1.0,2.0,23.0,90+,90+,90+,0.0,0.0,3.850000,3.0,2.0,4.0,4.0,R8937093
3,50402855,MAP,0.0,21.0,1.0,2.0,33.0,90+,,,,27.0,,,,,1.0,R7139444
4,20544321,ROS,0.0,16.0,1.0,2.0,23.0,90+,90+,,13.0,14.0,,,,,4.0,R4971237
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3579,22207815,ROS,0.0,18.0,2.0,2.0,23.0,57.653661875427787,,,,29.0,,,,,1.0,R5306025
3580,22207941,ROS,0.0,16.0,2.0,2.0,34.0,56.651608487337441,,,,27.0,,,,,1.0,R6142763
3581,49333806,MAP,0.0,12.0,2.0,2.0,,56.599589322381931,,,,30.0,,,,,1.0,R4468842
3582,59720188,MAP,0.0,13.0,1.0,1.0,,54.622861054072551,,,,29.0,,,,,1.0,R9446033


In [76]:
#proteomics data
protein_filtered

Unnamed: 0,gene_name,R8316516,R1822146,R3512949,R3143439,R4590536,R9679238,R6879714,R8963331,R4182458,...,R9680160,R7015250,R5676537,R2087569,R2421650,R1498848,R9794121,R7298955,R6084846,R9798367
0,ABL1,-0.103582,0.015336,-0.060367,-0.301662,-0.176597,-0.168287,-0.083194,-0.200645,-0.648434,...,-0.286807,-0.162889,-0.183929,-0.135078,0.057625,-0.115102,-0.273137,-0.154150,-0.212377,-0.080477
1,ABL2,0.013649,0.120403,-0.160178,0.060566,0.062877,0.029663,0.029067,0.176011,0.115946,...,-0.054777,0.024013,-0.099587,0.306420,-0.147124,0.145804,0.025654,0.047217,-0.050547,0.074370
2,ACOX1,-0.231938,-0.234615,-0.098837,-0.137983,-0.203026,-0.369375,0.079500,-0.323854,-0.232612,...,-0.179384,-0.060935,0.107517,-0.532369,0.328355,-0.513306,-0.329483,-0.122840,-0.085473,-0.430132
3,ACOX2,0.229814,0.698708,0.264218,0.305458,0.133755,0.242651,0.325068,0.222209,0.043442,...,0.095562,0.305927,0.468961,-0.041950,0.946098,-0.086123,0.229065,0.445173,0.260445,0.214540
4,ACSBG1,0.112325,-0.125347,-0.145176,-0.165873,-0.089858,-0.291286,0.017044,-0.194700,0.129824,...,-0.202923,0.105410,-0.103723,-0.475825,0.048453,-0.313050,-0.083820,-0.184159,-0.001842,-0.287780
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
277,USP25,0.048064,0.064028,0.053987,0.111727,0.216005,0.156309,0.113332,0.112448,0.063420,...,0.146471,0.094181,0.092233,0.099662,0.107154,0.085431,-0.021249,0.142780,0.135491,0.170657
278,VAV3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.213189,0.444728,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
279,WDR24,0.103879,0.115408,0.191761,0.215023,0.288794,0.223932,0.251621,0.202675,0.224688,...,0.165323,0.233864,0.266288,0.306279,0.079271,0.169097,0.209893,0.190447,0.216066,0.010723
280,WDR59,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,-0.058411,0.046777,0.191445,0.172955,-0.067315,0.000403,0.000672,0.089002,0.060449,0.034606


In [77]:
# Extract column names starting with 'R' from methylation datasets
R_columns_upstream = [col for col in Upstream_df_filtered.columns if col.startswith('R')]
R_columns_distal = [col for col in Distal_Promoter_df_filtered.columns if col.startswith('R')]
R_columns_proximal = [col for col in Proximal_Promoter_df_filtered.columns if col.startswith('R')]
R_columns_core = [col for col in Core_Promoter_df_filtered.columns if col.startswith('R')]
R_columns_downstream = [col for col in Downstream_df_filtered.columns if col.startswith('R')]
R_columns_cnv =[col for col in cnv_aggregated_filtered_del.columns if col.startswith('R')]
# Extract 'R' columns from other datasets
R_columns_gene_expression = [col for col in gene_expression_filtered.columns if col != 'gene_name']
R_columns_survival = [col for col in survival['individualID'] if col.startswith('R')]
R_columns_protein = [col for col in protein_filtered.columns if col.startswith('R')]
# Find the intersection of R column names across all DataFrames
common_R_columns = set(R_columns_upstream) & set(R_columns_distal) & set(R_columns_proximal) & set(R_columns_core) & set(R_columns_downstream) &set(R_columns_gene_expression) &set(R_columns_survival) &set(R_columns_protein) & set(R_columns_cnv)
# Convert the intersection back to a list, if needed
common_R_columns_list = list(common_R_columns)

# Print the number and the list of common R columns
print(f"Number of common R columns: {len(common_R_columns)}")


Number of common R columns: 138


In [78]:
# Define columns to keep along with common R columns
additional_cols_methylation = ['Closest_TSS_gene_name']

# Filter each methylation DataFrame
Upstream_df_filtered = Upstream_df_filtered[additional_cols_methylation + common_R_columns_list]
Distal_Promoter_df_filtered = Distal_Promoter_df_filtered[additional_cols_methylation + common_R_columns_list]
Proximal_Promoter_df_filtered = Proximal_Promoter_df_filtered[additional_cols_methylation + common_R_columns_list]
Core_Promoter_df_filtered = Core_Promoter_df_filtered[additional_cols_methylation + common_R_columns_list]
Downstream_df_filtered = Downstream_df_filtered[additional_cols_methylation + common_R_columns_list]

In [79]:
Core_Promoter_df_filtered

Unnamed: 0,Closest_TSS_gene_name,R5927382,R8155560,R2079629,R2880377,R1822146,R5369295,R2731764,R1710143,R3143439,...,R3477250,R3757880,R7796947,R9072353,R5546461,R8937093,R7325259,R9489952,R2716798,R5541746
44,ABL1,0.090762,0.086789,0.066612,0.079865,0.079401,0.066825,0.07532,0.085521,0.075098,...,0.082297,0.08838,0.078789,0.071615,0.104562,0.075345,0.082585,0.06277,0.075095,0.082938
122,ADCY2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,...,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000
123,ADCY3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,...,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000
124,ADCY5,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,...,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000
127,ADCY9,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,...,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22894,PLCD3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,...,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000
22933,PTEN,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,...,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000
23010,SIPA1L1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,...,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000
23015,SLC27A4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,...,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000


In [80]:
#sort by gene name and reset index
Upstream_df_filtered = Upstream_df_filtered.sort_values(by='Closest_TSS_gene_name').reset_index(drop=True)
Distal_Promoter_df_filtered = Distal_Promoter_df_filtered.sort_values(by='Closest_TSS_gene_name').reset_index(drop=True)
Proximal_Promoter_df_filtered = Proximal_Promoter_df_filtered.sort_values(by='Closest_TSS_gene_name').reset_index(drop=True)
Core_Promoter_df_filtered = Core_Promoter_df_filtered.sort_values(by='Closest_TSS_gene_name').reset_index(drop=True)
Downstream_df_filtered = Downstream_df_filtered.sort_values(by='Closest_TSS_gene_name').reset_index(drop=True)

In [81]:
Upstream_df_filtered

Unnamed: 0,Closest_TSS_gene_name,R5927382,R8155560,R2079629,R2880377,R1822146,R5369295,R2731764,R1710143,R3143439,...,R3477250,R3757880,R7796947,R9072353,R5546461,R8937093,R7325259,R9489952,R2716798,R5541746
0,ABL1,0.862893,0.961223,0.896742,0.960634,0.851359,0.833128,0.936416,0.801535,0.870916,...,0.871093,0.809780,0.860427,0.859633,0.794250,0.826059,0.919310,0.888154,0.886747,0.844326
1,ABL2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,ACOX1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,ACOX2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,ACSBG1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
277,USP25,0.858358,0.911624,0.950902,0.920440,0.917589,0.970906,0.948738,0.910852,0.914787,...,0.920179,0.925457,0.963111,0.933035,0.941268,0.929853,0.911902,0.916943,0.878775,0.944074
278,VAV3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
279,WDR24,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
280,WDR59,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [82]:
# Define columns to keep along with common R columns
additional_cols_gene_expression = ['gene_name']

# Filter the gene expression DataFrame
gene_expression_filtered = gene_expression_filtered[additional_cols_gene_expression + common_R_columns_list]
gene_expression_filtered

Unnamed: 0,gene_name,R5927382,R8155560,R2079629,R2880377,R1822146,R5369295,R2731764,R1710143,R3143439,...,R3477250,R3757880,R7796947,R9072353,R5546461,R8937093,R7325259,R9489952,R2716798,R5541746
0,ABL1,9.24,9.39,14.25,11.71,7.87,10.44,9.02,9.69,7.38,...,12.54,11.21,8.99,10.14,9.49,8.80,11.62,9.52,9.95,12.53
1,ABL2,4.88,3.84,5.12,5.46,4.30,6.31,4.04,4.90,3.81,...,4.55,4.94,4.40,5.44,3.77,6.95,4.46,4.87,4.01,4.38
2,ACOX1,9.90,7.01,10.28,10.11,9.21,7.75,8.90,9.66,8.69,...,12.30,8.77,9.07,9.01,9.41,10.06,12.54,9.37,7.40,9.56
3,ACOX2,3.14,2.53,6.46,3.35,1.41,5.87,3.16,2.41,1.55,...,5.06,3.46,3.49,1.63,1.93,2.61,3.29,2.44,4.15,6.01
4,ACSBG1,29.41,23.85,23.42,32.56,20.04,13.56,35.23,25.44,14.01,...,41.47,26.60,27.27,22.45,36.74,21.76,40.35,29.84,20.82,33.43
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
277,USP25,5.19,3.74,4.71,8.75,7.32,5.70,7.73,7.26,8.18,...,4.81,6.26,5.45,7.18,4.77,8.17,4.62,5.93,1.80,4.44
278,VAV3,0.82,0.65,1.48,1.83,1.25,1.01,1.83,1.80,1.04,...,2.24,1.21,1.08,0.95,1.00,1.00,1.17,1.05,0.88,1.06
279,WDR24,9.68,4.95,6.25,7.47,7.90,11.98,8.15,7.88,9.70,...,8.26,9.62,6.42,8.79,5.39,8.56,8.50,10.04,4.44,6.05
280,WDR59,16.06,14.57,25.68,18.99,13.39,23.82,17.49,16.04,14.96,...,23.58,19.41,17.39,19.72,15.20,18.83,20.26,17.46,17.74,20.95


In [83]:
# Define columns to keep along with common R columns
additional_cols_protein = ['gene_name']

# Filter the protein DataFrame
protein_filtered = protein_filtered[additional_cols_protein + common_R_columns_list]
protein_filtered

Unnamed: 0,gene_name,R5927382,R8155560,R2079629,R2880377,R1822146,R5369295,R2731764,R1710143,R3143439,...,R3477250,R3757880,R7796947,R9072353,R5546461,R8937093,R7325259,R9489952,R2716798,R5541746
0,ABL1,-0.215796,-0.689430,-0.032314,-0.121898,0.015336,-0.109440,0.007971,-0.143041,-0.301662,...,0.000000,-0.201871,-0.162702,-0.313186,-0.109878,-0.251237,0.119256,-0.139748,-0.285659,-0.160914
1,ABL2,0.020905,0.001840,0.016257,0.029596,0.120403,-0.065788,0.116878,0.005893,0.060566,...,0.054011,0.057517,0.144138,-0.050747,0.035889,0.035736,0.047177,0.031035,0.023666,0.079076
2,ACOX1,-0.184647,-0.212776,-0.182046,-0.326527,-0.234615,-0.209310,-0.252623,-0.318910,-0.137983,...,-0.107535,-0.258996,-0.242742,-0.057990,-0.131041,-0.297375,-0.152688,-0.241783,-0.158526,-0.181190
3,ACOX2,0.225248,0.168701,0.287659,0.159172,0.698708,0.206575,0.075594,0.199177,0.305458,...,0.000000,0.000000,0.268256,0.237922,0.358503,0.305901,0.382171,0.185479,0.372229,0.030072
4,ACSBG1,-0.164102,-0.191091,-0.112015,-0.131441,-0.125347,-0.230008,-0.156400,-0.044198,-0.165873,...,-0.005110,-0.213308,-0.148772,0.093946,0.035349,-0.148445,-0.067463,0.003858,-0.168837,-0.140486
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
277,USP25,0.026582,0.108734,0.064147,0.070146,0.064028,0.116045,0.190231,0.073453,0.111727,...,0.117025,0.091333,0.111318,0.064232,0.180962,0.071204,0.101895,0.120864,0.050285,0.180350
278,VAV3,0.092232,0.000000,0.078495,0.000000,0.000000,0.101004,0.003591,0.376939,0.000000,...,0.000000,0.000000,-0.058140,0.212792,0.000000,0.253988,0.382294,0.429864,0.247476,0.291035
279,WDR24,0.084714,0.165778,0.157664,0.340589,0.115408,0.095999,0.313422,0.280123,0.215023,...,0.191302,0.216025,0.185378,0.183803,0.229071,0.278024,0.078946,0.097724,0.105562,0.088174
280,WDR59,-0.016871,-0.355890,-0.006772,0.043462,0.000000,-0.009242,0.113705,0.008535,0.000000,...,0.147665,0.274498,0.027902,-0.013833,0.056924,-0.039715,-0.033925,0.048828,0.042541,0.054622


In [84]:
# Define columns to keep along with common R columns
additional_cols_protein = ['Gene_Names']

# Filter the cnv DataFrame
cnv_aggregated_filtered_del = cnv_aggregated_filtered_del[additional_cols_protein + common_R_columns_list]
cnv_aggregated_filtered_del.reset_index(drop=True, inplace=True)

cnv_aggregated_filtered_dup = cnv_aggregated_filtered_dup[additional_cols_protein + common_R_columns_list]
cnv_aggregated_filtered_dup.reset_index(drop=True, inplace=True)

cnv_aggregated_filtered_mcnv = cnv_aggregated_filtered_mcnv[additional_cols_protein + common_R_columns_list]
cnv_aggregated_filtered_mcnv.reset_index(drop=True, inplace=True)

cnv_aggregated_filtered_del

Unnamed: 0,Gene_Names,R5927382,R8155560,R2079629,R2880377,R1822146,R5369295,R2731764,R1710143,R3143439,...,R3477250,R3757880,R7796947,R9072353,R5546461,R8937093,R7325259,R9489952,R2716798,R5541746
0,PLAT,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
1,PIK3CB,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,GFAP,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
3,JUND,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,IL6ST,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
277,ADCY8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
278,DEPDC5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
279,CAMK1D,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,1.0,1.0,1.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0
280,PLCG1,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,...,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0


In [85]:
survival_filtered = survival[survival['individualID'].isin(common_R_columns_list)]
survival_filtered

Unnamed: 0,projid,Study,msex,educ,race,spanish,apoe_genotype,age_at_visit_max,age_first_ad_dx,age_death,cts_mmse30_first_ad_dx,cts_mmse30_lv,pmi,braaksc,ceradsc,cogdx,dcfdx_lv,individualID
2,9650662,MAP,0.0,15.0,1.0,2.0,23.0,90+,90+,90+,0.000000,0.000000,3.850000,3.0,2.0,4.0,4.0,R8937093
29,50108200,MAP,0.0,13.0,1.0,2.0,23.0,90+,90+,90+,8.181818,8.275862,5.833333,4.0,2.0,4.0,4.0,R5449861
40,2525608,MAP,0.0,14.0,1.0,2.0,33.0,90+,,90+,,26.000000,4.833333,3.0,4.0,1.0,1.0,R6108690
58,20280666,ROS,0.0,18.0,1.0,2.0,33.0,90+,,90+,,27.600000,4.400000,4.0,4.0,1.0,1.0,R3368249
62,20233939,ROS,0.0,20.0,1.0,2.0,33.0,90+,,90+,,25.000000,3.833333,4.0,2.0,2.0,3.0,R7791442
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3251,10246987,ROS,1.0,20.0,1.0,2.0,33.0,74.557152635181382,,74.984257357973988,,28.000000,5.000000,3.0,2.0,1.0,1.0,R5177066
3302,20646778,ROS,0.0,18.0,1.0,2.0,34.0,73.560574948665291,,75.433264887063658,,30.000000,7.500000,1.0,4.0,2.0,2.0,R8774534
3306,2899847,MAP,1.0,14.0,1.0,2.0,33.0,73.489390828199859,,74.450376454483234,,27.000000,7.016667,2.0,2.0,3.0,3.0,R9936070
3364,51668135,MAP,1.0,22.0,1.0,2.0,33.0,72.093086926762496,,72.720054757015745,,30.000000,12.433333,1.0,3.0,1.0,1.0,R9904978


In [86]:
survival_nan_column_proportions = survival_filtered.isna().mean()

# Display the results
print(survival_nan_column_proportions)

projid                    0.000000
Study                     0.000000
msex                      0.000000
educ                      0.000000
race                      0.000000
spanish                   0.000000
apoe_genotype             0.000000
age_at_visit_max          0.000000
age_first_ad_dx           0.753623
age_death                 0.000000
cts_mmse30_first_ad_dx    0.753623
cts_mmse30_lv             0.000000
pmi                       0.000000
braaksc                   0.000000
ceradsc                   0.000000
cogdx                     0.000000
dcfdx_lv                  0.000000
individualID              0.000000
dtype: float64


In [87]:
# Calculate the proportion of NaN values in each column
survival_nan_column_proportions = survival_filtered.isna().mean()

# Identify columns to be dropped (where proportion of NaN values is greater than 1/3)
columns_to_drop = survival_nan_column_proportions[survival_nan_column_proportions > 1/3].index.tolist()

# Drop these columns from the DataFrame
survival_filtered = survival_filtered.drop(columns=columns_to_drop)

# List of columns that were dropped
print("Columns dropped:", columns_to_drop)

Columns dropped: ['age_first_ad_dx', 'cts_mmse30_first_ad_dx']


In [88]:
survival_filtered

Unnamed: 0,projid,Study,msex,educ,race,spanish,apoe_genotype,age_at_visit_max,age_death,cts_mmse30_lv,pmi,braaksc,ceradsc,cogdx,dcfdx_lv,individualID
2,9650662,MAP,0.0,15.0,1.0,2.0,23.0,90+,90+,0.000000,3.850000,3.0,2.0,4.0,4.0,R8937093
29,50108200,MAP,0.0,13.0,1.0,2.0,23.0,90+,90+,8.275862,5.833333,4.0,2.0,4.0,4.0,R5449861
40,2525608,MAP,0.0,14.0,1.0,2.0,33.0,90+,90+,26.000000,4.833333,3.0,4.0,1.0,1.0,R6108690
58,20280666,ROS,0.0,18.0,1.0,2.0,33.0,90+,90+,27.600000,4.400000,4.0,4.0,1.0,1.0,R3368249
62,20233939,ROS,0.0,20.0,1.0,2.0,33.0,90+,90+,25.000000,3.833333,4.0,2.0,2.0,3.0,R7791442
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3251,10246987,ROS,1.0,20.0,1.0,2.0,33.0,74.557152635181382,74.984257357973988,28.000000,5.000000,3.0,2.0,1.0,1.0,R5177066
3302,20646778,ROS,0.0,18.0,1.0,2.0,34.0,73.560574948665291,75.433264887063658,30.000000,7.500000,1.0,4.0,2.0,2.0,R8774534
3306,2899847,MAP,1.0,14.0,1.0,2.0,33.0,73.489390828199859,74.450376454483234,27.000000,7.016667,2.0,2.0,3.0,3.0,R9936070
3364,51668135,MAP,1.0,22.0,1.0,2.0,33.0,72.093086926762496,72.720054757015745,30.000000,12.433333,1.0,3.0,1.0,1.0,R9904978


In [89]:
cols = ['individualID'] + [col for col in survival_filtered.columns if col != 'individualID']
survival_filtered = survival_filtered[cols]
survival_filtered.reset_index(drop=True, inplace=True)
survival_filtered

Unnamed: 0,individualID,projid,Study,msex,educ,race,spanish,apoe_genotype,age_at_visit_max,age_death,cts_mmse30_lv,pmi,braaksc,ceradsc,cogdx,dcfdx_lv
0,R8937093,9650662,MAP,0.0,15.0,1.0,2.0,23.0,90+,90+,0.000000,3.850000,3.0,2.0,4.0,4.0
1,R5449861,50108200,MAP,0.0,13.0,1.0,2.0,23.0,90+,90+,8.275862,5.833333,4.0,2.0,4.0,4.0
2,R6108690,2525608,MAP,0.0,14.0,1.0,2.0,33.0,90+,90+,26.000000,4.833333,3.0,4.0,1.0,1.0
3,R3368249,20280666,ROS,0.0,18.0,1.0,2.0,33.0,90+,90+,27.600000,4.400000,4.0,4.0,1.0,1.0
4,R7791442,20233939,ROS,0.0,20.0,1.0,2.0,33.0,90+,90+,25.000000,3.833333,4.0,2.0,2.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133,R5177066,10246987,ROS,1.0,20.0,1.0,2.0,33.0,74.557152635181382,74.984257357973988,28.000000,5.000000,3.0,2.0,1.0,1.0
134,R8774534,20646778,ROS,0.0,18.0,1.0,2.0,34.0,73.560574948665291,75.433264887063658,30.000000,7.500000,1.0,4.0,2.0,2.0
135,R9936070,2899847,MAP,1.0,14.0,1.0,2.0,33.0,73.489390828199859,74.450376454483234,27.000000,7.016667,2.0,2.0,3.0,3.0
136,R9904978,51668135,MAP,1.0,22.0,1.0,2.0,33.0,72.093086926762496,72.720054757015745,30.000000,12.433333,1.0,3.0,1.0,1.0


## 5.Gene name/patient samples

In [90]:
gene_list = gene_expression_filtered['gene_name']
gene_list

0        ABL1
1        ABL2
2       ACOX1
3       ACOX2
4      ACSBG1
        ...  
277     USP25
278      VAV3
279     WDR24
280     WDR59
281     YWHAG
Name: gene_name, Length: 282, dtype: object

In [91]:
patient_sample_list = pd.DataFrame(common_R_columns,columns=['sample'])
patient_sample_list

Unnamed: 0,sample
0,R5927382
1,R8155560
2,R2079629
3,R2880377
4,R1822146
...,...
133,R8937093
134,R7325259
135,R9489952
136,R2716798


## 6.Save processed datasets

### 6.1 Keep the consistency for dataframes on genes and samples

In [92]:
# [gene_list]
# gene
sorted_gene_list = gene_list.sort_values()
sorted_gene = sorted_gene_list.tolist()
sorted_gene_df = pd.DataFrame(sorted_gene, columns=['Gene'])
display(sorted_gene_df)
# gene-meth
sorted_gene_methy = [gene + '-METH' for gene in sorted_gene]
sorted_gene_methy_df = pd.DataFrame(sorted_gene_methy, columns=['Gene'])
display(sorted_gene_methy_df)
# # gene-protein
sorted_gene_protein = [gene + '-PROT' for gene in sorted_gene]
sorted_gene_protein_df = pd.DataFrame(sorted_gene_protein, columns=['Gene'])
display(sorted_gene_protein_df)
# all-gene
sorted_gene_all = sorted_gene + sorted_gene_methy + sorted_gene_protein
sorted_all_gene_df = pd.DataFrame(sorted_gene_all, columns=['Gene'])
display(sorted_all_gene_df)

Unnamed: 0,Gene
0,ABL1
1,ABL2
2,ACOX1
3,ACOX2
4,ACSBG1
...,...
277,USP25
278,VAV3
279,WDR24
280,WDR59


Unnamed: 0,Gene
0,ABL1-METH
1,ABL2-METH
2,ACOX1-METH
3,ACOX2-METH
4,ACSBG1-METH
...,...
277,USP25-METH
278,VAV3-METH
279,WDR24-METH
280,WDR59-METH


Unnamed: 0,Gene
0,ABL1-PROT
1,ABL2-PROT
2,ACOX1-PROT
3,ACOX2-PROT
4,ACSBG1-PROT
...,...
277,USP25-PROT
278,VAV3-PROT
279,WDR24-PROT
280,WDR59-PROT


Unnamed: 0,Gene
0,ABL1
1,ABL2
2,ACOX1
3,ACOX2
4,ACSBG1
...,...
841,USP25-PROT
842,VAV3-PROT
843,WDR24-PROT
844,WDR59-PROT


In [93]:
# [patient-sample-list]
sorted_patient_sample_list = patient_sample_list.sort_values(by='sample')['sample'].tolist()
print(sorted_patient_sample_list)
sorted_patient_sample_df = patient_sample_list.sort_values(by='sample').reset_index(drop=True)
display(sorted_patient_sample_df)

['R1042011', 'R1105988', 'R1218460', 'R1262106', 'R1287407', 'R1407047', 'R1489314', 'R1531359', 'R1617674', 'R1660726', 'R1687970', 'R1710143', 'R1743384', 'R1822146', 'R1924801', 'R2045909', 'R2079629', 'R2111091', 'R2237803', 'R2373966', 'R2455930', 'R2494273', 'R2516394', 'R2543886', 'R2575548', 'R2624931', 'R2645096', 'R2678902', 'R2716798', 'R2721311', 'R2731324', 'R2731764', 'R2803490', 'R2880377', 'R2881301', 'R3008520', 'R3143439', 'R3176125', 'R3257830', 'R3368249', 'R3408213', 'R3477250', 'R3535957', 'R3739042', 'R3740754', 'R3741788', 'R3757880', 'R3811781', 'R3874626', 'R3922205', 'R3978789', 'R4119160', 'R4147686', 'R4260171', 'R4276053', 'R4527133', 'R4531942', 'R4641987', 'R4917253', 'R5177066', 'R5211056', 'R5234179', 'R5334541', 'R5369295', 'R5374583', 'R5449861', 'R5541746', 'R5546461', 'R5656511', 'R5739959', 'R5766881', 'R5789564', 'R5850046', 'R5907586', 'R5927382', 'R5965031', 'R5973191', 'R6001210', 'R6058053', 'R6108690', 'R6114572', 'R6132310', 'R6231758', 'R6

Unnamed: 0,sample
0,R1042011
1,R1105988
2,R1218460
3,R1262106
4,R1287407
...,...
133,R9598418
134,R9680160
135,R9781891
136,R9904978


In [94]:
Upstream_df_filtered = Upstream_df_filtered[['Closest_TSS_gene_name'] + sorted_patient_sample_list]
Upstream_df_filtered

Unnamed: 0,Closest_TSS_gene_name,R1042011,R1105988,R1218460,R1262106,R1287407,R1407047,R1489314,R1531359,R1617674,...,R9330569,R9354381,R9419876,R9489952,R9596785,R9598418,R9680160,R9781891,R9904978,R9936070
0,ABL1,0.840924,0.828570,0.968392,0.934158,0.920154,0.842128,0.863738,0.890959,0.887796,...,0.883518,0.833930,0.853269,0.888154,0.871024,0.887854,0.796570,0.890607,0.857498,0.848104
1,ABL2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,ACOX1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,ACOX2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,ACSBG1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
277,USP25,0.912923,0.940930,0.956627,0.947480,0.958004,0.927244,0.924689,0.936878,0.935826,...,0.947889,0.905735,0.915803,0.916943,0.913990,0.943330,0.918655,0.972261,0.946215,0.916269
278,VAV3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
279,WDR24,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
280,WDR59,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [95]:
Distal_Promoter_df_filtered = Distal_Promoter_df_filtered[['Closest_TSS_gene_name'] + sorted_patient_sample_list]
Distal_Promoter_df_filtered

Unnamed: 0,Closest_TSS_gene_name,R1042011,R1105988,R1218460,R1262106,R1287407,R1407047,R1489314,R1531359,R1617674,...,R9330569,R9354381,R9419876,R9489952,R9596785,R9598418,R9680160,R9781891,R9904978,R9936070
0,ABL1,0.089715,0.079023,0.083782,0.095226,0.079727,0.089414,0.095794,0.097683,0.106171,...,0.086522,0.090599,0.084719,0.083871,0.090557,0.083483,0.087874,0.098657,0.090671,0.092750
1,ABL2,0.361083,0.402920,0.384848,0.348869,0.330690,0.381684,0.359238,0.325574,0.372634,...,0.364614,0.372977,0.365313,0.350353,0.379052,0.383266,0.346888,0.380569,0.338561,0.342576
2,ACOX1,0.279569,0.283096,0.275482,0.285610,0.226329,0.294882,0.286413,0.286388,0.257850,...,0.254181,0.288619,0.291166,0.247829,0.283675,0.284497,0.262019,0.266295,0.293556,0.290749
3,ACOX2,0.595101,0.675102,0.594466,0.608098,0.582309,0.592607,0.604587,0.569795,0.595909,...,0.559899,0.605573,0.593663,0.614390,0.540435,0.587574,0.612173,0.616614,0.606165,0.617456
4,ACSBG1,0.815428,0.766929,0.782087,0.825374,0.827599,0.800423,0.807970,0.790806,0.815863,...,0.820138,0.761951,0.802638,0.792526,0.796770,0.803631,0.759244,0.785801,0.788703,0.880316
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
277,USP25,0.426166,0.456587,0.473598,0.476879,0.457649,0.448843,0.462163,0.444049,0.456681,...,0.468259,0.412133,0.458516,0.424559,0.446867,0.468077,0.462013,0.462780,0.458581,0.433060
278,VAV3,0.166763,0.182697,0.180251,0.178134,0.167716,0.175470,0.186390,0.170123,0.163993,...,0.175206,0.174137,0.174846,0.170087,0.180055,0.178980,0.178857,0.182111,0.180808,0.169633
279,WDR24,0.601463,0.598475,0.567834,0.588056,0.567538,0.594039,0.607669,0.577609,0.527634,...,0.595817,0.603631,0.626335,0.605309,0.601313,0.622469,0.650755,0.612076,0.605661,0.601984
280,WDR59,0.589094,0.562141,0.590537,0.601665,0.589571,0.582975,0.641102,0.574564,0.509480,...,0.608581,0.599518,0.597243,0.594005,0.621439,0.627023,0.603657,0.616209,0.568872,0.610645


In [96]:
Proximal_Promoter_df_filtered = Proximal_Promoter_df_filtered[['Closest_TSS_gene_name'] + sorted_patient_sample_list]
Proximal_Promoter_df_filtered

Unnamed: 0,Closest_TSS_gene_name,R1042011,R1105988,R1218460,R1262106,R1287407,R1407047,R1489314,R1531359,R1617674,...,R9330569,R9354381,R9419876,R9489952,R9596785,R9598418,R9680160,R9781891,R9904978,R9936070
0,ABL1,0.044368,0.062633,0.026291,0.072624,0.059300,0.049887,0.061818,0.074327,0.089316,...,0.052990,0.048473,0.057537,0.057871,0.061479,0.054830,0.063721,0.061217,0.069982,0.055289
1,ABL2,0.056039,0.048390,0.043231,0.038778,0.074825,0.072046,0.033834,0.054895,0.075274,...,0.051643,0.060297,0.071945,0.045461,0.048974,0.045429,0.056747,0.086576,0.059429,0.035000
2,ACOX1,0.062218,0.054027,0.050465,0.065108,0.044093,0.064772,0.050085,0.048818,0.054590,...,0.049322,0.059093,0.063238,0.053418,0.061633,0.051742,0.065676,0.060437,0.057181,0.046583
3,ACOX2,0.565847,0.621953,0.546532,0.604594,0.559679,0.534586,0.600774,0.573503,0.590366,...,0.588369,0.562800,0.574083,0.562713,0.495566,0.574266,0.555134,0.569172,0.586409,0.531689
4,ACSBG1,0.645191,0.605844,0.596880,0.609193,0.587833,0.629804,0.626459,0.648666,0.643335,...,0.653563,0.605318,0.618164,0.657296,0.642971,0.607863,0.561088,0.704000,0.636151,0.620276
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
277,USP25,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
278,VAV3,0.206908,0.242413,0.209273,0.238575,0.178828,0.216909,0.191618,0.198186,0.216647,...,0.199099,0.227403,0.229036,0.181228,0.173853,0.250769,0.228539,0.200063,0.222036,0.207083
279,WDR24,0.051534,0.055213,0.055207,0.051952,0.053353,0.055276,0.050483,0.047329,0.048289,...,0.049047,0.046967,0.058844,0.040161,0.048496,0.062437,0.038922,0.050951,0.047347,0.045929
280,WDR59,0.042414,0.032748,0.028305,0.042473,0.029538,0.034286,0.029602,0.037961,0.029468,...,0.026818,0.033992,0.047527,0.032790,0.037695,0.031741,0.039175,0.041849,0.033136,0.036792


In [97]:
Core_Promoter_df_filtered = Core_Promoter_df_filtered[['Closest_TSS_gene_name'] + sorted_patient_sample_list]
Core_Promoter_df_filtered

Unnamed: 0,Closest_TSS_gene_name,R1042011,R1105988,R1218460,R1262106,R1287407,R1407047,R1489314,R1531359,R1617674,...,R9330569,R9354381,R9419876,R9489952,R9596785,R9598418,R9680160,R9781891,R9904978,R9936070
0,ABL1,0.063749,0.083006,0.078626,0.080111,0.079313,0.075052,0.094081,0.076834,0.071132,...,0.079611,0.072881,0.075733,0.062770,0.080510,0.075028,0.092450,0.074180,0.077276,0.078574
1,ABL2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,ACOX1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,ACOX2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,ACSBG1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
277,USP25,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
278,VAV3,0.848181,0.836759,0.804931,0.876005,0.877707,0.849264,0.868510,0.828925,0.867942,...,0.868437,0.812739,0.892631,0.844308,0.774650,0.832075,0.802959,0.865394,0.877187,0.794043
279,WDR24,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
280,WDR59,0.032753,0.045718,0.050815,0.069452,0.073916,0.068025,0.043035,0.054027,0.065731,...,0.042202,0.035667,0.065883,0.044396,0.080372,0.058206,0.048152,0.068930,0.050153,0.074434


In [98]:
Downstream_df_filtered = Downstream_df_filtered[['Closest_TSS_gene_name'] + sorted_patient_sample_list]
Downstream_df_filtered

Unnamed: 0,Closest_TSS_gene_name,R1042011,R1105988,R1218460,R1262106,R1287407,R1407047,R1489314,R1531359,R1617674,...,R9330569,R9354381,R9419876,R9489952,R9596785,R9598418,R9680160,R9781891,R9904978,R9936070
0,ABL1,0.192274,0.199518,0.189189,0.240213,0.171019,0.243995,0.197777,0.189323,0.211558,...,0.240568,0.205805,0.220485,0.192970,0.211803,0.232651,0.216349,0.224307,0.194131,0.223439
1,ABL2,0.190666,0.196916,0.199890,0.197673,0.191039,0.201994,0.205053,0.188914,0.189438,...,0.189366,0.183064,0.201461,0.186714,0.196832,0.188199,0.193339,0.209151,0.189358,0.193138
2,ACOX1,0.050739,0.071958,0.043231,0.057461,0.067936,0.057568,0.055619,0.080869,0.059850,...,0.051048,0.067357,0.070971,0.061172,0.063599,0.063973,0.061780,0.058098,0.056975,0.046393
3,ACOX2,0.622053,0.632751,0.569233,0.609195,0.578169,0.571030,0.631324,0.598472,0.606815,...,0.607991,0.629589,0.624581,0.639319,0.542820,0.614249,0.609025,0.605039,0.632324,0.618899
4,ACSBG1,0.571939,0.510617,0.521159,0.539002,0.552057,0.532050,0.564501,0.555669,0.544870,...,0.591923,0.585018,0.568528,0.581028,0.597445,0.580642,0.555774,0.586676,0.570299,0.567967
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
277,USP25,0.108714,0.151796,0.127211,0.146896,0.145585,0.142772,0.121093,0.146306,0.119753,...,0.128068,0.144064,0.138829,0.129855,0.158501,0.142854,0.144145,0.156703,0.130744,0.126844
278,VAV3,0.277073,0.289257,0.283241,0.275732,0.270178,0.272430,0.293400,0.293917,0.281541,...,0.284747,0.281749,0.289236,0.267715,0.287134,0.272334,0.259927,0.289256,0.289475,0.289243
279,WDR24,0.498517,0.501182,0.487196,0.501547,0.492430,0.491359,0.499129,0.496870,0.494892,...,0.506524,0.503099,0.507429,0.510200,0.503407,0.503564,0.488619,0.508973,0.501086,0.505347
280,WDR59,0.106767,0.099539,0.077286,0.100141,0.078854,0.088959,0.097602,0.113811,0.106046,...,0.102192,0.111921,0.126943,0.090526,0.100884,0.123730,0.109538,0.122009,0.100629,0.086949


In [99]:
cnv_aggregated_filtered_del = cnv_aggregated_filtered_del[['Gene_Names'] + sorted_patient_sample_list].sort_values(by='Gene_Names').reset_index(drop=True)
cnv_aggregated_filtered_del

Unnamed: 0,Gene_Names,R1042011,R1105988,R1218460,R1262106,R1287407,R1407047,R1489314,R1531359,R1617674,...,R9330569,R9354381,R9419876,R9489952,R9596785,R9598418,R9680160,R9781891,R9904978,R9936070
0,ABL1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ABL2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ACOX1,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
3,ACOX2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ACSBG1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
277,USP25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
278,VAV3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
279,WDR24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
280,WDR59,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0


In [100]:
cnv_aggregated_filtered_dup = cnv_aggregated_filtered_dup[['Gene_Names'] + sorted_patient_sample_list].sort_values(by='Gene_Names').reset_index(drop=True)
cnv_aggregated_filtered_dup

Unnamed: 0,Gene_Names,R1042011,R1105988,R1218460,R1262106,R1287407,R1407047,R1489314,R1531359,R1617674,...,R9330569,R9354381,R9419876,R9489952,R9596785,R9598418,R9680160,R9781891,R9904978,R9936070
0,ABL1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ABL2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ACOX1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,ACOX2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ACSBG1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
277,USP25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
278,VAV3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
279,WDR24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
280,WDR59,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [101]:
cnv_aggregated_filtered_mcnv = cnv_aggregated_filtered_mcnv[['Gene_Names'] + sorted_patient_sample_list].sort_values(by='Gene_Names').reset_index(drop=True)
cnv_aggregated_filtered_mcnv

Unnamed: 0,Gene_Names,R1042011,R1105988,R1218460,R1262106,R1287407,R1407047,R1489314,R1531359,R1617674,...,R9330569,R9354381,R9419876,R9489952,R9596785,R9598418,R9680160,R9781891,R9904978,R9936070
0,ABL1,4.0,4.0,4.0,4.0,3.0,4.0,3.0,2.0,4.0,...,2.0,4.0,4.0,4.0,4.0,4.0,2.0,4.0,3.0,2.0
1,ABL2,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
2,ACOX1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,ACOX2,2.0,2.0,3.0,3.0,2.0,3.0,2.0,2.0,3.0,...,1.0,2.0,2.0,2.0,2.0,3.0,2.0,2.0,2.0,2.0
4,ACSBG1,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
277,USP25,2.0,2.0,2.0,2.0,2.0,2.0,2.0,3.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
278,VAV3,6.0,6.0,6.0,6.0,6.0,6.0,6.0,7.0,6.0,...,6.0,5.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0
279,WDR24,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
280,WDR59,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,...,1.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,1.0,2.0


In [102]:
gene_expression_filtered = gene_expression_filtered[['gene_name'] + sorted_patient_sample_list].sort_values(by='gene_name').reset_index(drop=True)
gene_expression_filtered

Unnamed: 0,gene_name,R1042011,R1105988,R1218460,R1262106,R1287407,R1407047,R1489314,R1531359,R1617674,...,R9330569,R9354381,R9419876,R9489952,R9596785,R9598418,R9680160,R9781891,R9904978,R9936070
0,ABL1,9.55,8.59,9.99,9.38,7.60,10.04,12.94,8.11,11.25,...,10.45,12.04,8.62,9.52,7.84,8.78,10.01,8.69,7.80,9.17
1,ABL2,4.46,4.92,5.50,4.01,5.14,4.09,2.60,5.36,3.27,...,3.93,3.88,4.70,4.87,3.79,4.67,4.83,6.52,4.77,3.71
2,ACOX1,10.39,5.02,8.66,9.11,10.94,9.37,9.00,10.47,9.96,...,11.57,9.80,10.06,9.37,7.02,10.82,9.12,8.46,6.49,7.75
3,ACOX2,2.40,1.62,2.90,4.37,3.32,3.64,4.36,2.40,4.24,...,3.08,3.89,2.39,2.44,1.13,2.11,2.90,0.91,2.44,2.97
4,ACSBG1,31.60,6.44,19.45,36.13,18.28,35.44,41.35,20.46,33.00,...,44.28,36.26,13.89,29.84,17.88,15.19,22.31,9.42,15.91,34.02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
277,USP25,6.54,4.30,10.06,4.19,5.75,6.01,2.21,4.08,2.65,...,5.21,4.56,7.62,5.93,5.86,3.06,6.88,12.52,4.68,4.32
278,VAV3,1.40,0.48,0.97,1.09,0.93,0.95,1.33,0.81,1.14,...,2.13,0.96,0.83,1.05,0.57,1.44,0.91,0.94,0.55,1.05
279,WDR24,7.98,5.93,8.24,4.68,6.72,6.92,4.37,7.93,5.82,...,6.56,7.48,9.59,10.04,6.19,5.65,8.21,8.77,6.20,5.65
280,WDR59,16.94,17.41,17.24,14.13,20.19,18.52,17.68,16.56,14.74,...,14.42,19.16,15.18,17.46,18.86,19.82,17.73,13.53,13.41,16.14


In [103]:
protein_filtered = protein_filtered[['gene_name'] + sorted_patient_sample_list].sort_values(by='gene_name').reset_index(drop=True)
protein_filtered

Unnamed: 0,gene_name,R1042011,R1105988,R1218460,R1262106,R1287407,R1407047,R1489314,R1531359,R1617674,...,R9330569,R9354381,R9419876,R9489952,R9596785,R9598418,R9680160,R9781891,R9904978,R9936070
0,ABL1,-0.152318,-0.133348,-0.094704,0.000000,-0.207882,-0.207884,-0.215809,-0.057245,-0.203775,...,-0.122148,-0.154417,-0.249470,-0.139748,-0.184267,-0.160928,-0.286807,-0.192470,-0.213141,-0.032982
1,ABL2,0.359930,0.059332,0.086033,0.090225,0.014475,-0.032457,0.034013,0.098259,0.045561,...,0.048972,0.027514,0.019108,0.031035,-0.012989,0.044827,-0.054777,0.084112,0.091079,-0.031734
2,ACOX1,-0.321386,-0.317109,-0.070037,-0.120788,-0.021767,-0.038656,-0.207960,-0.199075,-0.191772,...,-0.127654,-0.560158,-0.264584,-0.241783,-0.055325,-0.095912,-0.179384,-0.252255,-0.138068,-0.214482
3,ACOX2,0.390535,0.324678,0.146437,0.040889,0.462729,0.063497,0.150657,-0.101771,0.250777,...,0.282836,0.245258,0.216197,0.185479,0.220832,0.319665,0.095562,0.361658,0.218263,0.297130
4,ACSBG1,-0.143728,-0.222351,-0.176261,-0.142229,0.053243,0.112320,-0.014588,-0.135944,0.006285,...,-0.090924,-0.134429,-0.115169,0.003858,-0.223631,-0.139431,-0.202923,-0.327051,-0.086037,-0.124788
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
277,USP25,0.156867,0.093152,0.082959,0.151999,0.006022,0.072775,0.106766,0.123087,0.114449,...,0.123289,0.060556,0.132568,0.120864,0.084480,0.114544,0.146471,0.100396,0.106885,0.116240
278,VAV3,0.736285,0.205829,0.151068,0.000000,0.310667,0.231630,0.000000,0.000000,0.203358,...,0.206399,0.153735,0.239625,0.429864,0.000000,0.284478,0.213189,0.000000,0.000000,0.000000
279,WDR24,0.155282,0.172706,0.216306,0.279249,0.348792,0.197577,0.161222,0.164100,0.217241,...,0.335740,0.204590,0.250283,0.097724,0.194178,0.250853,0.165323,0.269706,0.184638,0.186956
280,WDR59,0.246538,0.032793,0.146729,0.058569,0.079975,0.124078,-0.102047,0.065351,-0.031437,...,0.187531,0.049790,0.111510,0.048828,0.091561,0.163101,-0.058411,0.009037,0.251649,0.000000


In [104]:
survival_filtered = survival_filtered.sort_values(by='individualID').reset_index(drop=True)
survival_filtered

Unnamed: 0,individualID,projid,Study,msex,educ,race,spanish,apoe_genotype,age_at_visit_max,age_death,cts_mmse30_lv,pmi,braaksc,ceradsc,cogdx,dcfdx_lv
0,R1042011,20532115,ROS,0.0,17.0,1.0,2.0,23.0,89.505817932922653,89.727583846680361,23.793103,5.266667,2.0,4.0,4.0,2.0
1,R1105988,50108886,MAP,0.0,12.0,1.0,2.0,33.0,90+,90+,19.200000,7.283333,4.0,2.0,1.0,2.0
2,R1218460,20105242,ROS,0.0,18.0,1.0,2.0,33.0,86.483230663928822,87.285420944558524,29.000000,7.500000,3.0,4.0,1.0,1.0
3,R1262106,35286551,MAP,0.0,12.0,1.0,2.0,33.0,90+,90+,26.000000,11.750000,4.0,1.0,2.0,2.0
4,R1287407,20311676,ROS,0.0,18.0,1.0,2.0,34.0,89.100616016427111,90+,30.000000,4.583333,5.0,1.0,2.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133,R9598418,91921829,MAP,0.0,16.0,1.0,2.0,33.0,88.57494866529774,89.303216974674882,29.000000,4.750000,3.0,3.0,1.0,1.0
134,R9680160,20501668,ROS,0.0,18.0,1.0,2.0,34.0,90+,90+,6.000000,3.833333,5.0,1.0,4.0,4.0
135,R9781891,83034844,MAP,0.0,15.0,1.0,2.0,34.0,78.143737166324442,78.444900752908964,30.000000,7.450000,3.0,2.0,1.0,1.0
136,R9904978,51668135,MAP,1.0,22.0,1.0,2.0,33.0,72.093086926762496,72.720054757015745,30.000000,12.433333,1.0,3.0,1.0,1.0


### 6.2 create output folder and save processed datasets

In [105]:
import os

# outputfile name
output_folder = 'ROSMAP-process'
# create folder if not exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

In [106]:
# DataFrame needed to be saved
dataframes = {
    'gene-list.csv': sorted_gene_df,
    'gene-methy-list.csv': sorted_gene_methy_df,
    # 'gene-protein-list.csv': sorted_gene_protein_df,
    'gene-all-list.csv': sorted_all_gene_df,
    'gene-kegg-edge-list.csv': filtered_up_kegg_df,
    'gene-kegg-path-edge-list.csv': filtered_up_kegg_path_df,
    # 'gene-biogrid-edge-list.csv': filtered_up_biogrid_df,
    # 'gene-string-edge-list.csv': filtered_up_string_df,
    'patient-sample-list.csv': sorted_patient_sample_df,
    # 'phenotype-lists.csv': phenotype_lists,
    'processed-genotype-methy-Upstream.csv': Upstream_df_filtered,
    'processed-genotype-methy-Distal-Promoter.csv': Distal_Promoter_df_filtered,
    'processed-genotype-methy-Proximal-Promoter.csv': Proximal_Promoter_df_filtered,
    'processed-genotype-methy-Core-Promoter.csv': Core_Promoter_df_filtered,
    'processed-genotype-methy-Downstream.csv': Downstream_df_filtered,
    'processed-genotype-cnv_del.csv': cnv_aggregated_filtered_del,
    'processed-genotype-cnv_dup.csv': cnv_aggregated_filtered_dup,
    'processed-genotype-cnv_mcnv.csv': cnv_aggregated_filtered_mcnv,
    'processed-genotype-gene-expression.csv': gene_expression_filtered,
    'processed-genotype-proteomics.csv': protein_filtered,
    # 'processed-phenotype-immune-subtype-transposed.csv': immune_subtype_filtered,
    'processed-phenotype-survival-transposed.csv': survival_filtered,
    # 'processed-phenotype-dense-transposed.csv': dense_filtered,
    # 'processed-phenotype-cellsub-transposed.csv': cellsub_filtered
    'processed-mapping-idmap.csv': union_map,
    'processed-mapping-idmap-withoutnull.csv': union_map_cleaned
}

# save to output folder
for file_name, df in dataframes.items():
    df.to_csv(os.path.join(output_folder, file_name), index=False)

## 7.Convert the processed data into node dictionary

In [107]:
# load processed data
import pandas as pd
import os

# read the file names under the folder
# Define the path to the output folder where CSV files are stored
output_folder = 'ROSMAP-process'

# List of file names you saved earlier
file_names = [
    'gene-list', 'gene-methy-list', 'gene-all-list', 
    'gene-kegg-edge-list', 'gene-kegg-path-edge-list', 
    # 'gene-biogrid-edge-list', 
    # 'gene-string-edge-list',
    'patient-sample-list', 'processed-genotype-methy-Upstream', 
    'processed-genotype-methy-Distal-Promoter', 
    'processed-genotype-methy-Proximal-Promoter', 
    'processed-genotype-methy-Core-Promoter', 'processed-genotype-methy-Downstream', 
    'processed-genotype-cnv_del', 'processed-genotype-cnv_dup', 'processed-genotype-cnv_mcnv', 
    'processed-genotype-gene-expression', 
    'processed-genotype-proteomics',
    'processed-phenotype-survival-transposed'
]

# Dictionary to hold the dataframes
dataframes = {}

# Read each file and assign to a dataframe
for file_name in file_names:
    full_path = os.path.join(output_folder, file_name + '.csv')
    dataframes[file_name] = pd.read_csv(full_path)

In [108]:
# Assign each dataframe to a variable
sorted_gene_df = dataframes['gene-list']
sorted_gene_methy_df = dataframes['gene-methy-list']
# sorted_gene_protein_df = dataframes['gene-protein-list']
sorted_all_gene_df = dataframes['gene-all-list']
filtered_up_kegg_df = dataframes['gene-kegg-edge-list']
filtered_up_kegg_path_df = dataframes['gene-kegg-path-edge-list']
# filtered_up_biogrid_df = dataframes['gene-biogrid-edge-list']
# filtered_up_string_df = dataframes['gene-string-edge-list']
sorted_patient_sample_df = dataframes['patient-sample-list']
# phenotype_lists = dataframes['phenotype-lists']
Upstream_df_filtered = dataframes['processed-genotype-methy-Upstream']
Distal_Promoter_df_filtered = dataframes['processed-genotype-methy-Distal-Promoter']
Proximal_Promoter_df_filtered = dataframes['processed-genotype-methy-Proximal-Promoter']
Core_Promoter_df_filtered = dataframes['processed-genotype-methy-Core-Promoter']
Downstream_df_filtered = dataframes['processed-genotype-methy-Downstream']
copynumber_filtered_del = dataframes['processed-genotype-cnv_del']
copynumber_filtered_dup = dataframes['processed-genotype-cnv_dup']
copynumber_filtered_mcnv = dataframes['processed-genotype-cnv_mcnv']
gene_expression_filtered = dataframes['processed-genotype-gene-expression']
protein_filtered = dataframes['processed-genotype-proteomics']
# immune_subtype_filtered = dataframes['processed-phenotype-immune-subtype-transposed']
survival_filtered = dataframes['processed-phenotype-survival-transposed']
# dense_filtered = dataframes['processed-phenotype-dense-transposed']
# cellsub_filtered = dataframes['processed-phenotype-cellsub-transposed']

In [109]:
# outputfile name
graph_output_folder = 'ROSMAP-graph-data'
# create folder if not exist
if not os.path.exists(graph_output_folder):
    os.makedirs(graph_output_folder)

### 7.1 Make nodes dictionary

In [110]:
sorted_all_gene_dict = sorted_all_gene_df['Gene'].to_dict()
sorted_all_gene_name_dict = {value: key for key, value in sorted_all_gene_dict.items()}
num_gene = sorted_gene_df.shape[0]
num_gene_protein = sorted_gene_protein_df.shape[0]
nodetype_list = ['Gene'] * num_gene + ['Gene-METH'] * num_gene + ['Gene-PROT'] * num_gene_protein
map_all_gene_df = pd.DataFrame({'Gene_num': sorted_all_gene_dict.keys(), 'Gene_name': sorted_all_gene_dict.values(), 'NodeType': nodetype_list})
display(map_all_gene_df)
map_all_gene_df.to_csv(os.path.join(graph_output_folder, 'map-all-gene.csv'), index=False)

Unnamed: 0,Gene_num,Gene_name,NodeType
0,0,ABL1,Gene
1,1,ABL2,Gene
2,2,ACOX1,Gene
3,3,ACOX2,Gene
4,4,ACSBG1,Gene
...,...,...,...
841,841,USP25-PROT,Gene-PROT
842,842,VAV3-PROT,Gene-PROT
843,843,WDR24-PROT,Gene-PROT
844,844,WDR59-PROT,Gene-PROT


### 7.2 Create the edges connection between promoter methylations and proteins

In [111]:
# [Gene-METH - Gene]
sorted_gene_methy = sorted_gene_methy_df['Gene'].tolist()
sorted_gene_list = sorted_gene_df['Gene'].tolist()
sorted_gene_protein = sorted_gene_protein_df['Gene'].tolist()
sorted_intersection = [gene_protein.replace('-PROT', '') for gene_protein in sorted_gene_protein]
gene_meth_edge_df = pd.DataFrame({'src': sorted_gene_methy, 'dest': sorted_gene_list})
display(gene_meth_edge_df)
# [Gene - Gene-PROT]
gene_protein_edge_df = pd.DataFrame({'src': sorted_intersection, 'dest': sorted_gene_protein})
display(gene_protein_edge_df)

Unnamed: 0,src,dest
0,ABL1-METH,ABL1
1,ABL2-METH,ABL2
2,ACOX1-METH,ACOX1
3,ACOX2-METH,ACOX2
4,ACSBG1-METH,ACSBG1
...,...,...
277,USP25-METH,USP25
278,VAV3-METH,VAV3
279,WDR24-METH,WDR24
280,WDR59-METH,WDR59


Unnamed: 0,src,dest
0,ABL1,ABL1-PROT
1,ABL2,ABL2-PROT
2,ACOX1,ACOX1-PROT
3,ACOX2,ACOX2-PROT
4,ACSBG1,ACSBG1-PROT
...,...,...
277,USP25,USP25-PROT
278,VAV3,VAV3-PROT
279,WDR24,WDR24-PROT
280,WDR59,WDR59-PROT


In [112]:
sorted_all_gene_name_dict['ABL1-METH']

282

In [113]:
# replace gene name with gene number
gene_meth_num_edge_df = gene_meth_edge_df.copy()
gene_meth_num_edge_df['src'] = gene_meth_edge_df['src'].map(sorted_all_gene_name_dict)
gene_meth_num_edge_df['dest'] = gene_meth_edge_df['dest'].map(sorted_all_gene_name_dict)
display(gene_meth_num_edge_df)
gene_protein_num_edge_df = gene_protein_edge_df.copy()
gene_protein_num_edge_df['src'] = gene_protein_edge_df['src'].map(sorted_all_gene_name_dict)
gene_protein_num_edge_df['dest'] = gene_protein_edge_df['dest'].map(sorted_all_gene_name_dict)
display(gene_protein_num_edge_df)

Unnamed: 0,src,dest
0,282,0
1,283,1
2,284,2
3,285,3
4,286,4
...,...,...
277,559,277
278,560,278
279,561,279
280,562,280


Unnamed: 0,src,dest
0,0,564
1,1,565
2,2,566
3,3,567
4,4,568
...,...,...
277,277,841
278,278,842
279,279,843
280,280,844


### 7.3 Concat all edges

In [114]:
if selected_database == 'KEGG':
    filtered_up_num_df = filtered_up_kegg_df.copy()
elif selected_database == 'BioGRID':
    filtered_up_num_df = filtered_up_biogrid_df.copy()
elif selected_database == 'STRING':
    filtered_up_num_df = filtered_up_string_df.copy()

filtered_up_num_df['src'] = filtered_up_num_df['src'].map(sorted_all_gene_name_dict)
filtered_up_num_df['dest'] = filtered_up_num_df['dest'].map(sorted_all_gene_name_dict)
display(filtered_up_num_df)
all_gene_edge_num_df = pd.concat([filtered_up_num_df, gene_meth_num_edge_df, gene_protein_num_edge_df])
display(all_gene_edge_num_df)

num_gene_edge = filtered_up_num_df.shape[0]
num_gene_meth_edge = gene_meth_num_edge_df.shape[0]
num_gene_protein_edge = gene_protein_num_edge_df.shape[0]
edgetype_list = ['Gene-Gene'] * num_gene_edge + ['Gene-Gene-METH'] * num_gene_meth_edge + ['Gene-Gene-PROT'] * num_gene_protein_edge
all_gene_edge_num_df['EdgeType'] = edgetype_list
all_gene_edge_num_df = all_gene_edge_num_df.sort_values(by=['src', 'dest']).reset_index(drop=True)
display(all_gene_edge_num_df)
all_gene_edge_num_df.to_csv(os.path.join(graph_output_folder, 'all-gene-edge-num.csv'), index=False)

Unnamed: 0,src,dest
0,10,81
1,10,86
2,11,81
3,11,86
4,12,75
...,...,...
516,279,157
517,280,56
518,280,156
519,280,157


Unnamed: 0,src,dest
0,10,81
1,10,86
2,11,81
3,11,86
4,12,75
...,...,...
277,277,841
278,278,842
279,279,843
280,280,844


Unnamed: 0,src,dest,EdgeType
0,0,564,Gene-Gene-PROT
1,1,565,Gene-Gene-PROT
2,2,566,Gene-Gene-PROT
3,3,567,Gene-Gene-PROT
4,4,568,Gene-Gene-PROT
...,...,...,...
1080,559,277,Gene-Gene-METH
1081,560,278,Gene-Gene-METH
1082,561,279,Gene-Gene-METH
1083,562,280,Gene-Gene-METH


In [115]:
# gene edge interactions without map
all_gene_edge_df = all_gene_edge_num_df.copy()
all_gene_edge_df = all_gene_edge_df.replace(sorted_all_gene_dict)

num_gene_edge = filtered_up_num_df.shape[0]
num_gene_meth_edge = gene_meth_edge_df.shape[0]
num_gene_protein_edge = gene_protein_edge_df.shape[0]
# all_gene_edge_df = all_gene_edge_df.sort_values(by=['src', 'dest']).reset_index(drop=True)
all_gene_edge_df.to_csv(os.path.join(graph_output_folder, 'all-gene-edge.csv'), index=False)
display(all_gene_edge_df)

Unnamed: 0,src,dest,EdgeType
0,ABL1,ABL1-PROT,Gene-Gene-PROT
1,ABL2,ABL2-PROT,Gene-Gene-PROT
2,ACOX1,ACOX1-PROT,Gene-Gene-PROT
3,ACOX2,ACOX2-PROT,Gene-Gene-PROT
4,ACSBG1,ACSBG1-PROT,Gene-Gene-PROT
...,...,...,...
1080,USP25-METH,USP25,Gene-Gene-METH
1081,VAV3-METH,VAV3,Gene-Gene-METH
1082,WDR24-METH,WDR24,Gene-Gene-METH
1083,WDR59-METH,WDR59,Gene-Gene-METH


## 8.Load data into graph format

### 8.1 Form up the input samples

recommends the use of the ceradsc as the classfication for AD types

* ceradsc
* cogdx

In [116]:
survival_filtered

Unnamed: 0,individualID,projid,Study,msex,educ,race,spanish,apoe_genotype,age_at_visit_max,age_death,cts_mmse30_lv,pmi,braaksc,ceradsc,cogdx,dcfdx_lv
0,R1042011,20532115,ROS,0.0,17.0,1.0,2.0,23.0,89.505817932922653,89.727583846680361,23.793103,5.266667,2.0,4.0,4.0,2.0
1,R1105988,50108886,MAP,0.0,12.0,1.0,2.0,33.0,90+,90+,19.200000,7.283333,4.0,2.0,1.0,2.0
2,R1218460,20105242,ROS,0.0,18.0,1.0,2.0,33.0,86.483230663928822,87.285420944558524,29.000000,7.500000,3.0,4.0,1.0,1.0
3,R1262106,35286551,MAP,0.0,12.0,1.0,2.0,33.0,90+,90+,26.000000,11.750000,4.0,1.0,2.0,2.0
4,R1287407,20311676,ROS,0.0,18.0,1.0,2.0,34.0,89.100616016427111,90+,30.000000,4.583333,5.0,1.0,2.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133,R9598418,91921829,MAP,0.0,16.0,1.0,2.0,33.0,88.57494866529774,89.303216974674882,29.000000,4.750000,3.0,3.0,1.0,1.0
134,R9680160,20501668,ROS,0.0,18.0,1.0,2.0,34.0,90+,90+,6.000000,3.833333,5.0,1.0,4.0,4.0
135,R9781891,83034844,MAP,0.0,15.0,1.0,2.0,34.0,78.143737166324442,78.444900752908964,30.000000,7.450000,3.0,2.0,1.0,1.0
136,R9904978,51668135,MAP,1.0,22.0,1.0,2.0,33.0,72.093086926762496,72.720054757015745,30.000000,12.433333,1.0,3.0,1.0,1.0


In [117]:
survival_filtered_feature_df = survival_filtered.copy()
survival_filtered_feature_df = survival_filtered_feature_df[['individualID', 'ceradsc']]
display(survival_filtered_feature_df)

nan_counts = survival_filtered_feature_df.isna().sum()  # or df.isnull()
print(survival_filtered_feature_df['ceradsc'].unique())
survival_filtered_feature_df.to_csv(os.path.join(graph_output_folder, 'survival-label.csv'), index=False)

Unnamed: 0,individualID,ceradsc
0,R1042011,4.0
1,R1105988,2.0
2,R1218460,4.0
3,R1262106,1.0
4,R1287407,1.0
...,...,...
133,R9598418,3.0
134,R9680160,1.0
135,R9781891,2.0
136,R9904978,3.0


[4. 2. 1. 3.]


### 8.3 Randomize the input label

In [118]:
# Randomize the survival label
def input_random(randomized, graph_output_folder):
    randomized = True
    if randomized == True:
        random_survival_filtered_feature_df = survival_filtered_feature_df.sample(frac = 1).reset_index(drop=True)
        random_survival_filtered_feature_df.to_csv(os.path.join(graph_output_folder, 'random-survival-label.csv'), index=False)
    else:
        random_survival_filtered_feature_df = pd.read_csv(os.path.join(graph_output_folder, 'random-survival-label.csv'))
    display(random_survival_filtered_feature_df)

input_random(randomized=False, graph_output_folder=graph_output_folder)

Unnamed: 0,individualID,ceradsc
0,R8466373,3.0
1,R8457289,1.0
2,R2543886,4.0
3,R5965031,1.0
4,R5973191,3.0
...,...,...
133,R7483736,3.0
134,R5927382,4.0
135,R3008520,2.0
136,R9781891,2.0


### 8.3 Split the randomized input into K-fold

In [119]:
# Split deep learning input into training and test
def split_k_fold(k, graph_output_folder):
    random_survival_filtered_feature_df = pd.read_csv(os.path.join(graph_output_folder, 'random-survival-label.csv'))
    num_points = random_survival_filtered_feature_df.shape[0]
    num_div = int(num_points / k)
    num_div_list = [i * num_div for i in range(0, k)]
    num_div_list.append(num_points)
    # Split [random_survival_filtered_feature_df] into [k] folds
    for place_num in range(k):
        low_idx = num_div_list[place_num]
        high_idx = num_div_list[place_num + 1]
        print('\n--------TRAIN-TEST SPLIT WITH TEST FROM ' + str(low_idx) + ' TO ' + str(high_idx) + '--------')
        split_input_df = random_survival_filtered_feature_df[low_idx : high_idx]
        split_input_df.to_csv(os.path.join(graph_output_folder, 'split-random-survival-label-' + str(place_num + 1) + '.csv'), index=False)
        print(split_input_df.shape)

split_k_fold(k=5, graph_output_folder=graph_output_folder)


--------TRAIN-TEST SPLIT WITH TEST FROM 0 TO 27--------
(27, 2)

--------TRAIN-TEST SPLIT WITH TEST FROM 27 TO 54--------
(27, 2)

--------TRAIN-TEST SPLIT WITH TEST FROM 54 TO 81--------
(27, 2)

--------TRAIN-TEST SPLIT WITH TEST FROM 81 TO 108--------
(27, 2)

--------TRAIN-TEST SPLIT WITH TEST FROM 108 TO 138--------
(30, 2)


### 8.4 Reprocess the edge_index file after loading

In [120]:
import os
import numpy as np
import pandas as pd

graph_output_folder = 'ROSMAP-graph-data'
form_data_path = './' + graph_output_folder + '/form_data'
edge_index = np.load(form_data_path + '/edge_index.npy')
# Convert the 2D array into a DataFrame
edge_index_df = pd.DataFrame(edge_index.T, columns=['src', 'dest'])

gene_edge_num_df = pd.read_csv(os.path.join(graph_output_folder, 'all-gene-edge-num.csv'))
src_gene_list = list(gene_edge_num_df['src'])
dest_gene_list = list(gene_edge_num_df['dest'])
edgetype_list = list(gene_edge_num_df['EdgeType'])
gene_edge_num_reverse_df = pd.DataFrame({'src': dest_gene_list, 'dest': src_gene_list, 'EdgeType': edgetype_list})
gene_edge_num_all_df = pd.concat([gene_edge_num_df, gene_edge_num_reverse_df]).drop_duplicates().sort_values(by=['src', 'dest']).reset_index(drop=True)

display(edge_index_df)
display(gene_edge_num_all_df)
merged_gene_edge_num_all_df = pd.merge(gene_edge_num_all_df, edge_index_df, on=['src', 'dest'], how='inner')
display(merged_gene_edge_num_all_df)
merged_gene_edge_num_all_df.to_csv(os.path.join(graph_output_folder, 'merged-gene-edge-num-all.csv'), index=False)

merged_gene_edge_name_all_df = merged_gene_edge_num_all_df.replace(sorted_all_gene_dict)
display(merged_gene_edge_name_all_df)
merged_gene_edge_name_all_df.to_csv(os.path.join(graph_output_folder, 'merged-gene-edge-name-all.csv'), index=False)

FileNotFoundError: [Errno 2] No such file or directory: './ROSMAP-graph-data/form_data/edge_index.npy'