This script serves to import the Clock data from the original supplementary file of Lu et al 2023 (sup_data_2_43587_2023_462_MOESM3_ESM.xlsx)into a pandas dataframe. The data is then used to plot the Clock data for the different species.

### 1. Importing Pan-Mammal Clocks Results from Lu et al. 2023

In [1]:
# Import required libraries
import pandas as pd

In [2]:
### Load the Clock data from the excel file (Lu et al. 2023) ###

# Load epi clock results from excel
clocks_excel_file = "sup_data_2_43587_2023_462_MOESM3_ESM.xlsx"

# Specify the sheets to import 
sheets_to_import = list(range(15, 18)) #  sheets 16 to 18 (table S3.1-3.3)

# Load the sheets, each as a separate data frame
clocks_dfs = [pd.read_excel(clocks_excel_file, sheet_name=sheet, skiprows=[0, 1, 3]) for sheet in sheets_to_import] # skipping rows 1, 2, and 4 to remove the legends and intercept rows
 
# Separate the data frames into individual data frames (clocks 1, 2, and 3)
clock_1_df, clock_2_df, clock_3_df = clocks_dfs

# Create the overlapped dfs for clocks 1, 2 and 3 
overlap_1_2_df = pd.merge(clock_1_df, clock_2_df, on='Gene', how='inner')
overlap_1_2_3_df = pd.merge(overlap_1_2_df, clock_3_df, on='Gene', how='inner')
overlap_1_2_3_df = overlap_1_2_3_df.drop_duplicates(subset='Gene') # Drop duplicate columns resulting from the merge

# Create the overlapped dfs for clocks 2 and 3
overlap_2_3_df = pd.merge(clock_2_df, clock_3_df, on='Gene', how='inner')
overlap_2_3_df = overlap_2_3_df.drop_duplicates(subset='Gene') # Drop duplicate columns resulting from the merge



In [3]:
# Check size of data frames

print(f"Clock 1: {clock_1_df.shape}")
print(f"Clock 2: {clock_2_df.shape}")
print(f"Clock 3: {clock_3_df.shape}")
print(f"Overlap 1 and 2: {overlap_1_2_df.shape}")
print(f"Overlap 2 and 3: {overlap_2_3_df.shape}") # should be 401
print(f"Overlap 1, 2 and 3: {overlap_1_2_3_df.shape}") # should be 140

Clock 1: (335, 14)
Clock 2: (816, 14)
Clock 3: (760, 14)
Overlap 1 and 2: (435, 27)
Overlap 2 and 3: (401, 27)
Overlap 1, 2 and 3: (140, 40)


In [4]:
clock_1_df.head()

Unnamed: 0,index,var,beta_clock1,CHR,bp_hg38,Gene,Gene.hg19,ENTREZID,conservationInMouse,GeneRegionID,annotation,CHR_mm10,bp_mm10,Gene_mm10
0,2,cg00249943,0.152786,5,136274770,TRPC7,TRPC7,57113,conserved gene and region in mouse and human,TRPC7_Exon,"Exon (ENST00000513104.5/57113, exon 4 of 12)",13,56831383.0,Trpc7
1,3,cg00250826,0.00277,7,128754670,CALU,CALU,813,conserved gene but different region in human a...,CALU_Exon,"Exon (ENST00000542996.6/813, exon 4 of 8)",6,29361701.0,Calu
2,4,cg00292639,-0.129891,3,52393402,DNAH1,DNAH1,25981,conserved gene and region in mouse and human,DNAH1_Exon,"Exon (ENST00000420323.6/25981, exon 66 of 78)",14,31265961.0,Dnah1
3,5,cg00362836,0.04863,17,12473058,LINC00670,LINC00670,284034,"mapped in human, not mapped in mouse",LINC00670_Intergenic_upstream,Distal Intergenic,11,65435676.0,Myocd
4,6,cg00411555,-0.031366,17,44206588,UBTF,UBTF,7343,"mapped in human, not mapped in mouse",UBTF_threeUTR,3' UTR,11,102305986.0,Ubtf


In [5]:
clock_2_df.head()

Unnamed: 0,index,var,beta_clock2,CHR,bp_hg38,Gene,Gene.hg19,ENTREZID,conservationInMouse,GeneRegionID,annotation,CHR_mm10,bp_mm10,Gene_mm10
0,2,cg00020468,-0.002385,10,60070043,ANK3,ANK3,288,"mapped in human, not mapped in mouse",ANK3_Exon,"Exon (ENST00000280772.7/288, exon 37 of 44)",10,69992409.0,Ank3
1,3,cg00096922,-0.009445,7,97024873,DLX5,DLX5,1749,conserved gene but different region in human a...,DLX5_fiveUTR,Promoter (<=1kb),6,6882127.0,Dlx5
2,4,cg00098422,0.055805,1,224389815,WDR26,WDR26,80232,conserved gene and region in mouse and human,WDR26_threeUTR,3' UTR,1,181177786.0,Wdr26
3,5,cg00106940,-0.059145,3,181712691,SOX2-OT,SOX2-OT,347689,mapped to different gene in human and mouse,SOX2-OT_Exon,"Exon (ENST00000325404.3/6657, exon 1 of 1)",3,34650752.0,Sox2
4,6,cg00195533,-0.172659,7,128832266,FLNC,FLNC,2318,conserved gene and region in mouse and human,FLNC_Intron,"Intron (ENST00000325888.12/2318, intron 1 of 47)",6,29435072.0,Flnc


In [6]:
clock_3_df.head()

Unnamed: 0,index,var,beta_clock3,CHR,bp_hg38,Gene,Gene.hg19,ENTREZID,conservationInMouse,GeneRegionID,annotation,CHR_mm10,bp_mm10,Gene_mm10
0,2,cg00101675,-0.101354,14,63453806,PPP2R5E,PPP2R5E,5529,conserved gene and region in mouse and human,PPP2R5E_Exon,"Exon (ENST00000337537.8/5529, exon 3 of 14)",12,75515719.0,Ppp2r5e
1,3,cg06259996,0.045149,5,93585317,NR2F1-AS1,NR2F1,441094,mapped to different gene in human and mouse,NR2F1-AS1_Exon,Promoter (<=1kb),13,78198288.0,Nr2f1
2,4,cg08938156,0.663272,3,147409417,LOC440982,ZIC1,440982,mapped to different gene in human and mouse,LOC440982_fiveUTR,5' UTR,9,91365746.0,Zic1
3,5,cg15168457,0.766234,12,111516347,ATXN2,ATXN2,6311,conserved gene and region in mouse and human,ATXN2_Exon,"Exon (ENST00000642389.2/6311, exon 10 of 27)",5,121781348.0,Atxn2
4,6,cg20370622,0.549191,4,41748222,PHOX2B,PHOX2B,8929,conserved gene and region in mouse and human,PHOX2B_Promoter,Promoter (<=1kb),5,67098553.0,Phox2b


In [7]:
overlap_1_2_3_df.head()

Unnamed: 0,index_x,var_x,beta_clock1,CHR_x,bp_hg38_x,Gene,Gene.hg19_x,ENTREZID_x,conservationInMouse_x,GeneRegionID_x,...,CHR,bp_hg38,Gene.hg19,ENTREZID,conservationInMouse,GeneRegionID,annotation,CHR_mm10,bp_mm10,Gene_mm10
0,3,cg00250826,0.00277,7,128754670,CALU,CALU,813,conserved gene but different region in human a...,CALU_Exon,...,7,128754286,CALU,813,"mapped in human, not mapped in mouse",CALU_Exon,"Exon (ENST00000535011.6/813, exon 3 of 6)",6,29361318.0,Calu
2,7,cg00513357,0.091426,11,10806046,EIF4G2,EIF4G2,1982,conserved gene and region in mouse and human,EIF4G2_Exon,...,11,10806872,EIF4G2,1982,conserved gene and region in mouse and human,EIF4G2_Exon,"Exon (ENST00000526148.5/1982, exon 3 of 22)",7,111080951.0,Eif4g2
3,8,cg00587168,0.055299,2,172093840,DLX1,DLX1,1745,mapped to different gene in human and mouse,DLX1_Intergenic_downstream,...,2,172085716,DLX1,1745,mapped to different gene in human and mouse,DLX1_Exon,Promoter (<=1kb),2,71530032.0,Dlx1
5,11,cg00694357,-0.808471,1,155004171,ZBTB7B,ZBTB7B,51043,conserved gene but different region in human a...,ZBTB7B_Intron,...,1,155007795,ZBTB7B,51043,"mapped in human, not mapped in mouse",ZBTB7B_Intron,"Intron (ENST00000417934.6/51043, intron 1 of 4)",3,89388223.0,Zbtb7b
14,14,cg01009870,0.101085,10,129869867,EBF3,EBF3,253738,conserved gene and region in mouse and human,EBF3_Intron,...,10,129873560,EBF3,253738,conserved gene and region in mouse and human,EBF3_Exon,"Exon (ENST00000368648.7/253738, exon 8 of 16)",7,137231272.0,Ebf3


In [8]:
overlap_2_3_df.head()

Unnamed: 0,index_x,var_x,beta_clock2,CHR_x,bp_hg38_x,Gene,Gene.hg19_x,ENTREZID_x,conservationInMouse_x,GeneRegionID_x,...,CHR_y,bp_hg38_y,Gene.hg19_y,ENTREZID_y,conservationInMouse_y,GeneRegionID_y,annotation_y,CHR_mm10_y,bp_mm10_y,Gene_mm10_y
0,3,cg00096922,-0.009445,7,97024873,DLX5,DLX5,1749,conserved gene but different region in human a...,DLX5_fiveUTR,...,7,97024873,DLX5,1749,conserved gene but different region in human a...,DLX5_fiveUTR,Promoter (<=1kb),6,6882127.0,Dlx5
1,4,cg00098422,0.055805,1,224389815,WDR26,WDR26,80232,conserved gene and region in mouse and human,WDR26_threeUTR,...,1,224431495,WDR26,80232,conserved gene and region in mouse and human,WDR26_Exon,"Exon (ENST00000414423.8/80232, exon 3 of 14)",1,181209021.0,Wdr26
2,5,cg00106940,-0.059145,3,181712691,SOX2-OT,SOX2-OT,347689,mapped to different gene in human and mouse,SOX2-OT_Exon,...,3,181712998,SOX2-OT,347689,mapped to different gene in human and mouse,SOX2-OT_Exon,"Exon (ENST00000325404.3/6657, exon 1 of 1)",3,34651059.0,Sox2
3,6,cg00195533,-0.172659,7,128832266,FLNC,FLNC,2318,conserved gene and region in mouse and human,FLNC_Intron,...,7,128832266,FLNC,2318,conserved gene and region in mouse and human,FLNC_Intron,"Intron (ENST00000325888.12/2318, intron 1 of 47)",6,29435072.0,Flnc
4,7,cg00250826,0.007619,7,128754670,CALU,CALU,813,conserved gene but different region in human a...,CALU_Exon,...,7,128754286,CALU,813,"mapped in human, not mapped in mouse",CALU_Exon,"Exon (ENST00000535011.6/813, exon 3 of 6)",6,29361318.0,Calu


### 2. Ordering the df's by beta coefficient (measure of the strength and direction of the relationship between methylation and biological age)

These coefficients represent the effect of each CpG site on the predicted DNA methylation age based on the model. The values can be positive or negative and do not directly correspond to methylation levels (which are typically within the 0-1 range). The coefficients indicate the direction and strength of the relationship between each CpG site and the predicted DNA methylation age, as determined by the regression analysis 

The beta coefficients indicate the magnitude and direction of the effect that each predictor variable has on the outcome variable.

For an epigenetic clock model like this one, where the outcome is a measure of biological age based on epigenetic markers, the beta coefficients in the beta_clock1 column represent the weights assigned to each CpG site (variables listed in the var column) in the model. These weights determine the contribution of each CpG site to the overall prediction of biological age.

The sign of the beta coefficient indicates the direction of the relationship:

    Positive beta: An increase in the methylation level of the CpG site is associated with an increase in the predicted biological age.
    Negative beta: An increase in the methylation level of the CpG site is associated with a decrease in the predicted biological age.

The magnitude of the beta coefficient reflects the strength of the association. Larger absolute values indicate a stronger impact on the predicted outcome.



In [9]:
# Order dataframes by descending beta coefficient values

clock_1_df = clock_1_df.sort_values(by='beta_clock1', ascending=False)
clock_2_df = clock_2_df.sort_values(by='beta_clock2', ascending=False)
clock_3_df = clock_3_df.sort_values(by='beta_clock3', ascending=False)
overlap_1_2_3_df = overlap_1_2_3_df.sort_values(by='beta_clock1', ascending=False)
overlap_2_3_df = overlap_2_3_df.sort_values(by='beta_clock2', ascending=False)

In [10]:
clock_1_df.head()

Unnamed: 0,index,var,beta_clock1,CHR,bp_hg38,Gene,Gene.hg19,ENTREZID,conservationInMouse,GeneRegionID,annotation,CHR_mm10,bp_mm10,Gene_mm10
141,143,cg11728741,0.928874,8,41896469,ANK1,ANK1,286,conserved gene and region in mouse and human,ANK1_Exon,Promoter (<=1kb),8,22975146.0,Ank1
295,297,cg24352905,0.853268,5,77645452,OTP,OTP,23440,conserved gene and region in mouse and human,OTP_Promoter,Promoter (5-6kb),13,94869014.0,Otp
96,98,cg08938156,0.843749,3,147409417,LOC440982,ZIC1,440982,mapped to different gene in human and mouse,LOC440982_fiveUTR,5' UTR,9,91365746.0,Zic1
158,160,cg13058338,0.63866,12,54173598,SMUG1,SMUG1,23583,mapped to different gene in human and mouse,SMUG1_Intron,"Intron (ENST00000635234.1/23583, intron 2 of 4)",15,103146996.0,Smug1
314,316,cg26067250,0.510613,2,172085721,DLX1,DLX1,1745,mapped to different gene in human and mouse,DLX1_Exon,Promoter (<=1kb),2,71530037.0,Dlx1


In [11]:
clock_2_df.head()

Unnamed: 0,index,var,beta_clock2,CHR,bp_hg38,Gene,Gene.hg19,ENTREZID,conservationInMouse,GeneRegionID,annotation,CHR_mm10,bp_mm10,Gene_mm10
140,142,cg05474883,0.952198,4,82354926,HNRNPD,HNRNPD,3184,conserved gene and region in mouse and human,HNRNPD_threeUTR,3' UTR,5.0,99962129.0,Hnrnpd
270,272,cg09710440,0.877027,7,104328932,LHFPL3,LHFPL3,375612,"mapped in human, not mapped in mouse",LHFPL3_Exon,Promoter (<=1kb),,,
781,783,cg26512254,0.69306,7,27200144,HOTTIP,HOXA13,100316868,mapped to different gene in human and mouse,HOTTIP_Intron,"Intron (ENST00000421733.1/100316868, intron 1 ...",6.0,52260841.0,Hoxa13
236,238,cg08681110,0.647719,6,113857322,MARCKS,MARCKS,4082,conserved gene and region in mouse and human,MARCKS_Promoter,Promoter (<=1kb),10.0,37138948.0,Marcks
251,253,cg09227056,0.615235,2,176075721,EVX2,EVX2,344191,conserved gene and region in mouse and human,EVX2_Intergenic_downstream,Distal Intergenic,2.0,74651253.0,Evx2


In [12]:
clock_3_df.head()

Unnamed: 0,index,var,beta_clock3,CHR,bp_hg38,Gene,Gene.hg19,ENTREZID,conservationInMouse,GeneRegionID,annotation,CHR_mm10,bp_mm10,Gene_mm10
491,493,cg26512254,2.638299,7,27200144,HOTTIP,HOXA13,100316868,mapped to different gene in human and mouse,HOTTIP_Intron,"Intron (ENST00000421733.1/100316868, intron 1 ...",6.0,52260841.0,Hoxa13
97,99,cg09710440,1.854689,7,104328932,LHFPL3,LHFPL3,375612,"mapped in human, not mapped in mouse",LHFPL3_Exon,Promoter (<=1kb),,,
32,34,cg18418719,1.806028,3,70972083,FOXP1,FOXP1,27086,mapped to different gene in human and mouse,FOXP1_threeUTR,3' UTR,6.0,98941067.0,Foxp1
132,134,cg24352905,1.738218,5,77645452,OTP,OTP,23440,conserved gene and region in mouse and human,OTP_Promoter,Promoter (5-6kb),13.0,94869014.0,Otp
574,576,cg05474883,1.463495,4,82354926,HNRNPD,HNRNPD,3184,conserved gene and region in mouse and human,HNRNPD_threeUTR,3' UTR,5.0,99962129.0,Hnrnpd


In [13]:
overlap_1_2_3_df.head()

Unnamed: 0,index_x,var_x,beta_clock1,CHR_x,bp_hg38_x,Gene,Gene.hg19_x,ENTREZID_x,conservationInMouse_x,GeneRegionID_x,...,CHR,bp_hg38,Gene.hg19,ENTREZID,conservationInMouse,GeneRegionID,annotation,CHR_mm10,bp_mm10,Gene_mm10
310,98,cg08938156,0.843749,3,147409417,LOC440982,ZIC1,440982,mapped to different gene in human and mouse,LOC440982_fiveUTR,...,3,147409417,ZIC1,440982,mapped to different gene in human and mouse,LOC440982_fiveUTR,5' UTR,9,91365746.0,Zic1
899,329,cg27201382,0.444645,11,27720483,BDNF,BDNF,627,conserved gene and region in mouse and human,BDNF_fiveUTR,...,11,27720483,BDNF,627,conserved gene and region in mouse and human,BDNF_fiveUTR,Promoter (<=1kb),2,109676257.0,Bdnf
593,144,cg11904056,0.341791,5,93583204,NR2F1-AS1,NR2F1,441094,mapped to different gene in human and mouse,NR2F1-AS1_Intron,...,5,93585317,NR2F1,441094,mapped to different gene in human and mouse,NR2F1-AS1_Exon,Promoter (<=1kb),13,78198288.0,Nr2f1
171,44,cg03820088,0.337644,7,23522229,TRA2A,TRA2A,29896,conserved gene but different region in human a...,TRA2A_fiveUTR,...,7,23522229,TRA2A,29896,conserved gene but different region in human a...,TRA2A_fiveUTR,5' UTR,6,49252905.0,Tra2a
208,58,cg04998737,0.336275,5,77645403,OTP,OTP,23440,conserved gene and region in mouse and human,OTP_Promoter,...,5,77645452,OTP,23440,conserved gene and region in mouse and human,OTP_Promoter,Promoter (5-6kb),13,94869014.0,Otp


In [14]:
overlap_2_3_df.head()

Unnamed: 0,index_x,var_x,beta_clock2,CHR_x,bp_hg38_x,Gene,Gene.hg19_x,ENTREZID_x,conservationInMouse_x,GeneRegionID_x,...,CHR_y,bp_hg38_y,Gene.hg19_y,ENTREZID_y,conservationInMouse_y,GeneRegionID_y,annotation_y,CHR_mm10_y,bp_mm10_y,Gene_mm10_y
341,142,cg05474883,0.952198,4,82354926,HNRNPD,HNRNPD,3184,conserved gene and region in mouse and human,HNRNPD_threeUTR,...,4,82354926,HNRNPD,3184,conserved gene and region in mouse and human,HNRNPD_threeUTR,3' UTR,5,99962129.0,Hnrnpd
993,783,cg26512254,0.69306,7,27200144,HOTTIP,HOXA13,100316868,mapped to different gene in human and mouse,HOTTIP_Intron,...,7,27200144,HOXA13,100316868,mapped to different gene in human and mouse,HOTTIP_Intron,"Intron (ENST00000421733.1/100316868, intron 1 ...",6,52260841.0,Hoxa13
539,238,cg08681110,0.647719,6,113857322,MARCKS,MARCKS,4082,conserved gene and region in mouse and human,MARCKS_Promoter,...,6,113857322,MARCKS,4082,conserved gene and region in mouse and human,MARCKS_Promoter,Promoter (<=1kb),10,37138948.0,Marcks
1000,803,cg27201382,0.523618,11,27720483,BDNF,BDNF,627,conserved gene and region in mouse and human,BDNF_fiveUTR,...,11,27720483,BDNF,627,conserved gene and region in mouse and human,BDNF_fiveUTR,Promoter (<=1kb),2,109676257.0,Bdnf
899,599,cg19927064,0.499713,12,48818919,CACNB3,CACNB3,784,conserved gene and region in mouse and human,CACNB3_fiveUTR,...,12,48818919,CACNB3,784,conserved gene and region in mouse and human,CACNB3_fiveUTR,5' UTR,15,98635158.0,Cacnb3


### 3. Check for missing values in the Gene and ENTREZ ID columns

In [13]:
# Create a list of the dataframes to loop through
clock_dfs = [clock_1_df, clock_2_df, clock_3_df]
overlap_dfs = [overlap_2_3_df, overlap_1_2_3_df]

# Loop through the clock dataframes and check for missing genes and ENTREZIDs
for i, df in enumerate(clock_dfs, start=1):
    gene_missing = df['Gene'].isnull().sum()
    entrezid_missing = df['ENTREZID'].isnull().sum()
    print(f"CLOCK {i}: Missing Genes: {gene_missing}, Missing ENTREZ IDs: {entrezid_missing}")

# Loop through the overlap dataframes and check for missing genes and ENTREZIDs
for i, df in enumerate(overlap_dfs, start=2):
    gene_missing = df['Gene'].isnull().sum()
    entrezid_missing = df['ENTREZID_x'].isnull().sum()
    print(f"OOVERLAP 1, 2 and 3: Missing Genes: {gene_missing}, Missing ENTREZsIDs: {entrezid_missing}")

CLOCK 1: Missing Genes: 0, Missing ENTREZ IDs: 0
CLOCK 2: Missing Genes: 0, Missing ENTREZ IDs: 0
CLOCK 3: Missing Genes: 0, Missing ENTREZ IDs: 0
OOVERLAP 1, 2 and 3: Missing Genes: 0, Missing ENTREZsIDs: 0
OOVERLAP 1, 2 and 3: Missing Genes: 0, Missing ENTREZsIDs: 0


### 3. Create and export individual csv files of each of the clocks

In [32]:
# Export complete Clock 1 ordered by beta-values
clock_1_df.to_csv('clock_1_ordered.csv', index=False)

# Export complete Clock 2 ordered by beta-values
clock_2_df.to_csv('clock_2_ordered.csv', index=False)

# Export complete Clock 3 ordered by beta-values
clock_3_df.to_csv('clock_3_ordered.csv', index=False)

# Export complete Overlap 2 and 3 ordered by beta-values
overlap_2_3_df.to_csv('overlap_2_3_ordered.csv', index=False)

# Export complete Overlap 1, 2 and 3 ordered by beta-values
overlap_1_2_3_df.to_csv('overlap_1_2_3_ordered.csv', index=False)

### 4. Create and export csv files for the top 25 CpG sites with the highest beta coefficients for each clock

In [16]:
# Export the top 25 CpG's clock_1_df to csv
clock_1_df.head(25).to_csv('top_25_cpgs_clock_1.csv', index=False)

In [17]:
# Export the top 25 CpG's clock_2_df to csv
clock_2_df.head(25).to_csv('top_25_cpgs_clock_2.csv', index=False)

In [18]:
# Export the top 25 CpG's clock_3_df to csv
clock_3_df.head(25).to_csv('top_25_cpgs_clock_3.csv', index=False)

In [19]:
# Export the top 25 CpG's overlap_1_2_3_df to csv
overlap_1_2_3_df.head(25).to_csv('top_25_cpgs_overlap_1_2_3.csv', index=False)

In [20]:
# Export the top 25 CpG's overlap_2_3_df to csv
overlap_2_3_df.head(25).to_csv('top_25_cpgs_overlap_2_3.csv', index=False)

### 5. Create and export csv files for the top 100 CpG sites with the highest beta coefficients for each clock

In [21]:
# Export the top 100 CpG's clock_1_df to csv
clock_1_df.head(100).to_csv('top_100_cpgs_clock_1.csv', index=False)

In [22]:
# Export the top 100 CpG's clock_2_df to csv
clock_2_df.head(100).to_csv('top_100_cpgs_clock_2.csv', index=False)

In [23]:
# Export the top 100 CpG's clock_3_df to csv
clock_3_df.head(100).to_csv('top_100_cpgs_clock_3.csv', index=False)

In [24]:
# Export the top 100 CpG's overlap_1_2_3_df to csv
overlap_1_2_3_df.head(100).to_csv('top_100_cpgs_overlap_1_2_3.csv', index=False)

In [25]:
# Export the top 100 CpG's overlap_2_3_df to csv
overlap_2_3_df.head(100).to_csv('top_100_cpgs_overlap_2_3.csv', index=False)