### Gene Expression Data Import
Author: Catia Antunes   
Date: 04/03/2024

In [1]:
# Imports
import requests
import pandas as pd

In [27]:
# Function to get the data

def get_geo_data(gene_list):
    # Join the list of genes with '+' for the API query
    genes_query = '+'.join(gene_list)

    # Construct the GEO API URL with your gene list
    geo_api_url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=gene&term={genes_query}&retmode=json"  # change this line

    # Make a request to GEO API
    response = requests.get(geo_api_url)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Extract the gene IDs from the API response
        gene_ids = response.json()['esearchresult']['idlist']

        # Construct the GEO query URL for expression data
        geo_query_url = f"https://www.ncbi.nlm.nih.gov/geo/series/{gene_ids[0]}/gds{gene_ids[0]}/"

        # Make a request to the GEO query URL
        geo_response = requests.get(geo_query_url)

        # Check if the request was successful
        if geo_response.status_code == 200:
            # Extract and print the relevant information from the GEO page
            print("Gene Expression Data:")
            print(geo_response.text)
        else:
            print(f"Error accessing GEO data: {geo_response.status_code}")
    else:
        print(f"Error accessing GEO API: {response.status_code}")

In [23]:
# Load Clock 1
Clock1_top25 = pd.read_csv('../../FunctionalAnalysis/top_25_cpgs_clock_1.csv')
Clock1_top25.head()


Unnamed: 0,index,var,beta_clock1,CHR,bp_hg38,Gene,Gene.hg19,ENTREZID,conservationInMouse,GeneRegionID,annotation,CHR_mm10,bp_mm10,Gene_mm10
0,143,cg11728741,0.928874,8,41896469,ANK1,ANK1,286,conserved gene and region in mouse and human,ANK1_Exon,Promoter (<=1kb),8.0,22975146.0,Ank1
1,297,cg24352905,0.853268,5,77645452,OTP,OTP,23440,conserved gene and region in mouse and human,OTP_Promoter,Promoter (5-6kb),13.0,94869014.0,Otp
2,98,cg08938156,0.843749,3,147409417,LOC440982,ZIC1,440982,mapped to different gene in human and mouse,LOC440982_fiveUTR,5' UTR,9.0,91365746.0,Zic1
3,160,cg13058338,0.63866,12,54173598,SMUG1,SMUG1,23583,mapped to different gene in human and mouse,SMUG1_Intron,"Intron (ENST00000635234.1/23583, intron 2 of 4)",15.0,103146996.0,Smug1
4,316,cg26067250,0.510613,2,172085721,DLX1,DLX1,1745,mapped to different gene in human and mouse,DLX1_Exon,Promoter (<=1kb),2.0,71530037.0,Dlx1


In [20]:
# Load Clock 2
Clock2_top25 = pd.read_csv('../../FunctionalAnalysis/top_25_cpgs_clock_2.csv')
Clock2_top25.head()

Unnamed: 0,index,var,beta_clock2,CHR,bp_hg38,Gene,Gene.hg19,ENTREZID,conservationInMouse,GeneRegionID,annotation,CHR_mm10,bp_mm10,Gene_mm10
0,142,cg05474883,0.952198,4,82354926,HNRNPD,HNRNPD,3184,conserved gene and region in mouse and human,HNRNPD_threeUTR,3' UTR,5.0,99962129.0,Hnrnpd
1,272,cg09710440,0.877027,7,104328932,LHFPL3,LHFPL3,375612,"mapped in human, not mapped in mouse",LHFPL3_Exon,Promoter (<=1kb),,,
2,783,cg26512254,0.69306,7,27200144,HOTTIP,HOXA13,100316868,mapped to different gene in human and mouse,HOTTIP_Intron,"Intron (ENST00000421733.1/100316868, intron 1 ...",6.0,52260841.0,Hoxa13
3,238,cg08681110,0.647719,6,113857322,MARCKS,MARCKS,4082,conserved gene and region in mouse and human,MARCKS_Promoter,Promoter (<=1kb),10.0,37138948.0,Marcks
4,253,cg09227056,0.615235,2,176075721,EVX2,EVX2,344191,conserved gene and region in mouse and human,EVX2_Intergenic_downstream,Distal Intergenic,2.0,74651253.0,Evx2


In [28]:
# Load Clock 3
Clock3_top25 = pd.read_csv('../../FunctionalAnalysis/top_25_cpgs_clock_3.csv')
Clock3_top25.head()

Unnamed: 0,index,var,beta_clock3,CHR,bp_hg38,Gene,Gene.hg19,ENTREZID,conservationInMouse,GeneRegionID,annotation,CHR_mm10,bp_mm10,Gene_mm10
0,493,cg26512254,2.638299,7,27200144,HOTTIP,HOXA13,100316868,mapped to different gene in human and mouse,HOTTIP_Intron,"Intron (ENST00000421733.1/100316868, intron 1 ...",6.0,52260841.0,Hoxa13
1,99,cg09710440,1.854689,7,104328932,LHFPL3,LHFPL3,375612,"mapped in human, not mapped in mouse",LHFPL3_Exon,Promoter (<=1kb),,,
2,34,cg18418719,1.806028,3,70972083,FOXP1,FOXP1,27086,mapped to different gene in human and mouse,FOXP1_threeUTR,3' UTR,6.0,98941067.0,Foxp1
3,134,cg24352905,1.738218,5,77645452,OTP,OTP,23440,conserved gene and region in mouse and human,OTP_Promoter,Promoter (5-6kb),13.0,94869014.0,Otp
4,576,cg05474883,1.463495,4,82354926,HNRNPD,HNRNPD,3184,conserved gene and region in mouse and human,HNRNPD_threeUTR,3' UTR,5.0,99962129.0,Hnrnpd


In [25]:
# Load Clock overlap 2 and 3
Clock_overlap_2_3_top25 = pd.read_csv('../../FunctionalAnalysis/top_25_cpgs_overlap_2_3.csv')
Clock_overlap_2_3_top25.head()

Unnamed: 0,index_x,var_x,beta_clock2,CHR_x,bp_hg38_x,Gene,Gene.hg19_x,ENTREZID_x,conservationInMouse_x,GeneRegionID_x,...,CHR_y,bp_hg38_y,Gene.hg19_y,ENTREZID_y,conservationInMouse_y,GeneRegionID_y,annotation_y,CHR_mm10_y,bp_mm10_y,Gene_mm10_y
0,142,cg05474883,0.952198,4,82354926,HNRNPD,HNRNPD,3184,conserved gene and region in mouse and human,HNRNPD_threeUTR,...,4,82354926,HNRNPD,3184,conserved gene and region in mouse and human,HNRNPD_threeUTR,3' UTR,5,99962129.0,Hnrnpd
1,783,cg26512254,0.69306,7,27200144,HOTTIP,HOXA13,100316868,mapped to different gene in human and mouse,HOTTIP_Intron,...,7,27200144,HOXA13,100316868,mapped to different gene in human and mouse,HOTTIP_Intron,"Intron (ENST00000421733.1/100316868, intron 1 ...",6,52260841.0,Hoxa13
2,238,cg08681110,0.647719,6,113857322,MARCKS,MARCKS,4082,conserved gene and region in mouse and human,MARCKS_Promoter,...,6,113857322,MARCKS,4082,conserved gene and region in mouse and human,MARCKS_Promoter,Promoter (<=1kb),10,37138948.0,Marcks
3,803,cg27201382,0.523618,11,27720483,BDNF,BDNF,627,conserved gene and region in mouse and human,BDNF_fiveUTR,...,11,27720483,BDNF,627,conserved gene and region in mouse and human,BDNF_fiveUTR,Promoter (<=1kb),2,109676257.0,Bdnf
4,599,cg19927064,0.499713,12,48818919,CACNB3,CACNB3,784,conserved gene and region in mouse and human,CACNB3_fiveUTR,...,12,48818919,CACNB3,784,conserved gene and region in mouse and human,CACNB3_fiveUTR,5' UTR,15,98635158.0,Cacnb3


In [26]:
# Load Clocks overlap 1, 2 and 3
Clock_overlap_1_2_3_top25 = pd.read_csv('../../FunctionalAnalysis/top_25_cpgs_overlap_1_2_3.csv')
Clock_overlap_1_2_3_top25.head()

Unnamed: 0,index_x,var_x,beta_clock1,CHR_x,bp_hg38_x,Gene,Gene.hg19_x,ENTREZID_x,conservationInMouse_x,GeneRegionID_x,...,CHR,bp_hg38,Gene.hg19,ENTREZID,conservationInMouse,GeneRegionID,annotation,CHR_mm10,bp_mm10,Gene_mm10
0,98,cg08938156,0.843749,3,147409417,LOC440982,ZIC1,440982,mapped to different gene in human and mouse,LOC440982_fiveUTR,...,3,147409417,ZIC1,440982,mapped to different gene in human and mouse,LOC440982_fiveUTR,5' UTR,9.0,91365746.0,Zic1
1,329,cg27201382,0.444645,11,27720483,BDNF,BDNF,627,conserved gene and region in mouse and human,BDNF_fiveUTR,...,11,27720483,BDNF,627,conserved gene and region in mouse and human,BDNF_fiveUTR,Promoter (<=1kb),2.0,109676257.0,Bdnf
2,144,cg11904056,0.341791,5,93583204,NR2F1-AS1,NR2F1,441094,mapped to different gene in human and mouse,NR2F1-AS1_Intron,...,5,93585317,NR2F1,441094,mapped to different gene in human and mouse,NR2F1-AS1_Exon,Promoter (<=1kb),13.0,78198288.0,Nr2f1
3,44,cg03820088,0.337644,7,23522229,TRA2A,TRA2A,29896,conserved gene but different region in human a...,TRA2A_fiveUTR,...,7,23522229,TRA2A,29896,conserved gene but different region in human a...,TRA2A_fiveUTR,5' UTR,6.0,49252905.0,Tra2a
4,58,cg04998737,0.336275,5,77645403,OTP,OTP,23440,conserved gene and region in mouse and human,OTP_Promoter,...,5,77645452,OTP,23440,conserved gene and region in mouse and human,OTP_Promoter,Promoter (5-6kb),13.0,94869014.0,Otp


In [29]:
# Create lists of genes for all clocks 
Clock1_genes = Clock1_top25['Gene'].tolist()
Clock2_genes = Clock2_top25['Gene'].tolist()
Clock3_genes = Clock3_top25['Gene'].tolist()
Clock_overlap_2_3_genes = Clock_overlap_2_3_top25['Gene'].tolist()
Clock_overlap_1_2_3_genes = Clock_overlap_1_2_3_top25['Gene'].tolist()

In [31]:
Clock1_genes

['ANK1',
 'OTP',
 'LOC440982',
 'SMUG1',
 'DLX1',
 'BDNF',
 'COL4A1',
 'EVX2',
 'NR2F1-AS1',
 'TRA2A',
 'OTP',
 'IGF2BP3',
 'MGMT',
 'LCOR',
 'KDM2A',
 'ZNF326',
 'LHFPL3',
 'IQCM',
 'FMO3',
 'IKZF1',
 'CELF6',
 'ZIC2',
 'CELF6',
 'KCNC4',
 'RORA']

In [None]:
# Call the function to get the data
get_geo_data(Clock1_genes)