In [1]:
import os
import re
import pandas as pd
from tqdm import tqdm


In [2]:
# the user should have downloaded the GTEx v8 eQTL data
# eqtl_dir: the directory contains the 'XXX.v8.signif_variant_gene_pairs.txt.gz' files
eqtl_dir = r"/path/to/your/GTEx/v8/eQTL"
df_all = []
for filename in os.listdir(eqtl_dir):
    if ".signif_variant_gene_pairs." not in filename:
        continue
    tissue = filename.split(".", maxsplit=1)[0].replace("_", " ")
    filename = os.path.join(eqtl_dir, filename)
    if not os.path.isfile(filename):
        continue
    df_eqtl = pd.read_csv(
        filename,
        delimiter="\t",
        compression="gzip",
        usecols=["variant_id", "gene_id"],
    )
    df_eqtl.drop_duplicates(inplace=True)
    df_eqtl.insert(0, "tissue", tissue)
    df_all.append(df_eqtl)
df_all = pd.concat(df_all, axis=0, sort=False, ignore_index=True)
print(df_all.head())
print(df_all.shape)


                 tissue           variant_id            gene_id
0  Adipose Subcutaneous   chr1_64764_C_T_b38  ENSG00000227232.5
1  Adipose Subcutaneous  chr1_665098_G_A_b38  ENSG00000227232.5
2  Adipose Subcutaneous  chr1_666028_G_A_b38  ENSG00000227232.5
3  Adipose Subcutaneous  chr1_108826_G_C_b38  ENSG00000269981.1
4  Adipose Subcutaneous  chr1_126108_G_A_b38  ENSG00000269981.1
(71478479, 3)


In [3]:
# read the GTF given by GTEx
# gtf_path: the path of 'gencode.v26.GRCh38.genes.gtf' file given by GTEx
gtf_path = "/path/to/your/GTEx/v8/gencode.v26.GRCh38.genes.gtf"
df_gtf = pd.read_csv(
    gtf_path, delimiter="\t", comment="#", header=None, usecols=[0, 2, 3, 4, 6, 8]
)
# extract the gene IDs
df_gtf["gene_id"] = [re.search(r'gene_id "(.+?)"', i).group(1) for i in df_gtf[8]]
del df_gtf[8]
df_gtf.columns = ["chromosome", "type", "start", "end", "strand", "gene_id"]
df_gtf = df_gtf[df_gtf["type"].isin(["exon", "gene"])]
print(df_gtf.head())
print(df_gtf.shape)


  chromosome  type  start    end strand            gene_id
0       chr1  gene  11869  14403      +  ENSG00000223972.5
2       chr1  exon  11869  12227      +  ENSG00000223972.5
3       chr1  exon  12613  12721      +  ENSG00000223972.5
4       chr1  exon  12975  13052      +  ENSG00000223972.5
5       chr1  exon  13221  14403      +  ENSG00000223972.5
(384871, 6)


In [5]:
# calculate the distances of eQTLs to eGenes
df_all[["chromosome", "position"]] = [
    [(j := i.split("_", maxsplit=2))[0], int(j[1])] for i in df_all["variant_id"]
]
df_dist = []
for gene_id, df_temp in tqdm(df_all.groupby("gene_id")):
    # get the positions of gene and its exons, respectively
    gene_pos = df_gtf[df_gtf["gene_id"] == gene_id].copy().drop(columns=["gene_id"])
    exon_pos = gene_pos[gene_pos["type"] == "exon"]
    gene_pos = gene_pos[gene_pos["type"] == "gene"]
    gene_pos = gene_pos.iloc[0, :].to_dict()
    variant_positions = []
    variant_distances = []
    for _, row in df_temp.iterrows():
        if gene_pos["strand"] == "+":
            tss_distance = row["position"] - gene_pos["start"]
            if row["position"] < gene_pos["start"]:
                # upstream
                variant_positions.append("upstream")
                variant_distances.append(gene_pos["start"] - row["position"])
            elif row["position"] > gene_pos["end"]:
                # downstream
                variant_positions.append("downstream")
                variant_distances.append(row["position"] - gene_pos["end"])
            else:
                if (
                    len(
                        exon_pos[
                            (exon_pos["start"] <= row["position"])
                            & (exon_pos["end"] >= row["position"])
                        ]
                    )
                    > 0
                ):
                    # exon
                    variant_positions.append("exon")
                else:
                    # intron
                    variant_positions.append("intron")
                variant_distances.append(row["position"] - gene_pos["start"])
        else:
            tss_distance = row["position"] - gene_pos["end"]
            if row["position"] > gene_pos["end"]:
                # upstream
                variant_positions.append("upstream")
                variant_distances.append(row["position"] - gene_pos["end"])
            elif row["position"] < gene_pos["start"]:
                # downstream
                variant_positions.append("downstream")
                variant_distances.append(gene_pos["start"] - row["position"])
            else:
                if (
                    len(
                        exon_pos[
                            (exon_pos["start"] <= row["position"])
                            & (exon_pos["end"] >= row["position"])
                        ]
                    )
                    > 0
                ):
                    # exon
                    variant_positions.append("exon")
                else:
                    # intron
                    variant_positions.append("intron")
                variant_distances.append(gene_pos["end"] - row["position"])
    df_temp["variant_position"] = variant_positions
    df_temp["variant_distance"] = variant_distances
    df_temp = df_temp.drop(columns=["chromosome", "position"])
    df_dist.append(df_temp.copy())
df_dist = pd.concat(df_dist, axis=0, sort=False, ignore_index=True)
print(df_dist.head())
print(df_dist.shape)
df_dist.to_csv(
    "eQTL_distance.txt.gz",
    sep="\t",
    compression="gzip",
    index=False,
)
del df_dist


100%|██████████| 34548/34548 [1:41:50<00:00,  5.65it/s]   


                 tissue              variant_id             gene_id  \
0  Adipose Subcutaneous  chrX_100548181_A_G_b38  ENSG00000000003.14   
1  Adipose Subcutaneous  chrX_100573255_C_T_b38  ENSG00000000003.14   
2  Adipose Subcutaneous  chrX_100592861_G_A_b38  ENSG00000000003.14   
3  Adipose Subcutaneous  chrX_100594020_G_A_b38  ENSG00000000003.14   
4  Adipose Subcutaneous  chrX_100594054_T_C_b38  ENSG00000000003.14   

  variant_position  variant_distance  
0       downstream             78928  
1       downstream             53854  
2       downstream             34248  
3       downstream             33089  
4       downstream             33055  
(71478479, 5)


In [6]:
# load the table calculated above and drop duplicate eQTL-eGene pairs
df = pd.read_csv(
    "eQTL_distance.txt.gz",
    delimiter="\t",
    compression="gzip",
    usecols=["gene_id", "variant_id", "variant_position", "variant_distance"],
)
print(df.shape)
df.drop_duplicates(subset=["gene_id", "variant_id"], inplace=True)
print(df.head())
print(df.shape)


(71478479, 4)
               variant_id             gene_id variant_position  \
0  chrX_100548181_A_G_b38  ENSG00000000003.14       downstream   
1  chrX_100573255_C_T_b38  ENSG00000000003.14       downstream   
2  chrX_100592861_G_A_b38  ENSG00000000003.14       downstream   
3  chrX_100594020_G_A_b38  ENSG00000000003.14       downstream   
4  chrX_100594054_T_C_b38  ENSG00000000003.14       downstream   

   variant_distance  
0             78928  
1             53854  
2             34248  
3             33089  
4             33055  
(13791909, 4)


In [8]:
# calculate the lengths of genes
genes = set(df["gene_id"])
df_gene_len = df_gtf[df_gtf["type"] == "gene"][["gene_id", "start", "end"]].copy()
df_gene_len = df_gene_len[df_gene_len["gene_id"].isin(genes)]
df_gene_len["gene_length"] = df_gene_len["end"] - df_gene_len["start"] + 1
df_gene_len.drop(columns=["start", "end"], inplace=True)
print(df_gene_len.head())
print(df_gene_len.shape)


              gene_id  gene_length
0   ENSG00000223972.5         2535
6   ENSG00000227232.5        15144
39  ENSG00000186092.4          918
42  ENSG00000238009.6        39929
52  ENSG00000233750.3         3812
(34548, 2)


In [9]:
# calculate the lengths of exons
df_exon_len = df_gtf[df_gtf["type"] == "exon"][["gene_id", "start", "end"]].copy()
df_exon_len = df_exon_len[df_exon_len["gene_id"].isin(genes)]
gene_ids = []
lengths = []
for gene_id, df_temp in tqdm(df_exon_len.groupby("gene_id")):
    pos = set()
    for _, row in df_temp.iterrows():
        pos.update(range(row["start"], row["end"] + 1))
    gene_ids.append(gene_id)
    lengths.append(len(pos))
df_exon_len = pd.DataFrame({"gene_id": gene_ids, "exon_length": lengths})
print(df_exon_len.head())
print(df_exon_len.shape)


100%|██████████| 34548/34548 [00:16<00:00, 2144.94it/s]

              gene_id  exon_length
0  ENSG00000000003.14         4535
1   ENSG00000000005.5         1610
2  ENSG00000000419.12         1207
3  ENSG00000000457.13         5586
4  ENSG00000000460.16         4579
(34548, 2)





In [11]:
# the density of eQTLs on exons
# calculate the count of eQTLs on exons
df_gene = df[df["variant_position"].isin(["exon", "intron"])]
df_exon = df_gene[df_gene["variant_position"] == "exon"]
df_nvariants = df_exon[["gene_id", "variant_id"]].groupby("gene_id").count()
# The gene without any eQTLs on its exons will have a zero count
genes_novar = sorted(genes - set(df_nvariants.index))
df_nvariants = pd.concat(
    [
        df_nvariants,
        pd.DataFrame({"variant_id": [0] * len(genes_novar)}, index=genes_novar),
    ],
    axis=0,
    sort=False,
    ignore_index=False,
)
# density = total count / total length
df_nvariants = pd.merge(df_nvariants, df_exon_len, left_index=True, right_on="gene_id")
density_exon = df_nvariants["variant_id"].sum() / df_nvariants["exon_length"].sum()
density_exon


0.0012718772280142147

In [12]:
# the density of eQTLs on introns
# calculate the count of eQTLs on introns
df_intron = df_gene[df_gene["variant_position"] == "intron"]
df_nvariants = df_intron[["gene_id", "variant_id"]].groupby("gene_id").count()
df_nvariants.shape
# The gene without any eQTLs ont its introns will have a zero count
genes_novar = sorted(genes - set(df_nvariants.index))
df_nvariants = pd.concat(
    [
        df_nvariants,
        pd.DataFrame({"variant_id": [0] * len(genes_novar)}, index=genes_novar),
    ],
    axis=0,
    sort=False,
    ignore_index=False,
)
df_nvariants.shape
# density = total count / total length
df_nvariants = pd.merge(df_nvariants, df_gene_len, left_index=True, right_on="gene_id")
df_nvariants = pd.merge(df_nvariants, df_exon_len, on="gene_id")
df_nvariants["intron_length"] = (
    df_nvariants["gene_length"] - df_nvariants["exon_length"]
)
density_intron = df_nvariants["variant_id"].sum() / df_nvariants["intron_length"].sum()
density_intron


0.0011245422861891033

In [13]:
# density of eQTLs in median distance up/downstream
# formula: The number of all up/downstream eQTLs * 0.5 / ((Dup + Ddown) * number of genes)
df_nogene = df[df["variant_position"].isin(["upstream", "downstream"])]
# The number of up/downstream eQTLs
nvariants = len(df_nogene)
# median distances up/downstream
dist_up = df_nogene[df_nogene["variant_position"] == "upstream"][
    "variant_distance"
].median()
dist_down = df_nogene[df_nogene["variant_position"] == "downstream"][
    "variant_distance"
].median()
print(dist_up, dist_down)
# number of genes
ngenes = len(genes)
density_up_down_stream = nvariants * 0.5 / (dist_up + dist_down) / ngenes
print(density_up_down_stream)


121925.0 121388.0
0.0007142395971031626


In [14]:
# ratio of densities
print(density_exon / density_up_down_stream)
print(density_intron / density_up_down_stream)


1.780743091215802
1.5744608542428344
