## script that allows the display of the information from the publicly released data

insert SNPs or list of SNPs and call function  
for QTLs we find that a good p-value is 0.0001, which is roughly 10% FDR. Can check the permuted lists for a more accurate FDR corrected p-value.

In [2]:
import pandas as pd
import numpy as np
import gc

base_dir = "http://bartzabel.ls.manchester.ac.uk/orozcolab/SNP2Mechanism/PsA_cleaned_analysis/"


In [3]:
gtf_annotation_df = pd.read_pickle(f"{base_dir}/gencode_gtf.pickle")
gtf_transcripts = gtf_annotation_df[(gtf_annotation_df["feature"] == "transcript") & (gtf_annotation_df["transcript_type"] == "protein_coding")].dropna(axis=1, how='all')
gtf_transcripts["gene_id"] = gtf_transcripts["gene_id"].str.split(".").str[0]
gtf_transcripts["transcript_id"] = gtf_transcripts["transcript_id"].str.split(".").str[0]
gtf_transcripts["TSS_start"] = gtf_transcripts.apply(lambda x: int(x["start"]) if x["strand"] == "+" else int(x["end"]) ,axis = 1)
gene_mapper = dict(zip(gtf_transcripts['gene_id'], gtf_transcripts['gene_name']))

del gtf_annotation_df
del gtf_transcripts
gc.collect()

55

In [4]:
RNA_nominal_CD4 = pd.read_csv(f"{base_dir}/QTL_analysis/RNA/output_final/RNA_nominal_CD4_merged.txt", sep = " ", usecols = "phe_id phe_chr phe_from phe_to dist_phe_var var_id var_from nom_pval r_squared slope best_hit".split())
RNA_nominal_CD8 = pd.read_csv(f"{base_dir}/QTL_analysis/RNA/output_final/RNA_nominal_CD8_merged.txt", sep = " ", usecols = "phe_id phe_chr phe_from phe_to dist_phe_var var_id var_from nom_pval r_squared slope best_hit".split())
ATAC_nominal_CD8 = pd.read_csv(f"{base_dir}/QTL_analysis/ATAC/output_final/ATAC_nominal_CD8_merged.txt", sep = " ", usecols = "phe_id phe_chr phe_from phe_to dist_phe_var var_id var_from nom_pval r_squared slope best_hit".split())
ATAC_nominal_CD4 = pd.read_csv(f"{base_dir}/QTL_analysis/ATAC/output_final/ATAC_nominal_CD4_merged.txt", sep = " ", usecols = "phe_id phe_chr phe_from phe_to dist_phe_var var_id var_from nom_pval r_squared slope best_hit".split())
ins_nominal_CD8 = pd.read_csv(f"{base_dir}/QTL_analysis/HiC/output_final/ins_nominal_CD8_merged.txt", sep = " ", usecols = "phe_id phe_chr phe_from phe_to dist_phe_var var_id var_from nom_pval r_squared slope best_hit".split())
ins_nominal_CD4 = pd.read_csv(f"{base_dir}/QTL_analysis/HiC/output_final/ins_nominal_CD4_merged.txt", sep = " ", usecols = "phe_id phe_chr phe_from phe_to dist_phe_var var_id var_from nom_pval r_squared slope best_hit".split())
loop_nominal_CD8 = pd.read_csv(f"{base_dir}/QTL_analysis/HiC/output_final/loop_nominal_CD8_merged.txt", sep = " ", usecols = "phe_id phe_chr phe_from phe_to dist_phe_var var_id var_from nom_pval r_squared slope best_hit".split())
loop_nominal_CD4 = pd.read_csv(f"{base_dir}/QTL_analysis/HiC/output_final/loop_nominal_CD4_merged.txt", sep = " ", usecols = "phe_id phe_chr phe_from phe_to dist_phe_var var_id var_from nom_pval r_squared slope best_hit".split())

output_dataframe_CD8 = pd.read_csv(f"{base_dir}/HiC_allelic_imbalance/output_dataframe_CD8.csv", index_col=0)
output_dataframe_CD4 = pd.read_csv(f"{base_dir}/HiC_allelic_imbalance/output_dataframe_CD4.csv", index_col=0)

o = f"{base_dir}/ATAC_allelic_imbalance/combined_p_vals_files/"
all_SNPs_all = pd.read_pickle(o + "all_SNPs_all.pkl").drop(columns = ["eQTLgen_gene", "snp","hsc_genes","tcell_genes","all_genes"])

In [5]:
RNA_nominal_CD4["gene_name"] = RNA_nominal_CD4['phe_id'].map(gene_mapper)
RNA_nominal_CD8["gene_name"] = RNA_nominal_CD8['phe_id'].map(gene_mapper)

In [6]:
def get_all_tables(vars):
    if not type(vars) == list:
        vars = [vars]

    print("eQTL CD4")
    display(RNA_nominal_CD4[RNA_nominal_CD4["var_id"].isin(vars)])

    print("eQTL CD8")
    display(RNA_nominal_CD8[RNA_nominal_CD8["var_id"].isin(vars)])

    print("caQTL CD4")
    display(ATAC_nominal_CD4[ATAC_nominal_CD4["var_id"].isin(vars)])

    print("caQTL CD8")
    display(ATAC_nominal_CD8[ATAC_nominal_CD8["var_id"].isin(vars)])

    print("loopQTL CD4")
    display(loop_nominal_CD4[loop_nominal_CD4["var_id"].isin(vars)])

    print("loopQTL CD8")
    display(loop_nominal_CD8[loop_nominal_CD8["var_id"].isin(vars)])

    print("insQTL CD4")
    display(ins_nominal_CD4[ins_nominal_CD4["var_id"].isin(vars)])

    print("insQTL CD8")
    display(ins_nominal_CD8[ins_nominal_CD8["var_id"].isin(vars)])


    print("allelic imbalance")
    display(all_SNPs_all[all_SNPs_all["ID"].isin(vars)])

    print("loops with allelic imbalance CD4")
    display(output_dataframe_CD4[output_dataframe_CD4["rsID"].isin(vars)])

    print("loops with allelic imbalance CD8")
    display(output_dataframe_CD8[output_dataframe_CD8["rsID"].isin(vars)])

In [8]:
get_all_tables("rs4409785")

eQTL CD4


Unnamed: 0,phe_id,phe_chr,phe_from,phe_to,dist_phe_var,var_id,var_from,nom_pval,r_squared,slope,best_hit,gene_name


eQTL CD8


Unnamed: 0,phe_id,phe_chr,phe_from,phe_to,dist_phe_var,var_id,var_from,nom_pval,r_squared,slope,best_hit,gene_name


caQTL CD4


Unnamed: 0,phe_id,phe_chr,phe_from,phe_to,dist_phe_var,var_id,var_from,nom_pval,r_squared,slope,best_hit
5241669,18941,chr11,94786466,94786965,791293,rs4409785,95578258,0.00175586,0.189787,-0.704744,1
5242633,18965,chr11,95213587,95214086,364172,rs4409785,95578258,0.00705785,0.144491,-0.61492,0
5243047,18972,chr11,95263313,95263812,314446,rs4409785,95578258,0.00229941,0.18112,-0.688463,0
5243258,18977,chr11,95329634,95330133,248125,rs4409785,95578258,0.00292194,0.173371,-0.673575,0
5243562,18985,chr11,95547920,95548419,29839,rs4409785,95578258,0.00425772,0.161107,-0.649314,0
5243591,18986,chr11,95578037,95578536,0,rs4409785,95578258,7.21404e-09,0.512963,1.15862,1
5243745,18989,chr11,95605192,95605691,-26934,rs4409785,95578258,0.0072654,0.143534,0.61288,0
5243872,18993,chr11,95674583,95675082,-96325,rs4409785,95578258,0.00125237,0.20056,0.72447,0
5243893,18994,chr11,95680358,95680857,-102100,rs4409785,95578258,0.00743416,0.142775,0.611257,0
5248264,19073,chr11,96561061,96561560,-982803,rs4409785,95578258,0.00888568,0.136871,-0.598484,0


caQTL CD8


Unnamed: 0,phe_id,phe_chr,phe_from,phe_to,dist_phe_var,var_id,var_from,nom_pval,r_squared,slope,best_hit
5622405,18952,chr11,95089538,95090037,488221,rs4409785,95578258,0.00343752,0.127971,-0.606787,0
5623909,18986,chr11,95578037,95578536,0,rs4409785,95578258,2.58439e-10,0.472266,1.16566,1
5624122,18992,chr11,95660004,95660503,-81746,rs4409785,95578258,0.00489258,0.118989,0.585104,0
5627965,19072,chr11,96555776,96556275,-977518,rs4409785,95578258,0.00629423,0.112558,-0.569073,0
5628048,19074,chr11,96567112,96567611,-988854,rs4409785,95578258,0.00797856,0.106492,-0.553528,0


loopQTL CD4


Unnamed: 0,phe_id,phe_chr,phe_from,phe_to,dist_phe_var,var_id,var_from,nom_pval,r_squared,slope,best_hit
4660457,65112,chr11,95152501,95707500,0,rs4409785,95578258,0.000367159,0.26341,-1.02131,0
4660702,65117,chr11,95182501,95710000,0,rs4409785,95578258,5.82038e-05,0.322417,-1.12992,1
4660899,65121,chr11,95227501,96392500,0,rs4409785,95578258,0.0018154,0.208894,0.909502,0
4660976,65122,chr11,95230001,95690000,0,rs4409785,95578258,0.00899126,0.151604,-0.774812,0
4661873,65140,chr11,95580001,95705000,-1743,rs4409785,95578258,1.19512e-10,0.631489,1.58133,1


loopQTL CD8


Unnamed: 0,phe_id,phe_chr,phe_from,phe_to,dist_phe_var,var_id,var_from,nom_pval,r_squared,slope,best_hit
4825771,65074,chr11,94927501,95792500,0,rs4409785,95578258,0.00948784,0.124726,-0.719906,0
4829545,65137,chr11,95437501,95675000,0,rs4409785,95578258,0.00114957,0.188811,-0.885749,1
4829646,65140,chr11,95580001,95705000,-1743,rs4409785,95578258,2.26243e-12,0.622531,1.60834,1


insQTL CD4


Unnamed: 0,phe_id,phe_chr,phe_from,phe_to,dist_phe_var,var_id,var_from,nom_pval,r_squared,slope,best_hit
8063817,76165,chr11,95325001,95350000,228258,rs4409785,95578258,0.007617,0.150982,-0.77308,0
8064654,76177,chr11,95625001,95650000,-46743,rs4409785,95578258,0.000181,0.275405,1.04411,1
8064881,76178,chr11,95650001,95675000,-71743,rs4409785,95578258,0.000228,0.268107,1.03019,1
8065141,76179,chr11,95675001,95700000,-96743,rs4409785,95578258,0.001551,0.205644,0.902235,0


insQTL CD8


Unnamed: 0,phe_id,phe_chr,phe_from,phe_to,dist_phe_var,var_id,var_from,nom_pval,r_squared,slope,best_hit
8547177,76177,chr11,95625001,95650000,-46743,rs4409785,95578258,0.000648,0.20568,0.92447,0
8547301,76178,chr11,95650001,95675000,-71743,rs4409785,95578258,0.00035,0.223567,0.963829,1


allelic imbalance


Unnamed: 0,CHROM,POS,ID,REF,ALT,combined_p_val_greater,combined_p_val_less,tot_REF,tot_ALT,ratio,n_pat,corrected_p_val_greater,corrected_p_val_less,TF_remap,TF_JASPAR,eQTLgen_symbol,eQTLgen_pval,CD4_lowest_allele_specific,CD8_lowest_allele_specific,ATAC_hic_corr_score
264902,chr11,95578258,rs4409785,T,C,0.999995,4.4e-05,3.0,23.0,7.666667,1.0,1.0,0.001012,"{SMC1, TRIM22, TP63, NFATC2, STAG2, CTCF, RAD2...","{Neurod2, PRDM9, GRHL2, ZFP42, TP63, SIX1, ZNF...","[SESN3, RP11-712B9.2]","[SESN3: 61.3, RP11-712B9.2: 7.1]",,,0.06


loops with allelic imbalance CD4


Unnamed: 0,chrA,startA,endA,chrB,startB,endB,loopID,loopScore,rsID,rsCoord,combined_Pval_greater,corrected_Pval_greater,combined_Pval_less,corrected_Pval_less,tot_REF,tot_ALT,n_pat


loops with allelic imbalance CD8


Unnamed: 0,chrA,startA,endA,chrB,startB,endB,loopID,loopScore,rsID,rsCoord,combined_Pval_greater,corrected_Pval_greater,combined_Pval_less,corrected_Pval_less,tot_REF,tot_ALT,n_pat
