In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# This script is to zip together potential SNPs and genes which appear "dosage-compensated". First, we identify 
# potential "dosage compensated" genes and load them in. Then, we filter the GTEx database to only these genes
# across all tissues. Next, we identify SNPS that differ between Ethan, Dad, Mom, and Eric. After, we tie these SNPs together to the genes in the curated GTEx list. We assess the predicted
# genotype fold change based on the results of GTEx, and ask whether our calculated FC is
# significantly different. 

# One issue I'm not addressing here is whether or not non-canonical SNPs have an effect. Only SNPs with an rs number
# Not considering all different allele possibilities at a SNP either, unless these are already quantified within GTeX
# (e.g., if the possible allele combos are AA, AT, AG, TT, TG, GG, but only A/T is entered in GTeX,
# I only consider the alleles that have an entry in GTeX)

In [2]:
# Step 1: Load in genes of interest and the calculated FC between Ethan and Eric. The file format will end up being:
# Refseq_ID   Common_ID  BaseMean  Log2_FC  Log2_FC_SE  
fc_df = pd.read_csv('/Users/sahu0957/ds_deseq_normalization/dosagecompensated_genes_norepeats.csv')
fc_df = fc_df.iloc[:,[1,2,3,4]]
common_id = pd.read_csv('/Users/sahu0957/backup/genome_files/hg38_refseq_to_common_id.txt', header=None)
common_id = common_id[0].str.rsplit("_",n=1, expand=True)
common_id.columns =['Row.names', 'Common_ID']

fc_df = pd.merge(fc_df,common_id,indicator=False, how='left')
fc_df = fc_df.drop_duplicates(keep='last')

fc_df.sort_values(fc_df.columns[-1], ascending = False)

Unnamed: 0,Row.names,baseMean,log2FoldChange,lfcSE,Common_ID
18,NM_018964,2581.739509,0.077787,0.158898,SLC37A1
2,NM_001256370,6735.897125,0.011107,0.326691,SAMSN1
24,NR_145819,11666.98557,0.098365,0.260152,RNA45SN1
4,NM_001282934,1500.029164,0.091793,0.178404,PRDM15
12,NM_001348241,240.161311,-0.412931,0.375144,PCBP3
11,NM_001348240,277.710296,-0.4292,0.381264,PCBP3
14,NM_003489,2735.321866,-0.119453,0.298161,NRIP1
13,NM_001352596,142.67113,0.128563,0.279754,NCAM2
15,NM_015358,1627.511088,0.106304,0.169085,MORC3
7,NM_001320445,1627.609446,0.10657,0.169128,MORC3


In [72]:
#FIXME: I got frustrated with pandas and hard-coded everything. Come back and automate!

# Step 2: Load in Family E SNP VCF.
famE_snps_df = pd.read_csv('/Users/sahu0957/ds_deseq_normalization/FamE.vcf',header=236,sep='\t')

# Get only canonical SNPs on chr21 (we're assuming no trans-acting enhancers here!)
famE_snps_df = famE_snps_df[famE_snps_df['#CHROM'].str.contains("chr21")]
famE_snps_df = famE_snps_df[famE_snps_df['ID'].str.contains("rs")]

# Can remove unhelpful SNP calls (Mom and Dad have the same alleles) if needed. Done later in this notebook
# For now we're just gonna assume the SNP matches reference if it's an error. Depth filter likely will remove these anyway. 
# We should later change this to match Mendelian inheritance if the allele is available in both parents 
# (phasing info would be even better but more difficult to implement)

famE_snps_df['Eli_allele1'] = famE_snps_df['Eli'].str[0]
famE_snps_df['Eli_allele2'] = famE_snps_df['Eli'].str[2]

famE_snps_df.loc[famE_snps_df['Eli_allele1'] == ".", 'Eli_allele1'] = 0
famE_snps_df.loc[famE_snps_df['Eli_allele2'] == ".", 'Eli_allele2'] = 0

famE_snps_df['Eliz_allele1'] = famE_snps_df['Elizabeth'].str[0]
famE_snps_df['Eliz_allele2'] = famE_snps_df['Elizabeth'].str[2]

famE_snps_df.loc[famE_snps_df['Eliz_allele1'] == ".", 'Eliz_allele1'] = 0
famE_snps_df.loc[famE_snps_df['Eliz_allele2'] == ".", 'Eliz_allele2'] = 0

famE_snps_df['Eric_allele1'] = famE_snps_df['Eric'].str[0]
famE_snps_df['Eric_allele2'] = famE_snps_df['Eric'].str[2]

famE_snps_df.loc[famE_snps_df['Eric_allele1'] == ".", 'Eric_allele1'] = 0
famE_snps_df.loc[famE_snps_df['Eric_allele2'] == ".", 'Eric_allele2'] = 0

famE_snps_df['Ethan_allele1'] = famE_snps_df['Ethan'].str[0]
famE_snps_df['Ethan_allele2'] = famE_snps_df['Ethan'].str[2]
famE_snps_df['Ethan_allele3'] = famE_snps_df['Ethan'].str[4]

famE_snps_df.loc[famE_snps_df['Ethan_allele1'] == ".", 'Ethan_allele1'] = 0
famE_snps_df.loc[famE_snps_df['Ethan_allele2'] == ".", 'Ethan_allele2'] = 0
famE_snps_df.loc[famE_snps_df['Ethan_allele3'] == ".", 'Ethan_allele3'] = 0

In [73]:
# Extremely annoyingly, VCF files bury their depth info within the info column... with no order on the other data. So we have
# to do some extremely tedious cleanup to fetch that data. There's for sure a better way to do this.
info_df = famE_snps_df['INFO'].str.split(';',expand=True)
mask = info_df.apply(lambda col: col.str.contains('DP=').any(),
                axis=0)
sub_df = info_df.loc[: , mask]
sub_df = sub_df.replace(r'^(?!.*DP).*$', '', regex=True)
sub_df['Depth'] = sub_df.iloc[:,0] + sub_df.iloc[:,1]
sub_df['Depth'] = sub_df['Depth'].str.replace('DP=','').astype(int)
famE_snps_df['Depth'] = sub_df['Depth']

# From here you can apply a depth filter, then remove columns which are no longer useful
# 
famE_snps_df = famE_snps_df[famE_snps_df['Depth'] >= 100]
famE_snps_df = famE_snps_df.iloc[:,[2,13,14,15,16,17,18,19,20,21]]
famE_snps_df



Unnamed: 0,ID,Eli_allele1,Eli_allele2,Eliz_allele1,Eliz_allele2,Eric_allele1,Eric_allele2,Ethan_allele1,Ethan_allele2,Ethan_allele3
6422064,rs796574937,0,1,0,1,0,1,0,0,1
6422066,rs796473723,0,1,0,1,0,1,0,0,1
6422071,rs796625648,0,1,0,1,0,1,0,0,1
6422072,rs796888640,0,1,0,1,0,1,0,0,1
6422079,rs749596923,0,1,0,1,0,1,0,0,1
6422086,rs796830124,0,1,0,1,0,1,0,1,1
6422089,rs369998822,0,1,0,1,0,0,0,0,0
6422090,rs372700624,0,1,0,1,0,1,0,0,1
6422091,rs373368732,0,1,0,1,0,1,0,0,1
6422094,rs796604716,0,1,0,1,0,1,0,0,1


In [76]:
# The combined set of all relevant eQTLs from GTeX
eqtl_df = pd.read_csv('/Users/sahu0957/ds_deseq_normalization/all_eqtls.txt',sep='\t')
pd.set_option("display.max_columns", None)
eqtl_df

Unnamed: 0,gene_id,gene_name,gene_chr,gene_start,gene_end,strand,num_var,beta_shape1,beta_shape2,true_df,pval_true_df,variant_id,tss_distance,chr,variant_pos,ref,alt,num_alt_per_site,rs_id_dbSNP151_GRCh38p7,minor_allele_samples,minor_allele_count,maf,ref_factor,pval_nominal,slope,slope_se,pval_perm,pval_beta,qval,pval_nominal_threshold,log2_aFC,log2_aFC_lower,log2_aFC_upper
0,ENSG00000227232.5,WASH7P,chr1,14410,29553,-,1364,1.029840,294.487,455.958,6.290630e-08,chr1_64764_C_T_b38,35211,chr1,64764,C,T,1,rs769952832,70,71,0.061101,1,1.016610e-08,0.586346,0.100677,0.000100,1.321120e-05,1.011410e-05,0.000506,0.584194,0.435298,0.744545
1,ENSG00000268903.1,RP11-34P13.15,chr1,135141,135895,-,1863,1.048720,330.017,441.174,8.888300e-04,chr1_103147_C_T_b38,-32748,chr1,103147,C,T,1,rs866355763,18,18,0.015491,1,3.473320e-04,-0.612097,0.169958,0.241904,2.337000e-01,8.107420e-02,0.000473,-1.823931,-4.015491,-0.676333
2,ENSG00000269981.1,RP11-34P13.16,chr1,137682,137965,-,1868,1.049400,358.230,449.483,3.087250e-04,chr1_108826_G_C_b38,-29139,chr1,108826,G,C,1,rs62642117,40,40,0.034423,1,1.195250e-04,0.431229,0.111226,0.092691,9.179110e-02,3.754340e-02,0.000436,0.771338,0.523984,1.052938
3,ENSG00000241860.6,RP11-34P13.13,chr1,141474,173862,-,2066,1.036650,389.634,449.333,4.797330e-07,chr1_14677_G_A_b38,-159185,chr1,14677,G,A,1,rs201327123,60,60,0.051635,1,7.919480e-08,0.700658,0.128601,0.000400,1.343010e-04,9.226750e-05,0.000389,1.395702,1.081218,1.959566
4,ENSG00000279457.4,RP11-34P13.18,chr1,185217,195411,-,2234,1.052690,393.374,442.596,1.040420e-06,chr1_599167_G_A_b38,403756,chr1,599167,G,A,1,rs188376087,50,51,0.043890,1,1.536940e-07,-0.687794,0.129230,0.000200,2.650810e-04,1.756370e-04,0.000400,-0.813716,-1.236065,-0.532370
5,ENSG00000228463.9,AP006222.2,chr1,257864,297502,-,2799,1.056990,499.496,449.371,4.953790e-18,chr1_280550_G_A_b38,-16952,chr1,280550,G,A,1,rs1206875823,17,17,0.014630,1,2.720510e-20,-1.822030,0.189149,0.000100,3.548800e-16,6.268480e-16,0.000319,-6.643856,-6.643856,-6.643856
6,ENSG00000237094.11,RP4-669L17.10,chr1,366053,501617,-,3777,1.037320,590.739,448.560,1.383450e-03,chr1_877371_T_C_b38,375754,chr1,877371,T,C,1,rs4246500,23,27,0.023236,-1,6.409790e-04,0.640737,0.186537,0.551515,5.413000e-01,1.546330e-01,0.000257,1.481367,0.792237,2.497856
7,ENSG00000225972.1,MTND1P23,chr1,629062,629433,+,4328,1.038980,643.234,445.852,9.659700e-06,chr1_1543591_G_T_b38,914529,chr1,1543591,G,T,1,rs112703955,20,22,0.018933,1,2.166520e-06,-0.700075,0.146089,0.004200,4.995590e-03,2.787860e-03,0.000237,-6.643856,-6.643856,-6.643856
8,ENSG00000225630.1,MTND2P28,chr1,629640,630683,+,4331,1.036600,662.489,448.809,2.605700e-04,chr1_1328954_C_T_b38,699314,chr1,1328954,C,T,1,rs141109567,21,22,0.018933,1,9.751220e-05,0.711447,0.181132,0.144403,1.461210e-01,5.548710e-02,0.000229,0.868160,0.204390,1.469105
9,ENSG00000237973.1,MTCO1P12,chr1,631074,632616,+,4336,1.044480,684.929,450.306,3.385640e-05,chr1_1451550_C_T_b38,820476,chr1,1451550,C,T,1,rs150873804,15,18,0.015491,1,1.002190e-05,0.722942,0.162043,0.016998,1.901150e-02,9.436900e-03,0.000225,1.493728,0.330813,2.855416


In [77]:
# Step 3: Load in the GTEx concatenated eQTL file, and filter to only the genes in the FC list. Filter again to only
# the SNPs identified in Step 2. Load this all into a single file. Resulting file should be
# Gene_ID   Real_FC  Real_FC_SE  SNP_Names(list)  SNP_Distance(list)   Eric_genotype Ethan_genotype  Eli_genotype  Eliz_genotype log2_aFC(list) log2_aFC_lower(list) log2_aFC_upper(list)     
eqtl_df = pd.read_csv('/Users/sahu0957/ds_deseq_normalization/all_eqtls.txt',sep='\t')
eqtl_df = eqtl_df[eqtl_df['gene_name'].isin(fc_df['Common_ID'])]
eqtl_df = eqtl_df[eqtl_df['rs_id_dbSNP151_GRCh38p7'].isin(famE_snps_df['ID'])]

# Some other optional filtering... like TSS distance within 10kb, qval, low sample count
#eqtl_df = eqtl_df.loc[(eqtl_df['tss_distance'] < 10000) | (eqtl_df['tss_distance'] > -10000)]
eqtl_df = eqtl_df.loc[(eqtl_df['qval'] < .001)]

# Use Log2_aFC for now, but it might be less error-prone to just use the slope
genes_and_snps = eqtl_df[['rs_id_dbSNP151_GRCh38p7','gene_name']]
eqtl_df = eqtl_df[['rs_id_dbSNP151_GRCh38p7','tss_distance','log2_aFC','log2_aFC_lower','log2_aFC_upper']]
eqtl_df = eqtl_df.groupby('rs_id_dbSNP151_GRCh38p7', as_index=False).mean()
eqtl_df = pd.merge(eqtl_df,genes_and_snps, indicator=False, how='outer')
eqtl_df = eqtl_df.drop_duplicates(subset='rs_id_dbSNP151_GRCh38p7',keep='last')
all_info_df = pd.merge(eqtl_df,famE_snps_df, indicator=False, how='inner',right_on='ID',left_on='rs_id_dbSNP151_GRCh38p7')
all_info_df = pd.merge(all_info_df,fc_df, indicator=False, how='inner',right_on='Common_ID',left_on='gene_name')
#all_info_df.sort_values(all_info_df.columns[0], ascending = False)

In [78]:
# Step 4: For each gene, evaluate the accuracy of the slope with our trisomy data. In short, for each SNP for the gene we
# identify in the real data, we'll evaluate their relative contribution to the FC.. 
# I'm hoping we only have a couple SNPs per gene, or this could get REALLY messy...

# So what we're gonna do is just multiply the alleles columns by the Log2_aFC column, and then add up everything. 
# We'll get a FC per allele in this case, and can easily calculate the genotypic fold change from there

# Just gonna hard code all of this for now. Will loop this through a function if everything works
all_info_df["Eli_allele1"] = all_info_df["Eli_allele1"].astype(str).astype(int)
all_info_df["Eli_allele2"] = all_info_df["Eli_allele2"].astype(str).astype(int)
all_info_df["Eliz_allele1"] = all_info_df["Eliz_allele1"].astype(str).astype(int)
all_info_df["Eliz_allele2"] = all_info_df["Eliz_allele2"].astype(str).astype(int)
all_info_df["Eric_allele1"] = all_info_df["Eric_allele1"].astype(str).astype(int)
all_info_df["Eric_allele2"] = all_info_df["Eric_allele2"].astype(str).astype(int)
all_info_df["Ethan_allele1"] = all_info_df["Ethan_allele1"].astype(str).astype(int)
all_info_df["Ethan_allele2"] = all_info_df["Ethan_allele2"].astype(str).astype(int)
all_info_df["Ethan_allele3"] = all_info_df["Ethan_allele3"].astype(str).astype(int)

all_info_df['Eli_allele1_pFC'] = all_info_df['Eli_allele1'] * all_info_df['log2_aFC']
all_info_df['Eli_allele2_pFC'] = all_info_df['Eli_allele2'] * all_info_df['log2_aFC']
all_info_df['Eric_allele1_pFC'] = all_info_df['Eric_allele1'] * all_info_df['log2_aFC']
all_info_df['Eric_allele2_pFC'] = all_info_df['Eric_allele2'] * all_info_df['log2_aFC']
all_info_df['Eliz_allele1_pFC'] = all_info_df['Eliz_allele1'] * all_info_df['log2_aFC']
all_info_df['Eliz_allele2_pFC'] = all_info_df['Eliz_allele2'] * all_info_df['log2_aFC']
all_info_df['Ethan_allele1_pFC'] = all_info_df['Ethan_allele1'] * all_info_df['log2_aFC']
all_info_df['Ethan_allele2_pFC'] = all_info_df['Ethan_allele2'] * all_info_df['log2_aFC']
all_info_df['Ethan_allele3_pFC'] = all_info_df['Ethan_allele3'] * all_info_df['log2_aFC']

all_info_df['Eli_allele1_pFC_lower'] = all_info_df['Eli_allele1'] * all_info_df['log2_aFC_lower']
all_info_df['Eli_allele2_pFC_lower'] = all_info_df['Eli_allele2'] * all_info_df['log2_aFC_lower']
all_info_df['Eric_allele1_pFC_lower'] = all_info_df['Eric_allele1'] * all_info_df['log2_aFC_lower']
all_info_df['Eric_allele2_pFC_lower'] = all_info_df['Eric_allele2'] * all_info_df['log2_aFC_lower']
all_info_df['Eliz_allele1_pFC_lower'] = all_info_df['Eliz_allele1'] * all_info_df['log2_aFC_lower']
all_info_df['Eliz_allele2_pFC_lower'] = all_info_df['Eliz_allele2'] * all_info_df['log2_aFC_lower']
all_info_df['Ethan_allele1_pFC_lower'] = all_info_df['Ethan_allele1'] * all_info_df['log2_aFC_lower']
all_info_df['Ethan_allele2_pFC_lower'] = all_info_df['Ethan_allele2'] * all_info_df['log2_aFC_lower']
all_info_df['Ethan_allele3_pFC_lower'] = all_info_df['Ethan_allele3'] * all_info_df['log2_aFC_lower']

all_info_df['Eli_allele1_pFC_upper'] = all_info_df['Eli_allele1'] * all_info_df['log2_aFC_upper']
all_info_df['Eli_allele2_pFC_upper'] = all_info_df['Eli_allele2'] * all_info_df['log2_aFC_upper']
all_info_df['Eric_allele1_pFC_upper'] = all_info_df['Eric_allele1'] * all_info_df['log2_aFC_upper']
all_info_df['Eric_allele2_pFC_upper'] = all_info_df['Eric_allele2'] * all_info_df['log2_aFC_upper']
all_info_df['Eliz_allele1_pFC_upper'] = all_info_df['Eliz_allele1'] * all_info_df['log2_aFC_upper']
all_info_df['Eliz_allele2_pFC_upper'] = all_info_df['Eliz_allele2'] * all_info_df['log2_aFC_upper']
all_info_df['Ethan_allele1_pFC_upper'] = all_info_df['Ethan_allele1'] * all_info_df['log2_aFC_upper']
all_info_df['Ethan_allele2_pFC_upper'] = all_info_df['Ethan_allele2'] * all_info_df['log2_aFC_upper']
all_info_df['Ethan_allele3_pFC_upper'] = all_info_df['Ethan_allele3'] * all_info_df['log2_aFC_upper']

predicted_fc = all_info_df.groupby(all_info_df['Row.names']).sum()
predicted_fc = predicted_fc.rename_axis("Row.names").reset_index()
predicted_fc = pd.merge(predicted_fc,fc_df,right_on='Row.names',left_on='Row.names',indicator=False)
predicted_fc['Eli_allele1_pFC'] = 2**predicted_fc['Eli_allele1_pFC']
predicted_fc['Eli_allele2_pFC'] = 2**predicted_fc['Eli_allele2_pFC']
predicted_fc['Eric_allele1_pFC'] = 2**predicted_fc['Eric_allele1_pFC']
predicted_fc['Eric_allele2_pFC'] = 2**predicted_fc['Eric_allele2_pFC']
predicted_fc['Eliz_allele1_pFC'] = 2**predicted_fc['Eliz_allele1_pFC']
predicted_fc['Eliz_allele2_pFC'] = 2**predicted_fc['Eliz_allele2_pFC']
predicted_fc['Ethan_allele1_pFC'] = 2**predicted_fc['Ethan_allele1_pFC']
predicted_fc['Ethan_allele2_pFC'] = 2**predicted_fc['Ethan_allele2_pFC']
predicted_fc['Ethan_allele3_pFC'] = 2**predicted_fc['Ethan_allele3_pFC']

predicted_fc['Eli_allele1_pFC_lower'] = 2**predicted_fc['Eli_allele1_pFC_lower']
predicted_fc['Eli_allele2_pFC_lower'] = 2**predicted_fc['Eli_allele2_pFC_lower']
predicted_fc['Eric_allele1_pFC_lower'] = 2**predicted_fc['Eric_allele1_pFC_lower']
predicted_fc['Eric_allele2_pFC_lower'] = 2**predicted_fc['Eric_allele2_pFC_lower']
predicted_fc['Eliz_allele1_pFC_lower'] = 2**predicted_fc['Eliz_allele1_pFC_lower']
predicted_fc['Eliz_allele2_pFC_lower'] = 2**predicted_fc['Eliz_allele2_pFC_lower']
predicted_fc['Ethan_allele1_pFC_lower'] = 2**predicted_fc['Ethan_allele1_pFC_lower']
predicted_fc['Ethan_allele2_pFC_lower'] = 2**predicted_fc['Ethan_allele2_pFC_lower']
predicted_fc['Ethan_allele3_pFC_lower'] = 2**predicted_fc['Ethan_allele3_pFC_lower']

predicted_fc['Eli_allele1_pFC_upper'] = 2**predicted_fc['Eli_allele1_pFC_upper']
predicted_fc['Eli_allele2_pFC_upper'] = 2**predicted_fc['Eli_allele2_pFC_upper']
predicted_fc['Eric_allele1_pFC_upper'] = 2**predicted_fc['Eric_allele1_pFC_upper']
predicted_fc['Eric_allele2_pFC_upper'] = 2**predicted_fc['Eric_allele2_pFC_upper']
predicted_fc['Eliz_allele1_pFC_upper'] = 2**predicted_fc['Eliz_allele1_pFC_upper']
predicted_fc['Eliz_allele2_pFC_upper'] = 2**predicted_fc['Eliz_allele2_pFC_upper']
predicted_fc['Ethan_allele1_pFC_upper'] = 2**predicted_fc['Ethan_allele1_pFC_upper']
predicted_fc['Ethan_allele2_pFC_upper'] = 2**predicted_fc['Ethan_allele2_pFC_upper']
predicted_fc['Ethan_allele3_pFC_upper'] = 2**predicted_fc['Ethan_allele3_pFC_upper']


# This is kind of hacky, but simply finding a FC between the lower and upper bounds doesn't make sense, because if
# we do a ratio of the signal, the medians might not be bounded by the predicted FC bounds. So instead we do a 
# FC ratio of the upper signal for Eric and the lower signal for Ethan (the lower FC ratio bound) and the reverse for the
# upper ratio bound

predicted_fc['Predicted Ethan/Eric'] = (predicted_fc['Ethan_allele3_pFC'] + predicted_fc['Ethan_allele2_pFC'] + predicted_fc['Ethan_allele1_pFC'])/(predicted_fc['Eric_allele1_pFC'] + predicted_fc['Eric_allele2_pFC'])
predicted_fc['Predicted Ethan/Eric_lower'] = (predicted_fc['Ethan_allele3_pFC_lower'] + predicted_fc['Ethan_allele2_pFC_lower'] + predicted_fc['Ethan_allele1_pFC_lower'])/(predicted_fc['Eric_allele1_pFC_upper'] + predicted_fc['Eric_allele2_pFC_upper'])
predicted_fc['Predicted Ethan/Eric_upper'] = (predicted_fc['Ethan_allele3_pFC_upper'] + predicted_fc['Ethan_allele2_pFC_upper'] + predicted_fc['Ethan_allele1_pFC_upper'])/(predicted_fc['Eric_allele1_pFC_lower'] + predicted_fc['Eric_allele2_pFC_lower'])

predicted_fc['Predicted Ethan/Eli'] = (predicted_fc['Ethan_allele3_pFC'] + predicted_fc['Ethan_allele2_pFC'] + predicted_fc['Ethan_allele1_pFC'])/(predicted_fc['Eric_allele1_pFC'] + predicted_fc['Eric_allele2_pFC'])
predicted_fc['Predicted Ethan/Eli_lower'] = (predicted_fc['Ethan_allele3_pFC_lower'] + predicted_fc['Ethan_allele2_pFC_lower'] + predicted_fc['Ethan_allele1_pFC_lower'])/(predicted_fc['Eli_allele1_pFC_upper'] + predicted_fc['Eli_allele2_pFC_upper'])
predicted_fc['Predicted Ethan/Eli_upper'] = (predicted_fc['Ethan_allele3_pFC_upper'] + predicted_fc['Ethan_allele2_pFC_upper'] + predicted_fc['Ethan_allele1_pFC_upper'])/(predicted_fc['Eli_allele1_pFC_lower'] + predicted_fc['Eli_allele2_pFC_lower'])


predicted_fc['Real Ethan/Eli'] = 2**(predicted_fc['log2FoldChange_y'])


predicted_fc
#fc_df

Unnamed: 0,Row.names,tss_distance,log2_aFC,log2_aFC_lower,log2_aFC_upper,Eli_allele1,Eli_allele2,Eliz_allele1,Eliz_allele2,Eric_allele1,Eric_allele2,Ethan_allele1,Ethan_allele2,Ethan_allele3,baseMean_x,log2FoldChange_x,lfcSE_x,Eli_allele1_pFC,Eli_allele2_pFC,Eric_allele1_pFC,Eric_allele2_pFC,Eliz_allele1_pFC,Eliz_allele2_pFC,Ethan_allele1_pFC,Ethan_allele2_pFC,Ethan_allele3_pFC,Eli_allele1_pFC_lower,Eli_allele2_pFC_lower,Eric_allele1_pFC_lower,Eric_allele2_pFC_lower,Eliz_allele1_pFC_lower,Eliz_allele2_pFC_lower,Ethan_allele1_pFC_lower,Ethan_allele2_pFC_lower,Ethan_allele3_pFC_lower,Eli_allele1_pFC_upper,Eli_allele2_pFC_upper,Eric_allele1_pFC_upper,Eric_allele2_pFC_upper,Eliz_allele1_pFC_upper,Eliz_allele2_pFC_upper,Ethan_allele1_pFC_upper,Ethan_allele2_pFC_upper,Ethan_allele3_pFC_upper,baseMean_y,log2FoldChange_y,lfcSE_y,Common_ID,Predicted Ethan/Eric,Predicted Ethan/Eric_lower,Predicted Ethan/Eric_upper,Predicted Ethan/Eli,Predicted Ethan/Eli_lower,Predicted Ethan/Eli_upper,Real Ethan/Eli
0,NM_001007246,4681,-0.097162,-0.121193,-0.051786,1,1,1,1,1,1,1,1,1,916.762288,-0.171856,0.176205,0.93487,0.93487,0.93487,0.93487,0.93487,0.93487,0.93487,0.93487,0.93487,0.919427,0.919427,0.919427,0.919427,0.919427,0.919427,0.919427,0.919427,0.919427,0.964741,0.964741,0.964741,0.964741,0.964741,0.964741,0.964741,0.964741,0.964741,916.762288,-0.171856,0.176205,BRWD1,1.5,1.429544,1.573928,1.5,1.429544,1.573928,0.8877
1,NM_001146077,-215987,1.571982,1.107747,2.440477,1,1,1,1,0,2,0,1,2,3651.001975,0.166445,0.331703,1.579258,1.579258,1.0,2.973129,1.882611,1.882611,1.0,1.882611,2.973129,1.390869,1.390869,1.0,2.155088,1.549454,1.549454,1.0,1.549454,2.155088,1.854968,1.854968,1.0,5.428212,2.926311,2.926311,1.0,2.926311,5.428212,1825.500987,0.083222,0.165851,CLDN14,1.473836,0.731859,2.9649,1.473836,1.268093,3.362833,1.059382
2,NM_001271534,-1329430,3.984666,3.441828,4.365604,0,2,2,2,0,2,0,2,2,308.907697,0.235145,0.544564,1.0,15.830841,1.0,15.830841,15.830841,15.830841,1.0,15.830841,15.830841,1.0,10.866595,1.0,10.866595,10.866595,10.866595,1.0,10.866595,10.866595,1.0,20.614735,1.0,20.614735,20.614735,20.614735,1.0,20.614735,20.614735,154.453849,0.117573,0.272282,DSCAM,1.940585,1.051745,3.558685,1.940585,1.051745,3.558685,1.084908
3,NM_001282934,-125301,1.755536,1.442643,2.102984,1,5,2,2,2,2,2,2,2,7500.145818,0.458965,0.892018,1.834771,3.376517,1.520699,1.520699,1.520699,1.520699,1.520699,1.520699,1.520699,1.719245,2.718185,1.363845,1.363845,1.363845,1.363845,1.363845,1.363845,1.363845,1.93275,4.295971,1.709484,1.709484,1.709484,1.709484,1.709484,1.709484,1.709484,1500.029164,0.091793,0.178404,PRDM15,1.5,1.196717,1.880144,1.5,0.656882,1.155726,1.065694
4,NM_001286462,-52208,0.515471,0.320341,0.701785,1,4,3,3,1,4,1,3,4,1143.788518,0.152457,1.01505,1.205276,1.42946,1.205276,1.42946,1.632472,1.632472,1.205276,1.632472,1.42946,1.166472,1.248626,1.166472,1.248626,1.476769,1.476769,1.166472,1.476769,1.248626,1.241299,1.626515,1.241299,1.626515,1.800243,1.800243,1.241299,1.800243,1.626515,285.94713,0.038114,0.253763,C21orf58,1.619596,1.357084,1.932865,1.619596,1.357084,1.932865,1.026771
5,NM_001317009,240474,10.182811,5.831642,14.168398,1,14,2,12,11,11,11,11,11,13823.398664,-5.204998,2.870935,1.679103,674.04908,38.176162,38.176162,5.676911,65.83129,38.176162,38.176162,38.176162,1.454569,35.943849,3.027634,3.027634,3.720976,4.797092,3.027634,3.027634,3.027634,1.896794,9613.012093,389.862658,389.862658,9.187339,746.732114,389.862658,389.862658,389.862658,921.559911,-0.347,0.191396,CLIC6,1.5,0.011649,193.152115,1.5,0.000945,31.273728,0.786217
6,NM_001320618,-401213,3.624438,2.623904,4.426304,0,3,1,1,1,2,1,1,1,1896.500407,-0.150634,0.798791,1.0,12.332881,1.607731,5.573675,2.212702,2.212702,2.212702,2.212702,2.212702,1.0,6.164159,1.378372,3.986026,1.546442,1.546442,1.546442,1.546442,1.546442,1.0,21.500585,1.843008,7.710642,2.78843,2.78843,2.78843,2.78843,2.78843,632.166802,-0.050211,0.266264,GRIK1,0.924346,0.485608,1.559409,0.924346,0.206187,1.167658,0.965795
7,NM_001320630,-401213,3.624438,2.623904,4.426304,0,3,1,1,1,2,1,1,1,769.713492,0.239226,0.83562,1.0,12.332881,1.607731,5.573675,2.212702,2.212702,2.212702,2.212702,2.212702,1.0,6.164159,1.378372,3.986026,1.546442,1.546442,1.546442,1.546442,1.546442,1.0,21.500585,1.843008,7.710642,2.78843,2.78843,2.78843,2.78843,2.78843,256.571164,0.079742,0.27854,GRIK1,0.924346,0.485608,1.559409,0.924346,0.206187,1.167658,1.056829
8,NM_001331011,580801,0.638925,0.470703,0.815024,0,0,1,5,0,5,0,5,5,2263.114869,-4.592094,1.246667,1.0,1.0,1.0,1.557168,1.090461,1.557168,1.0,1.557168,1.557168,1.0,1.0,1.0,1.385785,1.069048,1.385785,1.0,1.385785,1.385785,1.0,1.0,1.0,1.759327,1.107952,1.759327,1.0,1.759327,1.759327,452.622974,-0.918419,0.249333,ITSN1,1.608942,1.366844,1.893991,1.608942,1.885785,2.259327,0.529089
9,NM_001348240,262107,0.976327,0.501507,1.447962,2,3,1,3,0,3,0,1,3,1388.551481,-2.146001,1.906318,2.260209,2.974718,1.0,2.054461,0.90897,0.870473,1.0,0.90897,2.054461,1.961368,2.409236,1.0,1.739069,0.886661,0.721788,1.0,0.886661,1.739069,2.602723,3.716321,1.0,2.448803,0.940862,1.048219,1.0,0.940862,2.448803,277.710296,-0.4292,0.381264,PCBP3,1.297588,1.051301,1.602612,1.297588,0.573778,1.004361,0.742673


In [83]:
# Hmm... that didn't work at all! Let's try a more basic strategy. We'll only ask whether there is the potential
# for explaining FC by using the slope. First we filter out all SNPs where either mom or dad is heterozygous, and where
# mom and dad are unmatching homozygous
famE_snps_df = pd.read_csv('/Users/sahu0957/ds_deseq_normalization/FamE.vcf',header=236,sep='\t')

# Get only canonical SNPs on chr21
famE_snps_df = famE_snps_df[famE_snps_df['#CHROM'].str.contains("chr21")]
famE_snps_df = famE_snps_df[famE_snps_df['ID'].str.contains("rs")]

# Remove unhelpful SNP calls (Mom and Dad have the same alleles)
famE_snps_df['EliGT'] = famE_snps_df['Eli'].str[0:3]
famE_snps_df['ElizGT'] = famE_snps_df['Elizabeth'].str[0:3]
famE_snps_df['EricGT'] = famE_snps_df['Eric'].str[0:3]
famE_snps_df['EthanGT'] = famE_snps_df['Ethan'].str[0:5]

# keep only heterozygous or unmatching homozygous in the parent
removal_df = famE_snps_df.loc[(famE_snps_df['EliGT'].str[0] == famE_snps_df['EliGT'].str[2]) & (famE_snps_df['ElizGT'].str[0] == famE_snps_df['ElizGT'].str[2]) & (famE_snps_df['EliGT'].str[0] == famE_snps_df['ElizGT'].str[0])]
famE_snps_df = pd.merge(famE_snps_df,removal_df, indicator=True, how='outer').query('_merge=="left_only"').drop('_merge', axis=1)


# FIXME: Once again, extremely ugly hard coding. 
famE_snps_df['Eli_allele1'] = famE_snps_df['Eli'].str[0]
famE_snps_df['Eli_allele2'] = famE_snps_df['Eli'].str[2]

famE_snps_df.loc[famE_snps_df['Eli_allele1'] == ".", 'Eli_allele1'] = 0
famE_snps_df.loc[famE_snps_df['Eli_allele2'] == ".", 'Eli_allele2'] = 0

famE_snps_df['Eliz_allele1'] = famE_snps_df['Elizabeth'].str[0]
famE_snps_df['Eliz_allele2'] = famE_snps_df['Elizabeth'].str[2]

famE_snps_df.loc[famE_snps_df['Eliz_allele1'] == ".", 'Eliz_allele1'] = 0
famE_snps_df.loc[famE_snps_df['Eliz_allele2'] == ".", 'Eliz_allele2'] = 0

famE_snps_df['Eric_allele1'] = famE_snps_df['Eric'].str[0]
famE_snps_df['Eric_allele2'] = famE_snps_df['Eric'].str[2]

famE_snps_df.loc[famE_snps_df['Eric_allele1'] == ".", 'Eric_allele1'] = 0
famE_snps_df.loc[famE_snps_df['Eric_allele2'] == ".", 'Eric_allele2'] = 0

famE_snps_df['Ethan_allele1'] = famE_snps_df['Ethan'].str[0]
famE_snps_df['Ethan_allele2'] = famE_snps_df['Ethan'].str[2]
famE_snps_df['Ethan_allele3'] = famE_snps_df['Ethan'].str[4]

famE_snps_df.loc[famE_snps_df['Ethan_allele1'] == ".", 'Ethan_allele1'] = 0
famE_snps_df.loc[famE_snps_df['Ethan_allele2'] == ".", 'Ethan_allele2'] = 0
famE_snps_df.loc[famE_snps_df['Ethan_allele3'] == ".", 'Ethan_allele3'] = 0


# For now we're just gonna assume the SNP matches reference if it's an error. We should later change this to match
# Mendelian inheritance (phasing info would be even better but more difficult to implement)




In [84]:
# VCF files bury their depth info within the info column... with no order on the other data. So we have
# to do some cleanup to fetch that data. There's for sure a better way to do this.
info_df = famE_snps_df['INFO'].str.split(';',expand=True)
mask = info_df.apply(lambda col: col.str.contains('DP=').any(),
                axis=0)
sub_df = info_df.loc[: , mask]
sub_df = sub_df.replace(r'^(?!.*DP).*$', '', regex=True)
sub_df['Depth'] = sub_df.iloc[:,0] + sub_df.iloc[:,1]
sub_df['Depth'] = sub_df['Depth'].str.replace('DP=','').astype(int)
famE_snps_df['Depth'] = sub_df['Depth']

# From here you can apply a depth filter, then remove columns which are no longer useful
# 
famE_snps_df = famE_snps_df[famE_snps_df['Depth'] >= 100]
famE_snps_df = famE_snps_df.iloc[:,[2,17,18,19,20,21,22,23,24,25]]
famE_snps_df

Unnamed: 0,ID,Eli_allele1,Eli_allele2,Eliz_allele1,Eliz_allele2,Eric_allele1,Eric_allele2,Ethan_allele1,Ethan_allele2,Ethan_allele3
0,rs796574937,0,1,0,1,0,1,0,0,1
1,rs796473723,0,1,0,1,0,1,0,0,1
2,rs796625648,0,1,0,1,0,1,0,0,1
3,rs796888640,0,1,0,1,0,1,0,0,1
4,rs749596923,0,1,0,1,0,1,0,0,1
5,rs796830124,0,1,0,1,0,1,0,1,1
6,rs369998822,0,1,0,1,0,0,0,0,0
7,rs372700624,0,1,0,1,0,1,0,0,1
8,rs373368732,0,1,0,1,0,1,0,0,1
9,rs796604716,0,1,0,1,0,1,0,0,1


In [85]:
# Step 3: Load in the GTEx concatenated eQTL file, and filter to only the genes in the FC list. Filter again to only
# the SNPs identified in Step 2. Load this all into a single file. Resulting file should be
# Gene_ID   Real_FC  Real_FC_SE  SNP_Names(list)  SNP_Distance(list)   Eric_genotype Ethan_genotype  Eli_genotype  Eliz_genotype log2_aFC(list) log2_aFC_lower(list) log2_aFC_upper(list)     
eqtl_df = pd.read_csv('/Users/sahu0957/ds_deseq_normalization/all_eqtls.txt',sep='\t')
eqtl_df = eqtl_df[eqtl_df['gene_name'].isin(fc_df['Common_ID'])]
eqtl_df = eqtl_df[eqtl_df['rs_id_dbSNP151_GRCh38p7'].isin(famE_snps_df['ID'])]

# Some other optional filtering... like TSS distance within 10kb, qval, low sample count
#eqtl_df = eqtl_df.loc[(eqtl_df['tss_distance'] < 10000) & (eqtl_df['tss_distance'] > -10000)]
eqtl_df = eqtl_df.loc[(eqtl_df['qval'] < .001)]

# Use Log2_aFC for now, but it might be less error-prone to just use the slope. Abandons the GLM though
genes_and_snps = eqtl_df[['rs_id_dbSNP151_GRCh38p7','gene_name']]
eqtl_df = eqtl_df[['rs_id_dbSNP151_GRCh38p7','tss_distance','slope','slope_se']]
eqtl_df = eqtl_df.groupby('rs_id_dbSNP151_GRCh38p7', as_index=False).mean()
eqtl_df = pd.merge(eqtl_df,genes_and_snps, indicator=False, how='outer')
eqtl_df = eqtl_df.drop_duplicates(subset='rs_id_dbSNP151_GRCh38p7',keep='last')
all_info_df = pd.merge(eqtl_df,famE_snps_df, indicator=False, how='inner',right_on='ID',left_on='rs_id_dbSNP151_GRCh38p7')
all_info_df = pd.merge(all_info_df,fc_df, indicator=False, how='inner',right_on='Common_ID',left_on='gene_name')

all_info_df.sort_values(all_info_df.columns[-1], ascending = False)

Unnamed: 0,rs_id_dbSNP151_GRCh38p7,tss_distance,slope,slope_se,gene_name,ID,Eli_allele1,Eli_allele2,Eliz_allele1,Eliz_allele2,Eric_allele1,Eric_allele2,Ethan_allele1,Ethan_allele2,Ethan_allele3,Row.names,baseMean,log2FoldChange,lfcSE,Common_ID
64,rs884982,69575,-0.677644,0.063263,SLC37A1,rs884982,1,1,0,1,1,1,1,1,1,NM_018964,2581.739509,0.077787,0.158898,SLC37A1
59,rs6586325,26131,0.25178,0.029987,SLC37A1,rs6586325,0,1,0,1,0,0,0,0,0,NM_018964,2581.739509,0.077787,0.158898,SLC37A1
65,rs9974527,-30567,-0.120717,0.022442,SLC37A1,rs9974527,0,0,0,1,0,0,0,0,0,NM_018964,2581.739509,0.077787,0.158898,SLC37A1
63,rs875060,28113,0.229003,0.036983,SLC37A1,rs875060,0,1,0,1,0,1,0,0,1,NM_018964,2581.739509,0.077787,0.158898,SLC37A1
62,rs8132907,20023,-0.280732,0.052981,SLC37A1,rs8132907,0,0,0,1,0,1,0,1,1,NM_018964,2581.739509,0.077787,0.158898,SLC37A1
61,rs8129779,26733,0.284163,0.041018,SLC37A1,rs8129779,0,1,0,0,0,1,0,0,1,NM_018964,2581.739509,0.077787,0.158898,SLC37A1
60,rs8127895,26684,0.169639,0.030686,SLC37A1,rs8127895,1,1,0,0,0,1,0,0,1,NM_018964,2581.739509,0.077787,0.158898,SLC37A1
56,rs1878069,58451,-0.339445,0.031198,SLC37A1,rs1878069,1,1,0,0,0,1,0,0,1,NM_018964,2581.739509,0.077787,0.158898,SLC37A1
58,rs401809,101834,-0.191801,0.032508,SLC37A1,rs401809,1,1,0,1,1,1,1,1,1,NM_018964,2581.739509,0.077787,0.158898,SLC37A1
57,rs228097,65823,-0.358462,0.036115,SLC37A1,rs228097,1,1,0,1,0,1,0,0,1,NM_018964,2581.739509,0.077787,0.158898,SLC37A1


In [86]:
# Okay, another unconvincing result... let's zoom out even further. How many of the genes on chr21 even HAVE
# associated SNPs in the family? And of those, how many of our dozen or so FC 1.0x genes have a SNP? 
# Step 1: Load in all genes
genome_df = pd.read_csv('/scratch/Shares/dowell/genomes/hg38/hg38_refseq.bed',sep="\t",header=None)
genome_df = genome_df.iloc[:,[0,1,2,3,4,5]]
genome_df.columns =["chr","start","stop",'Row.names','score',"strand"]



common_id = pd.read_csv('/Users/sahu0957/backup/genome_files/hg38_refseq_to_common_id.txt', header=None)
common_id = common_id[0].str.rsplit("_",n=1, expand=True)
common_id.columns =['Row.names', 'Common_ID']
#common_id
genome_df = pd.merge(genome_df,common_id,indicator=False, how='left')
genome_df = genome_df.drop_duplicates(keep='last')

genome_df.sort_values(genome_df.columns[-1], ascending = False)
genome_df

Unnamed: 0,chr,start,stop,Row.names,score,strand,Common_ID
0,chr1,11873,14409,NR_046018,0,+,DDX11L1
1,chr1,14361,29370,NR_024540,0,-,WASH7P
5,chr1,17368,17436,NR_106918,0,-,MIR6859-1
9,chr1,17368,17436,NR_107062,0,-,MIR6859-2
13,chr1,17368,17436,NR_107063,0,-,MIR6859-3
17,chr1,17368,17436,NR_128720,0,-,MIR6859-4
21,chr1,30365,30503,NR_036268,0,+,MIR1302-11
25,chr1,30365,30503,NR_036267,0,+,MIR1302-10
29,chr1,30365,30503,NR_036266,0,+,MIR1302-9
33,chr1,30365,30503,NR_036051,0,+,MIR1302-2


In [87]:
# Step 2: Load in Family E SNP VCF.
# There should be fewer shifts in Ethan/Eric if the genotypes of Mom and Dad are homozygous and match. 
famE_snps_df = pd.read_csv('/Users/sahu0957/ds_deseq_normalization/FamE.vcf',header=236,sep='\t')

# Get only canonical SNPs on chr21
famE_snps_df = famE_snps_df[famE_snps_df['#CHROM'].str.contains("chr21")]
famE_snps_df = famE_snps_df[famE_snps_df['ID'].str.contains("rs")]

# If the SNP isn't assigned, just assume it matches reference. These are filtered out later anyway
# Later we could include a guess based on Mendelian inheritance

famE_snps_df['Eli_allele1'] = famE_snps_df['Eli'].str[0]
famE_snps_df['Eli_allele2'] = famE_snps_df['Eli'].str[2]

famE_snps_df.loc[famE_snps_df['Eli_allele1'] == ".", 'Eli_allele1'] = 0
famE_snps_df.loc[famE_snps_df['Eli_allele2'] == ".", 'Eli_allele2'] = 0

famE_snps_df['Eliz_allele1'] = famE_snps_df['Elizabeth'].str[0]
famE_snps_df['Eliz_allele2'] = famE_snps_df['Elizabeth'].str[2]

famE_snps_df.loc[famE_snps_df['Eliz_allele1'] == ".", 'Eliz_allele1'] = 0
famE_snps_df.loc[famE_snps_df['Eliz_allele2'] == ".", 'Eliz_allele2'] = 0

famE_snps_df['Eric_allele1'] = famE_snps_df['Eric'].str[0]
famE_snps_df['Eric_allele2'] = famE_snps_df['Eric'].str[2]

famE_snps_df.loc[famE_snps_df['Eric_allele1'] == ".", 'Eric_allele1'] = 0
famE_snps_df.loc[famE_snps_df['Eric_allele2'] == ".", 'Eric_allele2'] = 0

famE_snps_df['Ethan_allele1'] = famE_snps_df['Ethan'].str[0]
famE_snps_df['Ethan_allele2'] = famE_snps_df['Ethan'].str[2]
famE_snps_df['Ethan_allele3'] = famE_snps_df['Ethan'].str[4]

famE_snps_df.loc[famE_snps_df['Ethan_allele1'] == ".", 'Ethan_allele1'] = 0
famE_snps_df.loc[famE_snps_df['Ethan_allele2'] == ".", 'Ethan_allele2'] = 0
famE_snps_df.loc[famE_snps_df['Ethan_allele3'] == ".", 'Ethan_allele3'] = 0


#removal_df = famE_snps_df.loc[(famE_snps_df['EliGT'].str[0] == famE_snps_df['EliGT'].str[2]) & (famE_snps_df['ElizGT'].str[0] == famE_snps_df['ElizGT'].str[2]) & (famE_snps_df['EliGT'].str[0] == famE_snps_df['ElizGT'].str[0])]
#famE_snps_df = pd.merge(famE_snps_df,removal_df, indicator=True, how='outer').query('_merge=="left_only"').drop('_merge', axis=1)

famE_snps_df = famE_snps_df.iloc[:,[2,13,14,15,16,17,18,19,20,21]]

#famE_snps_df = famE_snps_df[famE_snps_df['ID'].str]



In [88]:
eqtl_df = pd.read_csv('/Users/sahu0957/ds_deseq_normalization/all_eqtls.txt',sep='\t')
eqtl_df = eqtl_df[eqtl_df['gene_name'].isin(genome_df['Common_ID'])]
chr21_genome_df = genome_df[genome_df['chr'].str.contains("chr21")]

chr21_eqtl_withgenes_df = pd.merge(eqtl_df,chr21_genome_df, indicator=False)
chr21_eqtl_withgenes_df = chr21_eqtl_withgenes_df.drop_duplicates(subset='Row.names',keep='last')
chr21_eqtl_withgenes_df

Unnamed: 0,gene_id,gene_name,gene_chr,gene_start,gene_end,strand,num_var,beta_shape1,beta_shape2,true_df,pval_true_df,variant_id,tss_distance,chr,variant_pos,ref,alt,num_alt_per_site,rs_id_dbSNP151_GRCh38p7,minor_allele_samples,minor_allele_count,maf,ref_factor,pval_nominal,slope,slope_se,pval_perm,pval_beta,qval,pval_nominal_threshold,log2_aFC,log2_aFC_lower,log2_aFC_upper,start,stop,Row.names,score,Common_ID
2115851,ENSG00000160307.9,S100B,chr21,46598962,46605208,-,4800,1.03056,413.429,509.634,4.723540e-57,chr21_46540580_T_C_b38,-64628,chr21,46540580,T,C,1,rs9306156,390,488,0.364179,1,7.468270e-67,0.758301,0.038550,0.0001,4.399050e-56,3.761220e-55,0.000300,2.153710,1.973207,2.362372,5130870,5154658,NR_160695,0,LOC102724159
2115886,ENSG00000160307.9,S100B,chr21,46598962,46605208,-,4800,1.03056,413.429,509.634,4.723540e-57,chr21_46540580_T_C_b38,-64628,chr21,46540580,T,C,1,rs9306156,390,488,0.364179,1,7.468270e-67,0.758301,0.038550,0.0001,4.399050e-56,3.761220e-55,0.000300,2.153710,1.973207,2.362372,7816676,7829632,NM_001330065,0,KCNE1B
2115887,ENSG00000160307.9,S100B,chr21,46598962,46605208,-,4800,1.03056,413.429,509.634,4.723540e-57,chr21_46540580_T_C_b38,-64628,chr21,46540580,T,C,1,rs9306156,390,488,0.364179,1,7.468270e-67,0.758301,0.038550,0.0001,4.399050e-56,3.761220e-55,0.000300,2.153710,1.973207,2.362372,7816676,7829632,NM_001369869,0,KCNE1B
2115888,ENSG00000160307.9,S100B,chr21,46598962,46605208,-,4800,1.03056,413.429,509.634,4.723540e-57,chr21_46540580_T_C_b38,-64628,chr21,46540580,T,C,1,rs9306156,390,488,0.364179,1,7.468270e-67,0.758301,0.038550,0.0001,4.399050e-56,3.761220e-55,0.000300,2.153710,1.973207,2.362372,9068355,9129761,NR_038327,0,TEKT4P2
2115889,ENSG00000160307.9,S100B,chr21,46598962,46605208,-,4800,1.03056,413.429,509.634,4.723540e-57,chr21_46540580_T_C_b38,-64628,chr21,46540580,T,C,1,rs9306156,390,488,0.364179,1,7.468270e-67,0.758301,0.038550,0.0001,4.399050e-56,3.761220e-55,0.000300,2.153710,1.973207,2.362372,9068355,9129761,NR_038328,0,TEKT4P2
2115890,ENSG00000160307.9,S100B,chr21,46598962,46605208,-,4800,1.03056,413.429,509.634,4.723540e-57,chr21_46540580_T_C_b38,-64628,chr21,46540580,T,C,1,rs9306156,390,488,0.364179,1,7.468270e-67,0.758301,0.038550,0.0001,4.399050e-56,3.761220e-55,0.000300,2.153710,1.973207,2.362372,9076416,9129761,NR_038329,0,TEKT4P2
2115891,ENSG00000160307.9,S100B,chr21,46598962,46605208,-,4800,1.03056,413.429,509.634,4.723540e-57,chr21_46540580_T_C_b38,-64628,chr21,46540580,T,C,1,rs9306156,390,488,0.364179,1,7.468270e-67,0.758301,0.038550,0.0001,4.399050e-56,3.761220e-55,0.000300,2.153710,1.973207,2.362372,9781848,9821061,NR_038377,0,LINC01667
2115892,ENSG00000160307.9,S100B,chr21,46598962,46605208,-,4800,1.03056,413.429,509.634,4.723540e-57,chr21_46540580_T_C_b38,-64628,chr21,46540580,T,C,1,rs9306156,390,488,0.364179,1,7.468270e-67,0.758301,0.038550,0.0001,4.399050e-56,3.761220e-55,0.000300,2.153710,1.973207,2.362372,13406383,13406460,NR_036164,0,MIR3156-3
2115893,ENSG00000160307.9,S100B,chr21,46598962,46605208,-,4800,1.03056,413.429,509.634,4.723540e-57,chr21_46540580_T_C_b38,-64628,chr21,46540580,T,C,1,rs9306156,390,488,0.364179,1,7.468270e-67,0.758301,0.038550,0.0001,4.399050e-56,3.761220e-55,0.000300,2.153710,1.973207,2.362372,13644774,13644850,NR_036061,0,MIR3118-1
2115894,ENSG00000160307.9,S100B,chr21,46598962,46605208,-,4800,1.03056,413.429,509.634,4.723540e-57,chr21_46540580_T_C_b38,-64628,chr21,46540580,T,C,1,rs9306156,390,488,0.364179,1,7.468270e-67,0.758301,0.038550,0.0001,4.399050e-56,3.761220e-55,0.000300,2.153710,1.973207,2.362372,13843132,13848364,NR_026755,0,CYP4F29P


In [90]:
# Step 3: Load in the GTEx concatenated eQTL file, and filter to only the genes in the FC list. Filter again to only
# the SNPs identified in Step 2. Load this all into a single file. Resulting file should be
# Gene_ID   Real_FC  Real_FC_SE  SNP_Names(list)  SNP_Distance(list)   Eric_genotype Ethan_genotype  Eli_genotype  Eliz_genotype log2_aFC(list) log2_aFC_lower(list) log2_aFC_upper(list)     
eqtl_df = pd.read_csv('/Users/sahu0957/ds_deseq_normalization/all_eqtls.txt',sep='\t')
eqtl_df = eqtl_df[eqtl_df['gene_name'].isin(genome_df['Common_ID'])]
eqtl_df = eqtl_df[eqtl_df['rs_id_dbSNP151_GRCh38p7'].isin(famE_snps_df['ID'])]

# Some other optional filtering... like TSS distance within 10kb, qval, low sample count
#eqtl_df = eqtl_df.loc[(eqtl_df['tss_distance'] < 10000) | (eqtl_df['tss_distance'] > -10000)]
eqtl_df = eqtl_df.loc[(eqtl_df['qval'] < .001)]

# Use Log2_aFC for now, but it might be less error-prone to just use the slope...
genes_and_snps = eqtl_df[['rs_id_dbSNP151_GRCh38p7','gene_name']]
eqtl_df = eqtl_df[['rs_id_dbSNP151_GRCh38p7','tss_distance','log2_aFC','log2_aFC_lower','log2_aFC_upper']]
eqtl_df = eqtl_df.groupby('rs_id_dbSNP151_GRCh38p7', as_index=False).mean()
eqtl_df = pd.merge(eqtl_df,genes_and_snps, indicator=False, how='outer')
eqtl_df = eqtl_df.drop_duplicates(subset='rs_id_dbSNP151_GRCh38p7',keep='last')

# Make a DF which incorporates our curated SNPs info from the family
all_info_df = pd.merge(eqtl_df,famE_snps_df, indicator=False, how='inner',right_on='ID',left_on='rs_id_dbSNP151_GRCh38p7')
all_info_df = pd.merge(all_info_df,genome_df, indicator=False, how='inner',right_on='Common_ID',left_on='gene_name')
#all_info_df.sort_values(all_info_df.columns[0], ascending = False)
all_info_df["Eli_allele1"] = all_info_df["Eli_allele1"].astype(str).astype(int)
all_info_df["Eli_allele2"] = all_info_df["Eli_allele2"].astype(str).astype(int)
all_info_df["Eliz_allele1"] = all_info_df["Eliz_allele1"].astype(str).astype(int)
all_info_df["Eliz_allele2"] = all_info_df["Eliz_allele2"].astype(str).astype(int)
all_info_df["Eric_allele1"] = all_info_df["Eric_allele1"].astype(str).astype(int)
all_info_df["Eric_allele2"] = all_info_df["Eric_allele2"].astype(str).astype(int)
all_info_df["Ethan_allele1"] = all_info_df["Ethan_allele1"].astype(str).astype(int)
all_info_df["Ethan_allele2"] = all_info_df["Ethan_allele2"].astype(str).astype(int)
all_info_df["Ethan_allele3"] = all_info_df["Ethan_allele3"].astype(str).astype(int)

all_info_df['Eli_has_eQTL'] = np.where((all_info_df['Eli_allele1'] + all_info_df['Eli_allele2']) > 0, 1, 0)
all_info_df['Eliz_has_eQTL'] = np.where((all_info_df['Eliz_allele1'] + all_info_df['Eliz_allele2']) > 0, 1, 0)
all_info_df['Eric_has_eQTL'] = np.where((all_info_df['Eric_allele1'] + all_info_df['Eric_allele2']) > 0, 1, 0)
all_info_df['Ethan_has_eQTL'] = np.where((all_info_df['Ethan_allele1'] + all_info_df['Ethan_allele2'] + all_info_df['Ethan_allele3']) > 0, 1, 0)
all_info_df

Unnamed: 0,rs_id_dbSNP151_GRCh38p7,tss_distance,log2_aFC,log2_aFC_lower,log2_aFC_upper,gene_name,ID,Eli_allele1,Eli_allele2,Eliz_allele1,Eliz_allele2,Eric_allele1,Eric_allele2,Ethan_allele1,Ethan_allele2,Ethan_allele3,chr,start,stop,Row.names,score,strand,Common_ID,Eli_has_eQTL,Eliz_has_eQTL,Eric_has_eQTL,Ethan_has_eQTL
0,rs1003721,47131.000000,0.182547,0.152388,0.225333,TTC3,rs1003721,0,0,0,1,0,0,0,0,0,chr21,37073183,37203118,NM_001001894,0,+,TTC3,0,1,0,0
1,rs1003721,47131.000000,0.182547,0.152388,0.225333,TTC3,rs1003721,0,0,0,1,0,0,0,0,0,chr21,37073253,37203107,NM_001353937,0,+,TTC3,0,1,0,0
2,rs1003721,47131.000000,0.182547,0.152388,0.225333,TTC3,rs1003721,0,0,0,1,0,0,0,0,0,chr21,37073253,37203107,NM_001353936,0,+,TTC3,0,1,0,0
3,rs1003721,47131.000000,0.182547,0.152388,0.225333,TTC3,rs1003721,0,0,0,1,0,0,0,0,0,chr21,37073253,37203107,NM_001320704,0,+,TTC3,0,1,0,0
4,rs1003721,47131.000000,0.182547,0.152388,0.225333,TTC3,rs1003721,0,0,0,1,0,0,0,0,0,chr21,37073253,37203107,NM_001330681,0,+,TTC3,0,1,0,0
5,rs1003721,47131.000000,0.182547,0.152388,0.225333,TTC3,rs1003721,0,0,0,1,0,0,0,0,0,chr21,37073253,37203107,NM_001330683,0,+,TTC3,0,1,0,0
6,rs1003721,47131.000000,0.182547,0.152388,0.225333,TTC3,rs1003721,0,0,0,1,0,0,0,0,0,chr21,37073253,37203107,NM_001330682,0,+,TTC3,0,1,0,0
7,rs1003721,47131.000000,0.182547,0.152388,0.225333,TTC3,rs1003721,0,0,0,1,0,0,0,0,0,chr21,37073253,37203107,NM_001353938,0,+,TTC3,0,1,0,0
8,rs1003721,47131.000000,0.182547,0.152388,0.225333,TTC3,rs1003721,0,0,0,1,0,0,0,0,0,chr21,37073253,37203107,NM_001320703,0,+,TTC3,0,1,0,0
9,rs1003721,47131.000000,0.182547,0.152388,0.225333,TTC3,rs1003721,0,0,0,1,0,0,0,0,0,chr21,37082946,37203118,NM_003316,0,+,TTC3,0,1,0,0


In [None]:
# The above has info for SNPs in all 4. We can subset this list to those where Ethan/Eric have different SNPs, or where
# aFC predictions might suggest Ethan would look "dosage compensated" in comparison.

In [91]:
# Finally, as a spot check, How many genes in our "looks dosage-compensated" list have an eQTL at all? (In Ethan) 
# We can just manually check each of these on the GTeX site too to make sure I haven't messed up anywhere
fc_df = pd.read_csv('/Users/sahu0957/ds_deseq_normalization/dosagecompensated_genes_norepeats.csv')
fc_df = fc_df.iloc[:,[1,2,3,4]]
common_id = pd.read_csv('/Users/sahu0957/backup/genome_files/hg38_refseq_to_common_id.txt', header=None)
common_id = common_id[0].str.rsplit("_",n=1, expand=True)
common_id.columns =['Row.names', 'Common_ID']

fc_df = pd.merge(fc_df,common_id,indicator=False, how='left')
fc_df = fc_df.drop_duplicates(keep='last')

fc_df.sort_values(fc_df.columns[-1], ascending = False)

Unnamed: 0,Row.names,baseMean,log2FoldChange,lfcSE,Common_ID
18,NM_018964,2581.739509,0.077787,0.158898,SLC37A1
2,NM_001256370,6735.897125,0.011107,0.326691,SAMSN1
24,NR_145819,11666.98557,0.098365,0.260152,RNA45SN1
4,NM_001282934,1500.029164,0.091793,0.178404,PRDM15
12,NM_001348241,240.161311,-0.412931,0.375144,PCBP3
11,NM_001348240,277.710296,-0.4292,0.381264,PCBP3
14,NM_003489,2735.321866,-0.119453,0.298161,NRIP1
13,NM_001352596,142.67113,0.128563,0.279754,NCAM2
15,NM_015358,1627.511088,0.106304,0.169085,MORC3
7,NM_001320445,1627.609446,0.10657,0.169128,MORC3


In [None]:
# This is still extremely rough... but from a spot check for these genes in GTEx, 
# what I can see, Most of the above list (like everything but the rRNA, BRWD1, and AATBC)
# have a SNP that would explain them, although I don't allways see that SNP in Ethan vs Eric.