# Notebook for co-eqtl CRISPRi analysis
Contains the following analysis and visualisation steps: <br>
- data cleaning replogle CRISPR dataset
- assigning crispr scores to co-eqtls
- enrichment analysis co-eqtls and permutation testing
- histogram visualisation of relative position of egenes within the distribution of affected genes by CRISPR perturbation per co-egene <br><br>

Requirements: co-eqtls need to be correctly annotated (use annotate_egene_coegene.py)




In [1]:
import pandas as pd
import numpy as np
import re

import statistics as stats
import scipy.stats as sp

In [2]:
from bokeh.models import ColumnDataSource
import panel as pn
import hvplot.pandas
import holoviews as hv

from bokeh.plotting import figure
from bokeh.models import Range1d
from bokeh.models import HoverTool
from bokeh.palettes import Category10

In [3]:
pd.options.mode.chained_assignment = None  # default='warn'


## Load in data

In [None]:
#subset coeqtls (13 mil) with annotated egene and coegene
filtered_coeqtl_CD4T = pd.read_csv("data/coeqtl/egene_coegene_filtered_CD4T.tsv.gz", sep='\t')

In [None]:
#all coeqtls (32 mil) with annotated egene and coegene
coeqtl_CD4T = pd.read_csv("data/coeqtl/egene_coegene.tsv.gz", sep='\t')

In [None]:
#get headers of the columns of the file
cols = list(pd.read_csv('replogle_crispr/clustered_mean_gene_expression_figs2-4.csv', sep=',', nrows=1))
#select all but the unnamed column while loading the file
df_crispr = pd.read_csv('replogle_crispr/clustered_mean_gene_expression_figs2-4.csv', sep=',', usecols =[i for i in cols if i != "Unnamed: 1"])

## Data inspection, preparation and cleaning
### Filtered coeqtls

In [45]:
filtered_coeqtl_CD4T

Unnamed: 0,feature_id,snp_id,p_value,sig_coeqtl,rb_gene,egene,coegene
0,APBB1IP_CWF19L1,10:100055816:G:A,0.851971,no,no,CWF19L1,APBB1IP
1,CWF19L1_ZNF584,10:100055816:G:A,0.334923,no,no,CWF19L1,ZNF584
2,CWF19L1_PPAN,10:100055816:G:A,0.737807,no,no,CWF19L1,PPAN
3,CWF19L1_IFT88,10:100055816:G:A,0.270280,no,no,CWF19L1,IFT88
4,CWF19L1_MCMBP,10:100055816:G:A,0.389860,no,no,CWF19L1,MCMBP
...,...,...,...,...,...,...,...
13145177,INVS_SH3BP1,9:99971288:CTT:C,0.354876,no,no,INVS,SH3BP1
13145178,CCT3_INVS,9:99971288:CTT:C,0.095961,no,no,INVS,CCT3
13145179,INVS_MRNIP,9:99971288:CTT:C,0.460379,no,no,INVS,MRNIP
13145180,INVS_SYNJ2BP,9:99971288:CTT:C,0.793718,no,no,INVS,SYNJ2BP


In [46]:
df_crispr

Unnamed: 0,gene_transcript,10005_ZBTB4_P1_ENSG00000174282,10006_ZBTB5_P1P2_ENSG00000168795,10015_ZBTB8OS_P1P2_ENSG00000176261,10020_ZC3H13_P1P2_ENSG00000123200,10023_ZC3H18_P1P2_ENSG00000158545,10024_ZC3H3_P1P2_ENSG00000014164,10025_ZC3H4_P1_ENSG00000130749,10039_ZCCHC8_P1P2_ENSG00000033030,10040_ZCCHC9_P1P2_ENSG00000131732,...,9925_YEATS2_P1P2_ENSG00000163872,9926_YEATS4_P1P2_ENSG00000127337,9940_YME1L1_P1P2_ENSG00000136758,995_C16orf72_P1_ENSG00000182831,9971_ZBTB14_P1P2_ENSG00000198081,9975_ZBTB17_P1P2_ENSG00000116809,998_C16orf87_P1P2_ENSG00000155330,9992_ZBTB39_P1P2_ENSG00000166860,9998_ZBTB44_P1P2_ENSG00000196323,9_AAR2_P1P2_ENSG00000131043
0,cluster,19.000000,-1.000000,-1.000000,4.000000,-1.000000,8.000000,-1.000000,-1.000000,57.000000,...,35.000000,6.000000,-1.000000,-1.000000,-1.000000,7.000000,2.000000,-1.000000,-1.000000,57.000000
1,gene_name,,,,,,,,,,...,,,,,,,,,,
2,CYP51A1,0.015538,-0.169297,-0.141619,-0.068998,0.019589,-0.017413,-0.341628,-0.054714,-0.090300,...,0.035929,-0.038241,-0.126964,0.047563,-0.457281,-0.156436,0.016374,0.320231,0.199086,0.074135
3,BAD,0.075532,-0.039656,-0.133556,0.208468,0.355862,0.134834,-0.133954,0.031147,0.342149,...,0.010820,-0.241763,0.330868,-0.036582,0.050507,0.053212,0.126165,-0.053543,0.008552,0.139625
4,CD99,-0.185194,0.067921,0.138278,0.081160,-0.203804,0.125460,-0.038213,-0.051570,0.038924,...,0.071017,-0.064191,0.189424,-0.044574,0.809881,0.129267,0.163671,-0.015874,0.244931,0.467501
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2316,AC139493.2,-0.422495,-0.064950,-0.086410,-0.023713,-0.344314,-0.050482,-0.082852,-0.001369,0.881788,...,0.160054,-0.069895,-0.246135,-0.135752,0.440462,-0.197023,0.226833,-0.267401,-0.160511,0.465550
2317,POLR2J3,0.379631,0.000678,0.084222,-0.112965,0.352633,-0.123764,0.408237,-0.019059,0.038050,...,0.020901,-0.039629,0.151411,0.051497,-0.313755,0.285702,0.049606,-0.082782,0.012675,0.217903
2318,BX890604.2,0.060350,0.015746,0.098428,-0.192752,0.080785,0.083702,-0.067631,-0.002744,0.312789,...,-0.006933,-0.031753,-0.065019,-0.021804,-0.117480,-0.075904,-0.025401,0.063323,0.084166,0.247097
2319,AC016074.2,0.364247,-0.016822,0.285028,-0.185722,-0.163307,0.036043,0.028476,-0.108641,0.020184,...,0.251644,0.083346,0.315087,0.068939,-0.058699,-0.106195,-0.233496,0.645232,-0.079008,0.005066


In [47]:
sig_filterd_coeqtl_cd4t = filtered_coeqtl_CD4T[filtered_coeqtl_CD4T['sig_coeqtl']== 'yes']
sig_filterd_coeqtl_cd4t

Unnamed: 0,feature_id,snp_id,p_value,sig_coeqtl,rb_gene,egene,coegene
8830,NDUFB8_TRAK1,10:100586333:T:TA,2.820624e-05,yes,no,NDUFB8,TRAK1
8839,NDUFB8_RAB2A,10:100586333:T:TA,2.834884e-05,yes,no,NDUFB8,RAB2A
9872,RPS12_SLF2,10:100881934:G:A,1.974963e-05,yes,yes,SLF2,RPS12
14861,MRPL43_RMDN1,10:100892134:CA:C,2.160934e-05,yes,yes,MRPL43,RMDN1
32543,SAR1A_WBP1L,10:102813260:G:T,1.474349e-05,yes,no,WBP1L,SAR1A
...,...,...,...,...,...,...,...
13078799,AUH_TIA1,9:91189751:G:A,1.511356e-05,yes,no,AUH,TIA1
13080015,AUH_TIA1,9:91212271:G:T,1.339602e-05,yes,no,AUH,TIA1
13127172,LYAR_NANS,9:98026030:T:C,3.819557e-07,yes,no,NANS,LYAR
13132730,ATG4B_CORO2A,9:98223106:A:G,2.919610e-05,yes,no,CORO2A,ATG4B


In [48]:
sig_filterd_coeqtl_cd4t.groupby(['egene', 'coegene']).size().reset_index().rename(columns={0:'count'})

Unnamed: 0,egene,coegene,count
0,ABLIM1,TMEM141,1
1,ACP1,FAM153B,1
2,ACTR2,PITPNA-AS1,1
3,ACVR2A,BROX,2
4,ACYP1,RAB33B,1
...,...,...,...
4390,ZNF586,RSL24D1,1
4391,ZNF626,CMC1,1
4392,ZNF639,EMSY,1
4393,ZNF708,GNAS,1


In [None]:
#ribosomal genes
filtered_coeqtl_CD4T['rb_gene'].value_counts()

no     12502488
yes      642694
Name: rb_gene, dtype: int64

In [80]:
filtered_coeqtl_CD4T[(filtered_coeqtl_CD4T['sig_coeqtl'] == 'yes') & (filtered_coeqtl_CD4T['rb_gene'] == 'no')]

Unnamed: 0,feature_id,snp_id,p_value,sig_coeqtl,rb_gene,egene,coegene
8830,NDUFB8_TRAK1,10:100586333:T:TA,2.820624e-05,yes,no,NDUFB8,TRAK1
8839,NDUFB8_RAB2A,10:100586333:T:TA,2.834884e-05,yes,no,NDUFB8,RAB2A
32543,SAR1A_WBP1L,10:102813260:G:T,1.474349e-05,yes,no,WBP1L,SAR1A
95061,BBIP1_TMEM109,10:110937483:C:T,1.666098e-05,yes,no,BBIP1,TMEM109
108305,MAP3K5_ZDHHC6,10:112373084:CA:C,2.001293e-05,yes,no,ZDHHC6,MAP3K5
...,...,...,...,...,...,...,...
13078799,AUH_TIA1,9:91189751:G:A,1.511356e-05,yes,no,AUH,TIA1
13080015,AUH_TIA1,9:91212271:G:T,1.339602e-05,yes,no,AUH,TIA1
13127172,LYAR_NANS,9:98026030:T:C,3.819557e-07,yes,no,NANS,LYAR
13132730,ATG4B_CORO2A,9:98223106:A:G,2.919610e-05,yes,no,CORO2A,ATG4B


In [6]:
# drop first two rows
df_crispr = df_crispr.iloc[2:]

In [7]:
df_crispr

Unnamed: 0,gene_transcript,10005_ZBTB4_P1_ENSG00000174282,10006_ZBTB5_P1P2_ENSG00000168795,10015_ZBTB8OS_P1P2_ENSG00000176261,10020_ZC3H13_P1P2_ENSG00000123200,10023_ZC3H18_P1P2_ENSG00000158545,10024_ZC3H3_P1P2_ENSG00000014164,10025_ZC3H4_P1_ENSG00000130749,10039_ZCCHC8_P1P2_ENSG00000033030,10040_ZCCHC9_P1P2_ENSG00000131732,...,9925_YEATS2_P1P2_ENSG00000163872,9926_YEATS4_P1P2_ENSG00000127337,9940_YME1L1_P1P2_ENSG00000136758,995_C16orf72_P1_ENSG00000182831,9971_ZBTB14_P1P2_ENSG00000198081,9975_ZBTB17_P1P2_ENSG00000116809,998_C16orf87_P1P2_ENSG00000155330,9992_ZBTB39_P1P2_ENSG00000166860,9998_ZBTB44_P1P2_ENSG00000196323,9_AAR2_P1P2_ENSG00000131043
2,CYP51A1,0.015538,-0.169297,-0.141619,-0.068998,0.019589,-0.017413,-0.341628,-0.054714,-0.090300,...,0.035929,-0.038241,-0.126964,0.047563,-0.457281,-0.156436,0.016374,0.320231,0.199086,0.074135
3,BAD,0.075532,-0.039656,-0.133556,0.208468,0.355862,0.134834,-0.133954,0.031147,0.342149,...,0.010820,-0.241763,0.330868,-0.036582,0.050507,0.053212,0.126165,-0.053543,0.008552,0.139625
4,CD99,-0.185194,0.067921,0.138278,0.081160,-0.203804,0.125460,-0.038213,-0.051570,0.038924,...,0.071017,-0.064191,0.189424,-0.044574,0.809881,0.129267,0.163671,-0.015874,0.244931,0.467501
5,MAD1L1,0.011419,0.095523,-0.010855,0.282541,0.200241,0.031291,0.142003,-0.002545,0.375582,...,0.060587,-0.057528,0.120090,-0.075698,0.251953,-0.069809,0.046660,-0.149331,0.018375,0.250776
6,CFLAR,0.109648,-0.008776,0.172564,0.234939,0.110911,0.100156,0.310823,-0.018074,0.406680,...,0.107937,-0.035196,0.131583,-0.025158,0.128556,0.150333,-0.007144,0.210866,0.081446,0.027713
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2316,AC139493.2,-0.422495,-0.064950,-0.086410,-0.023713,-0.344314,-0.050482,-0.082852,-0.001369,0.881788,...,0.160054,-0.069895,-0.246135,-0.135752,0.440462,-0.197023,0.226833,-0.267401,-0.160511,0.465550
2317,POLR2J3,0.379631,0.000678,0.084222,-0.112965,0.352633,-0.123764,0.408237,-0.019059,0.038050,...,0.020901,-0.039629,0.151411,0.051497,-0.313755,0.285702,0.049606,-0.082782,0.012675,0.217903
2318,BX890604.2,0.060350,0.015746,0.098428,-0.192752,0.080785,0.083702,-0.067631,-0.002744,0.312789,...,-0.006933,-0.031753,-0.065019,-0.021804,-0.117480,-0.075904,-0.025401,0.063323,0.084166,0.247097
2319,AC016074.2,0.364247,-0.016822,0.285028,-0.185722,-0.163307,0.036043,0.028476,-0.108641,0.020184,...,0.251644,0.083346,0.315087,0.068939,-0.058699,-0.106195,-0.233496,0.645232,-0.079008,0.005066


In [8]:
df_crispr = df_crispr.rename(columns={'gene_transcript': 'gene_name'})

In [9]:
col_names = []
for col in df_crispr.columns[1:]:
    col = col.split('_')
    col_names.append(col[1])
col_names.insert(0, 'gene_name')
df_copy = df_crispr.copy()
df_copy = df_crispr.set_axis(col_names, axis=1)

In [10]:
df_copy

Unnamed: 0,gene_name,ZBTB4,ZBTB5,ZBTB8OS,ZC3H13,ZC3H18,ZC3H3,ZC3H4,ZCCHC8,ZCCHC9,...,YEATS2,YEATS4,YME1L1,C16orf72,ZBTB14,ZBTB17,C16orf87,ZBTB39,ZBTB44,AAR2
2,CYP51A1,0.015538,-0.169297,-0.141619,-0.068998,0.019589,-0.017413,-0.341628,-0.054714,-0.090300,...,0.035929,-0.038241,-0.126964,0.047563,-0.457281,-0.156436,0.016374,0.320231,0.199086,0.074135
3,BAD,0.075532,-0.039656,-0.133556,0.208468,0.355862,0.134834,-0.133954,0.031147,0.342149,...,0.010820,-0.241763,0.330868,-0.036582,0.050507,0.053212,0.126165,-0.053543,0.008552,0.139625
4,CD99,-0.185194,0.067921,0.138278,0.081160,-0.203804,0.125460,-0.038213,-0.051570,0.038924,...,0.071017,-0.064191,0.189424,-0.044574,0.809881,0.129267,0.163671,-0.015874,0.244931,0.467501
5,MAD1L1,0.011419,0.095523,-0.010855,0.282541,0.200241,0.031291,0.142003,-0.002545,0.375582,...,0.060587,-0.057528,0.120090,-0.075698,0.251953,-0.069809,0.046660,-0.149331,0.018375,0.250776
6,CFLAR,0.109648,-0.008776,0.172564,0.234939,0.110911,0.100156,0.310823,-0.018074,0.406680,...,0.107937,-0.035196,0.131583,-0.025158,0.128556,0.150333,-0.007144,0.210866,0.081446,0.027713
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2316,AC139493.2,-0.422495,-0.064950,-0.086410,-0.023713,-0.344314,-0.050482,-0.082852,-0.001369,0.881788,...,0.160054,-0.069895,-0.246135,-0.135752,0.440462,-0.197023,0.226833,-0.267401,-0.160511,0.465550
2317,POLR2J3,0.379631,0.000678,0.084222,-0.112965,0.352633,-0.123764,0.408237,-0.019059,0.038050,...,0.020901,-0.039629,0.151411,0.051497,-0.313755,0.285702,0.049606,-0.082782,0.012675,0.217903
2318,BX890604.2,0.060350,0.015746,0.098428,-0.192752,0.080785,0.083702,-0.067631,-0.002744,0.312789,...,-0.006933,-0.031753,-0.065019,-0.021804,-0.117480,-0.075904,-0.025401,0.063323,0.084166,0.247097
2319,AC016074.2,0.364247,-0.016822,0.285028,-0.185722,-0.163307,0.036043,0.028476,-0.108641,0.020184,...,0.251644,0.083346,0.315087,0.068939,-0.058699,-0.106195,-0.233496,0.645232,-0.079008,0.005066


In [11]:
# Crispri data that overlaps with coeqtls
df_crispr_co_eqtl = df_copy[df_copy.columns.intersection(filtered_coeqtl_CD4T['coegene'])]
df_crispr_co_eqtl.insert(0, 'gene_name', df_copy['gene_name'])
df_crispr_co_eqtl = df_crispr_co_eqtl[df_copy['gene_name'].isin(filtered_coeqtl_CD4T['egene'])]
df_crispr_co_eqtl


Unnamed: 0,gene_name,ZC3H13,ZCRB1,ZFC3H1,NDUFAF8,ZFR,ZMAT2,ZNF207,ZNF292,ZNF335,...,WDR75,WDR7,WDR82,WNK1,XPO1,XRCC5,XRN2,YBEY,YEATS2,YEATS4
5,MAD1L1,0.282541,-0.083874,-0.122115,0.065532,0.068533,0.124581,-0.028117,-0.044752,-0.052449,...,0.659348,0.112389,0.115523,0.075935,0.067429,-0.033156,0.347097,0.073877,0.060587,-0.057528
6,CFLAR,0.234939,-0.031453,0.251272,0.068093,-0.016753,0.105353,0.428431,-0.040985,-0.150139,...,-0.071532,0.283476,0.995953,0.151068,0.176876,0.202953,0.013178,-0.081765,0.107937,-0.035196
15,POLR2J,-0.076327,0.008678,-0.048413,-0.020213,-0.116076,0.019046,-0.108389,0.072924,0.061298,...,0.398138,-0.063683,0.197992,0.091389,0.153504,0.339866,0.152486,0.087480,0.206128,-0.064640
18,KMT2E,0.237011,0.012874,0.321401,0.054683,0.125881,0.080910,0.386595,-0.041969,0.000720,...,0.411928,0.195733,0.342194,0.140464,0.252648,0.308647,0.241205,-0.122916,0.125718,-0.105965
23,REX1BD,0.382004,0.002540,-0.050510,-0.055667,-0.239461,-0.200510,0.020040,0.045757,-0.107413,...,0.154902,0.216234,0.034814,-0.001574,-0.020215,0.148667,-0.559619,-0.071061,0.022612,-0.208348
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2297,TAF15,-0.221373,-0.235591,0.091972,0.105411,-0.037433,-0.086524,-0.453667,0.030435,-0.014078,...,0.060628,-0.053304,-0.300383,0.123681,-0.209295,-0.045559,-0.100892,-0.172765,0.050330,0.002482
2299,GTF2H5,0.295310,-0.019330,0.113873,0.043798,0.188583,-0.045109,-0.281496,-0.095054,0.000092,...,0.108896,-0.102752,-0.166307,-0.097011,0.170226,-0.192021,0.068127,0.102516,-0.004004,-0.056317
2303,TAF9,-0.263036,-0.056961,-0.059916,0.022127,0.206254,-0.430608,0.033324,-0.031259,-0.186926,...,-0.057476,-0.220275,-0.249117,0.044083,-0.012190,-0.358441,-0.059710,0.039966,-0.095581,0.040918
2309,PSMB3,-0.330507,0.036862,-0.043329,-0.145916,-0.055789,0.049684,-0.032362,0.013826,0.283720,...,0.174855,-0.030821,0.065326,0.031505,-0.101525,-0.009689,0.079871,-0.119016,0.142266,-0.082137


In [55]:
#Coeqtl data that overlaps with crispri genes
filtered_coeqtl_CD4T = filtered_coeqtl_CD4T[(filtered_coeqtl_CD4T['coegene'].isin(df_crispr_co_eqtl.columns[1:])) & (filtered_coeqtl_CD4T['egene'].isin(df_crispr_co_eqtl['gene_name']))]
filtered_coeqtl_CD4T

Unnamed: 0,feature_id,snp_id,p_value,sig_coeqtl,rb_gene,egene,coegene
7308,NDUFB8_NRF1,10:100586333:T:TA,0.494715,no,no,NDUFB8,NRF1
7309,NDUFB8_PSMD13,10:100586333:T:TA,0.608302,no,no,NDUFB8,PSMD13
7313,NDUFB8_UBE2E1,10:100586333:T:TA,0.401664,no,no,NDUFB8,UBE2E1
7317,ARPC1B_NDUFB8,10:100586333:T:TA,0.275487,no,no,NDUFB8,ARPC1B
7318,NDUFB8_SNRNP27,10:100586333:T:TA,0.583239,no,no,NDUFB8,SNRNP27
...,...,...,...,...,...,...,...
13126012,ANP32B_DHX36,9:98017781:T:G,0.445571,no,no,ANP32B,DHX36
13126028,ANP32B_RNGTT,9:98017781:T:G,0.827667,no,no,ANP32B,RNGTT
13126033,ANP32B_REST,9:98017781:T:G,0.693192,no,no,ANP32B,REST
13126035,ANP32B_COX10,9:98017781:T:G,0.632558,no,no,ANP32B,COX10


In [12]:
df_crispr_coeqtl_cols = list(df_crispr_co_eqtl.columns[1:])

In [13]:
#Look for duplicate column names
cols = list(set([x for x in df_crispr_coeqtl_cols if df_crispr_coeqtl_cols.count(x) > 1]))
print(cols)

['FBXW7', 'CCNK', 'LSM5']


In [14]:
for col in df_crispr.columns:
    for gene in cols:
        if re.search(gene, col):
            print(col)

1332_CCNK_P2_ENSG00000090061
1333_CCNK_P1_ENSG00000090061
3038_FBXW7_ENST00000281708.4_ENSG00000109670
3039_FBXW7_ENST00000603548.1_ENSG00000109670
4694_LSM5_P2_ENSG00000106355
4695_LSM5_P1_ENSG00000106355


In [15]:
guide_rna = df_crispr_co_eqtl[df_crispr_co_eqtl.columns.intersection(cols)]
guide_rna

Unnamed: 0,CCNK,CCNK.1,FBXW7,FBXW7.1,LSM5,LSM5.1
5,0.088231,0.247356,0.025696,-0.010572,0.049533,-0.042965
6,0.022080,0.127541,-0.046144,-0.049525,0.023380,0.077777
15,0.032043,0.060906,0.027750,0.007349,-0.023254,0.142638
18,0.000195,-0.091613,-0.134265,-0.104079,0.096575,0.175774
23,-0.001420,-0.187352,-0.108585,-0.052885,-0.311082,-0.160836
...,...,...,...,...,...,...
2297,0.028145,-0.094733,0.003141,-0.034933,-0.321415,-0.428856
2299,-0.125278,-0.050078,0.044628,0.004234,-0.045669,-0.146704
2303,-0.019556,0.017164,-0.045655,0.054579,-0.143120,-0.155568
2309,0.048048,0.102457,-0.128599,-0.122911,0.018381,0.016783


In [60]:
guide_rna.columns = ['CCNK_1', 'CCNK_2', 'FBXW7_1', 'FBXW7_2', 'LSM5_1', 'LSM5_2']

In [126]:
#Amount of rows that are in agreement in terms of both being positive or negative between comparable columns (same direction)
guide_rna['CCNK_1'].mul(guide_rna['CCNK_2']).ge(0).value_counts()

True     672
False    330
dtype: int64

In [31]:
guide_rna['FBXW7_1'].mul(guide_rna['FBXW7_2']).ge(0).value_counts()

True     689
False    313
dtype: int64

In [32]:
guide_rna['LSM5_1'].mul(guide_rna['LSM5_2']).ge(0).value_counts()

True     706
False    296
dtype: int64

In [16]:
df_crispr_coeqtl_drop_duplicated_col = df_crispr_co_eqtl.loc[:,~df_crispr_co_eqtl.columns.duplicated()].copy()

In [17]:
df_crispr_co_eqtl =df_crispr_coeqtl_drop_duplicated_col

In [18]:
df_crispr_co_eqtl

Unnamed: 0,gene_name,ZC3H13,ZCRB1,ZFC3H1,NDUFAF8,ZFR,ZMAT2,ZNF207,ZNF292,ZNF335,...,WDR75,WDR7,WDR82,WNK1,XPO1,XRCC5,XRN2,YBEY,YEATS2,YEATS4
5,MAD1L1,0.282541,-0.083874,-0.122115,0.065532,0.068533,0.124581,-0.028117,-0.044752,-0.052449,...,0.659348,0.112389,0.115523,0.075935,0.067429,-0.033156,0.347097,0.073877,0.060587,-0.057528
6,CFLAR,0.234939,-0.031453,0.251272,0.068093,-0.016753,0.105353,0.428431,-0.040985,-0.150139,...,-0.071532,0.283476,0.995953,0.151068,0.176876,0.202953,0.013178,-0.081765,0.107937,-0.035196
15,POLR2J,-0.076327,0.008678,-0.048413,-0.020213,-0.116076,0.019046,-0.108389,0.072924,0.061298,...,0.398138,-0.063683,0.197992,0.091389,0.153504,0.339866,0.152486,0.087480,0.206128,-0.064640
18,KMT2E,0.237011,0.012874,0.321401,0.054683,0.125881,0.080910,0.386595,-0.041969,0.000720,...,0.411928,0.195733,0.342194,0.140464,0.252648,0.308647,0.241205,-0.122916,0.125718,-0.105965
23,REX1BD,0.382004,0.002540,-0.050510,-0.055667,-0.239461,-0.200510,0.020040,0.045757,-0.107413,...,0.154902,0.216234,0.034814,-0.001574,-0.020215,0.148667,-0.559619,-0.071061,0.022612,-0.208348
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2297,TAF15,-0.221373,-0.235591,0.091972,0.105411,-0.037433,-0.086524,-0.453667,0.030435,-0.014078,...,0.060628,-0.053304,-0.300383,0.123681,-0.209295,-0.045559,-0.100892,-0.172765,0.050330,0.002482
2299,GTF2H5,0.295310,-0.019330,0.113873,0.043798,0.188583,-0.045109,-0.281496,-0.095054,0.000092,...,0.108896,-0.102752,-0.166307,-0.097011,0.170226,-0.192021,0.068127,0.102516,-0.004004,-0.056317
2303,TAF9,-0.263036,-0.056961,-0.059916,0.022127,0.206254,-0.430608,0.033324,-0.031259,-0.186926,...,-0.057476,-0.220275,-0.249117,0.044083,-0.012190,-0.358441,-0.059710,0.039966,-0.095581,0.040918
2309,PSMB3,-0.330507,0.036862,-0.043329,-0.145916,-0.055789,0.049684,-0.032362,0.013826,0.283720,...,0.174855,-0.030821,0.065326,0.031505,-0.101525,-0.009689,0.079871,-0.119016,0.142266,-0.082137


In [64]:
#Assign scores to the coeqtl datafram from the crispri matrix
crispr_score_coegene = []
for egene, coegene in zip(list(filtered_coeqtl_CD4T['egene']), list(filtered_coeqtl_CD4T['coegene'])):
    crispr_score_coegene.append(df_crispr_co_eqtl[df_crispr_co_eqtl['gene_name'] == egene][coegene].iloc[0])
print(len(crispr_score_coegene))
filtered_coeqtl_CD4T.insert(7, 'crispr_score', crispr_score_coegene)

641838


In [65]:
filtered_coeqtl_CD4T['sig_coeqtl'].value_counts()

no     640581
yes      1257
Name: sig_coeqtl, dtype: int64

In [66]:
filtered_coeqtl_CD4T

Unnamed: 0,feature_id,snp_id,p_value,sig_coeqtl,rb_gene,egene,coegene,crispr_score
7308,NDUFB8_NRF1,10:100586333:T:TA,0.494715,no,no,NDUFB8,NRF1,-0.706254
7309,NDUFB8_PSMD13,10:100586333:T:TA,0.608302,no,no,NDUFB8,PSMD13,-0.029283
7313,NDUFB8_UBE2E1,10:100586333:T:TA,0.401664,no,no,NDUFB8,UBE2E1,0.168135
7317,ARPC1B_NDUFB8,10:100586333:T:TA,0.275487,no,no,NDUFB8,ARPC1B,-0.064592
7318,NDUFB8_SNRNP27,10:100586333:T:TA,0.583239,no,no,NDUFB8,SNRNP27,-0.749481
...,...,...,...,...,...,...,...,...
13126012,ANP32B_DHX36,9:98017781:T:G,0.445571,no,no,ANP32B,DHX36,-0.399633
13126028,ANP32B_RNGTT,9:98017781:T:G,0.827667,no,no,ANP32B,RNGTT,-0.181897
13126033,ANP32B_REST,9:98017781:T:G,0.693192,no,no,ANP32B,REST,0.019119
13126035,ANP32B_COX10,9:98017781:T:G,0.632558,no,no,ANP32B,COX10,-0.179195


In [67]:
filtered_coeqtl_CD4T[(filtered_coeqtl_CD4T['sig_coeqtl'] == 'yes') & (filtered_coeqtl_CD4T['rb_gene'] == 'no')]

Unnamed: 0,feature_id,snp_id,p_value,sig_coeqtl,rb_gene,egene,coegene,crispr_score
516011,EEF2_PPA1,10:70212808:T:C,1.142563e-05,yes,no,PPA1,EEF2,-0.096185
517118,NACA_PPA1,10:70212808:T:C,1.932820e-11,yes,no,PPA1,NACA,-0.532990
517785,EIF3G_PPA1,10:70212808:T:C,1.490956e-06,yes,no,PPA1,EIF3G,0.144026
518536,EEF1G_PPA1,10:70212808:T:C,7.115789e-17,yes,no,PPA1,EEF1G,-0.531213
518595,NOP53_PPA1,10:70212808:T:C,2.337835e-06,yes,no,PPA1,NOP53,-0.011278
...,...,...,...,...,...,...,...,...
11885151,NACA_TOMM7,7:22823103:C:T,5.978233e-10,yes,no,TOMM7,NACA,-0.148377
11885152,PTMA_TOMM7,7:22823103:C:T,3.630921e-06,yes,no,TOMM7,PTMA,0.056562
11885278,EEF1G_TOMM7,7:22823103:C:T,1.121069e-14,yes,no,TOMM7,EEF1G,-0.075221
11885322,TMSB10_TOMM7,7:22823103:C:T,6.512430e-10,yes,no,TOMM7,TMSB10,0.011621


In [68]:
filtered_coeqtl_CD4T['sig_coeqtl'].replace('no', 0, inplace=True)
filtered_coeqtl_CD4T['sig_coeqtl'].replace('yes', 1, inplace=True)
filtered_coeqtl_CD4T

Unnamed: 0,feature_id,snp_id,p_value,sig_coeqtl,rb_gene,egene,coegene,crispr_score
7308,NDUFB8_NRF1,10:100586333:T:TA,0.494715,0,no,NDUFB8,NRF1,-0.706254
7309,NDUFB8_PSMD13,10:100586333:T:TA,0.608302,0,no,NDUFB8,PSMD13,-0.029283
7313,NDUFB8_UBE2E1,10:100586333:T:TA,0.401664,0,no,NDUFB8,UBE2E1,0.168135
7317,ARPC1B_NDUFB8,10:100586333:T:TA,0.275487,0,no,NDUFB8,ARPC1B,-0.064592
7318,NDUFB8_SNRNP27,10:100586333:T:TA,0.583239,0,no,NDUFB8,SNRNP27,-0.749481
...,...,...,...,...,...,...,...,...
13126012,ANP32B_DHX36,9:98017781:T:G,0.445571,0,no,ANP32B,DHX36,-0.399633
13126028,ANP32B_RNGTT,9:98017781:T:G,0.827667,0,no,ANP32B,RNGTT,-0.181897
13126033,ANP32B_REST,9:98017781:T:G,0.693192,0,no,ANP32B,REST,0.019119
13126035,ANP32B_COX10,9:98017781:T:G,0.632558,0,no,ANP32B,COX10,-0.179195


In [69]:
filtered_coeqtl_CD4T['rb_gene'].replace('no', 0, inplace=True)
filtered_coeqtl_CD4T['rb_gene'].replace('yes', 1, inplace=True)

In [71]:
filtered_coeqtl_CD4T.rename(columns={'sig_coeqtl': 'coeQTL'}, inplace=True)

In [72]:
filtered_coeqtl_CD4T

Unnamed: 0,feature_id,snp_id,p_value,coeQTL,rb_gene,egene,coegene,crispr_score
7308,NDUFB8_NRF1,10:100586333:T:TA,0.494715,0,0,NDUFB8,NRF1,-0.706254
7309,NDUFB8_PSMD13,10:100586333:T:TA,0.608302,0,0,NDUFB8,PSMD13,-0.029283
7313,NDUFB8_UBE2E1,10:100586333:T:TA,0.401664,0,0,NDUFB8,UBE2E1,0.168135
7317,ARPC1B_NDUFB8,10:100586333:T:TA,0.275487,0,0,NDUFB8,ARPC1B,-0.064592
7318,NDUFB8_SNRNP27,10:100586333:T:TA,0.583239,0,0,NDUFB8,SNRNP27,-0.749481
...,...,...,...,...,...,...,...,...
13126012,ANP32B_DHX36,9:98017781:T:G,0.445571,0,0,ANP32B,DHX36,-0.399633
13126028,ANP32B_RNGTT,9:98017781:T:G,0.827667,0,0,ANP32B,RNGTT,-0.181897
13126033,ANP32B_REST,9:98017781:T:G,0.693192,0,0,ANP32B,REST,0.019119
13126035,ANP32B_COX10,9:98017781:T:G,0.632558,0,0,ANP32B,COX10,-0.179195


In [None]:
filtered_coeqtl_CD4T.to_csv("coeqtl/scores/filtered_coeqtl_CD4T_crispr.tsv", sep='\t', index=False)

In [27]:
sig_coeqtls = filtered_coeqtl_CD4T[filtered_coeqtl_CD4T['coeQTL'] == 1]

In [None]:
sig_coeqtls.sort_values(by='crispr_score')

Unnamed: 0,feature_id,snp_id,p_value,coeQTL,rb_gene,egene,coegene,crispr_score
4313160,EIF5A_HSPE1,17:7304878:C:T,1.020061e-11,1,0,EIF5A,HSPE1,-1.185369
11262403,RPS18_RPS6,6:33270950:CAAA:C,7.522963e-08,1,1,RPS18,RPS6,-0.824024
5468388,RPL27A_RPS5,19:58387069:T:C,3.710483e-06,1,1,RPS5,RPL27A,-0.767426
5469242,RPL27A_RPS5,19:58387936:C:G,1.751014e-06,1,1,RPS5,RPL27A,-0.767426
4310764,EIF5A_LSM6,17:7304878:C:T,6.057012e-07,1,0,EIF5A,LSM6,-0.724677
...,...,...,...,...,...,...,...,...
12824887,RPL14_SNHG7,9:136792497:G:C,1.698333e-05,1,1,SNHG7,RPL14,0.681480
4492777,RPS26_VAMP2,17:8233498:A:G,2.311494e-05,1,1,VAMP2,RPS26,0.712894
12825266,RPL36_SNHG7,9:136792497:G:C,1.752243e-05,1,1,SNHG7,RPL36,0.728091
12825537,RPS12_SNHG7,9:136792497:G:C,8.308361e-06,1,1,SNHG7,RPS12,0.931788


In [None]:
vc = sig_coeqtls['egene'].value_counts()

In [None]:
list(vc[vc > 5].index)

['SMDT1',
 'TOMM7',
 'SNHG8',
 'RPS26',
 'SERPINB6',
 'EIF5A',
 'RPL36AL',
 'RPS10',
 'RPS5',
 'RNASET2',
 'TUFM',
 'HLA-C',
 'RPS17',
 'CENPK',
 'PPA1',
 'VAMP2',
 'FAM118A',
 'ENO1',
 'RPS18']

In [None]:
sig_coeqtls[sig_coeqtls['crispr_score'] > 0.25].sort_values(by='crispr_score')

Unnamed: 0,feature_id,snp_id,p_value,coeQTL,rb_gene,egene,coegene,crispr_score
374910,RPL27A_SMDT1,22:42108675:A:G,6.388560999999999e-79,1,1,SMDT1,RPL27A,0.644702
374666,RPL27A_SMDT1,22:42102200:C:T,6.16526e-79,1,1,SMDT1,RPL27A,0.644702
374488,RPL27A_SMDT1,22:42092902:G:A,5.69835e-79,1,1,SMDT1,RPL27A,0.644702
374455,RPL27A_SMDT1,22:42080750:A:C,5.8072e-79,1,1,SMDT1,RPL27A,0.644702
374248,RPL27A_SMDT1,22:42067810:C:T,3.5123699999999996e-78,1,1,SMDT1,RPL27A,0.644702
633488,RPL14_SNHG7,9:136792497:G:C,1.698333e-05,1,1,SNHG7,RPL14,0.68148
211312,RPS26_VAMP2,17:8233498:A:G,2.311494e-05,1,1,VAMP2,RPS26,0.712894
633574,RPL36_SNHG7,9:136792497:G:C,1.752243e-05,1,1,SNHG7,RPL36,0.728091
633633,RPS12_SNHG7,9:136792497:G:C,8.308361e-06,1,1,SNHG7,RPS12,0.931788
633477,RPS3_SNHG7,9:136792497:G:C,1.770546e-05,1,1,SNHG7,RPS3,1.310069


In [None]:
sig_coeqtls[sig_coeqtls['crispr_score'] < -0.25].sort_values(by='crispr_score')

Unnamed: 0,feature_id,snp_id,p_value,coeQTL,rb_gene,egene,coegene,crispr_score
200680,EIF5A_HSPE1,17:7304878:C:T,1.020061e-11,1,0,EIF5A,HSPE1,-1.185369
559965,RPS18_RPS6,6:33270950:CAAA:C,7.522963e-08,1,1,RPS18,RPS6,-0.824024
258609,RPL27A_RPS5,19:58387936:C:G,1.751014e-06,1,1,RPS5,RPL27A,-0.767426
258452,RPL27A_RPS5,19:58387069:T:C,3.710483e-06,1,1,RPS5,RPL27A,-0.767426
200278,EIF5A_LSM6,17:7304878:C:T,6.057012e-07,1,0,EIF5A,LSM6,-0.724677
...,...,...,...,...,...,...,...,...
258637,RPL37A_RPS5,19:58387936:C:G,1.263152e-05,1,1,RPS5,RPL37A,-0.268026
200705,EIF5A_ISCU,17:7304878:C:T,1.437630e-11,1,0,EIF5A,ISCU,-0.263067
258591,RPL23A_RPS5,19:58387936:C:G,6.678722e-07,1,1,RPS5,RPL23A,-0.258010
258420,RPL23A_RPS5,19:58387069:T:C,1.100661e-06,1,1,RPS5,RPL23A,-0.258010


Genes UQCRFS1, EIF5A, RPS26 examples

In [None]:
sig_coeqtls[sig_coeqtls['coegene']== 'UQCRFS1']

Unnamed: 0,feature_id,snp_id,p_value,coeQTL,rb_gene,egene,coegene,crispr_score
4311851,EIF5A_UQCRFS1,17:7304878:C:T,5e-06,1,0,EIF5A,UQCRFS1,-0.536777


In [None]:
sig_coeqtls[sig_coeqtls['egene'] == 'EIF5A'].sort_values(by='crispr_score').head(20)

Unnamed: 0,feature_id,snp_id,p_value,coeQTL,rb_gene,egene,coegene,crispr_score
4313160,EIF5A_HSPE1,17:7304878:C:T,1.020061e-11,1,0,EIF5A,HSPE1,-1.185369
4310764,EIF5A_LSM6,17:7304878:C:T,6.057012e-07,1,0,EIF5A,LSM6,-0.724677
4313248,EIF3D_EIF5A,17:7304878:C:T,2.565892e-06,1,0,EIF5A,EIF3D,-0.655103
4312893,EIF5A_NRF1,17:7304878:C:T,2.009907e-06,1,0,EIF5A,NRF1,-0.617321
4312129,ARPC1B_EIF5A,17:7304878:C:T,3.2737529999999995e-19,1,0,EIF5A,ARPC1B,-0.614232
4311470,EIF5A_MRPL41,17:7304878:C:T,3.894383e-07,1,1,EIF5A,MRPL41,-0.61299
4311479,EIF5A_GSPT1,17:7304878:C:T,4.526013e-06,1,1,EIF5A,GSPT1,-0.586853
4311046,EIF5A_GBF1,17:7304878:C:T,5.09361e-07,1,0,EIF5A,GBF1,-0.556498
4311851,EIF5A_UQCRFS1,17:7304878:C:T,5.008545e-06,1,0,EIF5A,UQCRFS1,-0.536777
4311857,EIF5A_PPA1,17:7304878:C:T,8.996659e-06,1,0,EIF5A,PPA1,-0.521256


In [None]:
sig_coeqtls[sig_coeqtls['egene'] == 'RPS26'].sort_values(by='crispr_score').tail(15)

Unnamed: 0,feature_id,snp_id,p_value,coeQTL,rb_gene,egene,coegene,crispr_score
2087251,RPS26_ZNF335,12:56007301:G:A,3.8132169999999996e-19,1,1,RPS26,ZNF335,0.105898
2086985,RPS26_SPEN,12:56007301:G:A,3.330157e-10,1,1,RPS26,SPEN,0.111168
2087419,RPS26_SH3GL1,12:56007301:G:A,3.087689e-12,1,1,RPS26,SH3GL1,0.111998
2087978,RPS26_TCOF1,12:56007301:G:A,5.914603e-06,1,1,RPS26,TCOF1,0.14912
2087320,RPS26_RSL24D1,12:56007301:G:A,7.427762e-72,1,1,RPS26,RSL24D1,0.154208
2087210,RPS26_SUMO1,12:56007301:G:A,5.903372e-10,1,1,RPS26,SUMO1,0.168965
2087767,RPL14_RPS26,12:56007301:G:A,2.020128e-220,1,1,RPS26,RPL14,0.171687
2087702,RPS26_USP36,12:56007301:G:A,2.376285e-28,1,1,RPS26,USP36,0.180346
2087514,RPLP0_RPS26,12:56007301:G:A,4.1005529999999996e-172,1,1,RPS26,RPLP0,0.182848
2087695,RPS26_TPP2,12:56007301:G:A,8.217917e-11,1,1,RPS26,TPP2,0.194368


***
### all coeqtls

In [45]:
coeqtl_CD4T

Unnamed: 0,feature_id,eqtl,co_eqtl,snp_id,coeQTL,egene,coegene
0,CWF19L1_PHACTR1,CWF19L1,PHACTR1,10:100055816:G:A,0,CWF19L1,PHACTR1
1,CWF19L1_TTC32,CWF19L1,TTC32,10:100055816:G:A,0,CWF19L1,TTC32
2,CWF19L1_ZNF600,CWF19L1,ZNF600,10:100055816:G:A,0,CWF19L1,ZNF600
3,CWF19L1_MPC1,CWF19L1,MPC1,10:100055816:G:A,0,CWF19L1,MPC1
4,CWF19L1_DHFR2,CWF19L1,DHFR2,10:100055816:G:A,0,CWF19L1,DHFR2
...,...,...,...,...,...,...,...
31497748,ARL6IP4_INVS,ARL6IP4,INVS,9:99971288:CTT:C,0,INVS,ARL6IP4
31497749,CAPZA2_INVS,CAPZA2,INVS,9:99971288:CTT:C,0,INVS,CAPZA2
31497750,INVS_PSMB4,INVS,PSMB4,9:99971288:CTT:C,0,INVS,PSMB4
31497751,INVS_SLC43A1,INVS,SLC43A1,9:99971288:CTT:C,0,INVS,SLC43A1


In [None]:
df_crispr_coeqtl = df_copy[df_copy.columns.intersection(coeqtl_CD4T['coegene'])]
df_crispr_coeqtl.insert(0, 'gene_name', df_copy['gene_name'])
df_crispr_coeqtl = df_crispr_coeqtl[df_copy['gene_name'].isin(coeqtl_CD4T['egene'])]
df_crispr_coeqtl


In [97]:
df_crispr_coeqtl_cols = list(df_crispr_coeqtl.columns[1:])

In [None]:
cols = list(set([x for x in df_crispr_coeqtl_cols if df_crispr_coeqtl_cols.count(x) > 1]))
print(cols)

In [50]:
df_crispr_coeqtl[df_crispr_coeqtl.columns.intersection(cols)].describe()

Unnamed: 0,CCNK,CCNK.1,FBXW7,FBXW7.1,LSM5,LSM5.1
count,1169.0,1169.0,1169.0,1169.0,1168.0,1168.0
mean,0.003358,0.009772,-0.026363,-0.010922,0.004102,-0.000269
std,0.069403,0.167905,0.110395,0.094457,0.127234,0.290513
min,-0.276254,-0.858342,-0.415633,-0.293096,-0.481884,-0.930126
25%,-0.040677,-0.060883,-0.097812,-0.071376,-0.074611,-0.18978
50%,0.003101,0.023052,-0.035198,-0.019595,0.001894,-0.019938
75%,0.044846,0.098404,0.029124,0.040448,0.080086,0.165993
max,0.351162,0.85707,0.497211,0.382102,0.592455,2.433475


In [None]:
df_crispr_coeqtl_drop_duplicated_col = df_crispr_coeqtl.loc[:,~df_crispr_coeqtl.columns.duplicated()].copy()

In [None]:
df_crispr_coeqtl_drop_duplicated_col

In [46]:
df_crispr_coeqtl =df_crispr_coeqtl_drop_duplicated_col

In [None]:
df_crispr_coeqtl

In [None]:
coeqtl_CD4T = coeqtl_CD4T[(coeqtl_CD4T['coegene'].isin(df_crispr_coeqtl.columns[1:])) & (coeqtl_CD4T['egene'].isin(df_crispr_coeqtl['gene_name']))]
coeqtl_CD4T

In [55]:
crispr_score_coegene_cd4 = []
for egene, coegene in zip(list(coeqtl_CD4T['egene']), list(coeqtl_CD4T['coegene'])):
    crispr_score_coegene_cd4.append(df_crispr_coeqtl[df_crispr_coeqtl['gene_name'] == egene][coegene].iloc[0])
print(len(crispr_score_coegene_cd4))


972733


In [56]:
coeqtl_CD4T.insert(7, 'crispr_score', crispr_score_coegene_cd4)

In [57]:
coeqtl_CD4T

Unnamed: 0,feature_id,eqtl,co_eqtl,snp_id,coeQTL,egene,coegene,crispr_score
23520,NDUFB8_TMED10,NDUFB8,TMED10,10:100586333:T:TA,0,NDUFB8,TMED10,-0.013124
23533,NDUFB8_VPS4A,NDUFB8,VPS4A,10:100586333:T:TA,0,NDUFB8,VPS4A,0.061456
23536,NDUFB8_PLEKHA5,NDUFB8,PLEKHA5,10:100586333:T:TA,0,NDUFB8,PLEKHA5,-0.037395
23543,ATP5PO_NDUFB8,ATP5PO,NDUFB8,10:100586333:T:TA,0,NDUFB8,ATP5PO,-0.028572
23548,NDUFB8_PDRG1,NDUFB8,PDRG1,10:100586333:T:TA,0,NDUFB8,PDRG1,-0.038526
...,...,...,...,...,...,...,...,...
31465351,ANP32B_PSMC1,ANP32B,PSMC1,9:98017781:T:G,0,ANP32B,PSMC1,-0.476127
31465355,ANP32B_LSG1,ANP32B,LSG1,9:98017781:T:G,0,ANP32B,LSG1,-0.893666
31465358,ANP32B_PAK1,ANP32B,PAK1,9:98017781:T:G,0,ANP32B,PAK1,-0.027756
31465370,ANP32B_MICA,ANP32B,MICA,9:98017781:T:G,0,ANP32B,MICA,-0.137480


In [None]:
#Drop splitted eqtl and coeqtl columns of feature id column
coeqtl_CD4T = coeqtl_CD4T[['feature_id', 'snp_id', 'coeQTL', 'egene', 'coegene', 'crispr_score']]
coeqtl_CD4T

Unnamed: 0,feature_id,snp_id,coeQTL,egene,coegene,crispr_score
23520,NDUFB8_TMED10,10:100586333:T:TA,0,NDUFB8,TMED10,-0.013124
23533,NDUFB8_VPS4A,10:100586333:T:TA,0,NDUFB8,VPS4A,0.061456
23536,NDUFB8_PLEKHA5,10:100586333:T:TA,0,NDUFB8,PLEKHA5,-0.037395
23543,ATP5PO_NDUFB8,10:100586333:T:TA,0,NDUFB8,ATP5PO,-0.028572
23548,NDUFB8_PDRG1,10:100586333:T:TA,0,NDUFB8,PDRG1,-0.038526
...,...,...,...,...,...,...
31465351,ANP32B_PSMC1,9:98017781:T:G,0,ANP32B,PSMC1,-0.476127
31465355,ANP32B_LSG1,9:98017781:T:G,0,ANP32B,LSG1,-0.893666
31465358,ANP32B_PAK1,9:98017781:T:G,0,ANP32B,PAK1,-0.027756
31465370,ANP32B_MICA,9:98017781:T:G,0,ANP32B,MICA,-0.137480


In [None]:
#coeqtl_CD4T.to_csv("coeqtl/scores/coeqtl_CD4T_crispr.tsv", sep='\t', index=False)

***
#### co-eqtl crispr score statistical analysis

In [None]:
#Instead of assigning scores again load in here the path to the file with assigned scores
#filtered_coeqtl_CD4T = pd.read_csv("coeqtl/scores/filtered_coeqtl_CD4T_crispr.tsv.gz", sep='\t')

In [20]:
filtered_coeqtl_CD4T

Unnamed: 0,feature_id,snp_id,p_value,coeQTL,rb_gene,egene,coegene,crispr_score
0,NDUFB8_NRF1,10:100586333:T:TA,0.494715,0,0,NDUFB8,NRF1,-0.706254
1,NDUFB8_PSMD13,10:100586333:T:TA,0.608302,0,0,NDUFB8,PSMD13,-0.029283
2,NDUFB8_UBE2E1,10:100586333:T:TA,0.401664,0,0,NDUFB8,UBE2E1,0.168135
3,ARPC1B_NDUFB8,10:100586333:T:TA,0.275487,0,0,NDUFB8,ARPC1B,-0.064592
4,NDUFB8_SNRNP27,10:100586333:T:TA,0.583239,0,0,NDUFB8,SNRNP27,-0.749481
...,...,...,...,...,...,...,...,...
641833,ANP32B_DHX36,9:98017781:T:G,0.445571,0,0,ANP32B,DHX36,-0.399633
641834,ANP32B_RNGTT,9:98017781:T:G,0.827667,0,0,ANP32B,RNGTT,-0.181897
641835,ANP32B_REST,9:98017781:T:G,0.693192,0,0,ANP32B,REST,0.019119
641836,ANP32B_COX10,9:98017781:T:G,0.632558,0,0,ANP32B,COX10,-0.179195


In [6]:
filtered_coeqtl_CD4T['coeQTL'].value_counts()

0    640581
1      1257
Name: coeQTL, dtype: int64

In [21]:
sig_coeqtl = filtered_coeqtl_CD4T[filtered_coeqtl_CD4T['coeQTL'] == 1]
z = []
for x, y in zip(sig_coeqtl['coegene'], sig_coeqtl['egene']):
    i = x + '_' + y
    z.append(i)
sig_coeqtl
sig_coeqtl.insert(8, 'new_feature', z)
sig_coeqtl['new_feature'].value_counts()

RPL12_RPL36AL    5
RPS17_SMDT1      5
RPS12_SMDT1      5
EEF2_SMDT1       5
RPL9_SMDT1       5
                ..
MBNL1_TFDP2      1
SUPT5H_ARF4      1
PRELID1_EIF5A    1
DDX17_EIF5A      1
RPS12_SNHG7      1
Name: new_feature, Length: 549, dtype: int64

In [None]:
#Determine z-scores of coeqtls (sig) and non-coeqtls (nsig) test for significant difference
sig = list(filtered_coeqtl_CD4T[filtered_coeqtl_CD4T['coeQTL'] == 1 ]['crispr_score'])
nSig =list(filtered_coeqtl_CD4T[filtered_coeqtl_CD4T['coeQTL'] == 0]['crispr_score'])

print(np.mean((sig)))
print(np.mean((nSig)))
sp.ttest_ind((sig),(nSig))

-0.013867466790387861
0.02607834260103401


TtestResult(statistic=-6.493801801167904, pvalue=8.375755938704919e-11, df=641836.0)

In [None]:
def crispr_coegene_sig_nsig_unique(df):
    #coegenes = list(df['coegene'].unique())
    """
    Return unique crispr_scores based on significance in lists
    Return mean crispr_scores per coegene in lists
    df = dataframe of coeqtls with crispr scores with annotated significance
    Columns: coeQTL, coegene, egene, crispr_score
    """
    sig_coegenes = list(df[df['coeQTL'] == 1]['coegene'].unique())
    mean_sig_list = []
    mean_nsig_list = []
    unique_sig_list = []
    unique_nsig_list = []
    for coegene in sig_coegenes:
        df_one_coegene = df.query('coegene == @coegene')
        #Significant coegenes    
        sig = list(df_one_coegene.query('coeQTL ==1')['crispr_score'].unique())
        mean_sig_list.append(np.mean(sig))
        for x in sig:
            unique_sig_list.append(x)

        #Non signficiant coegenes
        nsig = list(df_one_coegene.query('coeQTL == 0')['crispr_score'].unique())
        mean_nsig_list.append(np.mean(nsig))
        for y in nsig:
            unique_nsig_list.append(y)
                    
    return unique_sig_list, unique_nsig_list, mean_sig_list, mean_nsig_list

In [76]:
unique_sig_list, unique_nsig_list, mean_sig_list, mean_nsig_list = crispr_coegene_sig_nsig_unique(filtered_coeqtl_CD4T)

In [None]:
print(np.mean(mean_sig_list))
print(np.mean(mean_nsig_list))
#Assuming the scores are not independent due to the experiment
sp.ttest_rel((mean_sig_list), (mean_nsig_list))


-0.12119457303260991
0.026787639055905848


TtestResult(statistic=-10.680848123428843, pvalue=1.4445590774483182e-21, df=211)

In [None]:
def crispr_coegene_sig_nsig_unique_perm(df):
    #coegenes = list(df['coegene'].unique())
    """
    Randomly shuffles the significance assignment of co-eQTLswhile 
    maintaining the original distribution of labels.

    Returns unique crispr_scores based on significance in lists of permutated data
    Returns mean crispr_scores per coegene in lists of permutated data
    """
    
    sig_coegenes = list(df[df['coeQTL'] == 1]['coegene'].unique())
    mean_sig_list = []
    mean_nsig_list = []
    #df_list = []
    for coegene in sig_coegenes:
        df_one_coegene = df.query('coegene == @coegene')
        df_one_coegene['permutation'] = df_one_coegene['coeQTL'].sample(frac=1).values
        #Significant coegenes    
        sig = list(df_one_coegene.query("permutation == 1")['crispr_score'].unique())
        mean_sig_list.append(np.mean(sig))

        #Non signficiant coegenes
        nsig = list(df_one_coegene.query('permutation == 0')['crispr_score'].unique())
        mean_nsig_list.append(np.mean(nsig))
        
        #df_list.append(df_one_coegene)
    #df_permutation = pd.concat(df_list)
    
    return mean_sig_list, mean_nsig_list

In [125]:
def permutation_paired_ttest_counter(df, n=5):
    #coegenes = list(df['coegene'].unique())
    """
    Function that compares the pvalue of a paired ttest of the dataframe to a 
    pvalue of a paired ttest of a permutated dataframe a predetermined amount of times
    and keeps track if the pvalue is bigger/smaller than the original pvalue

    df = coeqtl dataframe with crispr zscores
    n = amount of permutations and comparisions you want done
    returns lists with pvalues that are bigger/smaller than the original pvalue
    """
    bigger = 0
    smaller= 0
    pvalues_bigger = []
    pvalues_smaller= []
    unique_sig_list, unique_nsig_list, mean_sig, mean_nsig = crispr_coegene_sig_nsig_unique(df)
    stats, original_pvalue = sp.ttest_rel((mean_sig), (mean_nsig))
    for _ in range(n):
        mean_sig_perm, mean_nsig_perm =crispr_coegene_sig_nsig_unique_perm(df)    
        stats, pvalue = sp.ttest_rel((mean_sig_perm), (mean_nsig_perm))
        if pvalue > original_pvalue:
            bigger +=1
            pvalues_bigger.append(pvalue)
            #print('Yes ' + str(pvalue) + ' > '+ str(original_pvalue))
        else:
            smaller +=1
            pvalues_smaller.append(pvalue)
            #print('No ' + str(pvalue) + ' < ' + str(original_pvalue))
    print(f'With {n} permutations the pvalues were {bigger} times bigger than the original pvalue {original_pvalue:.3g} and {smaller} times smaller')
    return bigger, smaller #pvalues_bigger, pvalues_smaller   




In [126]:
bigger, smaller = permutation_paired_ttest_counter(filtered_coeqtl_CD4T, n=100)

With 100 permutations the pvalues were 100 times bigger than the original pvalue 1.44e-21 and 0 times smaller


In [173]:
def crispr_coegene_sig_nsig(df):
    #coegenes = list(df['coegene'].unique())
    """
    Return unique crispr_scores based on significance in lists
    Return mean crispr_scores per coegene in lists
    """
    sig = list(df[df['coeQTL'] == 1 ]['crispr_score'])
    nsig =list(df[df['coeQTL'] == 0]['crispr_score'])                    
    return  sig, nsig

In [78]:
def crispr_coegene_sig_nsig_perm(df):
    #coegenes = list(df['coegene'].unique())
    """
    Return unique crispr_scores based on significance in lists
    Return mean crispr_scores per coegene in lists
    """
    sig_coegenes = list(df[df['coeQTL'] == 1]['coegene'].unique())
    df_list = []
    for coegene in sig_coegenes:
        df_one_coegene = df.query('coegene == @coegene')
        df_one_coegene['permutation'] = df_one_coegene['coeQTL'].sample(frac=1).values
        df_list.append(df_one_coegene)
        #Significant coegenes    
        # sig = list(df_one_coegene.query("permutation == 1")['crispr_score'].unique())
       
    df_sig_perm = pd.concat(df_list)
    df_nsig_perm = df[~df['coegene'].isin(sig_coegenes)]
    df_nsig_perm['permutation'] = 0
        # #Non signficiant coegenes
        # nsig = list(df_one_coegene.query('permutation == 0')['crispr_score'].unique())
    df_final = pd.concat([df_sig_perm, df_nsig_perm])  
    sig = list(df_final.query("permutation == 1")['crispr_score'])

    #Non signficiant coegenes
    nsig = list(df_final.query('permutation == 0')['crispr_score'])
    
    return sig, nsig, df_final #sig, nsig

In [79]:
sig, nsig, df_one_coegene = crispr_coegene_sig_nsig_perm(filtered_coeqtl_CD4T)

In [200]:
filtered_coeqtl_CD4T['coeQTL'].value_counts()

0    640581
1      1257
Name: coeQTL, dtype: int64

In [264]:
df_one_coegene['permutation'].value_counts()

0    640581
1      1257
Name: permutation, dtype: int64

In [192]:
crispr_coegene_sig_nsig_perm(filtered_coeqtl_CD4T)

641838
852


In [179]:
sig, nsig = crispr_coegene_sig_nsig(filtered_coeqtl_CD4T)

In [181]:
print(sig)

[-0.096184956666462, -0.5329897541230528, 0.144026290466778, -0.0300611997698256, -0.5312129441331336, -0.0112775887607006, -0.096184956666462, -0.4404200692818193, -0.0300611997698256, 0.144026290466778, -0.5329897541230528, -0.5312129441331336, -0.0112775887607006, -0.1762984910584348, 0.0075983619026498, -0.3486161761346428, -0.0102855089135104, -0.2239038243619235, -0.2139067848642901, -0.3355955335748664, -0.0487517033773999, -0.4057104800682064, -0.3856486960870341, 0.1111681215592052, 0.0255580556787005, -0.1176167230335576, -0.211626798540597, -0.1099306892312859, -0.4644029803712441, -0.0320779358530056, 0.0707571277154075, -0.0061158838433121, -0.4252477147139791, -0.0389081257400597, -0.0374998862601307, -0.0722994757409636, -0.0530502710528825, 0.168964510002098, -0.2918711834125749, 0.1058980853124953, -0.0490698698351691, -0.4538709549993013, 0.1542083750361425, -0.2200290505388707, -0.3193654985352144, 0.0695504936296262, -0.045590243901775, 0.1119976433673708, -0.277530

In [None]:
def permutation_ind_ttest_counter(df, n=5):
    #coegenes = list(df['coegene'].unique())
    """
    Function that compares the pvalue of a ind ttest of the dataframe to a 
    pvalue of a ind ttest of a permutated dataframe a predetermined amount of times
    and keeps track if the pvalue is bigger/smaller than the original pvalue

    df = coeqtl dataframe with crispr zscores
    n = amount of permutations and comparisions you want done
    returns lists with pvalues that are bigger/smaller than the original pvalue
    """
    bigger = 0
    smaller= 0
    pvalues_bigger = []
    pvalues_smaller= []
    mean_sig, mean_nsig = crispr_coegene_sig_nsig(df)
    stats, original_pvalue = sp.ttest_ind((mean_sig), (mean_nsig))
    for _ in range(n):
        mean_sig_perm, mean_nsig_perm =crispr_coegene_sig_nsig_perm(df)    
        stats, pvalue = sp.ttest_ind((mean_sig_perm), (mean_nsig_perm))
        if pvalue > original_pvalue:
            bigger +=1
            pvalues_bigger.append(pvalue)
            #print('Yes ' + str(pvalue) + ' > '+ str(original_pvalue))
        else:
            smaller +=1
            pvalues_smaller.append(pvalue)
            #print('No ' + str(pvalue) + ' < ' + str(original_pvalue))
    print(f'With {n} permutations the pvalues were {bigger} times bigger than the original pvalue {original_pvalue:.3g} and {smaller} times smaller')
    return bigger, smaller, pvalues_bigger, pvalues_smaller   




In [252]:
bigger, smaller, pvalues_bigger, pvalues_smaller = permutation_ind_ttest_counter(filtered_coeqtl_CD4T, 100)

With 100 permutations the pvalues were 90 times bigger than the original pvalue 8.38e-11 and 10 times smaller


In [256]:
pvalues_smaller[5:]

[3.797271964354056e-11,
 3.182658262297133e-11,
 1.0870014409206127e-11,
 4.5600576154814334e-11,
 2.2303274782044517e-14]

***
#### Visusalistion coeqtls with cripsr scores
Functions

In [38]:
def style(p):
    p.legend.location = "top_right"
    p.toolbar.autohide = True
    p.legend.click_policy = 'hide'
    p.xaxis.axis_line_width = 2
    p.yaxis.axis_line_width = 2
    p.xaxis.axis_label_standoff = 5
    p.yaxis.axis_label_standoff = 5
    p.xaxis.major_tick_line_width = 2
    p.yaxis.major_tick_line_width = 2
    p.xaxis.axis_label_text_font_size = '18px'
    p.yaxis.axis_label_text_font_size = '18px'
    p.xaxis.major_label_text_font_size ="18px"
    p.yaxis.major_label_text_font_size ="18px"
    return p 

In [36]:
def create_bokeh_plot_cols(df, dictionary, selected_col, df_filter):
    #Colors to use
    cmap = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']
    
    #Create figure
    p = figure(title=('Distribution of target genes of guide ' + selected_col),
               x_axis_label='Z-score', y_axis_label='Count target genes (log10)', width=1000,
               tools='save')#toolbar_location=None)
    #hover = HoverTool(tooltips=[('Count', '@top'), ('score', '@edges')], mode='mouse')

    for key, value in dictionary.items():
        for col in df.columns[1:]:
            #if re.search('_' + key + '_', col) and col == selected_col: #(works with the original criprs colnames)
            if key == col and col == selected_col: #some cols don't show a hist
                # Filter out NaN values before creating the histogram
                data = list(df[col].dropna())
                data1 = list(df_filter[col].dropna())
                
                if data:  # Check if there is data after dropping NaN values
                    # Plot histogram
                    hist, edges = np.histogram(data, bins=np.arange(min(data), max(data), 0.05))
                   
                    hist = [1.1 if x == 1 else (1 if x == 0 else x) for x in hist]
                    hist = [x + 0.1 for x in hist]
                    source = ColumnDataSource(dict(top=np.log10(hist), left=edges[:-1], right=edges[1:]))

                    plot =p.quad(top='top', bottom=0, left='left', right='right', 
                           fill_color='lightgrey', line_color="#033649", alpha=0.5, 
                           legend_label=('mean z-score '+ str(round(stats.mean(data), 3)) 
                                         + ' (' +  str(len(data)) + ' genes)'), source=source)
                    p.add_tools(HoverTool(renderers=[plot], tooltips=[('Count',"@top"), ('z-score', '@left')],
                                          mode='vline', description=str(len(data)) + ' genes') )

                    hist1, edges1 = np.histogram(data1, bins=np.arange(min(data1), max(data1), 0.05))
                    hist1 = [1.1 if x == 1 else (1 if x == 0 else x) for x in hist1]
                    hist1 = [x + 0.1 for x in hist1]
                    source1 = ColumnDataSource(dict(top=np.log10(hist1), left=edges1[:-1], right=edges1[1:]))
                    
                    plot1 =p.quad(top='top', bottom=0, left='left', right='right', 
                           fill_color='darkgrey', line_color="#033649", alpha= 0.7, 
                           legend_label=('mean z-score ' + str(round(stats.mean(data1), 3)) 
                                         + ' (' + str(len(data1)) + ' genes)'), source=source1)
                    p.add_tools(HoverTool(renderers=[plot1], tooltips=[('Count',"@top"), ('score', '@left')],
                                          mode='vline', description=str(len(data1)) + ' genes'))

                    # Plot vertical lines
                    for idx, val in enumerate(value):
                        color = cmap[idx % 10]  # Use modulo to cycle through colors
                        p.line([df[df['gene_name'] == val][col].iloc[-1], df[df['gene_name'] == val][col].iloc[-1]],
                               [0, 1000],
                               line_color=color, line_dash="dashed", line_width=2.5,
                               legend_label=(val + " " + str(round(df[df['gene_name'] == val][col].iloc[-1], 3))))
                    # Show the average/mean z-score if there are multiple cis-genes/gene 1's
                    if len(value) > 1:
                        p.line([round(df[df['gene_name'].isin(value)][col].mean(), 3),
                                round(df[df['gene_name'].isin(value)][col].mean(), 3)],
                               [0, 1000], alpha=0,
                               legend_label=('mean lines '+ str(round(df[df['gene_name'].isin(value)][col].mean(), 3))))
                    
    p.y_range = Range1d(0, max(np.log10(hist) * 1.1))
    p = style(p)
    #p.add_layout(p.legend[0], 'right')
    return p


In [24]:
def hist_bokeh_2(df, dictionary, df_filter):
    # available_columns = list(coegene_egene_dict.keys())
    # available_columns.sort()
    available_columns = [col for col in df_filter.columns[1:]]
    selected_column = pn.widgets.Select(options=available_columns, name='Select a guide')

    @pn.depends(selected_column.param.value)
    def update_plot(selected_col):
        return create_bokeh_plot_cols(df, dictionary, selected_col, df_filter)

    pn.Column(
        selected_column, update_plot,
    ).show()

In [25]:
def co_eqtl_overlap_correct(df_co_eqtl, df_crispr):
    gene2 = list(df_co_eqtl['coegene'])
    gene1 = list(df_co_eqtl['egene'])
    coeqtl_dict = {}
    for gene1, gene2 in zip(gene1, gene2):
        coeqtl_dict.setdefault(gene2, [])
        coeqtl_dict[gene2].append(gene1)

    coeqtl_dict_match = {}
    for key, value in coeqtl_dict.items():
        for i in list(df_crispr['gene_name']):
            if i in value:
                coeqtl_dict_match.setdefault(key, [])
                coeqtl_dict_match[key].append(i) 
    return gene1, gene2, coeqtl_dict_match, coeqtl_dict

Data prepartion

In [28]:
df_crispr_coegene = df_crispr_co_eqtl
sig_df_egene_coegene_new = sig_coeqtls[sig_coeqtls['coeQTL'] == 1]

In [29]:
g1, g2, sig_coegene_dict, x= co_eqtl_overlap_correct(sig_df_egene_coegene_new,df_crispr_coegene)

In [180]:
len(sig_coegene_dict)

212

In [181]:
sig_df_egene_coegene_new['coegene'].value_counts()

EEF1G     42
FAU       34
RPL11     31
RPL13     30
RPL23A    29
          ..
USE1       1
SRSF8      1
TAF2       1
YBEY       1
LRSAM1     1
Name: coegene, Length: 212, dtype: int64

In [182]:
len(sig_df_egene_coegene_new['egene'].value_counts())

47

In [167]:
#CRISPRi matrix conataining only the intersection with co-eqtls
# df_crispr_co_eqtl_intersect =df_crispr_co_eqtl[df_crispr_co_eqtl.columns.intersection(sig_coeqtls['coegene'])]
# df_crispr_co_eqtl_intersect

In [30]:
#Unique coegenes with all affected genes
df_sig_coegenes = df_copy.loc[:,~df_copy.columns.duplicated()].copy()
df_sig_coegenes =df_sig_coegenes[df_sig_coegenes.columns.intersection(sig_coeqtls['coegene'])]
df_sig_coegenes = df_sig_coegenes[sorted(df_sig_coegenes.iloc[:,1:])]
df_sig_coegenes.insert(0, 'gene_name', list(df_copy['gene_name']))

In [184]:
df_sig_coegenes

Unnamed: 0,gene_name,ACTB,ANKRD11,ANP32B,ARPC1B,ATP5F1D,ATP5MG,ATP5PO,BTF3,CDK6,...,WNK1,XPO1,YBEY,YEATS2,YEATS4,ZFR,ZNF207,ZNF292,ZNF335,ZNHIT1
2,CYP51A1,-0.087624,-0.102150,0.122183,-0.179362,0.173553,-0.309697,-0.239874,-0.402794,-0.152949,...,0.102662,-0.011830,-0.142337,0.035929,-0.038241,0.321627,-0.104773,0.022562,-0.059978,0.746907
3,BAD,0.014339,-0.060277,0.096079,-0.016038,0.129077,0.043058,0.037041,0.252875,-0.010136,...,-0.062527,0.069799,-0.023542,0.010820,-0.241763,0.135631,0.050625,0.010686,0.046010,-0.200362
4,CD99,-0.052006,-0.119141,-0.031207,0.153649,0.154617,0.253837,0.087892,-0.016870,-0.237172,...,0.076297,0.182119,-0.030352,0.071017,-0.064191,-0.155991,0.436021,-0.046537,-0.128695,-0.092176
5,MAD1L1,0.140356,-0.066833,0.044450,0.198355,-0.101130,0.078105,0.048766,0.260557,-0.124514,...,0.075935,0.067429,0.073877,0.060587,-0.057528,0.068533,-0.028117,-0.044752,-0.052449,0.094862
6,CFLAR,0.014106,-0.023794,-0.049322,0.175197,-0.050689,0.294116,0.012185,0.284841,-0.198797,...,0.151068,0.176876,-0.081765,0.107937,-0.035196,-0.016753,0.428431,-0.040985,-0.150139,0.208963
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2316,AC139493.2,-0.105180,-0.106436,0.020978,-0.199064,0.024484,-0.099833,0.045068,-0.365148,0.136766,...,0.054043,0.273045,0.153103,0.160054,-0.069895,0.130380,0.280272,-0.210760,-0.099149,-0.154597
2317,POLR2J3,0.110662,-0.059064,-0.044831,0.310161,-0.008577,-0.026816,0.050588,0.126441,-0.053518,...,0.041066,0.227188,-0.027637,0.020901,-0.039629,-0.169787,0.046470,-0.012299,-0.037838,0.163687
2318,BX890604.2,-0.064387,0.002892,-0.027926,0.036563,-0.042060,0.184467,-0.082343,-0.038038,-0.177864,...,0.063180,-0.031081,0.026889,-0.006933,-0.031753,0.038463,0.050427,-0.060951,-0.117941,0.038536
2319,AC016074.2,-0.066637,0.778836,-0.054772,0.667838,0.018808,0.084599,0.185526,-0.048126,0.581307,...,0.928768,0.173647,-0.082020,0.251644,0.083346,0.070784,1.933251,0.010075,0.022901,0.327709


In [31]:
df_crispr_coeqtl = df_crispr_co_eqtl.loc[:,~df_crispr_co_eqtl.columns.duplicated()].copy()
df_crispr_coeqtl =df_crispr_coeqtl[df_crispr_coeqtl.columns.intersection(sig_coeqtls['coegene'])]
df_crispr_coeqtl = df_crispr_coeqtl[sorted(df_crispr_coeqtl.iloc[:,1:])]

In [186]:
df_crispr_coeqtl

Unnamed: 0,ACTB,ANKRD11,ANP32B,ARPC1B,ATP5F1D,ATP5MG,ATP5PO,BTF3,CDK6,CENPK,...,WNK1,XPO1,YBEY,YEATS2,YEATS4,ZFR,ZNF207,ZNF292,ZNF335,ZNHIT1
5,0.140356,-0.066833,0.044450,0.198355,-0.101130,0.078105,0.048766,0.260557,-0.124514,-0.173560,...,0.075935,0.067429,0.073877,0.060587,-0.057528,0.068533,-0.028117,-0.044752,-0.052449,0.094862
6,0.014106,-0.023794,-0.049322,0.175197,-0.050689,0.294116,0.012185,0.284841,-0.198797,-0.179543,...,0.151068,0.176876,-0.081765,0.107937,-0.035196,-0.016753,0.428431,-0.040985,-0.150139,0.208963
15,-0.029620,-0.125357,-0.108088,0.250185,-0.135939,0.008715,-0.032728,-0.096826,-0.121253,0.021009,...,0.091389,0.153504,0.087480,0.206128,-0.064640,-0.116076,-0.108389,0.072924,0.061298,-0.239464
18,0.020609,-0.019917,0.259541,-0.137618,0.152495,0.113867,-0.037664,-0.106009,-0.280307,-0.030240,...,0.140464,0.252648,-0.122916,0.125718,-0.105965,0.125881,0.386595,-0.041969,0.000720,0.184757
23,-0.023663,-0.036003,0.099825,-0.105885,-0.023538,-0.036832,-0.035646,0.371208,0.299632,-0.058494,...,-0.001574,-0.020215,-0.071061,0.022612,-0.208348,-0.239461,0.020040,0.045757,-0.107413,-0.064352
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2297,-0.001775,-0.066351,0.042794,-0.136527,-0.104341,-0.017314,-0.060065,-0.018259,-0.126741,-0.196775,...,0.123681,-0.209295,-0.172765,0.050330,0.002482,-0.037433,-0.453667,0.030435,-0.014078,-0.105633
2299,-0.046160,-0.090704,0.017516,-0.173810,-0.018831,-0.279992,-0.159068,-0.045765,0.211489,0.054083,...,-0.097011,0.170226,0.102516,-0.004004,-0.056317,0.188583,-0.281496,-0.095054,0.000092,-0.367352
2303,-0.081125,-0.070460,-0.027897,-0.149444,0.010423,-0.249306,0.006201,-0.151408,0.104449,-0.057814,...,0.044083,-0.012190,0.039966,-0.095581,0.040918,0.206254,0.033324,-0.031259,-0.186926,-0.007825
2309,-0.077200,-0.092769,0.035503,0.010171,-0.196311,-0.176836,-0.105794,-0.006882,0.072589,0.047723,...,0.031505,-0.101525,-0.119016,0.142266,-0.082137,-0.055789,-0.032362,0.013826,0.283720,-0.151110


In [32]:
df_crispr_coeqtl.insert(0, 'gene_name', list(df_crispr_co_eqtl['gene_name']))

In [33]:
df_crispr_coeqtl

Unnamed: 0,gene_name,ACTB,ANKRD11,ANP32B,ARPC1B,ATP5F1D,ATP5MG,ATP5PO,BTF3,CDK6,...,WNK1,XPO1,YBEY,YEATS2,YEATS4,ZFR,ZNF207,ZNF292,ZNF335,ZNHIT1
5,MAD1L1,0.140356,-0.066833,0.044450,0.198355,-0.101130,0.078105,0.048766,0.260557,-0.124514,...,0.075935,0.067429,0.073877,0.060587,-0.057528,0.068533,-0.028117,-0.044752,-0.052449,0.094862
6,CFLAR,0.014106,-0.023794,-0.049322,0.175197,-0.050689,0.294116,0.012185,0.284841,-0.198797,...,0.151068,0.176876,-0.081765,0.107937,-0.035196,-0.016753,0.428431,-0.040985,-0.150139,0.208963
15,POLR2J,-0.029620,-0.125357,-0.108088,0.250185,-0.135939,0.008715,-0.032728,-0.096826,-0.121253,...,0.091389,0.153504,0.087480,0.206128,-0.064640,-0.116076,-0.108389,0.072924,0.061298,-0.239464
18,KMT2E,0.020609,-0.019917,0.259541,-0.137618,0.152495,0.113867,-0.037664,-0.106009,-0.280307,...,0.140464,0.252648,-0.122916,0.125718,-0.105965,0.125881,0.386595,-0.041969,0.000720,0.184757
23,REX1BD,-0.023663,-0.036003,0.099825,-0.105885,-0.023538,-0.036832,-0.035646,0.371208,0.299632,...,-0.001574,-0.020215,-0.071061,0.022612,-0.208348,-0.239461,0.020040,0.045757,-0.107413,-0.064352
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2297,TAF15,-0.001775,-0.066351,0.042794,-0.136527,-0.104341,-0.017314,-0.060065,-0.018259,-0.126741,...,0.123681,-0.209295,-0.172765,0.050330,0.002482,-0.037433,-0.453667,0.030435,-0.014078,-0.105633
2299,GTF2H5,-0.046160,-0.090704,0.017516,-0.173810,-0.018831,-0.279992,-0.159068,-0.045765,0.211489,...,-0.097011,0.170226,0.102516,-0.004004,-0.056317,0.188583,-0.281496,-0.095054,0.000092,-0.367352
2303,TAF9,-0.081125,-0.070460,-0.027897,-0.149444,0.010423,-0.249306,0.006201,-0.151408,0.104449,...,0.044083,-0.012190,0.039966,-0.095581,0.040918,0.206254,0.033324,-0.031259,-0.186926,-0.007825
2309,PSMB3,-0.077200,-0.092769,0.035503,0.010171,-0.196311,-0.176836,-0.105794,-0.006882,0.072589,...,0.031505,-0.101525,-0.119016,0.142266,-0.082137,-0.055789,-0.032362,0.013826,0.283720,-0.151110


In [138]:
#df_sig_coegenes.insert(0, 'gene_name', list(df_copy['gene_name']))

In [189]:
df_sig_coegenes

Unnamed: 0,gene_name,ACTB,ANKRD11,ANP32B,ARPC1B,ATP5F1D,ATP5MG,ATP5PO,BTF3,CDK6,...,WNK1,XPO1,YBEY,YEATS2,YEATS4,ZFR,ZNF207,ZNF292,ZNF335,ZNHIT1
2,CYP51A1,-0.087624,-0.102150,0.122183,-0.179362,0.173553,-0.309697,-0.239874,-0.402794,-0.152949,...,0.102662,-0.011830,-0.142337,0.035929,-0.038241,0.321627,-0.104773,0.022562,-0.059978,0.746907
3,BAD,0.014339,-0.060277,0.096079,-0.016038,0.129077,0.043058,0.037041,0.252875,-0.010136,...,-0.062527,0.069799,-0.023542,0.010820,-0.241763,0.135631,0.050625,0.010686,0.046010,-0.200362
4,CD99,-0.052006,-0.119141,-0.031207,0.153649,0.154617,0.253837,0.087892,-0.016870,-0.237172,...,0.076297,0.182119,-0.030352,0.071017,-0.064191,-0.155991,0.436021,-0.046537,-0.128695,-0.092176
5,MAD1L1,0.140356,-0.066833,0.044450,0.198355,-0.101130,0.078105,0.048766,0.260557,-0.124514,...,0.075935,0.067429,0.073877,0.060587,-0.057528,0.068533,-0.028117,-0.044752,-0.052449,0.094862
6,CFLAR,0.014106,-0.023794,-0.049322,0.175197,-0.050689,0.294116,0.012185,0.284841,-0.198797,...,0.151068,0.176876,-0.081765,0.107937,-0.035196,-0.016753,0.428431,-0.040985,-0.150139,0.208963
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2316,AC139493.2,-0.105180,-0.106436,0.020978,-0.199064,0.024484,-0.099833,0.045068,-0.365148,0.136766,...,0.054043,0.273045,0.153103,0.160054,-0.069895,0.130380,0.280272,-0.210760,-0.099149,-0.154597
2317,POLR2J3,0.110662,-0.059064,-0.044831,0.310161,-0.008577,-0.026816,0.050588,0.126441,-0.053518,...,0.041066,0.227188,-0.027637,0.020901,-0.039629,-0.169787,0.046470,-0.012299,-0.037838,0.163687
2318,BX890604.2,-0.064387,0.002892,-0.027926,0.036563,-0.042060,0.184467,-0.082343,-0.038038,-0.177864,...,0.063180,-0.031081,0.026889,-0.006933,-0.031753,0.038463,0.050427,-0.060951,-0.117941,0.038536
2319,AC016074.2,-0.066637,0.778836,-0.054772,0.667838,0.018808,0.084599,0.185526,-0.048126,0.581307,...,0.928768,0.173647,-0.082020,0.251644,0.083346,0.070784,1.933251,0.010075,0.022901,0.327709


In [39]:
hist_bokeh_2(df_sig_coegenes, sig_coegene_dict, df_crispr_coeqtl)

Launching server at http://localhost:49527
