## 07 - Infer depleted guides based on the initial guide pool

Some knockouts may affect cell viability. Here we assess the knock-outs that has resulted in significant 
reduction in number of cells compared to the initial guide pool.

In [None]:
from libraries import *
from parameters import *

In [None]:
%load_ext rpy2.ipython

In [None]:
os.getcwd()
os.chdir(projectDir)

In [None]:
adata = sc.read(par_save_filename_5)

In [None]:
fBarMat = adata.obs[adata.uns['feature_barcode_names']]
guideList = fBarMat.columns

Replace the control guide identifiers strings "ONE_NONGENE_SITE_" and "NO_TARGET_" according to your needs

In [None]:

k = [x.startswith("ONE_NONGENE_SITE_") for x in guideList]
nonGeneGuides = fBarMat.loc[:,k]
nonGeneGuides = nonGeneGuides.loc[nonGeneGuides.sum(axis=1) == 1,:]

l = [x.startswith("NO_TARGET_") for x in guideList]
noTargetGuides = fBarMat.loc[:,l]
noTargetGuides = noTargetGuides.loc[noTargetGuides.sum(axis=1) == 1,:]

g = [(not(x.startswith("ONE_NONGENE_SITE_") or x.startswith("NO_TARGET_")) )for x in guideList]
geneGuides = fBarMat.loc[:,g]
geneGuides = geneGuides.loc[geneGuides.sum(axis=1) == 1,:]

k1 = pd.DataFrame(nonGeneGuides.sum(axis=0), columns=["nCellsScreen"])
k1["Guide"] = k1.index
k2 = pd.DataFrame(noTargetGuides.sum(axis=0), columns=["nCellsScreen"])
k2["Guide"] = k2.index
k3 = pd.DataFrame(geneGuides.sum(axis=0), columns=["nCellsScreen"])
k3["Guide"] = k3.index

k = k1.append(k2)
k = k.append(k3)

Import the file containing the initial guide pool distribution. The first two columns of the file should have the name of the guide and the number of guides in the pool. 

In [None]:
poolSummary = pd.read_csv(par_initial_guide_pool_file, sep="\\," )
poolSummary.columns = ["Guide", "nCellsPool"]
poolSummary["nCellsPool"] = poolSummary["nCellsPool"].astype(int)
poolSummary["Guide"] = poolSummary["Guide"].replace('-', '_', regex=True)

In [None]:
result = pd.merge(poolSummary, k, on="Guide")
result["type"] = "Gene"

l = [x.startswith("NO_TARGET_") for x in result["Guide"]]
result.loc[l,"type"] = "Control_NO_TARGET"
l = [x.startswith("ONE_NONGENE_SITE") for x in result["Guide"]]
result.loc[l,"type"] = "Control_ONE_NON_GENE_SITE"

## Plot the distribution of the number of cells per guide type

In [None]:

f, (ax_box, ax_hist) = plt.subplots(2, sharex=True, gridspec_kw={"height_ratios": (.15, .85)}, figsize=(15, 3))
 
# Add a graph in each part
kGene = result.loc[result.type == "Gene",:]
print(kGene.nCellsScreen.sum())
print(kGene.shape)
print(kGene.nCellsScreen.median())
sns.boxplot(data = kGene, x= 'nCellsScreen', ax=ax_box, color="pink")
sns.histplot(data = kGene, x = 'nCellsScreen', ax=ax_hist, color="pink", binwidth=5)
ax_hist.set_xticks(range(0,600,20))
ax_hist.set_xticklabels(range(0,600,20))
plt.axvline(x=kGene.nCellsScreen.mean(),
            color='red')
plt.axvline(x=kGene.nCellsScreen.median(),
            color='blue')
 
# Remove x axis name for the boxplot
ax_box.set(xlabel='')
ax_hist.set(xlabel="Number of cells per knock out guide")


f, (ax_box, ax_hist) = plt.subplots(2, sharex=True, gridspec_kw={"height_ratios": (.15, .85)}, figsize=(15, 3))
 
# Add a graph in each part
kControlNonGene = result.loc[result.type == "Control_ONE_NON_GENE_SITE",:]
print(kControlNonGene.nCellsScreen.sum())
print(kControlNonGene.nCellsScreen.median())
sns.boxplot(data = kControlNonGene,x= 'nCellsScreen', ax=ax_box, color="lightgreen")
sns.histplot(data = kControlNonGene,x = 'nCellsScreen', ax=ax_hist, color="lightgreen", binwidth=5)
ax_hist.set_xticks(range(0,500,20))
ax_hist.set_xticklabels(range(0,500,20))
plt.axvline(x=kControlNonGene.nCellsScreen.mean(),
            color='red')
plt.axvline(x=kControlNonGene.nCellsScreen.median(),
            color='blue')
 
# Remove x axis name for the boxplot
ax_box.set(xlabel='')
ax_hist.set(xlabel="Number of cells per control non-gene-site guide")


f, (ax_box, ax_hist) = plt.subplots(2, sharex=True, gridspec_kw={"height_ratios": (.15, .85)}, figsize=(15, 3))
 
# Add a graph in each part
kControlNoTarget = result.loc[result.type == "Control_NO_TARGET",:]
print(kControlNoTarget.nCellsScreen.sum())
print(kControlNoTarget.nCellsScreen.median())
sns.boxplot(data = kControlNoTarget,x= 'nCellsScreen', ax=ax_box, color="lightblue")
sns.histplot(data = kControlNoTarget,x = 'nCellsScreen', ax=ax_hist, color="lightblue", binwidth=5)
ax_hist.set_xticks(range(0,500,20))
ax_hist.set_xticklabels(range(0,500,20))
plt.axvline(x=kControlNoTarget.nCellsScreen.mean(),
            color='red')
plt.axvline(x=kControlNoTarget.nCellsScreen.median(),
            color='blue')
 
# Remove x axis name for the boxplot
ax_box.set(xlabel='')
ax_hist.set(xlabel="Number of cells per control no-target guide")



In [None]:
totalScreen = sum(result.nCellsScreen)
totalGuidePool = sum(result.nCellsPool)

result["nCellsPoolPerc"] = result.nCellsPool / totalGuidePool
result["nCellsScreenPerc"] = result.nCellsScreen / totalScreen
result["cellRatio"] = result.nCellsScreen / result.nCellsPool
result["GuideType"] = "CONTROL"
result.loc[result.type == "Gene", "GuideType"] = "KO Guide"
result = result.sort_values('nCellsScreenPerc')

Test each guide while taking the changes in the control guides as the background distribution.

In [None]:
%%R -i result -w 10 -h 5 -u in
library(ggplot2)
library(ggpubr)

k = result[result$GuideType == "CONTROL","cellRatio"]
d_fun <- ecdf (k)

# ggplot(result, aes(cellRatio, colour = GuideType)) +
#       stat_ecdf(geom = "step")+theme_minimal()+
#       labs(
#          x="(# of guides in the screen) / (# of guides in the initial pool)", 
#          y = "CDF")+theme(axis.text = element_text(size=15),
#                   axis.title =  element_text(size=16))+xlim(0, 1)+
#         geom_segment(aes(x = 0, y = 0.05, xend = 0.08587, yend = 0.05), color="blue")+
#         geom_segment(aes(x = 0.08587, y = 0, xend = 0.08587, yend = 0.05), color="blue")


for(i in 1:nrow(result)){
    result[i,"Pval"] = d_fun(result[i,"cellRatio"])
}

result$Gene = sapply(result$Guide, function(x){strsplit(x,"_")[[1]][1]})
result = result[result$Pval < 0.05,]
hh <- table(result$Gene)
hh = hh[order(hh)]
hh = hh[hh>1]

data.frame(GuideName=names(hh), NumberOfGuides=hh)