In [2]:
#!quilt install --force kmoore/crispr_screen_sgRNAs

import pandas as pd
import numpy as np

from quilt.data.kmoore import crispr_screen_sgRNAs


In [3]:
!pwd
crispr_screen_sgRNAs

/Users/kmoore/toa/github/crispr_screen_sgRNAs


<PackageNode '/Users/kmoore/toa/github/crispr_screen_sgRNAs/quilt_packages/kmoore/crispr_screen_sgRNAs'>
mappedAndCuratedSgRNAs/
sourceFilesFromAddgene/
README
hg19_allPAM_sites

In [4]:
crispr_screen_sgRNAs.mappedAndCuratedSgRNAs.weissmanV2_I()

Unnamed: 0,targetGene,sgRNAname,sgRNA20merSeq,sgRNAgenomicContext30mer,chr_hg19,start_hg19,stop_hg19,strand,cripsrType
0,A1BG,A1BG_weissmanV2_I_1,GAGCAGCTCGAAGGTGACGT,TTAAGAGCAGCTGGAAGGTGACGTGAGCAT,chr10,13896886,13896916,-,sp_dCas9_inhibit
1,A1BG,A1BG_weissmanV2_I_11,GGCGAGGAACCGCCCAGCAA,TGCGGGCGAGGAACCGCCCAGCAAAGGCCC,chr19,58858546,58858576,-,sp_dCas9_inhibit
2,A1BG,A1BG_weissmanV2_I_14,GGGGCACCCAGGAGCGGTAG,GTGTGGGGCACCCAGGAGCGGTAGCGGCAC,chr19,58858762,58858792,+,sp_dCas9_inhibit
3,A1BG,A1BG_weissmanV2_I_15,GTCAAGGTGCACAGGCTCCT,GTGAGTCAAGGTGCACAGGCTCCTGGGCCA,chr19,58864410,58864440,+,sp_dCas9_inhibit
4,A1BG,A1BG_weissmanV2_I_3,GCAGGTGAGTCAAGGTGCAC,GATGGCAGGTGAGTCAAGGTGCACAGGCTC,chr19,58864402,58864432,+,sp_dCas9_inhibit
5,A1BG,A1BG_weissmanV2_I_7,GCTGCAGGGCCTTTGCTGGG,TCCAGCTGCAGGGCCTTTGCTGGGCGGTTC,chr19,58858536,58858566,+,sp_dCas9_inhibit
6,A1BG,A1BG_weissmanV2_I_8,GCTTGATGGCAGGTGAGTCA,TGGTGCTTGATGGCAGGTGAGTCAAGGTGC,chr19,58864394,58864424,+,sp_dCas9_inhibit
7,A1BG,A1BG_weissmanV2_I_9,GGAGACCCAGCGCTAACCAG,CTGGGGAGACCCAGCGCTAACCAGGGGTGC,chr19,58858591,58858621,+,sp_dCas9_inhibit
8,A1CF,A1CF_weissmanV2_I_4,GGGGATCTCTGAAATTACTC,GGGTGGGGATCTCTGAAATTACTCAGGTAT,chr10,52645336,52645366,-,sp_dCas9_inhibit
9,A2ML1,A2ML1_weissmanV2_I_2,GATCCTTTACCCAGAGAGGG,CACTGATCCTTTACCCAGAGAGGGAGGCTG,chr12,8975445,8975475,+,sp_dCas9_inhibit


Search by gene name
------------------
The example below searches all of the labs' published CRISPR libraries for gRNAs that target any of the search genes. All of the individual libraries are loaded as a single DataFrame by accessing the `mappedAndCuratedSgRNAs` group. The search terms (gene names) are entered as a separate DataFrame. The match itself is excuted using Pandas [merge](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.merge.html) on these two DataFrames.

In [5]:
# Load all the mapped and curated CRISPR libraries into
# a single DataFrame
all_labs = crispr_screen_sgRNAs.mappedAndCuratedSgRNAs()

In [None]:
# Enter the search terms (gene names) into a DataFrame
search_terms = ['DNMT3A', 'SHH', 'HOXD', 'LNP', 'EVX2']
search = pd.DataFrame({'term': pd.Series(search_terms)})
search

In [None]:
# Find GRNAs matching the search genes using Pandas merge
result = search.merge(all_labs, left_on='term', right_on='targetGene')
result

Search by genomic location
-------------------------
The next example searches the same mapped and curated CRISPR libraries by genomic location (Chromosome, Start, Stop). For speed, the all-library DataFrame is partitioned on Chromosome using Pandas [groupby](https://pandas.pydata.org/pandas-docs/stable/groupby.html). Each search location is then matched against only the library entries from the matching chromosome.

In [None]:
# Enter the search terms into a DataFrame
searchloc = pd.DataFrame([{'chr' : 'chr11', 'start' : 134145657, 'stop' : 134145680}, 
{'chr' : 'chr1', 'start' : 109106801, 'stop' : 109106824}])
searchloc

In [10]:
# For speed, partition the CRISPR libraries
# by Chromosome
crispr_chrs = all_labs.groupby('chr_hg19')

In [None]:
# Match each input location against the group corresponding to
# the input location's chromosome.
matches = []
for idx, a_row in searchloc.iterrows():
    chr_grp = crispr_chrs.get_group(a_row['chr'])
    a_match = chr_grp.loc[(chr_grp['start_hg19'] < a_row['stop'])
                                  & ((a_row['start'] <= chr_grp['stop_hg19']))]
    if len(a_match.index) > 0:
        matches.append(a_match)

allmatches = pd.concat(matches)
allmatches

Search by genomic location and add non-library PAM-site matches
-----------------------------------------------------------
The last example extends the search-by-genomic-location example above by additionally searching a table of all human PAM-sites. The result will contain matching published and curated gRNAs if any exist. If no matches are found in the published libraries, the result will contain gRNAs matching any PAM site in the search region.  [CRISPR between the genes: how to experiment with enhancers and epigenomics](https://genomics.quiltdata.com/2016/04/18/crisper-between-the-genes-enhancers/)

In [6]:
# For speed, partition the CRISPR libraries
# by Chromosome
pams = crispr_screen_sgRNAs.hg19_allPAM_sites()
pams.head()

Unnamed: 0,name,gRNA_20mer,Chromosome,start,stop,strand,oligo_plate_F,oligo_plate_R,oligo_library,gRNA_30mer
0,hg19DHS.chr1.100000160.100000310_000,GACGACCCGAATGAACCCAG,chr1,100000177,100000200,+,ACCGGACGACCCGAATGAACCCAG,AAACGACGACCCGAATGAACCCAG,GGAAAGGACGAAACACCGGACGACCCGAATGAACCCAGGTTTTAGA...,GACTGACGACCCGAATGAACCCAGAGGCAA
1,hg19DHS.chr1.100000160.100000310_001,ATGGCTTGCTTAGATATTTC,chr1,100000220,100000243,+,ACCGATGGCTTGCTTAGATATTTC,AAACATGGCTTGCTTAGATATTTC,GGAAAGGACGAAACACCGATGGCTTGCTTAGATATTTCGTTTTAGA...,TGAGATGGCTTGCTTAGATATTTCTGGGAA
2,hg19DHS.chr1.100000160.100000310_002,GTTAGAATGTTGTTGTTTCC,chr1,100000268,100000291,+,ACCGGTTAGAATGTTGTTGTTTCC,AAACGTTAGAATGTTGTTGTTTCC,GGAAAGGACGAAACACCGGTTAGAATGTTGTTGTTTCCGTTTTAGA...,AAAGGTTAGAATGTTGTTGTTTCCTGGTAA
3,hg19DHS.chr1.100000160.100000310_003,TGAGAATAAAAAATTGCCTC,chr1,100000193,100000216,-,ACCGTGAGAATAAAAAATTGCCTC,AAACTGAGAATAAAAAATTGCCTC,GGAAAGGACGAAACACCGTGAGAATAAAAAATTGCCTCGTTTTAGA...,CTCATGAGAATAAAAAATTGCCTCTGGGTT
4,hg19DHS.chr1.100001120.100001270_003,acaaatgccatagcaaagtt,chr1,100001234,100001257,-,ACCGacaaatgccatagcaaagtt,AAACacaaatgccatagcaaagtt,GGAAAGGACGAAACACCGacaaatgccatagcaaagttGTTTTAGA...,gtttacaaatgccatagcaaagttaggaag


In [None]:
# Enter the search terms into a DataFrame
searchloc = pd.DataFrame([{'chr' : 'chr11', 'start' : 134145657, 'stop' : 134145680},
                          {'chr' : 'chr1', 'start' : 109106801, 'stop' : 109106824},
                          {'chr' : 'chr2', 'start' : 243170014, 'stop' : 300000000}])
searchloc

In [8]:
pams_chrs = pams.groupby('Chromosome')

In [None]:
matches = []
for idx, a_row in searchloc.iterrows():
    chr_grp = crispr_chrs.get_group(a_row['chr'])
    a_match = chr_grp.loc[(chr_grp['start_hg19'] < a_row['stop'])
                                  & ((a_row['start'] <= chr_grp['stop_hg19']))]
    if len(a_match.index) > 0:
        matches.append(a_match)
    else:
        pam_grp = pams_chrs.get_group(a_row['chr'])
        pam_match = pam_grp.loc[(pam_grp['start'] < a_row['stop'])
                                  & ((a_row['start'] <= pam_grp['stop']))]
        if len(pam_match.index) > 0:
            matches.append(pam_match)
allmatches = pd.concat(matches)
allmatches

TEST/THROW-AWAY CODE BELOW
--------------

In [None]:
matches = []
for idx, a_row in searchloc.iterrows():
    pam_grp = pams_chrs.get_group(a_row['chr'])
    pam_match = pam_grp.loc[(pam_grp['start'] < a_row['stop'])
                                  & ((a_row['start'] <= pam_grp['stop']))]
    if len(pam_match.index) > 0:
        matches.append(pam_match)
allmatches = pd.concat(matches)
allmatches

In [11]:
pamCHR = pams_chrs.get_group('chr2')
crisprCHR = crispr_chrs.get_group('chr2')

In [None]:
for idx, row in pamCHR.iterrows():
    pam_match = crisprCHR.loc[(crisprCHR['start_hg19'] < row['stop'])
                                  & ((row['start'] <= crisprCHR['stop_hg19']))]
    if len(pam_match.index) == 0:
        print(idx, row)