In [1]:
import numpy as np
import pandas as pd
import os
import math
from itertools import chain
import pickle
import sqlalchemy
from sqlalchemy.orm import sessionmaker
from sqlalchemy.engine import Engine
from sqlalchemy import event


In [2]:
@event.listens_for(Engine, "connect")
def set_sqlite_pragma(dbapi_connection, connection_record):
    cursor = dbapi_connection.cursor()
    cursor.execute("PRAGMA foreign_keys=ON")
    cursor.close()

In [3]:
# @event.listens_for(Engine, "connect")
# def set_sqlite_pragma(dbapi_connection, connection_record):
#     cursor = dbapi_connection.cursor()
#     #cursor.execute("PRAGMA foreign_keys=OFF")
#     cursor.execute("PRAGMA foreign_keys=ON")

#     invalid_foreign_keys = cursor.execute("PRAGMA foreign_key_check").fetchall()
#     if invalid_foreign_keys:
#         print('A')
#         msg = "\n".join(
#             f"{invalid_key.table} (rowid: {invalid_key.rowid}) -> {invalid_key.parent}"
#             for invalid_key in invalid_foreign_keys
#         )
#         msg = 'A'
#         raise ValueError(f"Invalid Foreign Keys detected: {msg}")
        
#     cursor.close()

## Create a SL Database with Wet-Lab Experiments

all SL data:

* Diehl
* Han
* Horlbeck (Done)
* Ito
* Laufer (Done)
* Parrish
* Shen
* Thompson
* Wong
* Zhao (Done)

In [4]:
#PREV_REF = pd.read_csv('../Python/original_GI_110422-Copy1.csv')

In [5]:
# Load in the datasets
data_locs = "/users/PAS1376/bg12/SyntheticLethality/SyntheticLethalityReview/Project/ml_inputs"
learning_goals_loc_general =  os.path.join(data_locs, "learning_goals")
learning_goals_loc_general = '/users/PAS1376/bg12/SyntheticLethality - NewDB/data'


In [6]:
# read the database
#SLKB_engine = sqlalchemy.create_engine('sqlite:///SLKB_sqlite3')
#SLKB_engine = sqlalchemy.create_engine('sqlite:///shan_new_SL')
SLKB_engine = sqlalchemy.create_engine('sqlite:///shuai_SL')
#SLKB_engine_session = sessionmaker(bind=SLKB_engine)

In [7]:
db_metadata = sqlalchemy.MetaData(bind=SLKB_engine)
db_metadata.reflect(SLKB_engine)

In [8]:
db_metadata.tables

FacadeDict({'CDKO_EXPERIMENT_DESIGN': Table('CDKO_EXPERIMENT_DESIGN', MetaData(bind=Engine(sqlite:///shuai_SL)), Column('sgRNA_id', INTEGER(), table=<CDKO_EXPERIMENT_DESIGN>, primary_key=True), Column('sgRNA_guide_name', TEXT(), table=<CDKO_EXPERIMENT_DESIGN>, nullable=False), Column('sgRNA_guide_seq', TEXT(), table=<CDKO_EXPERIMENT_DESIGN>, nullable=False), Column('sgRNA_target_name', TEXT(), table=<CDKO_EXPERIMENT_DESIGN>, nullable=False), Column('study_origin', TEXT(), table=<CDKO_EXPERIMENT_DESIGN>, nullable=False), schema=None), 'CDKO_ORIGINAL_SL_RESULTS': Table('CDKO_ORIGINAL_SL_RESULTS', MetaData(bind=Engine(sqlite:///shuai_SL)), Column('id', INTEGER(), table=<CDKO_ORIGINAL_SL_RESULTS>, primary_key=True), Column('gene_pair_id', INTEGER(), table=<CDKO_ORIGINAL_SL_RESULTS>), Column('gene_pair', TEXT(), table=<CDKO_ORIGINAL_SL_RESULTS>, nullable=False), Column('study_origin', TEXT(), table=<CDKO_ORIGINAL_SL_RESULTS>, nullable=False), Column('cell_line_origin', TEXT(), table=<CDKO_O

In [9]:
# To delete all
#db_metadata.drop_all()

In [10]:
def make_cell_line_matrix(input_matrix, cell_line = "temp"):
    input_matrix_res = input_matrix.copy(deep = True)
    input_matrix_res = input_matrix_res.astype(object)
    input_matrix_res.values[np.nonzero(input_matrix_res.values)] = cell_line
    return(input_matrix_res)

In [11]:
def create_matrix_from_df(input_data):
    all_genes = sorted(set(input_data.loc[:, 'Gene_A'].tolist() + input_data.loc[:, 'Gene_B'].tolist()))
    print("Total Number of genes: " + str(len(all_genes)))
    curr_GI_matrix = pd.DataFrame(columns = all_genes, index = all_genes)
    for i in range(len(input_data.index)):
        gene_a = input_data.loc[i, "Gene_A"]
        gene_b = input_data.loc[i, "Gene_B"]
        curr_GI_matrix.loc[gene_a, gene_b] = input_data.loc[i, "GI_Score"]
        curr_GI_matrix.loc[gene_b, gene_a] = input_data.loc[i, "GI_Score"]
        if i % 1000 == 0:
            print(str(i) + "/" + str(len(input_data.index)))
    return(curr_GI_matrix.fillna(0))

In [12]:
def check_repeated_constructs(x, index_loc):
    if len(x) < max(index_loc):
        sub = index_loc[index_loc < len(x)]
        return(x[sub])
    else:
        return(x[index_loc])

In [13]:
## store pubmed IDs
study_name_to_pubmed_id = {}

study_name_to_pubmed_id['diehl_data'] = '33956155'

study_name_to_pubmed_id['han_data'] = '28319085'

study_name_to_pubmed_id['horlbeck_data'] = '30033366'

study_name_to_pubmed_id['ito_data'] = '34857952'

study_name_to_pubmed_id['parrish_data'] = '34469736'

study_name_to_pubmed_id['shen_data'] = '28319113'

study_name_to_pubmed_id['thompson_data'] = '33637726'

study_name_to_pubmed_id['wong_data'] = '26864203'

study_name_to_pubmed_id['zhao_data'] = '29452643'

study_name_to_pubmed_id['shantang_data'] = '36060092'

study_name_to_pubmed_id['najm_data'] = '29251726'

study_name_to_pubmed_id['shantang_2CL_data'] = 'Shan_2CL'

study_name_to_pubmed_id['shuai_data'] = 'Shuai'



In [14]:
## controls
controls = {}

# diehl et al
controls['diehl_data'] = ["wildtype-I-CeuI", "wildtype-I-SceI", "nan"] + ["Non-Human-Target-114", "Non-Human-Target-122",
                                    "Non-Human-Target-144", "Non-Human-Target-148", "Non-Human-Target-161", "Non-Human-Target-178", "Non-Human-Target-185", "Non-Human-Target-222", "Non-Human-Target-223",
                                    "Non-Human-Target-224", "Non-Human-Target-23", "Non-Human-Target-243", "Non-Human-Target-245", "Non-Human-Target-249", "Non-Human-Target-292", "Non-Human-Target-298",
                                    "Non-Human-Target-31", "Non-Human-Target-311", "Non-Human-Target-313", "Non-Human-Target-327", "Non-Human-Target-333", "Non-Human-Target-335", "Non-Human-Target-339",
                                    "Non-Human-Target-341", "Non-Human-Target-343", "Non-Human-Target-389", "Non-Human-Target-39", "Non-Human-Target-397", "Non-Human-Target-398", "Non-Human-Target-40", 
                                    "Non-Human-Target-402", "Non-Human-Target-42", "Non-Human-Target-432", "Non-Human-Target-444", "Non-Human-Target-466", "Non-Human-Target-479", "Non-Human-Target-495",
                                    "Non-Human-Target-512", "Non-Human-Target-515", "Non-Human-Target-526", "Non-Human-Target-532", "Non-Human-Target-542", "Non-Human-Target-547", "Non-Human-Target-55", 
                                    "Non-Human-Target-551", "Non-Human-Target-560", "Non-Human-Target-565", "Non-Human-Target-584", "Non-Human-Target-588", "Non-Human-Target-595", "Non-Human-Target-602",
                                    "Non-Human-Target-622", "Non-Human-Target-636", "Non-Human-Target-637", "Non-Human-Target-654", "Non-Human-Target-659", "Non-Human-Target-668", "Non-Human-Target-676",
                                    "Non-Human-Target-678", "Non-Human-Target-681", "Non-Human-Target-692", "Non-Human-Target-719", "Non-Human-Target-732", "Non-Human-Target-736", "Non-Human-Target-748",
                                    "Non-Human-Target-752", "Non-Human-Target-766", "Non-Human-Target-782", "Non-Human-Target-789", "Non-Human-Target-798", "Non-Human-Target-799", "Non-Human-Target-801",
                                    "Non-Human-Target-805", "Non-Human-Target-807", "Non-Human-Target-808", "Non-Human-Target-814", "Non-Human-Target-816", "Non-Human-Target-822", "Non-Human-Target-824",
                                    "Non-Human-Target-827", "Non-Human-Target-828", "Non-Human-Target-835", "Non-Human-Target-85", "Non-Human-Target-857", "Non-Human-Target-863", "Non-Human-Target-864",
                                    "Non-Human-Target-877", "Non-Human-Target-88", "Non-Human-Target-884", "Non-Human-Target-900", "Non-Human-Target-902", "Non-Human-Target-905", "Non-Human-Target-941",
                                    "Non-Human-Target-942", "Non-Human-Target-953", "Non-Human-Target-954", "Non-Human-Target-958", "Non-Human-Target-959", "Non-Human-Target-965", "Non-Human-Target-969",
                                    "Non-Human-Target-970", "Non-Human-Target-99", "Non-Human-Target-995", "Non-Human-Target-997"]

# horlbeck et al
controls['horlbeck_data'] = ["negative"]

# parrish et al
controls['parrish_data'] = ["nt" + str(i+1) for i in range(975)] + ["FAKE_GENE_" + str(i+1) for i in range(50)]
controls['parrish_data_original'] = ["nt" + str(i+1) for i in range(975)] + ["FAKE_GENE_" + str(i+1) for i in range(50)]

# wong et al
controls['wong_data'] = ["DUMMYGUIDE"]#, 'GFP', 'RFP'

# zhao et al
controls['zhao_data'] = ["0", 'CONTROL']

# tang et al
controls['shantang_data'] = ['0SAFE',
                             '0SAFE-SAFE-GE',
                             '0SAFE-SAFE-SP',
                             '0SAFE-SAFE-MP',
                             '0SAFE-SAFE-U2',
                             '0SAFE-SAFE-DTKP',
                             '0SAFE-SAFE-ACOC',
                             '0SAFE-SAFE-TMM',
                             '0SAFE-SAFE-U1',
                             '0SAFE-SAFE-U3']

controls['ito_data'] = ['AAVS1']

controls['najm_data'] = ["HPRT INTRON",
                         "6T",
                         "EEF2",
                         "CD81"]

controls['shantang_2CL_data'] = ['0SAFE',
                                 '0SAFE-SAFE-GE',
                                 '0SAFE-SAFE-SP',
                                 '0SAFE-SAFE-MP',
                                 '0SAFE-SAFE-U2',
                                 '0SAFE-SAFE-DTKP',
                                 '0SAFE-SAFE-ACOC',
                                 '0SAFE-SAFE-TMM',
                                 '0SAFE-SAFE-U1',
                                 '0SAFE-SAFE-U3']


controls['shuai_data'] = ['0SAFE',
                                 '0SAFE-SAFE-GE',
                                 '0SAFE-SAFE-SP',
                                 '0SAFE-SAFE-MP',
                                 '0SAFE-SAFE-U2',
                                 '0SAFE-SAFE-DTKP',
                                 '0SAFE-SAFE-ACOC',
                                 '0SAFE-SAFE-TMM',
                                 '0SAFE-SAFE-U1',
                                 '0SAFE-SAFE-U3']

for study in controls:
    controls[study] = [i.upper() for i in controls[study]]# + [i for i in controls[study]]

In [110]:
## conditions
study_conditions = {}

# diehl et al
study_conditions['diehl_data'] = [['ctrl_1',
                                    'ctrl_2'],
                                   ['rep_1',
                                    'rep_2',
                                    'rep_3']]

# ctrl_1;ctrl_2;rep_1;rep_2;rep_3
# ["wildtype-I-CeuI", "wildtype-I-SceI", "nan"]

# horlbeck et al
study_conditions['horlbeck_data'] = {}
# study_conditions['horlbeck_data']['JURKAT'] = [['JURKAT_barcode,T0,rep1',
#                                                 'JURKAT_barcode,T0,rep2'],
#                                                ['JURKAT_barcode,cyc,rep1',
#                                                 'JURKAT_barcode,cyc,rep2',]]
study_conditions['horlbeck_data']['JURKAT'] = [['JURKAT_tripleseq,T0,rep1',
                                                'JURKAT_tripleseq,T0,rep2'],
                                               ['JURKAT_tripleseq,cyc,rep1',
                                                'JURKAT_tripleseq,cyc,rep2',]]

# study_conditions['horlbeck_data']['K562'] = [['K562_barcode,T0,rep1',
#                                         'K562_barcode,T0,rep2'],
#                                        ['K562_barcode,cyc,rep1',
#                                         'K562_barcode,cyc,rep2',]]
study_conditions['horlbeck_data']['K562'] = [['K562_barcode,T0,rep1',
                                        'K562_barcode,T0,rep2'],
                                       ['K562_tripleseq,cyc,rep1',
                                        'K562_tripleseq,cyc,rep2',]]

# parrish et al
study_conditions['parrish_data'] = [["plasmid_1", 
                                     "plasmid_2",
                                     "plasmid_3"],
                                    ["LTP_1",
                                     "LTP_2",
                                     "LTP_3"]]
study_conditions['parrish_data_original'] = [["plasmid_1", 
                                     "plasmid_2",
                                     "plasmid_3"],
                                    ["LTP_1",
                                     "LTP_2",
                                     "LTP_3"]]

# wong et al
study_conditions['wong_data'] = [["day5 (Replicate 1)", 
                                  "day5 (Replicate 2)"],
                                 ["day20 (Replicate 1)",
                                  "day20 (Replicate 2)"]]

# zhao et al
study_conditions['zhao_data'] = {}
study_conditions['zhao_data']["HELA"] = [["Hela_MV4_d3_1_S1_trimmed53_len_filtered_counts",
                                          "Hela_MV4_d3_2_S2_trimmed53_len_filtered_counts"],
                                         ["Hela_MV4_d28_1_S5_trimmed53_len_filtered_counts",
                                          "Hela_MV4_d28_2_S6_trimmed53_len_filtered_counts"]]

study_conditions['zhao_data']["A549"] = [["A549_MV4_d3_1_S1_trimmed53_len_filtered_counts",
                                          "A549_MV4_d3_2_S2_trimmed53_len_filtered_counts"],
                                         ["A549_MV4_d28_1_S7_trimmed53_len_filtered_counts",
                                          "A549_MV4_d28_2_S8_trimmed53_len_filtered_counts"]]

study_conditions['shantang_data'] = [["T0_1", 
                                     "T0_2"],
                                    ["T12_1",
                                     "T12_2"]]

study_conditions['shantang_data'] = [["T0_1", 
                                     "T0_2"],
                                    ["T12_1",
                                     "T12_2"]]

study_conditions['shuai_data'] = [['T1',
                                   'T2',
                                   'T3'],
                                  ['F1',
                                   'F2',
                                   'F3']]


study_conditions['shantang_2CL_data'] = {}
study_conditions['shantang_2CL_data']["SAOS2"] = [["S0A", 
                                                   "S0B",
                                                   "S0C"],
                                                  ["SEA", 
                                                   "SEB",
                                                   "SEC"]]
study_conditions['shantang_2CL_data']["TT2"] =   [["T0A", 
                                                   "T0B",
                                                   "T0C"],
                                                  ["TEA", 
                                                   "TEB",
                                                   "TEC"]]

study_conditions['najm_data'] = [['pDNA_Reads'],
                                 ['Rep_A_Reads',
                                  'Rep_B_Reads',
                                  'Rep_C_Reads']]


In [16]:
def create_placeholder_scores(curr_counts, sequence_ref):
    # we should add genes to the KB that can later be modified following scoring
    
    # first, set the controls so they can be removed
    curr_counts.loc[curr_counts['Guide 1'].isin(sequence_ref.loc[sequence_ref['sgRNA_target_name'] == 'control', 'sgRNA_guide_name'].values), 'Gene 1'] = 'CONTROL'
    curr_counts.loc[curr_counts['Guide 2'].isin(sequence_ref.loc[sequence_ref['sgRNA_target_name'] == 'control', 'sgRNA_guide_name'].values), 'Gene 2'] = 'CONTROL'

    idx = (curr_counts['Gene 1'] == 'CONTROL') | (curr_counts['Gene 2'] == 'CONTROL')
    # remove them
    curr_counts = curr_counts[~idx]

    # add sorted genes so they can be removed
    curr_counts['sorted_genes'] = ['_'.join(sorted([curr_counts['Gene 1'].iloc[i], curr_counts['Gene 2'].iloc[i]])) for i in range(curr_counts.shape[0])]
    curr_counts.drop_duplicates(subset = ['sorted_genes', 'Cell Line'], keep = 'first', inplace = True)

    # drop the same genes as well
    curr_counts = curr_counts.loc[curr_counts['Gene 1'] != curr_counts['Gene 2']]
    curr_counts.reset_index(drop = True, inplace = True)

    # proceed to create the GI and return
    curr_GI = pd.DataFrame(columns = ["Gene_A", "Gene_B", "Study_Source", "Cell_Line", "GI_Score", "GI_Cutoff", "Stat_Score", "Stat_Cutoff"])
    curr_GI["Gene_A"] = curr_counts['Gene 1'].values
    curr_GI["Gene_B"] = curr_counts['Gene 2'].values
    curr_GI["Study_Source"] = [curr_counts['Study'].iloc[0]] * curr_GI.shape[0]
    curr_GI["Cell_Line"] = curr_counts['Cell Line'].values
    curr_GI = curr_GI.fillna(0)
    
    return(curr_GI)


In [17]:
def prepare_study_for_export(sequence_ref, counts_ref, score_ref, study_controls = None, study_conditions = None, can_control_be_substring = True):
    
    ## make sure the columns are within each table, if not return error
    sequence_ref_needed_columns = {'sgRNA_guide_name', 'sgRNA_guide_seq', 'sgRNA_target_name'}
    
    if sequence_ref is not None:
        if len(sequence_ref_needed_columns.difference(sequence_ref.columns)) > 0:
            print('Error')
            return
        # reset index by default
        sequence_ref.reset_index(drop = True, inplace = True)
    
    counts_ref_needed_columns = {'Guide 1', 'Guide 2', 'Gene 1', 'Gene 2', 'Count Replicates', 'Cell Line', 'Study', 'Condition'}
    if counts_ref is not None:
        if len(counts_ref_needed_columns.difference(counts_ref.columns)) > 0:
            print('Error')
            return
        # reset index by default
        counts_ref.reset_index(drop = True, inplace = True)
    
    score_ref_needed_columns = {'Gene_A', 'Gene_B', 'Study_Source', 'Cell_Line', 'GI_Score', 'GI_Cutoff', 'Stat_Score', 'Stat_Cutoff'}
    if (score_ref is None) and (counts_ref is not None):
        print('There are no scores, but there are counts...Generating Placeholder...')
        score_ref = create_placeholder_scores(counts_ref.copy(), sequence_ref.copy())
    if len(score_ref_needed_columns.difference(score_ref.columns)) > 0:
        print('Error')
        return
    # reset index by default
    score_ref.reset_index(drop = True, inplace = True)
    
    
    ## prepare each table to be inserted to their respective tables
    
    print("Starting processing...")
    
    ################################# first, handle the scores ref
    print('Score reference...')
    
    # fill NA
    score_ref = score_ref.fillna(0)
    
    for col in ['Gene_A', 'Gene_B', 'Cell_Line']:
        score_ref[col] = [i.upper() for i in score_ref[col]]
    
    genes_A = score_ref['Gene_A'].values
    genes_B = score_ref['Gene_B'].values
    sorted_genes = []
    for i in range(score_ref.shape[0]):
        sorted_genes.append('_'.join(sorted([genes_A[i], genes_B[i]])))
    score_ref["Gene Pair"] = sorted_genes

    # remove same ones
    score_ref = score_ref.loc[~(score_ref["Gene_A"].values == score_ref["Gene_B"].values)]

    # remove controls from SL scores
    if study_controls is not None:
        control_idx = np.array([False] * score_ref.shape[0])

        for curr_control in study_controls:
            
            if can_control_be_substring:
                control_idx = control_idx | np.array([True if curr_control in i else False for i in score_ref["Gene_A"]]) | np.array([True if curr_control in i else False for i in score_ref["Gene_B"]])
                
            control_idx = control_idx | np.array([True if i in curr_control else False for i in score_ref["Gene_A"]]) | np.array([True if i in curr_control else False for i in score_ref["Gene_B"]])

        print('Controls within SL score that are removed: ')
        print(control_idx.sum())
        print('---')

        score_ref = score_ref.loc[~control_idx]
        
    if (score_ref['Stat_Cutoff'].iloc[0] != 0) and (score_ref['GI_Cutoff'].iloc[0] != 0):
        print('Both GI and Stat cutoffs are present...')
        score_ref['SL_or_not'] = (score_ref['GI_Score'] <= (score_ref['GI_Cutoff'].iloc[0])) & (score_ref['Stat_Score'] <= (score_ref['Stat_Cutoff'].iloc[0]))
    elif score_ref['GI_Cutoff'].iloc[0] != 0:
        print('Only GI cutoff is present...')
        score_ref['SL_or_not'] = score_ref['GI_Score'] <= score_ref['GI_Cutoff'].iloc[0]
    elif score_ref['Stat_Score'].iloc[0] != 0:
        print('Only Stat cutoff is present...')
        score_ref['SL_or_not'] = score_ref['Stat_Score'] <= score_ref['Stat_Cutoff'].iloc[0]
    else:
        print('No scores/stats cutoffs are available, possibly generated. Setting all to be NOT SL')
        score_ref['SL_or_not'] = [False] * score_ref.shape[0]
    
    # rename the columns
    score_ref = score_ref.rename(columns = {  'Gene_A': 'gene_1',
                                    'Gene_B': 'gene_2',
                                    'SL_or_not': 'SL_or_not',
                                    'GI_Score': 'SL_score',
                                    'Stat_Score': 'statistical_score',
                                    'GI_Cutoff': 'SL_score_cutoff',
                                    'Stat_Cutoff': 'statistical_score_cutoff',
                                    'Cell_Line': 'cell_line_origin',
                                    'Study_Source': 'study_origin',
                                    'Gene Pair': 'gene_pair'})
    
    ################################# score ref - DONE
    
    print('Counts reference...')
    
    if counts_ref is not None:

        for col in ['Guide 1', 'Guide 2', 'Gene 1', 'Gene 2', 'Cell Line']:
            counts_ref[col] = [i.upper() for i in counts_ref[col]]


        # label whether single, double, or control
        sgRNA_true_pair_index = np.array([i for i in range(counts_ref.shape[0]) if (str(counts_ref["Gene 1"].iloc[i]) not in study_controls) and (str(counts_ref["Gene 2"].iloc[i]) not in study_controls) and (str(counts_ref["Gene 1"].iloc[i]) != str(counts_ref["Gene 2"].iloc[i]))])
        print(' '.join(["Number of double pairs:", str(len(sgRNA_true_pair_index))]))

        sgRNA_control_pair_index = np.array([i for i in range(counts_ref.shape[0]) if (str(counts_ref["Gene 1"].iloc[i]) in study_controls) and (str(counts_ref["Gene 2"].iloc[i]) in study_controls)])
        print(' '.join(["Number of controls:", str(len(sgRNA_control_pair_index))]))

        sgRNA_single_gene_index = np.array(sorted(list(set(range(counts_ref.shape[0])).difference(set(np.concatenate((sgRNA_true_pair_index, sgRNA_control_pair_index)))))))
        print(' '.join(["Number of singles:", str(len(sgRNA_single_gene_index))]))

        if (len(sgRNA_single_gene_index) + len(sgRNA_control_pair_index) + len(sgRNA_true_pair_index)) != counts_ref.shape[0]:
            print('Missing annotation')
            print(counts_ref.shape[0] - (len(sgRNA_single_gene_index) + len(sgRNA_control_pair_index) + len(sgRNA_true_pair_index)))

        counts_ref['Target Type'] = 'N/A'
        counts_ref['Target Type'].iloc[sgRNA_true_pair_index] = 'Dual'
        counts_ref['Target Type'].iloc[sgRNA_control_pair_index] = 'Control'
        counts_ref['Target Type'].iloc[sgRNA_single_gene_index] = 'Single'


        if 'Type' in counts_ref.columns:
            counts_ref = counts_ref.drop(columns = ['Type'])
        if 'Sequencing' in counts_ref.columns:
            counts_ref = counts_ref.drop(columns = ['Sequencing'])


        ## seperate the replicate counts across T0 and TEnd
        counts_ref['T0_counts'] = ""
        counts_ref['T0_replicate_names'] = ""
        counts_ref['TEnd_counts'] = ""
        counts_ref['TEnd_replicate_names'] = ""

        if isinstance(study_conditions, dict):
            # for different cell lines within a study

            for cell_line in study_conditions:
                curr_conditions = study_conditions[cell_line]

                # access the cell line counts
                access_level = counts_ref.loc[counts_ref['Cell Line'] == cell_line].copy()

                ## get all conditions
                condition = access_level['Condition'].value_counts().index.tolist()
                condition = condition[0].split(';')

                # time point T_0
                t_0_index = np.array([i for i in range(len(condition)) if condition[i] in curr_conditions[0]])
                # time point T_end
                t_end_index = np.array([i for i in range(len(condition)) if condition[i] in curr_conditions[1]])

                # get counts
                replicate_sep =  access_level["Count Replicates"].apply(    
                    lambda x: np.array(x.split(";"), dtype = np.float64)
                )

                # get time point 
                t_0_comb = replicate_sep.apply(
                    lambda x: check_repeated_constructs(x, t_0_index)
                ).apply(lambda x: ';'.join(x.astype(np.str_)))

                t_end_comb = replicate_sep.apply(
                    lambda x: check_repeated_constructs(x, t_end_index)#np.median(x[t_end_index])
                ).apply(lambda x: ';'.join(x.astype(np.str_)))

                access_level['T0_counts'] = t_0_comb
                access_level['T0_replicate_names'] = ';'.join(curr_conditions[0])
                access_level['TEnd_counts'] = t_end_comb
                access_level['TEnd_replicate_names'] = ';'.join(curr_conditions[1])

                counts_ref.loc[counts_ref['Cell Line'] == cell_line] = access_level
        else:
            # for only one cell line
            curr_conditions = study_conditions

            ## get all conditions
            condition = counts_ref['Condition'].value_counts().index.tolist()
            condition = condition[0].split(';')

            # time point T_0
            t_0_index = np.array([i for i in range(len(condition)) if condition[i] in curr_conditions[0]])
            # time point T_end
            t_end_index = np.array([i for i in range(len(condition)) if condition[i] in curr_conditions[1]])

            # get counts
            replicate_sep =  counts_ref["Count Replicates"].apply(    
                lambda x: np.array(x.split(";"), dtype = np.float64)
            )

            # get time point 
            t_0_comb = replicate_sep.apply(
                lambda x: check_repeated_constructs(x, t_0_index)
            ).apply(lambda x: ';'.join(x.astype(np.str_)))

            t_end_comb = replicate_sep.apply(
                lambda x: check_repeated_constructs(x, t_end_index)#np.median(x[t_end_index])
            ).apply(lambda x: ';'.join(x.astype(np.str_)))

            counts_ref['T0_counts'] = t_0_comb
            counts_ref['T0_replicate_names'] = ';'.join(curr_conditions[0])
            counts_ref['TEnd_counts'] = t_end_comb
            counts_ref['TEnd_replicate_names'] = ';'.join(curr_conditions[1])


        # proceed to add the orientation

        unsorted_orientations = np.array(['_'.join([counts_ref["Gene 1"].iloc[i], counts_ref["Gene 2"].iloc[i]]) for i in range(counts_ref.shape[0])])
        sorted_orientations = np.array(['_'.join(sorted([counts_ref["Gene 1"].iloc[i], counts_ref["Gene 2"].iloc[i]])) for i in range(counts_ref.shape[0])])

        counts_ref['Gene Pair'] = sorted_orientations
        counts_ref['Gene Pair Orientation'] = 'A_B'
        counts_ref['Gene Pair Orientation'].loc[sorted_orientations != unsorted_orientations] = 'B_A'

        # rename the columns
        counts_ref = counts_ref.rename(columns = {'Gene Pair Orientation': 'gene_pair_orientation',
                                        'Target Type': 'target_type',
                                        'Gene Pair': 'gene_pair',
                                        'Study': 'study_origin',
                                        'Cell Line': 'cell_line_origin'})
    ################################# counts ref - DONE
    
    print('Sequence reference...')
    if sequence_ref is not None:
        for col in ['sgRNA_guide_name', 'sgRNA_guide_seq', 'sgRNA_target_name']:
            sequence_ref[col] = [i.upper() for i in sequence_ref[col]]
            
        # set the target names to control
        control_idx = np.array([True if str(sequence_ref['sgRNA_target_name'].iloc[i]) in study_controls else False for i in range(sequence_ref.shape[0])]) | np.array([True if str(sequence_ref['sgRNA_guide_name'].iloc[i]) in study_controls else False for i in range(sequence_ref.shape[0])])
        sequence_ref.loc[control_idx, 'sgRNA_target_name'] = 'CONTROL'    
        # add study origin as well
        sequence_ref['study_origin'] = [counts_ref['study_origin'].iloc[0]] * sequence_ref.shape[0]
    
    ################################# sequence ref - DONE
    
    print('Done! Returning...')
    return({'sequence_ref': sequence_ref,
            'counts_ref': counts_ref,
            'score_ref': score_ref})

# temp = sequence_ref.copy()

# study_controls = controls['najm_data']

db_inserts['sequence_ref']

if sequence_ref is not None:
    for col in ['sgRNA_guide_name', 'sgRNA_guide_seq', 'sgRNA_target_name']:
        sequence_ref[col] = [i.upper() for i in sequence_ref[col]]

control_idx = np.array([True if str(sequence_ref['sgRNA_target_name'].iloc[i]) in study_controls else False for i in range(sequence_ref.shape[0])]) | np.array([True if str(sequence_ref['sgRNA_guide_name'].iloc[i]) in study_controls else False for i in range(sequence_ref.shape[0])])
sequence_ref.loc[control_idx, 'sgRNA_target_name'] = 'CONTROL'  

study_controls

sequence_ref.head(50)

In [18]:
def insert_study_to_db(engine_link, db_inserts):

    # first, get the metadata
    db_metadata = sqlalchemy.MetaData(bind=engine_link)
    db_metadata.reflect(engine_link)

    # access the tables
    sequence_table = db_metadata.tables['CDKO_EXPERIMENT_DESIGN']
    counts_table = db_metadata.tables['CDKO_SGRNA_COUNTS']
    scores_table = db_metadata.tables['CDKO_ORIGINAL_SL_RESULTS']

    # then, start the session
    engine_session = sessionmaker(bind=engine_link)
    curr_session = engine_session()

    # get number of records in each table
    sequence_records_num = curr_session.query(sequence_table).count()
    counts_records_num = curr_session.query(counts_table).count()
    scores_records_num = curr_session.query(scores_table).count()

    # get available gene pairs
    available_gene_pairs = curr_session.query(sqlalchemy.func.max(counts_table.c.gene_pair_id)).first()[0]

    # for the first instance
    if available_gene_pairs is None:
        available_gene_pairs = -1

    # in the case of studies that doesn't have counts, but were added
    temp = curr_session.query(sqlalchemy.func.max(scores_table.c.gene_pair_id)).first()[0]

    if temp is not None:
        available_gene_pairs = max(available_gene_pairs, temp)

    # proceed to add the IDs to each table and reindex
    if db_inserts['sequence_ref'] is not None:
        sequence_insert = db_inserts['sequence_ref'].reset_index(drop=True)
    else:
        sequence_insert = None

    if db_inserts['counts_ref'] is not None:
        counts_insert = db_inserts['counts_ref'].reset_index(drop=True)
    else:
        counts_insert = None
    score_insert = db_inserts['score_ref'].reset_index(drop=True)

    # add the current number of records for proper insertion
    if sequence_insert is not None:
        sequence_insert.index += sequence_records_num
    if counts_insert is not None:
        counts_insert.index += counts_records_num
    score_insert.index += scores_records_num

    # set IDs
    if sequence_insert is not None:
        sequence_insert['sgRNA_id'] = sequence_insert.index
    if counts_insert is not None:
        counts_insert['sgRNA_pair_id'] = counts_insert.index
    score_insert['id'] = score_insert.index
    score_insert['gene_pair_id'] = score_insert['id'].copy()

    if (sequence_insert is not None) or (counts_insert is not None):
        for_merging = sequence_insert.copy()
        for_merging['ref_id'] = for_merging.index

        # # add the foreign keys
        counts_insert['FK_guide_1_id'] = counts_insert.merge(for_merging, how = 'left', left_on = 'Guide 1', right_on = 'sgRNA_guide_name')['ref_id'].values
        counts_insert['FK_guide_2_id'] = counts_insert.merge(for_merging, how = 'left', left_on = 'Guide 2', right_on = 'sgRNA_guide_name')['ref_id'].values

    if counts_insert is not None:
        for_merging = score_insert.copy()

        # in the case of multiple cell lines
        for_merging['gene_pair+cell_line+study_origin'] = for_merging['gene_pair'] + '+' + for_merging['cell_line_origin'] + '+' + for_merging['study_origin']
        counts_insert['gene_pair+cell_line+study_origin'] = counts_insert['gene_pair'] + '+' + counts_insert['cell_line_origin'] + '+' + counts_insert['study_origin']

        # set the gene pair id
        #counts_insert['gene_pair_id_all'] = np.NaN
        # dual have the gene pair id
        #counts_insert.loc[counts_insert['target_type'] == 'Dual', 'gene_pair_id_all'] = counts_insert.loc[counts_insert['target_type'] == 'Dual'].groupby(['gene_pair+cell_line+study_origin']).ngroup() + (available_gene_pairs + 1)
        counts_insert['gene_pair_id_all'] = counts_insert.groupby(['gene_pair+cell_line+study_origin']).ngroup() + (available_gene_pairs + 1)

        score_insert['gene_pair_id'] = for_merging.merge(counts_insert.drop_duplicates(subset = 'gene_pair+cell_line+study_origin'), how = 'left', left_on = 'gene_pair+cell_line+study_origin', right_on = 'gene_pair+cell_line+study_origin')['gene_pair_id_all'].values

    ## check if there is any NA in the references
    if (sequence_insert is not None) or (counts_insert is not None):
        for col in ['FK_guide_1_id', 'FK_guide_2_id']:
            if counts_insert[col].isna().sum() > 0:
                print('NA in foreign keys: ' + col)
    else:
        print('No counts and sequences together')

    print('Final QC...')
    # Final quality control
    if sequence_insert is not None:
        sequence_insert = sequence_insert.applymap(lambda x: x.strip() if isinstance(x, str) else x, na_action='ignore')
    if counts_insert is not None:
        counts_insert = counts_insert.applymap(lambda x: x.strip() if isinstance(x, str) else x, na_action='ignore')
    score_insert = score_insert.applymap(lambda x: x.strip() if isinstance(x, str) else x, na_action='ignore')

    # proceed to insert to the database

    # start the transaction, # insert only the columns we need
    with engine_link.begin() as transaction:
        # insert sequence
        print('Beginning transaction...')

        if sequence_insert is not None:
            sequence_insert = sequence_insert.loc[:,['sgRNA_guide_name', 'sgRNA_guide_seq', 'sgRNA_target_name', 'study_origin', 'sgRNA_id']]
            sequence_insert.to_sql(name = 'CDKO_EXPERIMENT_DESIGN', con = transaction, if_exists = 'append', index = False, index_label = 'sgRNA_id')

            print('Done sequence')

        # insert CDKO counts
        if counts_insert is not None:
            counts_insert = counts_insert.loc[:,['sgRNA_pair_id', 'FK_guide_1_id', 'FK_guide_2_id', 'gene_pair_id_all', 'gene_pair_orientation', 'T0_counts', 'T0_replicate_names', 'TEnd_counts', 'TEnd_replicate_names', 'target_type', 'study_origin', 'cell_line_origin']]

            counts_insert = counts_insert.rename(columns = {'FK_guide_1_id': 'guide_1_id',
                                                                   'FK_guide_2_id': 'guide_2_id',
                                                                   'gene_pair_id_all': 'gene_pair_id'})

            counts_insert.to_sql(name = 'CDKO_SGRNA_COUNTS', con = transaction, if_exists = 'append', index = False, index_label = 'sgRNA_pair_id')

            print('Done counts')

        # finally, insert scores
        score_insert = score_insert.loc[:, ['gene_1', 'gene_2', 'study_origin', 'cell_line_origin', 'SL_score', 'SL_score_cutoff', 'statistical_score', 'statistical_score_cutoff', 'gene_pair', 'SL_or_not', 'gene_pair_id', 'id']]
        score_insert.to_sql(name = 'CDKO_ORIGINAL_SL_RESULTS', con = transaction, if_exists = 'append', index = False, index_label = 'id')

        print('Done score')

        print('Successfully inserted!')

        print('Added Record stats...')
        if sequence_insert is not None:
            print(' '.join(['Sequence insert:', str(sequence_insert.shape[0])]))
        if counts_insert is not None:
            print(' '.join(['Counts insert:', str(counts_insert.shape[0])]))
        print(' '.join(['Score insert:', str(score_insert.shape[0])]))

    print('Done!')



guide_id, sequence_id
gene1, gene2(control), guide1, guide2, replicates 1::N (sep = ;), "single", conditions, cell_line
gene1, gene2, guide1, guide2, replicates 1::N (sep=;), "dual", conditions

## Diehl Data (HEK293T, RPE1)

In [19]:
diehl_loc = os.path.join(learning_goals_loc_general, "Diehl")

#GI is for RPE1
atg_flux_gRNA = pd.read_excel(os.path.join(diehl_loc, "Supplementary Table S5_ATG_Single_MPX_grna_readcounts.xlsx"), skiprows = 1)
atg_flux_gRNA_sequences = pd.ExcelFile(os.path.join(diehl_loc, "Supplementary Table S1_DNA_oligonucleotides.xlsx"))

### Seq References

In [20]:
sequence_ref = pd.DataFrame(columns = ["Guide_id", "Sequence"])

In [21]:
atg_flux_gRNA_sequences.parse("Sheet1", header = None).head(8)

Unnamed: 0,0,1,2,3
0,h7SK-1N,ccagcatagctcttaaacCAGTGATGATNACTGAGGCTCAGTTAga...,,h7SK-1N
1,h7SK-2N,ccagcatagctcttaaacCAGTGATGATNNCTGAGGCTCAGTTAga...,,h7SK-2N
2,h7SK-3N,ccagcatagctcttaaacCAGTGATGANNNCTGAGGCTCAGTTAga...,,h7SK-3N
3,h7SK-4N,ccagcatagctcttaaacCAGTGATGANNNNTGAGGCTCAGTTAga...,,h7SK-4N
4,hU6-1N,gctaTTTCtagctctaaaacCAGTGATGATNACTGAGGCTCgtttc...,,hU6-1N
5,hU6-2N,gctaTTTCtagctctaaaacCAGTGATGATNNCTGAGGCTCgtttc...,,hU6-2N
6,hU6-3N,gctaTTTCtagctctaaaacCAGTGATGANNNCTGAGGCTCgtttc...,,hU6-3N
7,hU6-4N,gctaTTTCtagctctaaaacCAGTGATGANNNNTGAGGCTCgtttc...,,hU6-4N


In [22]:
curr_seq = atg_flux_gRNA_sequences.parse("Sheet1", header = None).head(8)[[0, 1]]
curr_seq.columns = ["Guide_id", "Sequence"]
sequence_ref = pd.concat([sequence_ref, curr_seq])

In [23]:
atg_flux_gRNA_sequences.parse("3Cs GFP-mCherry library", header = 0)

Unnamed: 0,name,seuqence
0,GFP-mpx1#1,CCAGCATAGCTCTTAAACGTGCCCTGGCCCACCCTCGTCAGTTAGA...
1,GFP-mpx1#2,CCAGCATAGCTCTTAAACGGAGCGCACCATCTTCTTCACAGTTAGA...
2,GFP-mpx1#3,CCAGCATAGCTCTTAAACTGCCCTGGCCCACCCTCGTGCAGTTAGA...
3,GFP-mpx1#4,CCAGCATAGCTCTTAAACCGCCCTCGCCGGACACGCTGCAGTTAGA...
4,GFP-mpx1#5,CCAGCATAGCTCTTAAACGGCCCACCCTCGTGACCACCCAGTTAGA...
...,...,...
95,mCherry-mpx1#46,GCTATTTCTAGCTCTAAAACGAGGGCTTCAAGTGGGAGCGCGTTTC...
96,mCherry-mpx1#47,GCTATTTCTAGCTCTAAAACCCCAGCCCATGGTCTTCTTCCGTTTC...
97,mCherry-mpx1#48,GCTATTTCTAGCTCTAAAACCCGACATCCCCGACTACTTGCGTTTC...
98,mCherry-mpx1#49,GCTATTTCTAGCTCTAAAACACAACGTCAACATCAAGCTGCGTTTC...


In [24]:
curr_seq = atg_flux_gRNA_sequences.parse("3Cs GFP-mCherry library", header = 0)
curr_seq.columns = ["Guide_id", "Sequence"]
sequence_ref = pd.concat([sequence_ref, curr_seq])

In [25]:
atg_flux_gRNA_sequences.parse("3Cs distorted library", header = 0)[1:161]

Unnamed: 0,guide,oligo sequence
1,CCT4-KO-1,gctatttctagctctaaaacTAGTGGGACAATTGATGACTcgtttc...
2,CCT4-KO-2,gctatttctagctctaaaacCTATTTGATTATCCATCTGTcgtttc...
3,CCT4-KO-3,gctatttctagctctaaaacCATCAGTAGTCATCATTGCTcgtttc...
4,CCT4-KO-4,gctatttctagctctaaaacCAGGTCGAGACATGTCAGTCcgtttc...
5,IARS-KO-1,gctatttctagctctaaaacCTTTCCAACTTCGAGTCACAcgtttc...
...,...,...
156,ARNT-KO-4,ccagcatagctcttaaac TGGTCGCCGCTTAATAGCCC cagtt...
157,BBX-KO-1,ccagcatagctcttaaac TCTTGCGAGGGGTATCTCCA cagtt...
158,BBX-KO-2,ccagcatagctcttaaac CCATCGGATTCTTTTGCTTT cagtt...
159,BBX-KO-3,ccagcatagctcttaaac CATGGAATTATGATCATTGA cagtt...


In [26]:
curr_seq = atg_flux_gRNA_sequences.parse("3Cs distorted library", header = 0)[1:161]
curr_seq.columns = ["Guide_id", "Sequence"]
sequence_ref = pd.concat([sequence_ref, curr_seq])

In [27]:
atg_flux_gRNA_sequences.parse("3Cs distorted library", header = 0)[163:]

Unnamed: 0,guide,oligo sequence
163,Non-Human-Target-258-KO-1,gctatttctagctctaaaacCAACGGGTTCTCCCGGCTACcgtttc...
164,Non-Human-Target-754-KO-2,gctatttctagctctaaaacGTGCGATGTCGCTTCAACGTcgtttc...
165,Non-Human-Target-340-KO-3,gctatttctagctctaaaacCCCTGTGAAGGAGGCGTAAGcgtttc...
166,Non-Human-Target-196-KO-4,gctatttctagctctaaaacATATTTCGGCAGTTGCAGCAcgtttc...
167,Non-Human-Target-17-KO-5,gctatttctagctctaaaacAAATGCACAGATCGCTGATCcgtttc...
...,...,...
318,Non-Human-Target-220-KO-152,ccagcatagctcttaaac ATGACATTGCGCGTCTACGG cagtt...
319,Non-Human-Target-310-KO-153,ccagcatagctcttaaac CCAATGATAAGCCCGAACGG cagtt...
320,Non-Human-Target-360-KO-154,ccagcatagctcttaaac CCTCGGGCGTAAATACTCAT cagtt...
321,Non-Human-Target-727-KO-155,ccagcatagctcttaaac GTACACACTTATGCCATCAC cagtt...


In [28]:
curr_seq = atg_flux_gRNA_sequences.parse("3Cs distorted library", header = 0)[163:]
curr_seq.columns = ["Guide_id", "Sequence"]
sequence_ref = pd.concat([sequence_ref, curr_seq])

In [29]:
atg_flux_gRNA_sequences.parse("3Cs ExtendedAutophagy library", header = 0)

Unnamed: 0,name,gRNA,homology,homology.1,sum(h7SK:v2),length
0,AKT1-KO-1-R,acgtgaggctcccctcaaca,CAGTTAgaggtacccaagc,ccagcatagctcttaaac,ccagcatagctcttaaacacgtgaggctcccctcaacaCAGTTAga...,57
1,AKT1-KO-2-R,cgttggcgtactccatgaca,CAGTTAgaggtacccaagc,ccagcatagctcttaaac,ccagcatagctcttaaaccgttggcgtactccatgacaCAGTTAga...,57
2,AKT1-KO-3-R,ctgtcatcgaacgcaccttc,CAGTTAgaggtacccaagc,ccagcatagctcttaaac,ccagcatagctcttaaacctgtcatcgaacgcaccttcCAGTTAga...,57
3,AKT1-KO-4-R,cgttcttctccgagtgcagg,CAGTTAgaggtacccaagc,ccagcatagctcttaaac,ccagcatagctcttaaaccgttcttctccgagtgcaggCAGTTAga...,57
4,AKT3-KO-1-R,ttacctagtagtttcaaata,CAGTTAgaggtacccaagc,ccagcatagctcttaaac,ccagcatagctcttaaacttacctagtagtttcaaataCAGTTAga...,57
...,...,...,...,...,...,...
871,Non-Human-Target-941-KO-76-R,taacaccataaactttcaca,CAGTTAgaggtacccaagc,ccagcatagctcttaaac,ccagcatagctcttaaactaacaccataaactttcacaCAGTTAga...,57
872,Non-Human-Target-532-KO-77-R,ggtgacgccagcacgtcttc,CAGTTAgaggtacccaagc,ccagcatagctcttaaac,ccagcatagctcttaaacggtgacgccagcacgtcttcCAGTTAga...,57
873,Non-Human-Target-398-KO-78-R,ggtagccacataaacgggcg,CAGTTAgaggtacccaagc,ccagcatagctcttaaac,ccagcatagctcttaaacggtagccacataaacgggcgCAGTTAga...,57
874,Non-Human-Target-798-KO-79-R,cgcaaggacaatccagggta,CAGTTAgaggtacccaagc,ccagcatagctcttaaac,ccagcatagctcttaaaccgcaaggacaatccagggtaCAGTTAga...,57


In [30]:
curr_seq = atg_flux_gRNA_sequences.parse("3Cs ExtendedAutophagy library", header = 0)[["name", "gRNA"]]
curr_seq.columns = ["Guide_id", "Sequence"]
sequence_ref = pd.concat([sequence_ref, curr_seq])

In [31]:
atg_flux_gRNA_sequences.parse("3Cs CoreAutophagy library", header = 0)

Unnamed: 0,name,gRNAs,homology,homology.1,sum (U6:v1),length
0,AMBRA1-KO-1-R,ttctaggtatcaccgagaaa,gctaTTTCtagctctaaaac,Cgtttcgtcctttccacaa,gctaTTTCtagctctaaaacttctaggtatcaccgagaaaCgtttc...,59
1,AMBRA1-KO-2-R,tgagagatactggatcatcc,gctaTTTCtagctctaaaac,Cgtttcgtcctttccacaa,gctaTTTCtagctctaaaactgagagatactggatcatccCgtttc...,59
2,AMBRA1-KO-3-R,cttggcaggtccccagctcc,gctaTTTCtagctctaaaac,Cgtttcgtcctttccacaa,gctaTTTCtagctctaaaaccttggcaggtccccagctccCgtttc...,59
3,AMBRA1-KO-4-R,ccgtaatatagatattatgg,gctaTTTCtagctctaaaac,Cgtttcgtcctttccacaa,gctaTTTCtagctctaaaacccgtaatatagatattatggCgtttc...,59
4,ATG10-KO-1-R,acgttattgtgcagaattca,gctaTTTCtagctctaaaac,Cgtttcgtcctttccacaa,gctaTTTCtagctctaaaacacgttattgtgcagaattcaCgtttc...,59
...,...,...,...,...,...,...
277,Non-Human-Target-668-KO-22-R,ataaggtctttcggagtgcc,gctaTTTCtagctctaaaac,Cgtttcgtcctttccacaa,gctaTTTCtagctctaaaacataaggtctttcggagtgccCgtttc...,59
278,Non-Human-Target-953-KO-23-R,gtaatggcgtgctcacataa,gctaTTTCtagctctaaaac,Cgtttcgtcctttccacaa,gctaTTTCtagctctaaaacgtaatggcgtgctcacataaCgtttc...,59
279,Non-Human-Target-551-KO-24-R,tcttttatattctagaagtc,gctaTTTCtagctctaaaac,Cgtttcgtcctttccacaa,gctaTTTCtagctctaaaactcttttatattctagaagtcCgtttc...,59
280,Non-Human-Target-148-KO-25-R,tacggtcaatgttggcgcct,gctaTTTCtagctctaaaac,Cgtttcgtcctttccacaa,gctaTTTCtagctctaaaactacggtcaatgttggcgcctCgtttc...,59


In [32]:
curr_seq = atg_flux_gRNA_sequences.parse("3Cs CoreAutophagy library", header = 0)[["name", "gRNAs"]]
curr_seq.columns = ["Guide_id", "Sequence"]
sequence_ref = pd.concat([sequence_ref, curr_seq])

In [33]:
sequence_ref

Unnamed: 0,Guide_id,Sequence
0,h7SK-1N,ccagcatagctcttaaacCAGTGATGATNACTGAGGCTCAGTTAga...
1,h7SK-2N,ccagcatagctcttaaacCAGTGATGATNNCTGAGGCTCAGTTAga...
2,h7SK-3N,ccagcatagctcttaaacCAGTGATGANNNCTGAGGCTCAGTTAga...
3,h7SK-4N,ccagcatagctcttaaacCAGTGATGANNNNTGAGGCTCAGTTAga...
4,hU6-1N,gctaTTTCtagctctaaaacCAGTGATGATNACTGAGGCTCgtttc...
...,...,...
277,Non-Human-Target-668-KO-22-R,ataaggtctttcggagtgcc
278,Non-Human-Target-953-KO-23-R,gtaatggcgtgctcacataa
279,Non-Human-Target-551-KO-24-R,tcttttatattctagaagtc
280,Non-Human-Target-148-KO-25-R,tacggtcaatgttggcgcct


In [34]:
sgRNA_targets = np.array([i.split('-')[0] for i in sequence_ref['Guide_id']])

In [35]:
sgRNA_targets = sgRNA_targets.astype('<U14')

In [36]:
sgRNA_targets[sgRNA_targets == 'Non'] = 'Control'


In [37]:
sequence_ref['Guide_target'] = sgRNA_targets

In [38]:
sequence_ref['Guide_target'].value_counts()

Control    266
GFP         50
mCherry     50
KEAP1       12
TP53        12
          ... 
NEDD4        4
NEDD4L       4
hU6          4
PEX13        4
VMP1         4
Name: Guide_target, Length: 223, dtype: int64

In [39]:
sequence_ref.columns = ['sgRNA_guide_name', 'sgRNA_guide_seq', 'sgRNA_target_name']

In [40]:
sequence_ref = pd.concat([sequence_ref, pd.DataFrame({'sgRNA_guide_name' : ['wildtype-I-CeuI', 'wildtype-I-SceI'],
                     'sgRNA_guide_seq' : ["ctcTAACTATAACGGTCCTAAGGTAGCGAaacTCGCTACCTTAGGACCGTTATAGTTA", 'cacctagggataacagggtaatgttttagagctaGAAAtagcaagttaaaataaggctagtccgttatcaacttgaaaaagtggcaccgagtcggtgcTTTTTTggatccAAAAAAgcaccgactcggtgccactttttcaagttgataacggactagccttattttaacttgctaTTTCtagctctaaaacattaccctgttatcccta'],
                     'sgRNA_target_name' : ['Control', 'Control']})])

In [41]:
sequence_ref = sequence_ref.drop_duplicates(subset = ['sgRNA_guide_name'])

In [42]:
sequence_ref

Unnamed: 0,sgRNA_guide_name,sgRNA_guide_seq,sgRNA_target_name
0,h7SK-1N,ccagcatagctcttaaacCAGTGATGATNACTGAGGCTCAGTTAga...,h7SK
1,h7SK-2N,ccagcatagctcttaaacCAGTGATGATNNCTGAGGCTCAGTTAga...,h7SK
2,h7SK-3N,ccagcatagctcttaaacCAGTGATGANNNCTGAGGCTCAGTTAga...,h7SK
3,h7SK-4N,ccagcatagctcttaaacCAGTGATGANNNNTGAGGCTCAGTTAga...,h7SK
4,hU6-1N,gctaTTTCtagctctaaaacCAGTGATGATNACTGAGGCTCgtttc...,hU6
...,...,...,...
279,Non-Human-Target-551-KO-24-R,tcttttatattctagaagtc,Control
280,Non-Human-Target-148-KO-25-R,tacggtcaatgttggcgcct,Control
281,Non-Human-Target-995-KO-26-R,caagctatgcgttgaccaaa,Control
0,wildtype-I-CeuI,ctcTAACTATAACGGTCCTAAGGTAGCGAaacTCGCTACCTTAGGA...,Control


## Counts
### Single sgRNAs

In [43]:
atg_flux_gRNA

Unnamed: 0,Guide,Gene,library,n1,n2,Unnamed: 5,Guide1,Guide2,Gene_1,Gene_2,library.1,ProlifCTRL_1,ProlifCTRL_2,n1.1,n2.1,n3,Highgate_1,Highgate_2,Highgate_3
0,AKT1-KO-1-R,AKT1,900.0,99.0,34.0,,AKT1-KO-1-R,AMBRA1-KO-1-R,AKT1,AMBRA1,94,906,869,6,195,0,0,2,0
1,AKT1-KO-2-R,AKT1,1223.0,54.0,44.0,,AKT1-KO-1-R,AMBRA1-KO-2-R,AKT1,AMBRA1,101,1140,1069,3,5,0,0,0,0
2,AKT1-KO-3-R,AKT1,788.0,46.0,66.0,,AKT1-KO-1-R,AMBRA1-KO-3-R,AKT1,AMBRA1,90,703,583,14,13,19,0,1,0
3,AKT1-KO-4-R,AKT1,1221.0,24.0,57.0,,AKT1-KO-1-R,AMBRA1-KO-4-R,AKT1,AMBRA1,59,698,656,25,19,21,0,4,0
4,AKT3-KO-1-R,AKT3,471.0,18.0,5.0,,AKT1-KO-1-R,ATG10-KO-1-R,AKT1,ATG10,13,721,660,161,75,21,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
248186,,,,,,,wildtype-I-CeuI,ZKSCAN3-KO-1-R,wildtype-I-CeuI,ZKSCAN3,917,6290,6245,133,527,33,4,9,0
248187,,,,,,,wildtype-I-CeuI,ZKSCAN3-KO-2-R,wildtype-I-CeuI,ZKSCAN3,3271,4317,4129,268,237,15,1,16,0
248188,,,,,,,wildtype-I-CeuI,ZKSCAN3-KO-3-R,wildtype-I-CeuI,ZKSCAN3,4500,8497,8366,677,194,31,19,17,0
248189,,,,,,,wildtype-I-CeuI,ZKSCAN3-KO-4-R,wildtype-I-CeuI,ZKSCAN3,3800,3505,3541,62,124,9,3,2,0


In [44]:
counts_ref = pd.DataFrame(columns = ["Guide 1", "Guide 2", "Gene 1", "Gene 2", "Count Replicates", "Type", "Sequencing", "Cell Line", "Condition"])

In [45]:
atg_flux_gRNA.iloc[:, 0:5].dropna()

Unnamed: 0,Guide,Gene,library,n1,n2
0,AKT1-KO-1-R,AKT1,900.0,99.0,34.0
1,AKT1-KO-2-R,AKT1,1223.0,54.0,44.0
2,AKT1-KO-3-R,AKT1,788.0,46.0,66.0
3,AKT1-KO-4-R,AKT1,1221.0,24.0,57.0
4,AKT3-KO-1-R,AKT3,471.0,18.0,5.0
...,...,...,...,...,...
872,ZKSCAN3-KO-1-R,ZKSCAN3,725.0,5.0,6.0
873,ZKSCAN3-KO-2-R,ZKSCAN3,1001.0,33.0,46.0
874,ZKSCAN3-KO-3-R,ZKSCAN3,1808.0,27.0,20.0
875,ZKSCAN3-KO-4-R,ZKSCAN3,340.0,5.0,4.0


In [46]:
curr_sgRNA = atg_flux_gRNA.iloc[:, 0:5].dropna()
curr_sgRNA = curr_sgRNA.drop("library", axis = 1)

In [47]:
curr_sgRNA['Count Replicates'] = curr_sgRNA[["n1", "n2"]].apply(    
    lambda x: ';'.join(x.astype(str)),
    axis=1
)
curr_sgRNA = curr_sgRNA.drop(["n1", "n2"], axis = 1)
curr_sgRNA.columns = ["Guide 1", "Gene 1", "Count Replicates"]
curr_sgRNA["Type"] = ["Single"] * curr_sgRNA.shape[0]
curr_sgRNA["Sequencing"] = ["sgRNA"] * curr_sgRNA.shape[0]
curr_sgRNA["Condition"] = ["rep1;rep2"] * curr_sgRNA.shape[0]

In [48]:
#skip singles
#sequence_ref = pd.concat([sequence_ref, curr_sgRNA])

### Dual sgRNAs

In [49]:
curr_sgRNA = atg_flux_gRNA.iloc[:, 6:]
curr_sgRNA = curr_sgRNA.drop(["library.1", "Highgate_1", "Highgate_2", "Highgate_3"], axis = 1)#"ProlifCTRL_1", "ProlifCTRL_2"

In [50]:
curr_sgRNA

Unnamed: 0,Guide1,Guide2,Gene_1,Gene_2,ProlifCTRL_1,ProlifCTRL_2,n1.1,n2.1,n3
0,AKT1-KO-1-R,AMBRA1-KO-1-R,AKT1,AMBRA1,906,869,6,195,0
1,AKT1-KO-1-R,AMBRA1-KO-2-R,AKT1,AMBRA1,1140,1069,3,5,0
2,AKT1-KO-1-R,AMBRA1-KO-3-R,AKT1,AMBRA1,703,583,14,13,19
3,AKT1-KO-1-R,AMBRA1-KO-4-R,AKT1,AMBRA1,698,656,25,19,21
4,AKT1-KO-1-R,ATG10-KO-1-R,AKT1,ATG10,721,660,161,75,21
...,...,...,...,...,...,...,...,...,...
248186,wildtype-I-CeuI,ZKSCAN3-KO-1-R,wildtype-I-CeuI,ZKSCAN3,6290,6245,133,527,33
248187,wildtype-I-CeuI,ZKSCAN3-KO-2-R,wildtype-I-CeuI,ZKSCAN3,4317,4129,268,237,15
248188,wildtype-I-CeuI,ZKSCAN3-KO-3-R,wildtype-I-CeuI,ZKSCAN3,8497,8366,677,194,31
248189,wildtype-I-CeuI,ZKSCAN3-KO-4-R,wildtype-I-CeuI,ZKSCAN3,3505,3541,62,124,9


In [51]:
curr_sgRNA['Count Replicates'] = curr_sgRNA[["ProlifCTRL_1", "ProlifCTRL_2", "n1.1", "n2.1", "n3"]].apply(    
    lambda x: ';'.join(x.astype(str)),
    axis=1
)
curr_sgRNA = curr_sgRNA.drop(["ProlifCTRL_1", "ProlifCTRL_2", "n1.1", "n2.1", "n3"], axis = 1)

curr_sgRNA.columns = ["Guide 1", "Guide 2", "Gene 1", "Gene 2", "Count Replicates"]
curr_sgRNA["Type"] = ["Dual"] * curr_sgRNA.shape[0]
curr_sgRNA["Sequencing"] = ["sgRNA"] * curr_sgRNA.shape[0]
curr_sgRNA["Condition"] = ["ctrl_1;ctrl_2;rep_1;rep_2;rep_3"] * curr_sgRNA.shape[0]

In [52]:
curr_sgRNA

Unnamed: 0,Guide 1,Guide 2,Gene 1,Gene 2,Count Replicates,Type,Sequencing,Condition
0,AKT1-KO-1-R,AMBRA1-KO-1-R,AKT1,AMBRA1,906;869;6;195;0,Dual,sgRNA,ctrl_1;ctrl_2;rep_1;rep_2;rep_3
1,AKT1-KO-1-R,AMBRA1-KO-2-R,AKT1,AMBRA1,1140;1069;3;5;0,Dual,sgRNA,ctrl_1;ctrl_2;rep_1;rep_2;rep_3
2,AKT1-KO-1-R,AMBRA1-KO-3-R,AKT1,AMBRA1,703;583;14;13;19,Dual,sgRNA,ctrl_1;ctrl_2;rep_1;rep_2;rep_3
3,AKT1-KO-1-R,AMBRA1-KO-4-R,AKT1,AMBRA1,698;656;25;19;21,Dual,sgRNA,ctrl_1;ctrl_2;rep_1;rep_2;rep_3
4,AKT1-KO-1-R,ATG10-KO-1-R,AKT1,ATG10,721;660;161;75;21,Dual,sgRNA,ctrl_1;ctrl_2;rep_1;rep_2;rep_3
...,...,...,...,...,...,...,...,...
248186,wildtype-I-CeuI,ZKSCAN3-KO-1-R,wildtype-I-CeuI,ZKSCAN3,6290;6245;133;527;33,Dual,sgRNA,ctrl_1;ctrl_2;rep_1;rep_2;rep_3
248187,wildtype-I-CeuI,ZKSCAN3-KO-2-R,wildtype-I-CeuI,ZKSCAN3,4317;4129;268;237;15,Dual,sgRNA,ctrl_1;ctrl_2;rep_1;rep_2;rep_3
248188,wildtype-I-CeuI,ZKSCAN3-KO-3-R,wildtype-I-CeuI,ZKSCAN3,8497;8366;677;194;31,Dual,sgRNA,ctrl_1;ctrl_2;rep_1;rep_2;rep_3
248189,wildtype-I-CeuI,ZKSCAN3-KO-4-R,wildtype-I-CeuI,ZKSCAN3,3505;3541;62;124;9,Dual,sgRNA,ctrl_1;ctrl_2;rep_1;rep_2;rep_3


In [53]:
counts_ref = pd.concat([counts_ref, curr_sgRNA])
counts_ref["Cell Line"] = ["RPE1"] * counts_ref.shape[0]
counts_ref["Study"] = [study_name_to_pubmed_id['diehl_data']] * counts_ref.shape[0]

In [54]:
counts_ref

Unnamed: 0,Guide 1,Guide 2,Gene 1,Gene 2,Count Replicates,Type,Sequencing,Cell Line,Condition,Study
0,AKT1-KO-1-R,AMBRA1-KO-1-R,AKT1,AMBRA1,906;869;6;195;0,Dual,sgRNA,RPE1,ctrl_1;ctrl_2;rep_1;rep_2;rep_3,33956155
1,AKT1-KO-1-R,AMBRA1-KO-2-R,AKT1,AMBRA1,1140;1069;3;5;0,Dual,sgRNA,RPE1,ctrl_1;ctrl_2;rep_1;rep_2;rep_3,33956155
2,AKT1-KO-1-R,AMBRA1-KO-3-R,AKT1,AMBRA1,703;583;14;13;19,Dual,sgRNA,RPE1,ctrl_1;ctrl_2;rep_1;rep_2;rep_3,33956155
3,AKT1-KO-1-R,AMBRA1-KO-4-R,AKT1,AMBRA1,698;656;25;19;21,Dual,sgRNA,RPE1,ctrl_1;ctrl_2;rep_1;rep_2;rep_3,33956155
4,AKT1-KO-1-R,ATG10-KO-1-R,AKT1,ATG10,721;660;161;75;21,Dual,sgRNA,RPE1,ctrl_1;ctrl_2;rep_1;rep_2;rep_3,33956155
...,...,...,...,...,...,...,...,...,...,...
248186,wildtype-I-CeuI,ZKSCAN3-KO-1-R,wildtype-I-CeuI,ZKSCAN3,6290;6245;133;527;33,Dual,sgRNA,RPE1,ctrl_1;ctrl_2;rep_1;rep_2;rep_3,33956155
248187,wildtype-I-CeuI,ZKSCAN3-KO-2-R,wildtype-I-CeuI,ZKSCAN3,4317;4129;268;237;15,Dual,sgRNA,RPE1,ctrl_1;ctrl_2;rep_1;rep_2;rep_3,33956155
248188,wildtype-I-CeuI,ZKSCAN3-KO-3-R,wildtype-I-CeuI,ZKSCAN3,8497;8366;677;194;31,Dual,sgRNA,RPE1,ctrl_1;ctrl_2;rep_1;rep_2;rep_3,33956155
248189,wildtype-I-CeuI,ZKSCAN3-KO-4-R,wildtype-I-CeuI,ZKSCAN3,3505;3541;62;124;9,Dual,sgRNA,RPE1,ctrl_1;ctrl_2;rep_1;rep_2;rep_3,33956155


## SL Scores

In [55]:
diehl_loc = os.path.join(learning_goals_loc_general, "Diehl")

#GI is for RPE1
atg_flux_GI = pd.read_excel(os.path.join(diehl_loc, "Supplementary Table S6_ATG_GI_models_genes.xlsx"))

In [56]:
# RPE1
# negative LFC are depleted, -1 cutoff
atg_flux_GI

Unnamed: 0,Gene1,Gene_2,prolif_ctrl_avg_norm,post_facs_avg_norm,lfc,prolif_ctrl_avg_norm_h7SK,post_facs_avg_norm_h7SK,lfc_h7SK,prolif_ctrl_avg_norm_U6,post_facs_avg_norm_U6,...,max_model,max_dLFC,sum_model,sum_dLFC,min_model,min_dLFC,mult_model,mult_dLFC,log_model,log_dLFC
0,AKT1,AMBRA1,49.509296,49.133852,-0.010982,66.726643,10.566977,-2.658700,40.667392,34.655258,...,-0.195852,0.184870,-2.854552,2.843570,-2.658700,2.647718,0.520711,-0.531693,0.146446,-0.157428
1,AKT3,AMBRA1,15.129326,67.584450,2.159344,20.007618,8.004394,-1.483954,40.667392,34.655258,...,-0.195852,2.355195,-1.679806,3.839149,-1.483954,3.643298,0.290635,1.868709,0.113113,2.046230
2,AMBRA1,AMBRA1,35.618412,84.113111,1.239708,36.815835,23.038215,-0.731591,40.667392,34.655258,...,-0.195852,1.435559,-0.927443,2.167151,-0.731591,1.971299,0.143283,1.096424,0.071066,1.168641
3,ARF6,AMBRA1,31.232640,21.116277,-0.564699,44.586703,5.954327,-2.645284,40.667392,34.655258,...,-0.195852,-0.368847,-2.841135,2.276436,-2.645284,2.080585,0.518083,-1.082782,0.146202,-0.710901
4,ATF4,AMBRA1,30.673669,61.306122,0.999030,43.737172,10.396138,-1.827212,40.667392,34.655258,...,-0.195852,1.194882,-2.023064,3.022094,-1.827212,2.826242,0.357863,0.641168,0.125875,0.873156
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25475,WIPI1,wildtype-I-SceI,5.690665,2.879228,-0.982915,34.241181,6.979361,-2.134569,,,...,-2.134569,1.151654,,,-2.134569,1.151654,,,,
25476,WIPI2,wildtype-I-SceI,11.798954,2.708389,-2.123152,68.029605,15.179626,-2.108286,,,...,-2.108286,-0.014867,,,-2.108286,-0.014867,,,,
25477,ZFYVE1,wildtype-I-SceI,5.002701,1.683355,-1.571367,27.293784,5.270972,-2.183353,,,...,-2.183353,0.611985,,,-2.183353,0.611985,,,,
25478,ZKSCAN3,wildtype-I-SceI,11.590480,3.562583,-1.701945,68.316257,9.883621,-2.791084,,,...,-2.791084,1.089139,,,-2.791084,1.089139,,,,


In [57]:
atg_flux_GI.columns

Index(['Gene1', 'Gene_2', 'prolif_ctrl_avg_norm', 'post_facs_avg_norm', 'lfc',
       'prolif_ctrl_avg_norm_h7SK', 'post_facs_avg_norm_h7SK', 'lfc_h7SK',
       'prolif_ctrl_avg_norm_U6', 'post_facs_avg_norm_U6', 'lfc_U6',
       'max_model', 'max_dLFC', 'sum_model', 'sum_dLFC', 'min_model',
       'min_dLFC', 'mult_model', 'mult_dLFC', 'log_model', 'log_dLFC'],
      dtype='object')

In [58]:
atg_flux_GI = atg_flux_GI.fillna(0)

mult_dLFC is the best according to Defining Gene Expression. by R Mani

In [59]:
curr_GI = pd.DataFrame(columns = ["Gene_A", "Gene_B", "Study_Source", "Cell_Line", "GI_Score", "GI_Cutoff", "Stat_Score", "Stat_Cutoff"])
curr_GI["Gene_A"] = atg_flux_GI.loc[:, 'Gene1']
curr_GI["Gene_B"] = atg_flux_GI.loc[:, 'Gene_2']
curr_GI["GI_Score"] = atg_flux_GI.loc[:, 'lfc']
curr_GI["Study_Source"] = [study_name_to_pubmed_id['diehl_data']] * atg_flux_GI.shape[0]
curr_GI["Cell_Line"] = ["RPE1"] * atg_flux_GI.shape[0]
curr_GI = curr_GI.fillna(0)
curr_GI["GI_Cutoff"] = [-1] * atg_flux_GI.shape[0]
curr_GI["Stat_Score"] = [float("nan")] * atg_flux_GI.shape[0]
curr_GI["Stat_Cutoff"] = [float("nan")] * atg_flux_GI.shape[0]

In [60]:
atg_flux_GI.loc[:, 'lfc']

0       -0.010982
1        2.159344
2        1.239708
3       -0.564699
4        0.999030
           ...   
25475   -0.982915
25476   -2.123152
25477   -1.571367
25478   -1.701945
25479   -0.141042
Name: lfc, Length: 25480, dtype: float64

In [61]:
sorted_genes = []
for i in range(curr_GI.shape[0]):
    gene_A = curr_GI["Gene_A"].iloc[i]
    gene_B = curr_GI["Gene_B"].iloc[i]
    sorted_genes.append('_'.join(sorted([gene_A, gene_B])))
curr_GI["Sorted Genes"] = sorted_genes

In [62]:
curr_GI

Unnamed: 0,Gene_A,Gene_B,Study_Source,Cell_Line,GI_Score,GI_Cutoff,Stat_Score,Stat_Cutoff,Sorted Genes
0,AKT1,AMBRA1,33956155,RPE1,-0.010982,-1,,,AKT1_AMBRA1
1,AKT3,AMBRA1,33956155,RPE1,2.159344,-1,,,AKT3_AMBRA1
2,AMBRA1,AMBRA1,33956155,RPE1,1.239708,-1,,,AMBRA1_AMBRA1
3,ARF6,AMBRA1,33956155,RPE1,-0.564699,-1,,,AMBRA1_ARF6
4,ATF4,AMBRA1,33956155,RPE1,0.999030,-1,,,AMBRA1_ATF4
...,...,...,...,...,...,...,...,...,...
25475,WIPI1,wildtype-I-SceI,33956155,RPE1,-0.982915,-1,,,WIPI1_wildtype-I-SceI
25476,WIPI2,wildtype-I-SceI,33956155,RPE1,-2.123152,-1,,,WIPI2_wildtype-I-SceI
25477,ZFYVE1,wildtype-I-SceI,33956155,RPE1,-1.571367,-1,,,ZFYVE1_wildtype-I-SceI
25478,ZKSCAN3,wildtype-I-SceI,33956155,RPE1,-1.701945,-1,,,ZKSCAN3_wildtype-I-SceI


In [63]:
# average over duplicate SLs
res = curr_GI.groupby('Sorted Genes', as_index = False)['GI_Score'].apply(lambda x: np.mean(x))

In [64]:
curr_GI = pd.merge(curr_GI, res, on=['Sorted Genes'], how='inner')

In [65]:
curr_GI = curr_GI.drop_duplicates(subset = ['Sorted Genes'])
curr_GI['GI_Score'] = curr_GI['GI_Score_y']

In [66]:
curr_GI


Unnamed: 0,Gene_A,Gene_B,Study_Source,Cell_Line,GI_Score_x,GI_Cutoff,Stat_Score,Stat_Cutoff,Sorted Genes,GI_Score_y,GI_Score
0,AKT1,AMBRA1,33956155,RPE1,-0.010982,-1,,,AKT1_AMBRA1,-0.010982,-0.010982
1,AKT3,AMBRA1,33956155,RPE1,2.159344,-1,,,AKT3_AMBRA1,2.159344,2.159344
2,AMBRA1,AMBRA1,33956155,RPE1,1.239708,-1,,,AMBRA1_AMBRA1,1.239708,1.239708
3,ARF6,AMBRA1,33956155,RPE1,-0.564699,-1,,,AMBRA1_ARF6,-0.564699,-0.564699
4,ATF4,AMBRA1,33956155,RPE1,0.999030,-1,,,AMBRA1_ATF4,0.999030,0.999030
...,...,...,...,...,...,...,...,...,...,...,...
25475,WIPI1,wildtype-I-SceI,33956155,RPE1,-0.982915,-1,,,WIPI1_wildtype-I-SceI,-0.982915,-0.982915
25476,WIPI2,wildtype-I-SceI,33956155,RPE1,-2.123152,-1,,,WIPI2_wildtype-I-SceI,-2.123152,-2.123152
25477,ZFYVE1,wildtype-I-SceI,33956155,RPE1,-1.571367,-1,,,ZFYVE1_wildtype-I-SceI,-1.571367,-1.571367
25478,ZKSCAN3,wildtype-I-SceI,33956155,RPE1,-1.701945,-1,,,ZKSCAN3_wildtype-I-SceI,-1.701945,-1.701945


In [67]:
counts_ref

Unnamed: 0,Guide 1,Guide 2,Gene 1,Gene 2,Count Replicates,Type,Sequencing,Cell Line,Condition,Study
0,AKT1-KO-1-R,AMBRA1-KO-1-R,AKT1,AMBRA1,906;869;6;195;0,Dual,sgRNA,RPE1,ctrl_1;ctrl_2;rep_1;rep_2;rep_3,33956155
1,AKT1-KO-1-R,AMBRA1-KO-2-R,AKT1,AMBRA1,1140;1069;3;5;0,Dual,sgRNA,RPE1,ctrl_1;ctrl_2;rep_1;rep_2;rep_3,33956155
2,AKT1-KO-1-R,AMBRA1-KO-3-R,AKT1,AMBRA1,703;583;14;13;19,Dual,sgRNA,RPE1,ctrl_1;ctrl_2;rep_1;rep_2;rep_3,33956155
3,AKT1-KO-1-R,AMBRA1-KO-4-R,AKT1,AMBRA1,698;656;25;19;21,Dual,sgRNA,RPE1,ctrl_1;ctrl_2;rep_1;rep_2;rep_3,33956155
4,AKT1-KO-1-R,ATG10-KO-1-R,AKT1,ATG10,721;660;161;75;21,Dual,sgRNA,RPE1,ctrl_1;ctrl_2;rep_1;rep_2;rep_3,33956155
...,...,...,...,...,...,...,...,...,...,...
248186,wildtype-I-CeuI,ZKSCAN3-KO-1-R,wildtype-I-CeuI,ZKSCAN3,6290;6245;133;527;33,Dual,sgRNA,RPE1,ctrl_1;ctrl_2;rep_1;rep_2;rep_3,33956155
248187,wildtype-I-CeuI,ZKSCAN3-KO-2-R,wildtype-I-CeuI,ZKSCAN3,4317;4129;268;237;15,Dual,sgRNA,RPE1,ctrl_1;ctrl_2;rep_1;rep_2;rep_3,33956155
248188,wildtype-I-CeuI,ZKSCAN3-KO-3-R,wildtype-I-CeuI,ZKSCAN3,8497;8366;677;194;31,Dual,sgRNA,RPE1,ctrl_1;ctrl_2;rep_1;rep_2;rep_3,33956155
248189,wildtype-I-CeuI,ZKSCAN3-KO-4-R,wildtype-I-CeuI,ZKSCAN3,3505;3541;62;124;9,Dual,sgRNA,RPE1,ctrl_1;ctrl_2;rep_1;rep_2;rep_3,33956155


In [68]:
# prepare the files for insertion
db_inserts = prepare_study_for_export(sequence_ref = sequence_ref.copy(), counts_ref = counts_ref.copy(), score_ref = curr_GI.copy(), study_controls = controls['diehl_data'], study_conditions = study_conditions['diehl_data'])

Starting processing...
Score reference...
Controls within SL score that are removed: 
12615
---
Only GI cutoff is present...
Counts reference...
Number of double pairs: 202768
Number of controls: 2187
Number of singles: 43236
Sequence reference...
Done! Returning...


In [69]:
#PREV_REF.loc[PREV_REF['Study_Source'] == 'Diehl',:]

In [70]:
#PREV_REF.loc[(PREV_REF['Study_Source'] == 'Diehl') & (PREV_REF['Gene_A'] == PREV_REF['Gene_B']),:]

In [71]:
db_inserts['score_ref']['SL_or_not'].value_counts()

True     6776
False    3944
Name: SL_or_not, dtype: int64

In [72]:
#PREV_REF.loc[(PREV_REF['Study_Source'] == 'Diehl') & (PREV_REF['Gene Pair'] == 'ATG2B_MTOR'),:]

In [73]:
#ATG2B_MTOR

In [74]:
#set(PREV_REF.loc[PREV_REF['Study_Source'] == 'Diehl','Gene Pair']).difference(set(db_inserts['score_ref']['gene_pair']))

In [75]:
#db_inserts['score_ref']['gene_pair']

In [76]:
#(db_inserts['score_ref']['GI_Score_x'] < -1).sum()

In [77]:
#(atg_flux_GI.loc[:, 'lfc'] < -1).sum()

In [78]:
#db_inserts['score_ref']['SL_or_not'].value_counts()

In [79]:
# insert to the database
insert_study_to_db(engine_link = SLKB_engine, db_inserts = db_inserts)

Final QC...
Beginning transaction...
Done sequence
Done counts
Done score
Successfully inserted!
Added Record stats...
Sequence insert: 1176
Counts insert: 248191
Score insert: 10720
Done!


## Han Data (K562) No counts
No counts are available

In [80]:
han_loc = os.path.join(learning_goals_loc_general, "Han")

ricin_CDKO = pd.read_excel(os.path.join(han_loc, "7 - GI scores of Ricin-CDKO screen.xlsx"))
drug_CDKO = pd.read_excel(os.path.join(han_loc, "4 - GI scores of DrugTarget-CDKO screen.xlsx"))

In [81]:
ricin_CDKO

Unnamed: 0,Gene_Pair,GeneA ρ Phenotype,GeneB ρ Phenotype,GeneAB Observed\nρ Phenotype,GeneAB RawGI,GeneAB NormGI,GeneAB GIT score,GeneAB GIM score,Correlation of\nGI profiles\n(GIT score),Correlation of\nGI profiles\n(GIM score),q-values based on GIM score
0,B4GALT1__B4GALT3,4.574319,1.231391,9.000142,-3.328240,-4.150589,-23.494356,-24.621381,-0.389358,-0.383814,1.960799e-22
1,SLC39A9__TRAPPC13,-5.004521,10.661590,8.043506,-2.838739,-3.546919,-17.692541,-24.275939,0.028368,0.072512,2.895902e-22
2,SLC39A9__TRAPPC2L,-5.004521,8.633068,5.033394,-1.581442,-2.062448,-11.040980,-18.488956,-0.051350,0.035278,6.649633e-17
3,MAN1B1__MOGS,3.992183,2.377593,7.415340,-1.291890,-1.679146,-9.784799,-16.660538,-0.176978,-0.206039,2.107934e-15
4,MAN2A1__MOGS,2.566985,2.377593,5.997311,-1.311117,-1.729301,-9.283981,-17.769419,-0.326060,-0.324798,2.535316e-16
...,...,...,...,...,...,...,...,...,...,...,...
3076,SAR1A__SAR1A,6.586146,6.586146,10.018327,2.097951,2.303398,10.144232,12.393864,,,1.576634e-11
3077,EED__TBL1XR1,5.256384,3.905370,7.379280,1.349383,1.735638,10.256917,18.996254,0.340102,0.349537,2.757013e-17
3078,SEC31A__SEC31A,4.740903,4.740903,7.211232,2.019320,2.852353,10.770399,12.522567,,,1.230882e-11
3079,SEC24C__SLC39A9,6.633404,-5.004521,-0.143315,1.874443,1.980267,11.239310,16.727586,-0.061064,-0.076536,1.919277e-15


In [82]:
drug_CDKO

Unnamed: 0,Gene_Pair,GeneA γ Phenotype,GeneB γ Phenotype,GeneAB Observed\nγ Phenotype,GeneAB RawGI,GeneAB NormGI,GeneAB GIT score,GeneAB GIM score,Correlation of\nGI profiles\n(GIT score),Correlation of\nGI profiles\n(GIM score),q-values based on GIM score
0,AKT1__AKT2,-1.613746,-3.475764,-6.505799,-1.303365,-1.851698,-10.002585,-14.394872,-0.220295,-0.363181,3.230000e-11
1,PIM1__PIM2,-3.018919,-2.275498,-6.978705,-1.643037,-2.228615,-9.855829,-14.423427,-0.165029,-0.315924,6.050000e-11
2,BCL2L1__MCL1,-5.370227,-2.488718,-10.066704,-1.970779,-2.274515,-8.359275,-10.374774,-0.010717,-0.052520,9.670000e-08
3,TK1__TK1,-1.890373,-1.890373,-5.974678,-2.383823,-3.588639,-8.116117,-1.408927,,,1.622756e-01
4,ALAD__GPI,-4.894231,-3.285434,-9.437143,-1.432968,-1.737535,-7.047904,-11.272882,-0.108534,-0.249004,2.140000e-08
...,...,...,...,...,...,...,...,...,...,...,...
20985,PIK3CA__PIK3CA,-2.546795,-2.546795,-3.802928,1.609335,2.240952,7.348141,7.094248,,,7.590000e-05
20986,LCK__LCK,-2.887190,-2.887190,-3.557350,1.994927,2.784584,8.181569,6.979700,,,8.850000e-05
20987,NPY4R__NPY4R,-2.263467,-2.263467,-2.580027,2.297937,3.294392,8.476707,5.956021,,,5.220400e-04
20988,AKT2__AKT2,-3.475764,-3.475764,-5.327819,2.178670,2.464910,8.742763,8.521079,,,4.390000e-06


In [83]:
drug_CDKO.loc[:, 'q-values based on GIM score']

0        3.230000e-11
1        6.050000e-11
2        9.670000e-08
3        1.622756e-01
4        2.140000e-08
             ...     
20985    7.590000e-05
20986    8.850000e-05
20987    5.220400e-04
20988    4.390000e-06
20989    2.440000e-08
Name: q-values based on GIM score, Length: 20990, dtype: float64

In [84]:
drug_CDKO.loc[:, ['GeneA γ Phenotype', 'GeneB γ Phenotype', 'GeneAB Observed\nγ Phenotype']].astype('str').agg('|'.join, axis=1)

0        -1.613745903|-3.475764088|-6.505798868
1        -3.018919429|-2.275498445|-6.978705237
2        -5.370226669|-2.488718326|-10.06670369
3          -1.89037342|-1.89037342|-5.974678125
4        -4.894230657|-3.285433863|-9.437142687
                          ...                  
20985    -2.546794911|-2.546794911|-3.802928425
20986      -2.88719015|-2.88719015|-3.557350168
20987    -2.263466552|-2.263466552|-2.580027035
20988    -3.475764088|-3.475764088|-5.327819195
20989    -3.289454989|-3.289454989|-4.626701805
Length: 20990, dtype: object

In [85]:
gene_A_list, gene_B_list = [], []
for i in drug_CDKO['Gene_Pair']:
    split_ab = i.split('__')
    gene_A_list.append(split_ab[0])
    gene_B_list.append(split_ab[1])

In [86]:
curr_GI = pd.DataFrame(columns = ["Gene_A", "Gene_B", "Study_Source", "Cell_Line", "Phenotype", "GI_Score", "GI_Cutoff", "Stat_Score", "Stat_Cutoff"])
curr_GI["Gene_A"] = gene_A_list
curr_GI["Gene_B"] = gene_B_list
curr_GI["GI_Score"] = drug_CDKO.loc[:, ['GeneAB GIT score', 'GeneAB GIM score']].mean(axis = 1)
curr_GI["Study_Source"] = [study_name_to_pubmed_id['han_data']] * drug_CDKO.shape[0]
curr_GI["Cell_Line"] = ["K562"] * drug_CDKO.shape[0]
curr_GI = curr_GI.fillna(0)
curr_GI["Phenotype"] = drug_CDKO.loc[:, ['GeneA γ Phenotype', 'GeneB γ Phenotype', 'GeneAB Observed\nγ Phenotype']].astype('str').agg('|'.join, axis=1)
curr_GI["GI_Cutoff"] = [-4] * drug_CDKO.shape[0]
curr_GI["Stat_Score"] = drug_CDKO.loc[:, 'q-values based on GIM score']
curr_GI["Stat_Cutoff"] = [float("nan")] * drug_CDKO.shape[0]

In [87]:
drug_CDKO.loc[:, ['GeneAB GIT score', 'GeneAB GIM score']].mean(axis = 1)

0       -12.198728
1       -12.139628
2        -9.367025
3        -4.762522
4        -9.160393
           ...    
20985     7.221194
20986     7.580635
20987     7.216364
20988     8.631921
20989    10.014652
Length: 20990, dtype: float64

In [88]:
drug_CDKO

Unnamed: 0,Gene_Pair,GeneA γ Phenotype,GeneB γ Phenotype,GeneAB Observed\nγ Phenotype,GeneAB RawGI,GeneAB NormGI,GeneAB GIT score,GeneAB GIM score,Correlation of\nGI profiles\n(GIT score),Correlation of\nGI profiles\n(GIM score),q-values based on GIM score
0,AKT1__AKT2,-1.613746,-3.475764,-6.505799,-1.303365,-1.851698,-10.002585,-14.394872,-0.220295,-0.363181,3.230000e-11
1,PIM1__PIM2,-3.018919,-2.275498,-6.978705,-1.643037,-2.228615,-9.855829,-14.423427,-0.165029,-0.315924,6.050000e-11
2,BCL2L1__MCL1,-5.370227,-2.488718,-10.066704,-1.970779,-2.274515,-8.359275,-10.374774,-0.010717,-0.052520,9.670000e-08
3,TK1__TK1,-1.890373,-1.890373,-5.974678,-2.383823,-3.588639,-8.116117,-1.408927,,,1.622756e-01
4,ALAD__GPI,-4.894231,-3.285434,-9.437143,-1.432968,-1.737535,-7.047904,-11.272882,-0.108534,-0.249004,2.140000e-08
...,...,...,...,...,...,...,...,...,...,...,...
20985,PIK3CA__PIK3CA,-2.546795,-2.546795,-3.802928,1.609335,2.240952,7.348141,7.094248,,,7.590000e-05
20986,LCK__LCK,-2.887190,-2.887190,-3.557350,1.994927,2.784584,8.181569,6.979700,,,8.850000e-05
20987,NPY4R__NPY4R,-2.263467,-2.263467,-2.580027,2.297937,3.294392,8.476707,5.956021,,,5.220400e-04
20988,AKT2__AKT2,-3.475764,-3.475764,-5.327819,2.178670,2.464910,8.742763,8.521079,,,4.390000e-06


In [89]:
# prepare the files for insertion
db_inserts = prepare_study_for_export(sequence_ref = None, counts_ref = None, score_ref = curr_GI.copy(), study_controls = None, study_conditions = None)

Starting processing...
Score reference...
Only GI cutoff is present...
Counts reference...
Sequence reference...
Done! Returning...


In [90]:
# #HAN specific, only top 30 instead of the higher number (40)
# db_inserts['score_ref'].sort_values('SL_score', inplace = True)
# db_inserts['score_ref']['SL_or_not'] = [False] * db_inserts['score_ref'].shape[0]
# #db_inserts['score_ref']['SL_or_not'][:31] = [True] * 31
# db_inserts['score_ref'].sort_values('gene_pair', inplace = True)

In [91]:
#db_inserts['score_ref']['SL_or_not'].value_counts()

In [92]:
# insert to the database
insert_study_to_db(engine_link = SLKB_engine, db_inserts = db_inserts)

No counts and sequences together
Final QC...
Beginning transaction...
Done score
Successfully inserted!
Added Record stats...
Score insert: 20791
Done!


## Horlbeck Data

In [93]:
horlbeck_loc = os.path.join(learning_goals_loc_general, "Horlbeck")


sgRNA_ref = pd.read_excel(os.path.join(horlbeck_loc, "2 - Genes Included in the GI Map with Localization, Function, Neighboring Gene Annotation, and sgRNA Information.xlsx"), sheet_name = "Table S2b sgRNA sequences")
sgRNA_level_GI = pd.read_excel(os.path.join(horlbeck_loc, "5 - sgRNA-Level and Gene-Level GI Scores and Correlations; GO Terms Used and Clusters Identified in K562, Jurkat, and Combined GI Maps.xlsx"))
sgRNA_counts = pd.read_csv(os.path.join(horlbeck_loc, "3 - sgRNA Pair Read Counts Based on Barcode, sgRNA, and Triple Sequencing.txt"), sep = "\t")
sgRNA_phenotypes = pd.read_csv(os.path.join(horlbeck_loc, "4 - sgRNA Pair Phenotypes.txt"), sep = "\t")



  sgRNA_counts = pd.read_csv(os.path.join(horlbeck_loc, "3 - sgRNA Pair Read Counts Based on Barcode, sgRNA, and Triple Sequencing.txt"), sep = "\t")
  sgRNA_phenotypes = pd.read_csv(os.path.join(horlbeck_loc, "4 - sgRNA Pair Phenotypes.txt"), sep = "\t")


### Seq Ref

In [94]:
sequence_ref = pd.DataFrame(columns = ["Guide_ID", "Sequence"])

In [95]:
sgRNA_ref['Sequence'] = sgRNA_ref['sgRNA sequence']

# sgRNA_ref[["Upstream Barcode", "sgRNA sequence", "Downstream Barcode"]].apply(    
#     lambda x: ''.join(x.astype(str)),
#     axis=1
# )

In [96]:
sgRNA_counts

Unnamed: 0.1,Unnamed: 0,K562,K562.1,K562.2,K562.3,K562.4,K562.5,K562.6,K562.7,Jurkat,...,Jurkat.2,Jurkat.3,Jurkat.4,Jurkat.5,Jurkat.6,Jurkat.7,Jurkat.8,Jurkat.9,Jurkat.10,Jurkat.11
0,,barcode,barcode,sgRNA,sgRNA,tripleseq,tripleseq,barcode,barcode,barcode,...,barcode,barcode,sgRNA,sgRNA,sgRNA,sgRNA,tripleseq,tripleseq,tripleseq,tripleseq
1,,cyc,cyc,cyc,cyc,cyc,cyc,T0,T0,T0,...,cyc,cyc,T0,T0,cyc,cyc,T0,T0,cyc,cyc
2,,rep1,rep2,rep1,rep2,rep1,rep2,rep1,rep2,rep1,...,rep1,rep2,rep1,rep2,rep1,rep2,rep1,rep2,rep1,rep2
3,combined name,,,,,,,,,,...,,,,,,,,,,
4,AARS2_+_44281027.23-P1P2++AARS2_+_44281027.23-...,153.0,180.0,248.0,285.0,113.0,99.0,288,442,455.0,...,510.0,364.0,827.0,705.0,637.0,576.0,197.0,165.0,221.0,160.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1044483,negative_control-10927++negative_control-10115,3128.0,4936.0,5474.0,5653.0,2395.0,3702.0,266,496,586.0,...,984.0,825.0,638.0,423.0,1132.0,1014.0,301.0,178.0,539.0,485.0
1044484,negative_control-10927++negative_control-10130,2228.0,2131.0,2135.0,2909.0,1409.0,1481.0,294,298,371.0,...,651.0,743.0,402.0,338.0,737.0,840.0,191.0,165.0,335.0,411.0
1044485,negative_control-10927++negative_control-10899,4896.0,4908.0,5784.0,5686.0,3594.0,3673.0,612,582,882.0,...,1905.0,1571.0,849.0,707.0,1767.0,1771.0,427.0,313.0,1007.0,1002.0
1044486,negative_control-10927++negative_control-10909,388.0,139.0,300.0,141.0,225.0,111.0,30,21,16.0,...,26.0,11.0,9.0,11.0,23.0,7.0,6.0,8.0,19.0,7.0


In [97]:
sgRNA_ref = sgRNA_ref.drop(["gene name", "sgRNA short name", "Upstream Barcode", "sgRNA sequence", "Downstream Barcode"], axis = 1)

In [98]:
sgRNA_ref.columns = ["Guide_ID", "Sequence"]
sequence_ref = pd.concat([sequence_ref, sgRNA_ref])

In [99]:
sequence_ref

Unnamed: 0,Guide_ID,Sequence
0,AARS2_+_44281027.23-P1P2,GAGTGGCAGCTGCAGCCCGG
1,AARS2_+_44281044.23-P1P2,GGCTACGATGGCAGCGTCAG
2,AATF_-_35306286.23-P1P2,GAGTGGCCGGTCCAGAGCTG
3,AATF_-_35306346.23-P1P2,GGGATCAAGGCGAGAGGATC
4,ABCB7_+_74375984.23-P1P2,GTAGCGGCTCAGGTCCGCAG
...,...,...
1018,negative_control-10115,GCCGTACTACCGGCGGCCCT
1019,negative_control-10130,GATGTAAATTAAGACGAAGC
1020,negative_control-10899,GAGTCCATTCAGGCGCGCCG
1021,negative_control-10909,GGAGCGGTCCGACCGAATGG


In [100]:
target_names = np.array([i.split('_')[0] for i in sequence_ref['Guide_ID']])

In [101]:
target_names[target_names == 'negative'] = 'control'

In [102]:
sequence_ref['targets'] = target_names

In [103]:
sequence_ref.columns = ['sgRNA_guide_name', 'sgRNA_guide_seq', 'sgRNA_target_name']

In [104]:
sequence_ref

Unnamed: 0,sgRNA_guide_name,sgRNA_guide_seq,sgRNA_target_name
0,AARS2_+_44281027.23-P1P2,GAGTGGCAGCTGCAGCCCGG,AARS2
1,AARS2_+_44281044.23-P1P2,GGCTACGATGGCAGCGTCAG,AARS2
2,AATF_-_35306286.23-P1P2,GAGTGGCCGGTCCAGAGCTG,AATF
3,AATF_-_35306346.23-P1P2,GGGATCAAGGCGAGAGGATC,AATF
4,ABCB7_+_74375984.23-P1P2,GTAGCGGCTCAGGTCCGCAG,ABCB7
...,...,...,...
1018,negative_control-10115,GCCGTACTACCGGCGGCCCT,control
1019,negative_control-10130,GATGTAAATTAAGACGAAGC,control
1020,negative_control-10899,GAGTCCATTCAGGCGCGCCG,control
1021,negative_control-10909,GGAGCGGTCCGACCGAATGG,control


### Counts

In [105]:
counts_ref = pd.DataFrame(columns = ["Guide 1", "Guide 2", "Gene 1", "Gene 2", "Count Replicates", "Type", "Sequencing", "Cell Line", "Condition"])

In [106]:
sgRNA_counts

Unnamed: 0.1,Unnamed: 0,K562,K562.1,K562.2,K562.3,K562.4,K562.5,K562.6,K562.7,Jurkat,...,Jurkat.2,Jurkat.3,Jurkat.4,Jurkat.5,Jurkat.6,Jurkat.7,Jurkat.8,Jurkat.9,Jurkat.10,Jurkat.11
0,,barcode,barcode,sgRNA,sgRNA,tripleseq,tripleseq,barcode,barcode,barcode,...,barcode,barcode,sgRNA,sgRNA,sgRNA,sgRNA,tripleseq,tripleseq,tripleseq,tripleseq
1,,cyc,cyc,cyc,cyc,cyc,cyc,T0,T0,T0,...,cyc,cyc,T0,T0,cyc,cyc,T0,T0,cyc,cyc
2,,rep1,rep2,rep1,rep2,rep1,rep2,rep1,rep2,rep1,...,rep1,rep2,rep1,rep2,rep1,rep2,rep1,rep2,rep1,rep2
3,combined name,,,,,,,,,,...,,,,,,,,,,
4,AARS2_+_44281027.23-P1P2++AARS2_+_44281027.23-...,153.0,180.0,248.0,285.0,113.0,99.0,288,442,455.0,...,510.0,364.0,827.0,705.0,637.0,576.0,197.0,165.0,221.0,160.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1044483,negative_control-10927++negative_control-10115,3128.0,4936.0,5474.0,5653.0,2395.0,3702.0,266,496,586.0,...,984.0,825.0,638.0,423.0,1132.0,1014.0,301.0,178.0,539.0,485.0
1044484,negative_control-10927++negative_control-10130,2228.0,2131.0,2135.0,2909.0,1409.0,1481.0,294,298,371.0,...,651.0,743.0,402.0,338.0,737.0,840.0,191.0,165.0,335.0,411.0
1044485,negative_control-10927++negative_control-10899,4896.0,4908.0,5784.0,5686.0,3594.0,3673.0,612,582,882.0,...,1905.0,1571.0,849.0,707.0,1767.0,1771.0,427.0,313.0,1007.0,1002.0
1044486,negative_control-10927++negative_control-10909,388.0,139.0,300.0,141.0,225.0,111.0,30,21,16.0,...,26.0,11.0,9.0,11.0,23.0,7.0,6.0,8.0,19.0,7.0


In [107]:
K562_counts = sgRNA_counts.iloc[:, 0:9]
#K562_counts = K562_counts.dropna()
K562_counts = K562_counts.fillna(0)
K562_counts = K562_counts.iloc[4:, :]

In [108]:
K562_counts.iloc[4:, :]

Unnamed: 0.1,Unnamed: 0,K562,K562.1,K562.2,K562.3,K562.4,K562.5,K562.6,K562.7
8,AARS2_+_44281027.23-P1P2++ABCB7_+_74375984.23-...,210.0,207.0,215.0,208.0,100.0,110.0,559,458
9,AARS2_+_44281027.23-P1P2++ABCB7_-_74376019.23-...,288.0,266.0,258.0,265.0,133.0,167.0,639,523
10,AARS2_+_44281027.23-P1P2++ACTL6A_+_179280849.2...,890.0,644.0,639.0,851.0,422.0,502.0,1176,1609
11,AARS2_+_44281027.23-P1P2++ACTR10_+_58666892.23...,186.0,404.0,240.0,445.0,42.0,280.0,624,828
12,AARS2_+_44281027.23-P1P2++ACTR10_+_58666933.23...,191.0,198.0,123.0,213.0,84.0,102.0,510,585
...,...,...,...,...,...,...,...,...,...
1044483,negative_control-10927++negative_control-10115,3128.0,4936.0,5474.0,5653.0,2395.0,3702.0,266,496
1044484,negative_control-10927++negative_control-10130,2228.0,2131.0,2135.0,2909.0,1409.0,1481.0,294,298
1044485,negative_control-10927++negative_control-10899,4896.0,4908.0,5784.0,5686.0,3594.0,3673.0,612,582
1044486,negative_control-10927++negative_control-10909,388.0,139.0,300.0,141.0,225.0,111.0,30,21


In [109]:
K562_count_replicates = K562_counts[K562_counts.columns[1:]].apply(    
    lambda x: ';'.join(x.astype(str)),
    axis=1
)

In [110]:
K562_conditions = ';'.join(sgRNA_counts.loc[0:2, K562_counts.columns[1:]].apply(    
    lambda x: ','.join(x.astype(str)),
    axis=0
).tolist())

In [111]:
K562_conditions

'barcode,cyc,rep1;barcode,cyc,rep2;sgRNA,cyc,rep1;sgRNA,cyc,rep2;tripleseq,cyc,rep1;tripleseq,cyc,rep2;barcode,T0,rep1;barcode,T0,rep2'

In [112]:
JURKAT_counts = sgRNA_counts.iloc[:,np.r_[0, 9:21]]
#JURKAT_counts = JURKAT_counts.dropna()
JURKAT_counts = JURKAT_counts.fillna(0)
JURKAT_counts = JURKAT_counts.iloc[4:, :]

In [113]:
JURKAT_count_replicates = JURKAT_counts[JURKAT_counts.columns[1:]].apply(    
    lambda x: ';'.join(x.astype(str)),
    axis=1
)

In [114]:
JURKAT_conditions = ';'.join(sgRNA_counts.loc[0:2, JURKAT_counts.columns[1:]].apply(    
    lambda x: ','.join(x.astype(str)),
    axis=0
).tolist())

In [115]:
JURKAT_conditions

'barcode,T0,rep1;barcode,T0,rep2;barcode,cyc,rep1;barcode,cyc,rep2;sgRNA,T0,rep1;sgRNA,T0,rep2;sgRNA,cyc,rep1;sgRNA,cyc,rep2;tripleseq,T0,rep1;tripleseq,T0,rep2;tripleseq,cyc,rep1;tripleseq,cyc,rep2'

In [116]:
guide_1_list_K562 = []
guide_2_list_K562 = []
gene_1_list_K562 = []
gene_2_list_K562 = []

for i in K562_counts['Unnamed: 0']:
    guide_1, guide_2 = str(i).split('++')
    
    guide_1_list_K562.append(guide_1)
    guide_2_list_K562.append(guide_2)
    
    gene_1, gene_2 = guide_1.split('_')[0].upper(), guide_2.split('_')[0].upper()
    
    gene_1_list_K562.append(gene_1)
    gene_2_list_K562.append(gene_2)

In [117]:
guide_1_list_JURKAT = []
guide_2_list_JURKAT = []
gene_1_list_JURKAT = []
gene_2_list_JURKAT = []

for i in JURKAT_counts['Unnamed: 0']:
    guide_1, guide_2 = str(i).split('++')
    
    guide_1_list_JURKAT.append(guide_1)
    guide_2_list_JURKAT.append(guide_2)
    
    gene_1, gene_2 = guide_1.split('_')[0].upper(), guide_2.split('_')[0].upper()
    
    gene_1_list_JURKAT.append(gene_1)
    gene_2_list_JURKAT.append(gene_2)

In [118]:
counts_ref["Guide 1"] = guide_1_list_K562 + guide_1_list_JURKAT
counts_ref["Guide 2"] = guide_2_list_K562 + guide_2_list_JURKAT
counts_ref["Gene 1"] = gene_1_list_K562 + gene_1_list_JURKAT
counts_ref["Gene 2"] = gene_2_list_K562 + gene_2_list_JURKAT
counts_ref["Count Replicates"] = K562_count_replicates.tolist() + JURKAT_count_replicates.tolist()
counts_ref["Cell Line"] = ["K562"] * len(guide_1_list_K562) + ["JURKAT"] * len(guide_1_list_JURKAT)
counts_ref["Condition"] = [K562_conditions] * len(guide_1_list_K562) + [JURKAT_conditions] * len(guide_1_list_JURKAT)
counts_ref["Sequencing"] = ["Barcode, sgRNA, Tripleseq"] * len(counts_ref["Guide 1"])
counts_ref["Study"] = [study_name_to_pubmed_id['horlbeck_data']] * counts_ref.shape[0]

In [119]:
type_list = ["Dual"] * len(counts_ref["Guide 1"])
# for i in range(len(type_list)):
#     if "NEGATIVE" == sequence_ref["Gene 1"][i] == "NEGATIVE" in sequence_ref["Gene 2"][i]:
#         type_list[i] = "Single"
counts_ref["Type"] = type_list

In [120]:
counts_ref

Unnamed: 0,Guide 1,Guide 2,Gene 1,Gene 2,Count Replicates,Type,Sequencing,Cell Line,Condition,Study
0,AARS2_+_44281027.23-P1P2,AARS2_+_44281027.23-P1P2,AARS2,AARS2,153.0;180.0;248.0;285.0;113.0;99.0;288;442,Dual,"Barcode, sgRNA, Tripleseq",K562,"barcode,cyc,rep1;barcode,cyc,rep2;sgRNA,cyc,re...",30033366
1,AARS2_+_44281027.23-P1P2,AARS2_+_44281044.23-P1P2,AARS2,AARS2,570.0;568.0;748.0;513.0;422.0;344.0;641;712,Dual,"Barcode, sgRNA, Tripleseq",K562,"barcode,cyc,rep1;barcode,cyc,rep2;sgRNA,cyc,re...",30033366
2,AARS2_+_44281027.23-P1P2,AATF_-_35306286.23-P1P2,AARS2,AATF,324.0;180.0;330.0;218.0;213.0;117.0;354;363,Dual,"Barcode, sgRNA, Tripleseq",K562,"barcode,cyc,rep1;barcode,cyc,rep2;sgRNA,cyc,re...",30033366
3,AARS2_+_44281027.23-P1P2,AATF_-_35306346.23-P1P2,AARS2,AATF,170.0;254.0;183.0;306.0;130.0;171.0;264;427,Dual,"Barcode, sgRNA, Tripleseq",K562,"barcode,cyc,rep1;barcode,cyc,rep2;sgRNA,cyc,re...",30033366
4,AARS2_+_44281027.23-P1P2,ABCB7_+_74375984.23-P1P2,AARS2,ABCB7,210.0;207.0;215.0;208.0;100.0;110.0;559;458,Dual,"Barcode, sgRNA, Tripleseq",K562,"barcode,cyc,rep1;barcode,cyc,rep2;sgRNA,cyc,re...",30033366
...,...,...,...,...,...,...,...,...,...,...
2088963,negative_control-10927,negative_control-10115,NEGATIVE,NEGATIVE,586.0;440.0;984.0;825.0;638.0;423.0;1132.0;101...,Dual,"Barcode, sgRNA, Tripleseq",JURKAT,"barcode,T0,rep1;barcode,T0,rep2;barcode,cyc,re...",30033366
2088964,negative_control-10927,negative_control-10130,NEGATIVE,NEGATIVE,371.0;429.0;651.0;743.0;402.0;338.0;737.0;840....,Dual,"Barcode, sgRNA, Tripleseq",JURKAT,"barcode,T0,rep1;barcode,T0,rep2;barcode,cyc,re...",30033366
2088965,negative_control-10927,negative_control-10899,NEGATIVE,NEGATIVE,882.0;700.0;1905.0;1571.0;849.0;707.0;1767.0;1...,Dual,"Barcode, sgRNA, Tripleseq",JURKAT,"barcode,T0,rep1;barcode,T0,rep2;barcode,cyc,re...",30033366
2088966,negative_control-10927,negative_control-10909,NEGATIVE,NEGATIVE,16.0;19.0;26.0;11.0;9.0;11.0;23.0;7.0;6.0;8.0;...,Dual,"Barcode, sgRNA, Tripleseq",JURKAT,"barcode,T0,rep1;barcode,T0,rep2;barcode,cyc,re...",30033366


In [121]:
K562_conditions = ';'.join(["K562_" + i for i in 'barcode,cyc,rep1;barcode,cyc,rep2;sgRNA,cyc,rep1;sgRNA,cyc,rep2;tripleseq,cyc,rep1;tripleseq,cyc,rep2;barcode,T0,rep1;barcode,T0,rep2'.split(';')])
JURKAT_conditions = ';'.join(["JURKAT_" + i for i in 'barcode,T0,rep1;barcode,T0,rep2;barcode,cyc,rep1;barcode,cyc,rep2;sgRNA,T0,rep1;sgRNA,T0,rep2;sgRNA,cyc,rep1;sgRNA,cyc,rep2;tripleseq,T0,rep1;tripleseq,T0,rep2;tripleseq,cyc,rep1;tripleseq,cyc,rep2'.split(';')])


In [122]:
counts_ref['Condition'][counts_ref['Condition'] == 'barcode,cyc,rep1;barcode,cyc,rep2;sgRNA,cyc,rep1;sgRNA,cyc,rep2;tripleseq,cyc,rep1;tripleseq,cyc,rep2;barcode,T0,rep1;barcode,T0,rep2'] = [K562_conditions] * 1044484
counts_ref['Condition'][counts_ref['Condition'] == 'barcode,T0,rep1;barcode,T0,rep2;barcode,cyc,rep1;barcode,cyc,rep2;sgRNA,T0,rep1;sgRNA,T0,rep2;sgRNA,cyc,rep1;sgRNA,cyc,rep2;tripleseq,T0,rep1;tripleseq,T0,rep2;tripleseq,cyc,rep1;tripleseq,cyc,rep2'] = [JURKAT_conditions] * 1044484

In [123]:
counts_ref

Unnamed: 0,Guide 1,Guide 2,Gene 1,Gene 2,Count Replicates,Type,Sequencing,Cell Line,Condition,Study
0,AARS2_+_44281027.23-P1P2,AARS2_+_44281027.23-P1P2,AARS2,AARS2,153.0;180.0;248.0;285.0;113.0;99.0;288;442,Dual,"Barcode, sgRNA, Tripleseq",K562,"K562_barcode,cyc,rep1;K562_barcode,cyc,rep2;K5...",30033366
1,AARS2_+_44281027.23-P1P2,AARS2_+_44281044.23-P1P2,AARS2,AARS2,570.0;568.0;748.0;513.0;422.0;344.0;641;712,Dual,"Barcode, sgRNA, Tripleseq",K562,"K562_barcode,cyc,rep1;K562_barcode,cyc,rep2;K5...",30033366
2,AARS2_+_44281027.23-P1P2,AATF_-_35306286.23-P1P2,AARS2,AATF,324.0;180.0;330.0;218.0;213.0;117.0;354;363,Dual,"Barcode, sgRNA, Tripleseq",K562,"K562_barcode,cyc,rep1;K562_barcode,cyc,rep2;K5...",30033366
3,AARS2_+_44281027.23-P1P2,AATF_-_35306346.23-P1P2,AARS2,AATF,170.0;254.0;183.0;306.0;130.0;171.0;264;427,Dual,"Barcode, sgRNA, Tripleseq",K562,"K562_barcode,cyc,rep1;K562_barcode,cyc,rep2;K5...",30033366
4,AARS2_+_44281027.23-P1P2,ABCB7_+_74375984.23-P1P2,AARS2,ABCB7,210.0;207.0;215.0;208.0;100.0;110.0;559;458,Dual,"Barcode, sgRNA, Tripleseq",K562,"K562_barcode,cyc,rep1;K562_barcode,cyc,rep2;K5...",30033366
...,...,...,...,...,...,...,...,...,...,...
2088963,negative_control-10927,negative_control-10115,NEGATIVE,NEGATIVE,586.0;440.0;984.0;825.0;638.0;423.0;1132.0;101...,Dual,"Barcode, sgRNA, Tripleseq",JURKAT,"JURKAT_barcode,T0,rep1;JURKAT_barcode,T0,rep2;...",30033366
2088964,negative_control-10927,negative_control-10130,NEGATIVE,NEGATIVE,371.0;429.0;651.0;743.0;402.0;338.0;737.0;840....,Dual,"Barcode, sgRNA, Tripleseq",JURKAT,"JURKAT_barcode,T0,rep1;JURKAT_barcode,T0,rep2;...",30033366
2088965,negative_control-10927,negative_control-10899,NEGATIVE,NEGATIVE,882.0;700.0;1905.0;1571.0;849.0;707.0;1767.0;1...,Dual,"Barcode, sgRNA, Tripleseq",JURKAT,"JURKAT_barcode,T0,rep1;JURKAT_barcode,T0,rep2;...",30033366
2088966,negative_control-10927,negative_control-10909,NEGATIVE,NEGATIVE,16.0;19.0;26.0;11.0;9.0;11.0;23.0;7.0;6.0;8.0;...,Dual,"Barcode, sgRNA, Tripleseq",JURKAT,"JURKAT_barcode,T0,rep1;JURKAT_barcode,T0,rep2;...",30033366


## SL Scores

In [124]:
Horlbeck_GI_JURKAT = pd.read_csv(os.path.join(learning_goals_loc_general, "Horlbeck", "GI_Jurkat_w_score.txt"), delim_whitespace=True, index_col = 0)
Horlbeck_GI_K562 = pd.read_csv(os.path.join(learning_goals_loc_general, "Horlbeck", "GI_K562_w_score.txt"), delim_whitespace=True, index_col = 0)

In [125]:
curr_learning_goal = Horlbeck_GI_JURKAT.copy()
row_idx, col_idx = np.triu_indices(curr_learning_goal.shape[0])

In [126]:
curr_learning_goal.columns[0]

'FARSA'

In [127]:
curr_learning_goal = Horlbeck_GI_JURKAT.copy()
row_idx, col_idx = np.triu_indices(curr_learning_goal.shape[0])

gene_A_list = []
gene_B_list = []
SL_scores = []


for idx in range(len(row_idx)):
    i = row_idx[idx]
    j = col_idx[idx]
    gene_A_list.append(curr_learning_goal.index[i])
    gene_B_list.append(curr_learning_goal.columns[j])
    SL_scores.append(curr_learning_goal.iloc[i, j])

In [128]:
curr_GI = pd.DataFrame(columns = ["Gene_A", "Gene_B", "Study_Source", "Cell_Line", "Phenotype", "GI_Score", "GI_Cutoff", "Stat_Score", "Stat_Cutoff"])
curr_GI["Gene_A"] = gene_A_list
curr_GI["Gene_B"] = gene_B_list
curr_GI["GI_Score"] = SL_scores
curr_GI["Study_Source"] = [study_name_to_pubmed_id['horlbeck_data']] * len(gene_A_list)
curr_GI["Cell_Line"] = ["JURKAT"] * len(gene_A_list)
curr_GI = curr_GI.fillna(0)
curr_GI["Phenotype"] = [float("nan")] * len(gene_A_list)
curr_GI["GI_Cutoff"] = [-3] * len(gene_A_list)
curr_GI["Stat_Score"] = [float("nan")] * len(gene_A_list)
curr_GI["Stat_Cutoff"] = [float("nan")] * len(gene_A_list)

In [129]:
curr_GI_JURKAT = curr_GI.copy()

In [130]:
curr_learning_goal = Horlbeck_GI_K562.copy()
row_idx, col_idx = np.triu_indices(curr_learning_goal.shape[0])

gene_A_list = []
gene_B_list = []
SL_scores = []


for idx in range(len(row_idx)):
    i = row_idx[idx]
    j = col_idx[idx]
    gene_A_list.append(curr_learning_goal.index[i])
    gene_B_list.append(curr_learning_goal.columns[j])
    SL_scores.append(curr_learning_goal.iloc[i, j])

In [131]:
curr_GI = pd.DataFrame(columns = ["Gene_A", "Gene_B", "Study_Source", "Cell_Line", "Phenotype", "GI_Score", "GI_Cutoff", "Stat_Score", "Stat_Cutoff"])
curr_GI["Gene_A"] = gene_A_list
curr_GI["Gene_B"] = gene_B_list
curr_GI["GI_Score"] = SL_scores
curr_GI["Study_Source"] = [study_name_to_pubmed_id['horlbeck_data']] * len(gene_A_list)
curr_GI["Cell_Line"] = ["K562"] * len(gene_A_list)
curr_GI = curr_GI.fillna(0)
curr_GI["Phenotype"] = [float("nan")] * len(gene_A_list)
curr_GI["GI_Cutoff"] = [-3] * len(gene_A_list)
curr_GI["Stat_Score"] = [float("nan")] * len(gene_A_list)
curr_GI["Stat_Cutoff"] = [float("nan")] * len(gene_A_list)

In [132]:
curr_GI_K562 = curr_GI.copy()

In [133]:
curr_GI = pd.concat([curr_GI_JURKAT, curr_GI_K562])

In [134]:
curr_GI 

Unnamed: 0,Gene_A,Gene_B,Study_Source,Cell_Line,Phenotype,GI_Score,GI_Cutoff,Stat_Score,Stat_Cutoff
0,FARSA,FARSA,30033366,JURKAT,,0.968186,-3,,
1,FARSA,LIAS,30033366,JURKAT,,-2.872620,-3,,
2,FARSA,TBC1D31,30033366,JURKAT,,-0.595562,-3,,
3,FARSA,COX7C,30033366,JURKAT,,-2.642902,-3,,
4,FARSA,ATP5J2,30033366,JURKAT,,0.626772,-3,,
...,...,...,...,...,...,...,...,...,...
100571,NEMF,RAE1,30033366,K562,,-3.728041,-3,,
100572,NEMF,TGIF2,30033366,K562,,-0.564289,-3,,
100573,RAE1,RAE1,30033366,K562,,-5.444090,-3,,
100574,RAE1,TGIF2,30033366,K562,,0.998261,-3,,


In [135]:
#curr_GI['Stat_Cutoff']
#score_ref['Stat_Cutoff'][0]

In [136]:
# prepare the files for insertion
db_inserts = prepare_study_for_export(sequence_ref = sequence_ref.copy(), counts_ref = counts_ref.copy(), score_ref = curr_GI.copy(), study_controls = controls['horlbeck_data'], study_conditions = study_conditions['horlbeck_data'])

Starting processing...
Score reference...
Controls within SL score that are removed: 
0
---
Only GI cutoff is present...
Counts reference...
Number of double pairs: 2011444
Number of controls: 648
Number of singles: 76876
Sequence reference...
Done! Returning...


In [137]:
# insert to the database
insert_study_to_db(engine_link = SLKB_engine, db_inserts = db_inserts)

Final QC...
Beginning transaction...
Done sequence
Done counts
Done score
Successfully inserted!
Added Record stats...
Sequence insert: 1023
Counts insert: 2088968
Score insert: 175208
Done!


## Ito Data (HSC5, MeWo, MELJUSO, IPC298, HS936T, HS944T, PATU8988S, PK1, A549 GI1, MEL202) No Counts

In [138]:
ito_loc = os.path.join(learning_goals_loc_general, "Ito")

inferred_LFC = pd.read_excel(os.path.join(ito_loc, "Supplementary Tables.xlsx"), sheet_name = "Supplementary table 5", skiprows = 3)
gemini_SL_score = pd.read_excel(os.path.join(ito_loc, "Supplementary Tables.xlsx"), sheet_name = "Supplementary table 8", skiprows = 3)
gemini_FDR_score = pd.read_excel(os.path.join(ito_loc, "Supplementary Tables.xlsx"), sheet_name = "Supplementary table 10", skiprows = 3)

In [139]:
(inferred_LFC['A549_LUNG'] < 0).sum()

2351

In [140]:
inferred_LFC

Unnamed: 0,gene 1,gene 2,A549_LUNG,GI1_CENTRAL_NERVOUS_SYSTEM,HS936T_SKIN,HS944T_SKIN,HSC5_SKIN,IPC298_SKIN,MEL202_UVEA,MELJUSO_SKIN,MEWO_SKIN,PATU8988S_PANCREAS,PK1_PANCREAS
0,A3GALT2,AAVS1,0.227807,0.326647,0.274665,0.152530,0.505637,0.070358,0.373297,0.189235,0.086962,0.222683,0.374522
1,A3GALT2,ABO,0.469130,0.026932,0.033748,0.127937,0.243435,-0.013569,0.350034,0.326564,0.160496,0.103914,0.242022
2,A3GALT2,GBGT1,0.282571,-0.149230,-0.098934,-0.085788,-0.218262,-0.144974,0.301605,0.221792,0.092004,0.024993,-0.139925
3,A3GALT2,GLT6D1,0.164828,-0.077907,0.100564,0.072888,-0.203077,-0.131575,0.377808,0.253479,0.068689,0.046162,-0.017748
4,AADAC,AADACL2,0.019660,0.307797,-0.193339,0.011265,0.051313,0.258473,0.166144,-0.115806,-0.158089,0.016727,0.071597
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8344,ZNF362,ZNF384,0.492029,0.158074,-0.063436,0.186603,0.357797,0.676371,0.272882,0.132875,0.053505,0.009301,0.041217
8345,ZNF423,ZNF521,0.437752,0.189360,0.449655,0.217239,0.368404,0.227551,0.382624,0.242068,0.182690,0.266521,0.598274
8346,ZNRF1,ZNRF2,0.264205,0.588086,0.340166,0.123244,0.301472,0.150680,0.070817,-0.161118,0.115489,0.044563,0.344359
8347,ZXDA,ZXDC,0.634686,0.138514,0.681951,0.408652,0.391544,0.089159,0.294511,0.302321,0.194356,0.313704,0.635204


In [141]:
gemini_SL_score

Unnamed: 0.1,Unnamed: 0,A549_LUNG,GI1_CENTRAL_NERVOUS_SYSTEM,HS936T_SKIN,HS944T_SKIN,HSC5_SKIN,IPC298_SKIN,MEL202_UVEA,MELJUSO_SKIN,MEWO_SKIN,PATU8988S_PANCREAS,PK1_PANCREAS
0,A3GALT2;ABO,-0.241323,0.299715,0.240917,0.024593,0.218901,0.083927,-0.077458,-0.137329,-0.073534,0.118768,0.132501
1,A3GALT2;GBGT1,-0.054765,0.348611,0.303613,0.234600,0.363287,0.215332,-0.150396,-0.032557,-0.005041,0.197135,0.360829
2,A3GALT2;GLT6D1,0.062979,0.200752,0.138522,0.079642,0.540686,0.201932,-0.140968,-0.105989,0.018274,0.130981,0.366193
3,AADAC;AADACL2,0.352531,0.002588,0.373891,0.168104,0.270224,-0.037628,0.067509,0.314345,0.300868,0.090486,0.227840
4,AADAC;AADACL3,0.257603,0.406478,0.277750,0.426975,0.142934,0.160611,0.100796,-0.026351,0.374977,0.129949,0.133039
...,...,...,...,...,...,...,...,...,...,...,...,...
4854,ZNF362;ZNF384,-0.447865,-0.024214,0.078828,-0.046643,-0.216521,-0.325250,-0.308825,-0.187287,0.089056,-0.170584,0.056888
4855,ZNF423;ZNF521,-0.185968,0.119744,-0.105866,0.026296,0.093787,0.161879,-0.119130,-0.070896,-0.025065,-0.127838,-0.071252
4856,ZNRF1;ZNRF2,0.067613,-0.193392,0.081893,0.009510,0.015224,0.057746,0.187882,0.201913,0.048178,0.108719,-0.181989
4857,ZXDA;ZXDC,-0.400328,-0.001426,-0.385589,-0.156762,-0.163813,0.006693,-0.236212,-0.102695,-0.113680,-0.142767,-0.331000


In [142]:
gemini_FDR_score

Unnamed: 0.1,Unnamed: 0,A549_LUNG,GI1_CENTRAL_NERVOUS_SYSTEM,HS936T_SKIN,HS944T_SKIN,HSC5_SKIN,IPC298_SKIN,MEL202_UVEA,MELJUSO_SKIN,MEWO_SKIN,PATU8988S_PANCREAS,PK1_PANCREAS
0,A3GALT2;ABO,0.921199,0.332505,0.424278,0.837115,0.336008,0.764955,0.935477,0.921540,0.955418,0.584562,0.563288
1,A3GALT2;GBGT1,0.751001,0.263071,0.307590,0.381261,0.135696,0.547510,0.977996,0.755984,0.896801,0.380175,0.172056
2,A3GALT2;GLT6D1,0.570461,0.487910,0.622668,0.741919,0.020182,0.571000,0.974356,0.884787,0.866267,0.553824,0.165021
3,AADAC;AADACL2,0.139850,0.776708,0.196587,0.542847,0.248709,0.904911,0.726176,0.058205,0.204460,0.652104,0.389172
4,AADAC;AADACL3,0.252618,0.191253,0.354247,0.067707,0.491352,0.642586,0.659100,0.742404,0.090457,0.556450,0.562338
...,...,...,...,...,...,...,...,...,...,...,...,...
4854,ZNF362;ZNF384,0.981917,0.806990,0.725792,0.920152,0.978128,0.997034,0.999124,0.960840,0.735925,0.969437,0.688228
4855,ZNF423;ZNF521,0.886454,0.616188,0.921000,0.834580,0.597923,0.640425,0.964026,0.830805,0.918407,0.948940,0.849562
4856,ZNRF1;ZNRF2,0.562628,0.936957,0.720903,0.858420,0.754345,0.801502,0.475185,0.186117,0.818102,0.609197,0.934004
4857,ZXDA;ZXDC,0.970700,0.781413,0.992768,0.978433,0.956582,0.862882,0.995520,0.880310,0.973317,0.957046,0.984665


In [143]:
# updated_gemini_SL_score = inferred_LFC.copy()
# merged = updated_gemini_SL_score[["gene 1", "gene 2"]].agg(';'.join, axis=1) 
# for i in range(len(merged)):
#     pair = merged[i]
#     ref_list = gemini_SL_score["Unnamed: 0"].tolist()
#     if pair not in ref_list:
#         updated_gemini_SL_score.iloc[i, 2:] = float("nan")
#     else:
#         updated_gemini_SL_score.iloc[i, 2:] = gemini_SL_score.iloc[ref_list.index(pair), 1:]

In [144]:
updated_gemini_SL_score = inferred_LFC.copy()
merged = updated_gemini_SL_score[["gene 1", "gene 2"]].agg(';'.join, axis=1) 
for i in range(len(merged)):
    pair = merged[i]
    ref_list = gemini_SL_score["Unnamed: 0"].tolist()
    if pair not in ref_list:
        updated_gemini_SL_score.iloc[i, 2:] = float("nan")
    else:
        updated_gemini_SL_score.iloc[i, 2:] = gemini_SL_score.iloc[ref_list.index(pair), 1:]

In [145]:
updated_gemini_FDR_score = inferred_LFC.copy()
merged = updated_gemini_FDR_score[["gene 1", "gene 2"]].agg(';'.join, axis=1) 
for i in range(len(merged)):
    pair = merged[i]
    ref_list = gemini_FDR_score["Unnamed: 0"].tolist()
    if pair not in ref_list:
        updated_gemini_FDR_score.iloc[i, 2:] = float("nan")
    else:
        updated_gemini_FDR_score.iloc[i, 2:] = gemini_FDR_score.iloc[ref_list.index(pair), 1:]

In [146]:
updated_gemini_FDR_score

Unnamed: 0,gene 1,gene 2,A549_LUNG,GI1_CENTRAL_NERVOUS_SYSTEM,HS936T_SKIN,HS944T_SKIN,HSC5_SKIN,IPC298_SKIN,MEL202_UVEA,MELJUSO_SKIN,MEWO_SKIN,PATU8988S_PANCREAS,PK1_PANCREAS
0,A3GALT2,AAVS1,,,,,,,,,,,
1,A3GALT2,ABO,0.921199,0.332505,0.424278,0.837115,0.336008,0.764955,0.935477,0.921540,0.955418,0.584562,0.563288
2,A3GALT2,GBGT1,0.751001,0.263071,0.307590,0.381261,0.135696,0.547510,0.977996,0.755984,0.896801,0.380175,0.172056
3,A3GALT2,GLT6D1,0.570461,0.487910,0.622668,0.741919,0.020182,0.571000,0.974356,0.884787,0.866267,0.553824,0.165021
4,AADAC,AADACL2,0.139850,0.776708,0.196587,0.542847,0.248709,0.904911,0.726176,0.058205,0.204460,0.652104,0.389172
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8344,ZNF362,ZNF384,0.981917,0.806990,0.725792,0.920152,0.978128,0.997034,0.999124,0.960840,0.735925,0.969437,0.688228
8345,ZNF423,ZNF521,0.886454,0.616188,0.921000,0.834580,0.597923,0.640425,0.964026,0.830805,0.918407,0.948940,0.849562
8346,ZNRF1,ZNRF2,0.562628,0.936957,0.720903,0.858420,0.754345,0.801502,0.475185,0.186117,0.818102,0.609197,0.934004
8347,ZXDA,ZXDC,0.970700,0.781413,0.992768,0.978433,0.956582,0.862882,0.995520,0.880310,0.973317,0.957046,0.984665


In [147]:
gemini_SL_score['gene 1'] = [i.split(';')[0] for i in gemini_SL_score['Unnamed: 0']]
gemini_SL_score['gene 2'] = [i.split(';')[1] for i in gemini_SL_score['Unnamed: 0']]

In [148]:
(gemini_SL_score['Unnamed: 0'] == gemini_FDR_score['Unnamed: 0']).sum()

4859

In [149]:
gemini_FDR_score

Unnamed: 0.1,Unnamed: 0,A549_LUNG,GI1_CENTRAL_NERVOUS_SYSTEM,HS936T_SKIN,HS944T_SKIN,HSC5_SKIN,IPC298_SKIN,MEL202_UVEA,MELJUSO_SKIN,MEWO_SKIN,PATU8988S_PANCREAS,PK1_PANCREAS
0,A3GALT2;ABO,0.921199,0.332505,0.424278,0.837115,0.336008,0.764955,0.935477,0.921540,0.955418,0.584562,0.563288
1,A3GALT2;GBGT1,0.751001,0.263071,0.307590,0.381261,0.135696,0.547510,0.977996,0.755984,0.896801,0.380175,0.172056
2,A3GALT2;GLT6D1,0.570461,0.487910,0.622668,0.741919,0.020182,0.571000,0.974356,0.884787,0.866267,0.553824,0.165021
3,AADAC;AADACL2,0.139850,0.776708,0.196587,0.542847,0.248709,0.904911,0.726176,0.058205,0.204460,0.652104,0.389172
4,AADAC;AADACL3,0.252618,0.191253,0.354247,0.067707,0.491352,0.642586,0.659100,0.742404,0.090457,0.556450,0.562338
...,...,...,...,...,...,...,...,...,...,...,...,...
4854,ZNF362;ZNF384,0.981917,0.806990,0.725792,0.920152,0.978128,0.997034,0.999124,0.960840,0.735925,0.969437,0.688228
4855,ZNF423;ZNF521,0.886454,0.616188,0.921000,0.834580,0.597923,0.640425,0.964026,0.830805,0.918407,0.948940,0.849562
4856,ZNRF1;ZNRF2,0.562628,0.936957,0.720903,0.858420,0.754345,0.801502,0.475185,0.186117,0.818102,0.609197,0.934004
4857,ZXDA;ZXDC,0.970700,0.781413,0.992768,0.978433,0.956582,0.862882,0.995520,0.880310,0.973317,0.957046,0.984665


In [150]:
gemini_FDR_score['gene 1'] = [i.split(';')[0] for i in gemini_FDR_score['Unnamed: 0']]
gemini_FDR_score['gene 2'] = [i.split(';')[1] for i in gemini_FDR_score['Unnamed: 0']]

In [151]:
all_GI = pd.DataFrame(columns = ["Gene_A", "Gene_B", "Study_Source", "Cell_Line", "GI_Score", "GI_Cutoff", "Stat_Score", "Stat_Cutoff"])

In [152]:
for cell_line in inferred_LFC.columns[2:]:
    orig_cell_line = cell_line
    curr_GI = pd.DataFrame(columns = ["Gene_A", "Gene_B", "Study_Source", "Cell_Line", "Phenotype", "GI_Score", "GI_Cutoff", "Stat_Score", "Stat_Cutoff"])
    curr_GI["Gene_A"] = gemini_SL_score["gene 1"]
    curr_GI["Gene_B"] = gemini_SL_score["gene 2"]
    curr_GI["GI_Score"] = gemini_SL_score.loc[:, cell_line]
    
    curr_GI = curr_GI.fillna(0)
    curr_GI["GI_Cutoff"] = [float("nan")] * len(curr_GI["Gene_A"])
    curr_GI["Stat_Score"] = gemini_FDR_score.loc[:, cell_line]
    curr_GI["Stat_Cutoff"] = [0.05] * len(curr_GI["Gene_A"])
    
    # Total number of GIs (negatives)
    cell_line = cell_line.split('_')[0]
    curr_GI["Study_Source"] = [study_name_to_pubmed_id['ito_data']] * len(curr_GI["Gene_A"])
    curr_GI["Cell_Line"] = [cell_line] * len(curr_GI["Gene_A"])
    
    all_GI = pd.concat([all_GI, curr_GI])
    
#     all_data["ito_GI_" + cell_line] = curr_GI
#     all_data["Notes"]["ito_GI_" + cell_line] = {}
#     all_data["Notes"]["ito_GI_" + cell_line]["Control"] = "AAVS1"
#     all_data["Notes"]["ito_GI_" + cell_line]["Phenotype"] = "log fold change"

    print(cell_line)
#     print("Total Neg Phenotype: " + str(sum(curr_GI["GI_Score"] < 0)))
#     print("Total Neg Phenotype: " + str(sum(curr_GI["Phenotype"] < 0)))
#     print("Total Sig Neg Phenotype: " + str(sum((gemini_FDR_score[orig_cell_line] < 0.05))))#(gemini_SL_score[orig_cell_line] < 0) & 
#     print("Percentages")
#     print("Total Neg Phenotype: " + str(100 * sum(curr_GI["GI_Score"] < 0)/len(curr_GI["GI_Score"])))
#     print("Total Neg Phenotype: " + str(100 * sum(curr_GI["Phenotype"] < 0)/len(curr_GI["Phenotype"])))
#     print("Total Sig Neg Phenotype: " + str(100 * sum((gemini_SL_score[orig_cell_line] > 0) & (gemini_FDR_score[orig_cell_line] < 0.05))/len(gemini_FDR_score[orig_cell_line])))
# #     curr_GI_matrix = create_matrix_from_df(curr_GI)
#     all_str += "{}% {} ({} GI), ".format(round(100 * sum(curr_GI["GI_Score"] < 0)/len(curr_GI["GI_Score"]), 2),
#                                   cell_line,
#                                   sum(curr_GI["GI_Score"] > 0)
          
#     )
#     sig_str += "{}% {} ({} SL), ".format(round(100 * sum((gemini_SL_score[orig_cell_line] > 0) & (gemini_FDR_score[orig_cell_line] < 0.05))/len(gemini_FDR_score[orig_cell_line]), 2),
#                                   cell_line,
#                                   sum((gemini_SL_score[orig_cell_line] < 0) & (gemini_FDR_score[orig_cell_line] < 0.05))
#          
#    )
#     curr_GI_matrix_cell_lines = make_cell_line_matrix(curr_GI_matrix, cell_line = cell_line)
#     curr_GI_matrix.to_csv(os.path.join(ito_loc, "GI_matrix.csv"))
#     curr_GI_matrix_cell_lines.to_csv(os.path.join(ito_loc, "CellLine_matrix.csv"))
    print("###############################")
    

A549
###############################
GI1
###############################
HS936T
###############################
HS944T
###############################
HSC5
###############################
IPC298
###############################
MEL202
###############################
MELJUSO
###############################
MEWO
###############################
PATU8988S
###############################
PK1
###############################


In [153]:
gemini_FDR_score

Unnamed: 0.1,Unnamed: 0,A549_LUNG,GI1_CENTRAL_NERVOUS_SYSTEM,HS936T_SKIN,HS944T_SKIN,HSC5_SKIN,IPC298_SKIN,MEL202_UVEA,MELJUSO_SKIN,MEWO_SKIN,PATU8988S_PANCREAS,PK1_PANCREAS,gene 1,gene 2
0,A3GALT2;ABO,0.921199,0.332505,0.424278,0.837115,0.336008,0.764955,0.935477,0.921540,0.955418,0.584562,0.563288,A3GALT2,ABO
1,A3GALT2;GBGT1,0.751001,0.263071,0.307590,0.381261,0.135696,0.547510,0.977996,0.755984,0.896801,0.380175,0.172056,A3GALT2,GBGT1
2,A3GALT2;GLT6D1,0.570461,0.487910,0.622668,0.741919,0.020182,0.571000,0.974356,0.884787,0.866267,0.553824,0.165021,A3GALT2,GLT6D1
3,AADAC;AADACL2,0.139850,0.776708,0.196587,0.542847,0.248709,0.904911,0.726176,0.058205,0.204460,0.652104,0.389172,AADAC,AADACL2
4,AADAC;AADACL3,0.252618,0.191253,0.354247,0.067707,0.491352,0.642586,0.659100,0.742404,0.090457,0.556450,0.562338,AADAC,AADACL3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4854,ZNF362;ZNF384,0.981917,0.806990,0.725792,0.920152,0.978128,0.997034,0.999124,0.960840,0.735925,0.969437,0.688228,ZNF362,ZNF384
4855,ZNF423;ZNF521,0.886454,0.616188,0.921000,0.834580,0.597923,0.640425,0.964026,0.830805,0.918407,0.948940,0.849562,ZNF423,ZNF521
4856,ZNRF1;ZNRF2,0.562628,0.936957,0.720903,0.858420,0.754345,0.801502,0.475185,0.186117,0.818102,0.609197,0.934004,ZNRF1,ZNRF2
4857,ZXDA;ZXDC,0.970700,0.781413,0.992768,0.978433,0.956582,0.862882,0.995520,0.880310,0.973317,0.957046,0.984665,ZXDA,ZXDC


In [154]:
updated_gemini_FDR_score

Unnamed: 0,gene 1,gene 2,A549_LUNG,GI1_CENTRAL_NERVOUS_SYSTEM,HS936T_SKIN,HS944T_SKIN,HSC5_SKIN,IPC298_SKIN,MEL202_UVEA,MELJUSO_SKIN,MEWO_SKIN,PATU8988S_PANCREAS,PK1_PANCREAS
0,A3GALT2,AAVS1,,,,,,,,,,,
1,A3GALT2,ABO,0.921199,0.332505,0.424278,0.837115,0.336008,0.764955,0.935477,0.921540,0.955418,0.584562,0.563288
2,A3GALT2,GBGT1,0.751001,0.263071,0.307590,0.381261,0.135696,0.547510,0.977996,0.755984,0.896801,0.380175,0.172056
3,A3GALT2,GLT6D1,0.570461,0.487910,0.622668,0.741919,0.020182,0.571000,0.974356,0.884787,0.866267,0.553824,0.165021
4,AADAC,AADACL2,0.139850,0.776708,0.196587,0.542847,0.248709,0.904911,0.726176,0.058205,0.204460,0.652104,0.389172
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8344,ZNF362,ZNF384,0.981917,0.806990,0.725792,0.920152,0.978128,0.997034,0.999124,0.960840,0.735925,0.969437,0.688228
8345,ZNF423,ZNF521,0.886454,0.616188,0.921000,0.834580,0.597923,0.640425,0.964026,0.830805,0.918407,0.948940,0.849562
8346,ZNRF1,ZNRF2,0.562628,0.936957,0.720903,0.858420,0.754345,0.801502,0.475185,0.186117,0.818102,0.609197,0.934004
8347,ZXDA,ZXDC,0.970700,0.781413,0.992768,0.978433,0.956582,0.862882,0.995520,0.880310,0.973317,0.957046,0.984665


In [155]:
# prepare the files for insertion
db_inserts = prepare_study_for_export(sequence_ref = None, counts_ref = None, score_ref = all_GI.copy(), study_controls = controls['ito_data'], study_conditions = None)

Starting processing...
Score reference...
Controls within SL score that are removed: 
0
---
Only Stat cutoff is present...
Counts reference...
Sequence reference...
Done! Returning...


In [156]:
#PREV_REF['Study_Source'].value_counts()

In [157]:
#PREV_REF.loc[PREV_REF['Study_Source'] == 'Ito', :]

In [158]:
#len(set(PREV_REF.loc[PREV_REF['Study_Source'] == 'Ito', 'Gene Pair']))

In [159]:
#set(PREV_REF.loc[PREV_REF['Study_Source'] == 'Ito', 'Gene Pair']).difference(set(db_inserts['score_ref']['gene_pair']))

In [160]:
#len(set(db_inserts['score_ref']['gene_pair']))

In [161]:
db_inserts['score_ref']

Unnamed: 0,gene_1,gene_2,study_origin,cell_line_origin,SL_score,SL_score_cutoff,statistical_score,statistical_score_cutoff,Phenotype,gene_pair,SL_or_not
0,A3GALT2,ABO,34857952,A549,-0.241323,0,0.921199,0.05,0.0,A3GALT2_ABO,False
1,A3GALT2,GBGT1,34857952,A549,-0.054765,0,0.751001,0.05,0.0,A3GALT2_GBGT1,False
2,A3GALT2,GLT6D1,34857952,A549,0.062979,0,0.570461,0.05,0.0,A3GALT2_GLT6D1,False
3,AADAC,AADACL2,34857952,A549,0.352531,0,0.139850,0.05,0.0,AADAC_AADACL2,False
4,AADAC,AADACL3,34857952,A549,0.257603,0,0.252618,0.05,0.0,AADAC_AADACL3,False
...,...,...,...,...,...,...,...,...,...,...,...
53444,ZNF362,ZNF384,34857952,PK1,0.056888,0,0.688228,0.05,0.0,ZNF362_ZNF384,False
53445,ZNF423,ZNF521,34857952,PK1,-0.071252,0,0.849562,0.05,0.0,ZNF423_ZNF521,False
53446,ZNRF1,ZNRF2,34857952,PK1,-0.181989,0,0.934004,0.05,0.0,ZNRF1_ZNRF2,False
53447,ZXDA,ZXDC,34857952,PK1,-0.331000,0,0.984665,0.05,0.0,ZXDA_ZXDC,False


In [162]:
# insert to the database
insert_study_to_db(engine_link = SLKB_engine, db_inserts = db_inserts)

No counts and sequences together
Final QC...
Beginning transaction...
Done score
Successfully inserted!
Added Record stats...
Score insert: 53449
Done!


## Parrish Data (PC9, HeLa)

In [163]:
parrish_loc = os.path.join(learning_goals_loc_general, "Parrish")

sgRNA_ref = pd.read_excel(os.path.join(parrish_loc, "1 - pgPEN library composition.xlsx"), skiprows = 1)
PC9_sgRNA = pd.read_excel(os.path.join(parrish_loc, "3 - PC9 screen pgRNA CRISPR scores and normalized counts across three biological replicates.xlsx"), skiprows = 1)
HELA_sgRNA = pd.read_excel(os.path.join(parrish_loc, "6 - HeLa screen pgRNA CRISPR scores and normalized counts across three biological replicates.xlsx"), skiprows = 1)

### Seq Ref

In [164]:
old_to_new_map = {}
old_IDs = sgRNA_ref.loc[:, 'pgRNA_ID'].tolist()

In [165]:
sgRNA_ref

Unnamed: 0,pgRNA_ID,target1_sgRNA_seq,target2_sgRNA_seq,paralog_pair,target_type,target1,target2,target1_ensembl_id,target2_ensembl_id
0,A2M_nt1,ACTGCATCTGTGCAAACGGG,ATACGGCCGAAGCCCCTTCA,A2M|PZP,gene_ntc,A2M,,ENSG00000175899,
1,A2M_nt2,ATGTCTCATGAACTACCCTG,CTCTGTGAGATGTCCCGAAA,A2M|PZP,gene_ntc,A2M,,ENSG00000175899,
2,A2M_nt3,TGAAATGAAACTTCACACTG,AGGGCACCCGGTTCATACGC,A2M|PZP,gene_ntc,A2M,,ENSG00000175899,
3,A2M_nt4,TTACTCATATAGGATCCCAA,CCAATGATAAGCCCGAACGG,A2M|PZP,gene_ntc,A2M,,ENSG00000175899,
4,A2M_nt5,ACTGCATCTGTGCAAACGGG,TTCGAGGTCCGGACAGGTCG,A2M|PZP,gene_ntc,A2M,,ENSG00000175899,
...,...,...,...,...,...,...,...,...,...
33165,ZYG11B_ZYG11A_pg5,CAAGTTCCACTAACTTGTGG,AATGCACTATCTGAAATGCC,ZYG11B|ZYG11A,gene_gene,ZYG11B,ZYG11A,ENSG00000162378,ENSG00000203995
33166,ZYG11B_ZYG11A_pg6,CAAGTTCCACTAACTTGTGG,CATCACCCTGAGAAATCGCT,ZYG11B|ZYG11A,gene_gene,ZYG11B,ZYG11A,ENSG00000162378,ENSG00000203995
33167,ZYG11B_ZYG11A_pg7,CAAGTTCCACTAACTTGTGG,GCTCTCAACCTAACACGCCA,ZYG11B|ZYG11A,gene_gene,ZYG11B,ZYG11A,ENSG00000162378,ENSG00000203995
33168,ZYG11B_ZYG11A_pg8,CAAGTTCCACTAACTTGTGG,TAGAACTGTTTATACGACTG,ZYG11B|ZYG11A,gene_gene,ZYG11B,ZYG11A,ENSG00000162378,ENSG00000203995


In [166]:
sgRNA_ref['target_type'].value_counts()

gene_gene    16396
ntc_gene      8149
gene_ntc      8125
ntc_ntc        500
Name: target_type, dtype: int64

In [167]:
all_control_single_sgRNA_sequences = []

In [168]:
all_control_single_sgRNA_sequences += sgRNA_ref.loc[sgRNA_ref['target_type'] == 'ntc_gene']['target1_sgRNA_seq'].values.tolist()

In [169]:
all_control_single_sgRNA_sequences += sgRNA_ref.loc[sgRNA_ref['target_type'] == 'gene_ntc']['target2_sgRNA_seq'].values.tolist()

In [170]:
all_control_single_sgRNA_sequences = set(all_control_single_sgRNA_sequences)

In [171]:
all_control_dual_sgRNA_sequences = set(sgRNA_ref.loc[sgRNA_ref['target_type'] == 'ntc_ntc']['target1_sgRNA_seq'].values.tolist() + sgRNA_ref.loc[sgRNA_ref['target_type'] == 'ntc_ntc']['target2_sgRNA_seq'].values.tolist())

In [172]:
len(all_control_single_sgRNA_sequences)

975

In [173]:
len(all_control_dual_sgRNA_sequences)

967

In [174]:
all_control_dual_sgRNA_sequences.difference(all_control_single_sgRNA_sequences)

set()

In [175]:
seq_to_control_map = {}
i = 1
for seq in all_control_single_sgRNA_sequences:
    seq_to_control_map[seq] = "nt" + str(i)
    i += 1

In [176]:
all_gene_sgRNA_sequences = []
all_gene_sgRNA_sequences += sgRNA_ref.loc[sgRNA_ref['target_type'] == 'ntc_gene']['target2_sgRNA_seq'].values.tolist()
all_gene_sgRNA_sequences += sgRNA_ref.loc[sgRNA_ref['target_type'] == 'gene_ntc']['target1_sgRNA_seq'].values.tolist()
all_gene_sgRNA_sequences += sgRNA_ref.loc[sgRNA_ref['target_type'] == 'gene_gene']['target1_sgRNA_seq'].values.tolist()
all_gene_sgRNA_sequences += sgRNA_ref.loc[sgRNA_ref['target_type'] == 'gene_gene']['target2_sgRNA_seq'].values.tolist()

all_genes = []
all_genes += sgRNA_ref.loc[sgRNA_ref['target_type'] == 'ntc_gene']['target2'].values.tolist()
all_genes += sgRNA_ref.loc[sgRNA_ref['target_type'] == 'gene_ntc']['target1'].values.tolist()
all_genes += sgRNA_ref.loc[sgRNA_ref['target_type'] == 'gene_gene']['target1'].values.tolist()
all_genes += sgRNA_ref.loc[sgRNA_ref['target_type'] == 'gene_gene']['target2'].values.tolist()

In [177]:
seq_to_gene_map = {}
for i in range(len(all_genes)):
    if all_genes[i] not in seq_to_gene_map:
        seq_to_gene_map[all_genes[i]] = []
    if all_gene_sgRNA_sequences[i] not in seq_to_gene_map[all_genes[i]]:
        seq_to_gene_map[all_genes[i]].append(all_gene_sgRNA_sequences[i])

In [178]:
guide_to_guide_map = {}
for i in range(len(sgRNA_ref['pgRNA_ID'])):
    curr_ID = sgRNA_ref['pgRNA_ID'][i]
    target1_seq = sgRNA_ref['target1_sgRNA_seq'][i]
    target2_seq = sgRNA_ref['target2_sgRNA_seq'][i]
    target1_name = sgRNA_ref['target1'][i]
    target2_name = sgRNA_ref['target2'][i]
    cond = sgRNA_ref['target_type'][i]
    
    new_target_1 = target1_name
    new_target_2 = target2_name
    if cond == 'gene_gene':
        new_target_1 = target1_name + '+' + str(seq_to_gene_map[target1_name].index(target1_seq) + 1)
        new_target_2 = target2_name + '+' + str(seq_to_gene_map[target2_name].index(target2_seq) + 1)
    elif cond == 'gene_ntc':
        new_target_1 = target1_name + '+' + str(seq_to_gene_map[target1_name].index(target1_seq) + 1)
        new_target_2 = curr_ID.split('_')[1]
    elif cond == 'ntc_gene':
        new_target_1 = curr_ID.split('_')[0]
        new_target_2 = target2_name + '+' + str(seq_to_gene_map[target2_name].index(target2_seq) + 1)
    else:
        continue
    new_guide = '_'.join([new_target_1, new_target_2])
    
    guide_to_guide_map[curr_ID] = new_guide
    

In [179]:
sgRNA_ref['pgRNA_ID'] =  [guide_to_guide_map[i] if i in guide_to_guide_map else i for i in sgRNA_ref['pgRNA_ID']]

In [180]:
## replace the original values
curr_idx = sgRNA_ref['target_type'] == 'gene_ntc'

gene_1 = [i.split('_')[0] for i in sgRNA_ref.loc[curr_idx]['pgRNA_ID']]
gene_2 = [seq_to_control_map[i] for i in sgRNA_ref.loc[curr_idx]['target2_sgRNA_seq']]

sgRNA_ref.loc[curr_idx, 'pgRNA_ID'] = [gene_1[i] + "_" + gene_2[i] for i in range(sum(curr_idx))]#[gene_1[i] + "+" + str(i+1) + "_" + gene_2[i] for i in range(sum(curr_idx))]

In [181]:
curr_idx = sgRNA_ref['target_type'] == 'ntc_gene'

gene_1 = [seq_to_control_map[i] for i in sgRNA_ref.loc[curr_idx]['target1_sgRNA_seq']]
gene_2 = [i.split('_')[1] for i in sgRNA_ref.loc[curr_idx]['pgRNA_ID']]

sgRNA_ref.loc[curr_idx, 'pgRNA_ID'] = [gene_1[i] + "_" + gene_2[i] for i in range(sum(curr_idx))]#[gene_1[i] + "_" + gene_2[i] + "+" + str(i+1) for i in range(sum(curr_idx))]

In [182]:
curr_idx = sgRNA_ref['target_type'] == 'ntc_ntc'

gene_1 = [seq_to_control_map[i] for i in sgRNA_ref.loc[curr_idx]['target1_sgRNA_seq']]
gene_2 = [seq_to_control_map[i] for i in sgRNA_ref.loc[curr_idx]['target2_sgRNA_seq']]

sgRNA_ref.loc[curr_idx, 'pgRNA_ID'] = [gene_1[i] + "_" + gene_2[i] for i in range(sum(curr_idx))]

In [183]:
new_IDs = sgRNA_ref['pgRNA_ID'].tolist()

In [184]:
len(old_IDs) == len(new_IDs)

True

In [185]:
for i in range(len(old_IDs)):
    old_to_new_map[old_IDs[i]] = new_IDs[i]

In [186]:
sgRNA_ref

Unnamed: 0,pgRNA_ID,target1_sgRNA_seq,target2_sgRNA_seq,paralog_pair,target_type,target1,target2,target1_ensembl_id,target2_ensembl_id
0,A2M+1_nt382,ACTGCATCTGTGCAAACGGG,ATACGGCCGAAGCCCCTTCA,A2M|PZP,gene_ntc,A2M,,ENSG00000175899,
1,A2M+2_nt70,ATGTCTCATGAACTACCCTG,CTCTGTGAGATGTCCCGAAA,A2M|PZP,gene_ntc,A2M,,ENSG00000175899,
2,A2M+3_nt740,TGAAATGAAACTTCACACTG,AGGGCACCCGGTTCATACGC,A2M|PZP,gene_ntc,A2M,,ENSG00000175899,
3,A2M+4_nt249,TTACTCATATAGGATCCCAA,CCAATGATAAGCCCGAACGG,A2M|PZP,gene_ntc,A2M,,ENSG00000175899,
4,A2M+1_nt574,ACTGCATCTGTGCAAACGGG,TTCGAGGTCCGGACAGGTCG,A2M|PZP,gene_ntc,A2M,,ENSG00000175899,
...,...,...,...,...,...,...,...,...,...
33165,ZYG11B+2_ZYG11A+1,CAAGTTCCACTAACTTGTGG,AATGCACTATCTGAAATGCC,ZYG11B|ZYG11A,gene_gene,ZYG11B,ZYG11A,ENSG00000162378,ENSG00000203995
33166,ZYG11B+2_ZYG11A+2,CAAGTTCCACTAACTTGTGG,CATCACCCTGAGAAATCGCT,ZYG11B|ZYG11A,gene_gene,ZYG11B,ZYG11A,ENSG00000162378,ENSG00000203995
33167,ZYG11B+2_ZYG11A+3,CAAGTTCCACTAACTTGTGG,GCTCTCAACCTAACACGCCA,ZYG11B|ZYG11A,gene_gene,ZYG11B,ZYG11A,ENSG00000162378,ENSG00000203995
33168,ZYG11B+2_ZYG11A+4,CAAGTTCCACTAACTTGTGG,TAGAACTGTTTATACGACTG,ZYG11B|ZYG11A,gene_gene,ZYG11B,ZYG11A,ENSG00000162378,ENSG00000203995


In [187]:
guides = []
guides += [i.split('_')[0] for i in sgRNA_ref['pgRNA_ID'].tolist()]
guides += [i.split('_')[1] for i in sgRNA_ref['pgRNA_ID'].tolist()]
sequences = []
sequences += sgRNA_ref['target1_sgRNA_seq'].tolist()
sequences += sgRNA_ref['target2_sgRNA_seq'].tolist()

sgRNA_ref2 = pd.DataFrame({"Guide_ID": guides, 
                           "Sequence" : sequences})

In [188]:
sgRNA_ref2

Unnamed: 0,Guide_ID,Sequence
0,A2M+1,ACTGCATCTGTGCAAACGGG
1,A2M+2,ATGTCTCATGAACTACCCTG
2,A2M+3,TGAAATGAAACTTCACACTG
3,A2M+4,TTACTCATATAGGATCCCAA
4,A2M+1,ACTGCATCTGTGCAAACGGG
...,...,...
66335,ZYG11A+1,AATGCACTATCTGAAATGCC
66336,ZYG11A+2,CATCACCCTGAGAAATCGCT
66337,ZYG11A+3,GCTCTCAACCTAACACGCCA
66338,ZYG11A+4,TAGAACTGTTTATACGACTG


In [189]:
sequence_ref = sgRNA_ref2.copy()

In [190]:
sequence_ref['Target'] = [i.split('+')[0] for i in sequence_ref['Guide_ID']]
sequence_ref.columns = ['sgRNA_guide_name', 'sgRNA_guide_seq', 'sgRNA_target_name']

In [191]:
sequence_ref = sequence_ref.drop_duplicates(subset=['sgRNA_guide_name'])
sequence_ref = sequence_ref.reset_index(drop = True)

In [192]:
sequence_ref

Unnamed: 0,sgRNA_guide_name,sgRNA_guide_seq,sgRNA_target_name
0,A2M+1,ACTGCATCTGTGCAAACGGG,A2M
1,A2M+2,ATGTCTCATGAACTACCCTG,A2M
2,A2M+3,TGAAATGAAACTTCACACTG,A2M
3,A2M+4,TTACTCATATAGGATCCCAA,A2M
4,AADAC+1,AAGTCTGAAGCACTAAGAAG,AADAC
...,...,...,...
9189,PLAG1+4,TCGCCGGTTCTACACCCGAA,PLAG1
9190,PRPF40A+4,TTGAAGTACTACCTGTATCG,PRPF40A
9191,PXDN+4,TGAAAACCTACGCGGAGTCG,PXDN
9192,RHEB+4,GCAAATTGTTGGATATGGTG,RHEB


### Counts

In [193]:
# replace the guide ids
PC9_sgRNA['pgRNA_ID'] = [old_to_new_map[i] if i in old_to_new_map else i for i in PC9_sgRNA['pgRNA_ID']]
HELA_sgRNA['pgRNA'] = [old_to_new_map[i] if i in old_to_new_map else i for i in HELA_sgRNA['pgRNA']]

In [194]:
counts_ref = pd.DataFrame(columns = ["Guide 1", "Guide 2", "Gene 1", "Gene 2", "Count Replicates", "Type", "Sequencing", "Cell Line", "Condition"])

In [195]:
PC9_sgRNA

Unnamed: 0,pgRNA_ID,paralog_pair,pgRNA_target,broad_target_type,mean_CRISPR_score,RepA_CRISPR_score,RepB_CRISPR_score,RepC_CRISPR_score,RepA_plasmid_count,RepB_plasmid_count,RepC_plasmid_count,RepA_LTP_count,RepB_LTP_count,RepC_LTP_count
0,A2M+1_nt382,A2M|PZP,A2M|ntc,single_targeting,0.368602,0.495471,0.196626,0.413707,2435.90,2367.70,2182.40,3364.10,2893.90,3027.60
1,A2M+2_nt70,A2M|PZP,A2M|ntc,single_targeting,0.483738,0.303549,0.531184,0.616480,868.75,844.39,778.32,1102.40,1224.90,1207.40
2,A2M+3_nt740,A2M|PZP,A2M|ntc,single_targeting,0.368685,-0.456572,0.736245,0.826380,911.10,885.56,816.27,826.22,1426.70,1421.20
3,A2M+4_nt249,A2M|PZP,A2M|ntc,single_targeting,0.041019,-1.270615,0.010252,1.383419,416.65,404.97,373.28,263.42,450.09,883.55
4,A2M+1_nt574,A2M|PZP,A2M|ntc,single_targeting,-0.196169,-0.170177,-0.495369,0.077040,1852.50,1800.50,1659.60,1906.60,1544.90,1913.50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32735,ZYG11B+2_ZYG11A+1,ZYG11B|ZYG11A,ZYG11B|ZYG11A,double_targeting,-0.325519,-0.609131,-0.399958,0.032532,1926.80,1872.80,1726.20,1495.80,1525.80,1759.30
32736,ZYG11B+2_ZYG11A+2,ZYG11B|ZYG11A,ZYG11B|ZYG11A,double_targeting,-0.606071,-0.907578,-0.333128,-0.577508,2270.00,2206.30,2033.70,1544.50,1860.00,1482.10
32737,ZYG11B+2_ZYG11A+3,ZYG11B|ZYG11A,ZYG11B|ZYG11A,double_targeting,-0.763798,-0.759516,-0.782239,-0.749639,1378.80,1340.10,1235.20,1001.40,897.78,818.77
32738,ZYG11B+2_ZYG11A+4,ZYG11B|ZYG11A,ZYG11B|ZYG11A,double_targeting,-0.418372,0.221395,-1.528546,0.052036,272.29,264.66,243.95,305.19,120.58,251.33


In [196]:
HELA_sgRNA

Unnamed: 0,pgRNA,paralog_pair,pgRNA_target,broad_target_type,mean_CRISPR_score,RepA_CRISPR_score,RepB_CRISPR_score,RepC_CRISPR_score,RepA_plasmid_count,RepB_plasmid_count,RepC_plasmid_count,RepA_LTP_count,RepB_LTP_count,RepC_LTP_count
0,TNPO2+1_nt928,TNPO2|TNPO1,TNPO2|ntc,single_targeting,1.404089,1.839640,1.220548,1.152079,238.76,266.44,267.51,1555.70,1106.90,1069.80
1,nt713_SEC23A+2,SEC23B|SEC23A,ntc|SEC23A,single_targeting,1.376141,1.719647,1.432025,0.976752,448.63,500.64,502.65,2625.00,2544.70,1691.10
2,nt453_IL1RAPL1+4,IL1RAPL2|IL1RAPL1,ntc|IL1RAPL1,single_targeting,0.401452,1.764124,-0.273009,-0.286759,238.76,266.44,267.51,1455.30,263.25,261.14
3,nt756_TGFB2+3,TGFB3|TGFB2,ntc|TGFB2,single_targeting,0.399684,1.297354,-0.345809,0.247506,2651.80,2959.30,2971.20,10675.00,2726.60,4892.20
4,nt713_PAFAH1B2+2,PAFAH1B3|PAFAH1B2,ntc|PAFAH1B2,single_targeting,1.474571,1.432591,1.727355,1.263767,1123.30,1253.50,1258.50,5096.60,8451.50,5602.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31828,nt591_nt267,,FAKE_GENE_25,double_nontargeting,-0.025413,-0.600200,0.054393,0.469568,364.51,406.77,408.40,358.82,750.63,1125.70
31829,nt91_nt963,,FAKE_GENE_14,double_nontargeting,-1.004934,-0.574149,-1.615088,-0.825566,1119.00,1248.80,1253.80,1127.20,463.07,971.92
31830,nt45_nt899,,FAKE_GENE_25,double_nontargeting,-0.114991,-0.576438,0.239666,-0.008203,814.83,909.30,912.96,819.14,2003.40,1575.30
31831,nt912_nt563,,FAKE_GENE_25,double_nontargeting,-0.127443,-0.579808,0.163396,0.034084,1275.40,1423.20,1428.90,1278.30,2913.70,2569.30


In [197]:
curr_counts = PC9_sgRNA.copy()
curr_counts = curr_counts.fillna(0)
curr_counts.index = range(curr_counts.shape[0])

In [198]:
curr_counts.columns

Index(['pgRNA_ID', 'paralog_pair', 'pgRNA_target', 'broad_target_type',
       'mean_CRISPR_score', 'RepA_CRISPR_score', 'RepB_CRISPR_score',
       'RepC_CRISPR_score', 'RepA_plasmid_count', 'RepB_plasmid_count',
       'RepC_plasmid_count', 'RepA_LTP_count', 'RepB_LTP_count',
       'RepC_LTP_count'],
      dtype='object')

In [199]:
curr_counts

Unnamed: 0,pgRNA_ID,paralog_pair,pgRNA_target,broad_target_type,mean_CRISPR_score,RepA_CRISPR_score,RepB_CRISPR_score,RepC_CRISPR_score,RepA_plasmid_count,RepB_plasmid_count,RepC_plasmid_count,RepA_LTP_count,RepB_LTP_count,RepC_LTP_count
0,A2M+1_nt382,A2M|PZP,A2M|ntc,single_targeting,0.368602,0.495471,0.196626,0.413707,2435.90,2367.70,2182.40,3364.10,2893.90,3027.60
1,A2M+2_nt70,A2M|PZP,A2M|ntc,single_targeting,0.483738,0.303549,0.531184,0.616480,868.75,844.39,778.32,1102.40,1224.90,1207.40
2,A2M+3_nt740,A2M|PZP,A2M|ntc,single_targeting,0.368685,-0.456572,0.736245,0.826380,911.10,885.56,816.27,826.22,1426.70,1421.20
3,A2M+4_nt249,A2M|PZP,A2M|ntc,single_targeting,0.041019,-1.270615,0.010252,1.383419,416.65,404.97,373.28,263.42,450.09,883.55
4,A2M+1_nt574,A2M|PZP,A2M|ntc,single_targeting,-0.196169,-0.170177,-0.495369,0.077040,1852.50,1800.50,1659.60,1906.60,1544.90,1913.50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32735,ZYG11B+2_ZYG11A+1,ZYG11B|ZYG11A,ZYG11B|ZYG11A,double_targeting,-0.325519,-0.609131,-0.399958,0.032532,1926.80,1872.80,1726.20,1495.80,1525.80,1759.30
32736,ZYG11B+2_ZYG11A+2,ZYG11B|ZYG11A,ZYG11B|ZYG11A,double_targeting,-0.606071,-0.907578,-0.333128,-0.577508,2270.00,2206.30,2033.70,1544.50,1860.00,1482.10
32737,ZYG11B+2_ZYG11A+3,ZYG11B|ZYG11A,ZYG11B|ZYG11A,double_targeting,-0.763798,-0.759516,-0.782239,-0.749639,1378.80,1340.10,1235.20,1001.40,897.78,818.77
32738,ZYG11B+2_ZYG11A+4,ZYG11B|ZYG11A,ZYG11B|ZYG11A,double_targeting,-0.418372,0.221395,-1.528546,0.052036,272.29,264.66,243.95,305.19,120.58,251.33


In [200]:
all_counts = curr_counts[['RepA_plasmid_count', 'RepB_plasmid_count', 'RepC_plasmid_count', 'RepA_LTP_count', 'RepB_LTP_count', 'RepC_LTP_count']].apply(    
    lambda x: ';'.join(x.astype(str)),
    axis=1
)

In [201]:
type_list = ["Single"] * curr_counts.shape[0]
for i in range(len(type_list)):
    if "single" not in curr_counts.loc[i, 'broad_target_type']:
        type_list[i] = "Dual"


In [202]:
targeted_genes = curr_counts['pgRNA_target'].values

In [203]:
curr_counts = curr_counts.drop(['pgRNA_target', 'paralog_pair', 'mean_CRISPR_score', 
                                'RepA_CRISPR_score', 'RepB_CRISPR_score', 'RepC_CRISPR_score',
                               'RepA_plasmid_count', 'RepB_plasmid_count', 'RepC_plasmid_count', 
                                'RepA_LTP_count', 'RepB_LTP_count', 'RepC_LTP_count', 'broad_target_type'], axis = 1)

In [204]:
curr_counts.columns = ["Guide 1"]

In [205]:
gene_1_list = np.array([i.split('_')[0].split('+')[0] for i in curr_counts['Guide 1']], dtype = object)
gene_2_list = np.array([i.split('_')[1].split('+')[0] for i in curr_counts['Guide 1']], dtype = object)
guide_2_list = [i.split('_')[1] for i in curr_counts['Guide 1'].tolist()]
curr_counts['Guide 1'] = [i.split('_')[0] for i in curr_counts['Guide 1'].tolist()]

gene_1_list = gene_1_list.tolist()
gene_2_list = gene_2_list.tolist()

In [206]:
curr_counts['Gene 1'] = gene_1_list
curr_counts['Guide 2'] = guide_2_list
curr_counts['Gene 2'] = gene_2_list
curr_counts['Count Replicates'] = all_counts
curr_counts['Type'] = type_list
curr_counts['Cell Line'] = ["PC9"] * curr_counts.shape[0]
curr_counts['Condition'] = ["plasmid;plasmid;plasmid;LTP;LTP;LTP"] * curr_counts.shape[0]

In [207]:
curr_counts

Unnamed: 0,Guide 1,Gene 1,Guide 2,Gene 2,Count Replicates,Type,Cell Line,Condition
0,A2M+1,A2M,nt382,nt382,2435.9;2367.7;2182.4;3364.1;2893.9;3027.6,Single,PC9,plasmid;plasmid;plasmid;LTP;LTP;LTP
1,A2M+2,A2M,nt70,nt70,868.75;844.39;778.32;1102.4;1224.9;1207.4,Single,PC9,plasmid;plasmid;plasmid;LTP;LTP;LTP
2,A2M+3,A2M,nt740,nt740,911.1;885.56;816.27;826.22;1426.7;1421.2,Single,PC9,plasmid;plasmid;plasmid;LTP;LTP;LTP
3,A2M+4,A2M,nt249,nt249,416.65;404.97;373.28;263.42;450.09;883.55,Single,PC9,plasmid;plasmid;plasmid;LTP;LTP;LTP
4,A2M+1,A2M,nt574,nt574,1852.5;1800.5;1659.6;1906.6;1544.9;1913.5,Single,PC9,plasmid;plasmid;plasmid;LTP;LTP;LTP
...,...,...,...,...,...,...,...,...
32735,ZYG11B+2,ZYG11B,ZYG11A+1,ZYG11A,1926.8;1872.8;1726.2;1495.8;1525.8;1759.3,Dual,PC9,plasmid;plasmid;plasmid;LTP;LTP;LTP
32736,ZYG11B+2,ZYG11B,ZYG11A+2,ZYG11A,2270.0;2206.3;2033.7;1544.5;1860.0;1482.1,Dual,PC9,plasmid;plasmid;plasmid;LTP;LTP;LTP
32737,ZYG11B+2,ZYG11B,ZYG11A+3,ZYG11A,1378.8;1340.1;1235.2;1001.4;897.78;818.77,Dual,PC9,plasmid;plasmid;plasmid;LTP;LTP;LTP
32738,ZYG11B+2,ZYG11B,ZYG11A+4,ZYG11A,272.29;264.66;243.95;305.19;120.58;251.33,Dual,PC9,plasmid;plasmid;plasmid;LTP;LTP;LTP


In [208]:
counts_ref = pd.concat([counts_ref, curr_counts])

In [209]:
curr_counts = HELA_sgRNA.copy()
#curr_counts = curr_counts.dropna()
curr_counts = curr_counts.fillna(0)
curr_counts.index = range(curr_counts.shape[0])

In [210]:
curr_counts.columns

Index(['pgRNA', 'paralog_pair', 'pgRNA_target', 'broad_target_type',
       'mean_CRISPR_score', 'RepA_CRISPR_score', 'RepB_CRISPR_score',
       'RepC_CRISPR_score', 'RepA_plasmid_count', 'RepB_plasmid_count',
       'RepC_plasmid_count', 'RepA_LTP_count', 'RepB_LTP_count',
       'RepC_LTP_count'],
      dtype='object')

In [211]:
curr_counts

Unnamed: 0,pgRNA,paralog_pair,pgRNA_target,broad_target_type,mean_CRISPR_score,RepA_CRISPR_score,RepB_CRISPR_score,RepC_CRISPR_score,RepA_plasmid_count,RepB_plasmid_count,RepC_plasmid_count,RepA_LTP_count,RepB_LTP_count,RepC_LTP_count
0,TNPO2+1_nt928,TNPO2|TNPO1,TNPO2|ntc,single_targeting,1.404089,1.839640,1.220548,1.152079,238.76,266.44,267.51,1555.70,1106.90,1069.80
1,nt713_SEC23A+2,SEC23B|SEC23A,ntc|SEC23A,single_targeting,1.376141,1.719647,1.432025,0.976752,448.63,500.64,502.65,2625.00,2544.70,1691.10
2,nt453_IL1RAPL1+4,IL1RAPL2|IL1RAPL1,ntc|IL1RAPL1,single_targeting,0.401452,1.764124,-0.273009,-0.286759,238.76,266.44,267.51,1455.30,263.25,261.14
3,nt756_TGFB2+3,TGFB3|TGFB2,ntc|TGFB2,single_targeting,0.399684,1.297354,-0.345809,0.247506,2651.80,2959.30,2971.20,10675.00,2726.60,4892.20
4,nt713_PAFAH1B2+2,PAFAH1B3|PAFAH1B2,ntc|PAFAH1B2,single_targeting,1.474571,1.432591,1.727355,1.263767,1123.30,1253.50,1258.50,5096.60,8451.50,5602.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31828,nt591_nt267,0,FAKE_GENE_25,double_nontargeting,-0.025413,-0.600200,0.054393,0.469568,364.51,406.77,408.40,358.82,750.63,1125.70
31829,nt91_nt963,0,FAKE_GENE_14,double_nontargeting,-1.004934,-0.574149,-1.615088,-0.825566,1119.00,1248.80,1253.80,1127.20,463.07,971.92
31830,nt45_nt899,0,FAKE_GENE_25,double_nontargeting,-0.114991,-0.576438,0.239666,-0.008203,814.83,909.30,912.96,819.14,2003.40,1575.30
31831,nt912_nt563,0,FAKE_GENE_25,double_nontargeting,-0.127443,-0.579808,0.163396,0.034084,1275.40,1423.20,1428.90,1278.30,2913.70,2569.30


In [212]:
all_counts = curr_counts[['RepA_plasmid_count', 'RepB_plasmid_count', 'RepC_plasmid_count', 'RepA_LTP_count', 'RepB_LTP_count', 'RepC_LTP_count']].apply(    
    lambda x: ';'.join(x.astype(str)),
    axis=1
)

In [213]:
type_list = ["Single"] * curr_counts.shape[0]
for i in range(len(type_list)):
    if "single" not in curr_counts.loc[i, 'broad_target_type']:
        type_list[i] = "Dual"


In [214]:
targeted_genes = curr_counts['pgRNA_target'].values

In [215]:
curr_counts = curr_counts.drop(['pgRNA_target', 'paralog_pair', 'mean_CRISPR_score', 
                                'RepA_CRISPR_score', 'RepB_CRISPR_score', 'RepC_CRISPR_score',
                               'RepA_plasmid_count', 'RepB_plasmid_count', 'RepC_plasmid_count', 
                                'RepA_LTP_count', 'RepB_LTP_count', 'RepC_LTP_count', 'broad_target_type'], axis = 1)

In [216]:
curr_counts.columns = ["Guide 1"]

In [217]:
curr_counts['Guide 1']

0           TNPO2+1_nt928
1          nt713_SEC23A+2
2        nt453_IL1RAPL1+4
3           nt756_TGFB2+3
4        nt713_PAFAH1B2+2
               ...       
31828         nt591_nt267
31829          nt91_nt963
31830          nt45_nt899
31831         nt912_nt563
31832          nt601_nt38
Name: Guide 1, Length: 31833, dtype: object

In [218]:
gene_1_list = np.array([i.split('_')[0].split('+')[0] for i in curr_counts['Guide 1']], dtype = object)
gene_2_list = np.array([i.split('_')[1].split('+')[0] for i in curr_counts['Guide 1']], dtype = object)
guide_2_list = [i.split('_')[1] for i in curr_counts['Guide 1'].tolist()]
curr_counts['Guide 1'] = [i.split('_')[0] for i in curr_counts['Guide 1'].tolist()]

gene_1_list = gene_1_list.tolist()
gene_2_list = gene_2_list.tolist()

In [219]:
curr_counts['Gene 1'] = gene_1_list
curr_counts['Guide 2'] = guide_2_list
curr_counts['Gene 2'] = gene_2_list
curr_counts['Count Replicates'] = all_counts
curr_counts['Type'] = type_list
curr_counts['Cell Line'] = ["HELA"] * curr_counts.shape[0]
curr_counts['Condition'] = ["plasmid;plasmid;plasmid;LTP;LTP;LTP"] * curr_counts.shape[0]

In [220]:
curr_counts

Unnamed: 0,Guide 1,Gene 1,Guide 2,Gene 2,Count Replicates,Type,Cell Line,Condition
0,TNPO2+1,TNPO2,nt928,nt928,238.76;266.44;267.51;1555.7;1106.9;1069.8,Single,HELA,plasmid;plasmid;plasmid;LTP;LTP;LTP
1,nt713,nt713,SEC23A+2,SEC23A,448.63;500.64;502.65;2625.0;2544.7;1691.1,Single,HELA,plasmid;plasmid;plasmid;LTP;LTP;LTP
2,nt453,nt453,IL1RAPL1+4,IL1RAPL1,238.76;266.44;267.51;1455.3;263.25;261.14,Single,HELA,plasmid;plasmid;plasmid;LTP;LTP;LTP
3,nt756,nt756,TGFB2+3,TGFB2,2651.8;2959.3;2971.2;10675.0;2726.6;4892.2,Single,HELA,plasmid;plasmid;plasmid;LTP;LTP;LTP
4,nt713,nt713,PAFAH1B2+2,PAFAH1B2,1123.3;1253.5;1258.5;5096.6;8451.5;5602.0,Single,HELA,plasmid;plasmid;plasmid;LTP;LTP;LTP
...,...,...,...,...,...,...,...,...
31828,nt591,nt591,nt267,nt267,364.51;406.77;408.4;358.82;750.63;1125.7,Dual,HELA,plasmid;plasmid;plasmid;LTP;LTP;LTP
31829,nt91,nt91,nt963,nt963,1119.0;1248.8;1253.8;1127.2;463.07;971.92,Dual,HELA,plasmid;plasmid;plasmid;LTP;LTP;LTP
31830,nt45,nt45,nt899,nt899,814.83;909.3;912.96;819.14;2003.4;1575.3,Dual,HELA,plasmid;plasmid;plasmid;LTP;LTP;LTP
31831,nt912,nt912,nt563,nt563,1275.4;1423.2;1428.9;1278.3;2913.7;2569.3,Dual,HELA,plasmid;plasmid;plasmid;LTP;LTP;LTP


In [221]:
counts_ref = pd.concat([counts_ref, curr_counts])

In [222]:
counts_ref

Unnamed: 0,Guide 1,Guide 2,Gene 1,Gene 2,Count Replicates,Type,Sequencing,Cell Line,Condition
0,A2M+1,nt382,A2M,nt382,2435.9;2367.7;2182.4;3364.1;2893.9;3027.6,Single,,PC9,plasmid;plasmid;plasmid;LTP;LTP;LTP
1,A2M+2,nt70,A2M,nt70,868.75;844.39;778.32;1102.4;1224.9;1207.4,Single,,PC9,plasmid;plasmid;plasmid;LTP;LTP;LTP
2,A2M+3,nt740,A2M,nt740,911.1;885.56;816.27;826.22;1426.7;1421.2,Single,,PC9,plasmid;plasmid;plasmid;LTP;LTP;LTP
3,A2M+4,nt249,A2M,nt249,416.65;404.97;373.28;263.42;450.09;883.55,Single,,PC9,plasmid;plasmid;plasmid;LTP;LTP;LTP
4,A2M+1,nt574,A2M,nt574,1852.5;1800.5;1659.6;1906.6;1544.9;1913.5,Single,,PC9,plasmid;plasmid;plasmid;LTP;LTP;LTP
...,...,...,...,...,...,...,...,...,...
31828,nt591,nt267,nt591,nt267,364.51;406.77;408.4;358.82;750.63;1125.7,Dual,,HELA,plasmid;plasmid;plasmid;LTP;LTP;LTP
31829,nt91,nt963,nt91,nt963,1119.0;1248.8;1253.8;1127.2;463.07;971.92,Dual,,HELA,plasmid;plasmid;plasmid;LTP;LTP;LTP
31830,nt45,nt899,nt45,nt899,814.83;909.3;912.96;819.14;2003.4;1575.3,Dual,,HELA,plasmid;plasmid;plasmid;LTP;LTP;LTP
31831,nt912,nt563,nt912,nt563,1275.4;1423.2;1428.9;1278.3;2913.7;2569.3,Dual,,HELA,plasmid;plasmid;plasmid;LTP;LTP;LTP


In [223]:
counts_ref['Condition'] = 'plasmid_1;plasmid_2;plasmid_3;LTP_1;LTP_2;LTP_3'
counts_ref["Study"] = [study_name_to_pubmed_id['parrish_data']] * counts_ref.shape[0]

In [224]:
counts_ref

Unnamed: 0,Guide 1,Guide 2,Gene 1,Gene 2,Count Replicates,Type,Sequencing,Cell Line,Condition,Study
0,A2M+1,nt382,A2M,nt382,2435.9;2367.7;2182.4;3364.1;2893.9;3027.6,Single,,PC9,plasmid_1;plasmid_2;plasmid_3;LTP_1;LTP_2;LTP_3,34469736
1,A2M+2,nt70,A2M,nt70,868.75;844.39;778.32;1102.4;1224.9;1207.4,Single,,PC9,plasmid_1;plasmid_2;plasmid_3;LTP_1;LTP_2;LTP_3,34469736
2,A2M+3,nt740,A2M,nt740,911.1;885.56;816.27;826.22;1426.7;1421.2,Single,,PC9,plasmid_1;plasmid_2;plasmid_3;LTP_1;LTP_2;LTP_3,34469736
3,A2M+4,nt249,A2M,nt249,416.65;404.97;373.28;263.42;450.09;883.55,Single,,PC9,plasmid_1;plasmid_2;plasmid_3;LTP_1;LTP_2;LTP_3,34469736
4,A2M+1,nt574,A2M,nt574,1852.5;1800.5;1659.6;1906.6;1544.9;1913.5,Single,,PC9,plasmid_1;plasmid_2;plasmid_3;LTP_1;LTP_2;LTP_3,34469736
...,...,...,...,...,...,...,...,...,...,...
31828,nt591,nt267,nt591,nt267,364.51;406.77;408.4;358.82;750.63;1125.7,Dual,,HELA,plasmid_1;plasmid_2;plasmid_3;LTP_1;LTP_2;LTP_3,34469736
31829,nt91,nt963,nt91,nt963,1119.0;1248.8;1253.8;1127.2;463.07;971.92,Dual,,HELA,plasmid_1;plasmid_2;plasmid_3;LTP_1;LTP_2;LTP_3,34469736
31830,nt45,nt899,nt45,nt899,814.83;909.3;912.96;819.14;2003.4;1575.3,Dual,,HELA,plasmid_1;plasmid_2;plasmid_3;LTP_1;LTP_2;LTP_3,34469736
31831,nt912,nt563,nt912,nt563,1275.4;1423.2;1428.9;1278.3;2913.7;2569.3,Dual,,HELA,plasmid_1;plasmid_2;plasmid_3;LTP_1;LTP_2;LTP_3,34469736


## SL Scores

In [225]:
parrish_loc = os.path.join(learning_goals_loc_general, "Parrish")

PC9_HeLa_GIs = pd.read_excel(os.path.join(parrish_loc, "4 - PC9 and HeLa screen paralog genetic interaction (GI) scores.xlsx"), skiprows = 1)

In [226]:
PC9_HeLa_GIs

Unnamed: 0,paralog_pair,target1,target2,GI_flag,PC9_GI_flag,PC9_GI_score_rank,PC9_GI_score,PC9_GI_fdr,HeLa_GI_flag,HeLa_GI_score_rank,HeLa_GI_score,HeLa_GI_fdr,PC9_DKO_expected_CS,PC9_DKO_observed_CS,HeLa_DKO_expected_CS,HeLa_DKO_observed_CS,same_chr,same_chr_dist,proximity
0,CCNL2|CCNL1,CCNL2,CCNL1,synthetic_lethal,SL_in_PC9,1,-2.313788,0.000165,SL_in_HeLa,10,-1.261084,0.026168,0.375079,-1.987383,-0.099315,-1.332104,False,,diff_chr
1,CDK6|CDK4,CDK6,CDK4,synthetic_lethal,SL_in_PC9,2,-1.537568,0.000340,neither_in_HeLa,210,-0.251834,0.307389,-2.309117,-3.408344,-0.451694,-0.632285,False,,diff_chr
2,GSK3B|GSK3A,GSK3B,GSK3A,synthetic_lethal,SL_in_PC9,3,-1.429767,0.001618,SL_in_HeLa,4,-1.659580,0.012451,-0.343343,-1.691436,0.649198,-1.073316,False,,diff_chr
3,G3BP2|G3BP1,G3BP2,G3BP1,synthetic_lethal,SL_in_PC9,4,-1.429397,0.004693,neither_in_HeLa,663,0.021784,0.951159,-0.260061,-1.622894,-0.858532,-0.715920,False,,diff_chr
4,CNOT8|CNOT7,CNOT8,CNOT7,synthetic_lethal,SL_in_PC9,5,-1.208182,0.004693,SL_in_HeLa,1,-2.063079,0.000022,0.775066,-0.554362,0.166142,-1.900996,False,,diff_chr
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1025,ACTL6A|ACTL6B,ACTL6A,ACTL6B,buffering,buffering_in_PC9,1026,0.728780,0.029700,neither_in_HeLa,121,-0.388877,0.033472,-1.909384,-0.814789,-0.574970,-0.877579,False,,diff_chr
1026,CLEC4C|CLEC6A,CLEC4C,CLEC6A,buffering,buffering_in_PC9,1027,0.748590,0.000489,neither_in_HeLa,751,0.069986,0.789673,-0.330330,0.497574,-0.308274,-0.184525,True,726547.0,less_than_1Mb
1027,ZFAND6|ZFAND5,ZFAND6,ZFAND5,buffering,buffering_in_PC9,1028,0.763221,0.039909,neither_in_HeLa,951,0.249774,0.107288,-1.434036,-0.391247,-0.486414,-0.161166,False,,diff_chr
1028,METTL7B|METTL7A,METTL7B,METTL7A,buffering,buffering_in_PC9,1029,0.789800,0.026260,neither_in_HeLa,338,-0.142798,0.424612,-1.460019,-0.385937,-1.060845,-1.058157,True,4758206.0,greater_than_1Mb


In [227]:
PC9_HeLa_GIs.columns

Index(['paralog_pair', 'target1', 'target2', 'GI_flag', 'PC9_GI_flag',
       'PC9_GI_score_rank', 'PC9_GI_score', 'PC9_GI_fdr', 'HeLa_GI_flag',
       'HeLa_GI_score_rank', 'HeLa_GI_score', 'HeLa_GI_fdr',
       'PC9_DKO_expected_CS', 'PC9_DKO_observed_CS', 'HeLa_DKO_expected_CS',
       'HeLa_DKO_observed_CS', 'same_chr', 'same_chr_dist', 'proximity'],
      dtype='object')

In [228]:
all_GI = pd.DataFrame(columns = ["Gene_A", "Gene_B", "Study_Source", "Cell_Line", "Phenotype", "GI_Score", "GI_Cutoff", "Stat_Score", "Stat_Cutoff"])

In [229]:
for cell_line in ['PC9', 'HeLa']:
    curr_GI = pd.DataFrame(columns = ["Gene_A", "Gene_B", "Study_Source", "Cell_Line", "Phenotype", "GI_Score", "GI_Cutoff", "Stat_Score", "Stat_Cutoff"])
    curr_GI["Gene_A"] = PC9_HeLa_GIs["target1"]
    curr_GI["Gene_B"] = PC9_HeLa_GIs["target2"]
    curr_GI["GI_Score"] = PC9_HeLa_GIs.loc[:, cell_line + "_GI_score"]
    
    curr_GI["Study_Source"] = [study_name_to_pubmed_id['parrish_data']] * len(curr_GI["Gene_A"])
    curr_GI["Cell_Line"] = [cell_line] * len(curr_GI["Gene_A"])
    curr_GI = curr_GI.fillna(0)
    curr_GI["GI_Cutoff"] = [-0.5] * len(curr_GI["Gene_A"])
    curr_GI["Stat_Score"] = PC9_HeLa_GIs.loc[:, cell_line + "_GI_fdr"]
    curr_GI["Stat_Cutoff"] = [0.1] * len(curr_GI["Gene_A"])
    curr_GI["Phenotype"] = [float("nan")] * len(curr_GI["Gene_A"])
    
    all_GI = pd.concat([all_GI, curr_GI])
    print(cell_line)
    print("###############################")

PC9
###############################
HeLa
###############################


In [230]:
all_GI

Unnamed: 0,Gene_A,Gene_B,Study_Source,Cell_Line,Phenotype,GI_Score,GI_Cutoff,Stat_Score,Stat_Cutoff
0,CCNL2,CCNL1,34469736,PC9,,-2.313788,-0.5,0.000165,0.1
1,CDK6,CDK4,34469736,PC9,,-1.537568,-0.5,0.00034,0.1
2,GSK3B,GSK3A,34469736,PC9,,-1.429767,-0.5,0.001618,0.1
3,G3BP2,G3BP1,34469736,PC9,,-1.429397,-0.5,0.004693,0.1
4,CNOT8,CNOT7,34469736,PC9,,-1.208182,-0.5,0.004693,0.1
...,...,...,...,...,...,...,...,...,...
1025,ACTL6A,ACTL6B,34469736,HeLa,,-0.388877,-0.5,0.033472,0.1
1026,CLEC4C,CLEC6A,34469736,HeLa,,0.069986,-0.5,0.789673,0.1
1027,ZFAND6,ZFAND5,34469736,HeLa,,0.249774,-0.5,0.107288,0.1
1028,METTL7B,METTL7A,34469736,HeLa,,-0.142798,-0.5,0.424612,0.1


In [231]:
sequence_ref

Unnamed: 0,sgRNA_guide_name,sgRNA_guide_seq,sgRNA_target_name
0,A2M+1,ACTGCATCTGTGCAAACGGG,A2M
1,A2M+2,ATGTCTCATGAACTACCCTG,A2M
2,A2M+3,TGAAATGAAACTTCACACTG,A2M
3,A2M+4,TTACTCATATAGGATCCCAA,A2M
4,AADAC+1,AAGTCTGAAGCACTAAGAAG,AADAC
...,...,...,...
9189,PLAG1+4,TCGCCGGTTCTACACCCGAA,PLAG1
9190,PRPF40A+4,TTGAAGTACTACCTGTATCG,PRPF40A
9191,PXDN+4,TGAAAACCTACGCGGAGTCG,PXDN
9192,RHEB+4,GCAAATTGTTGGATATGGTG,RHEB


In [232]:
counts_ref

Unnamed: 0,Guide 1,Guide 2,Gene 1,Gene 2,Count Replicates,Type,Sequencing,Cell Line,Condition,Study
0,A2M+1,nt382,A2M,nt382,2435.9;2367.7;2182.4;3364.1;2893.9;3027.6,Single,,PC9,plasmid_1;plasmid_2;plasmid_3;LTP_1;LTP_2;LTP_3,34469736
1,A2M+2,nt70,A2M,nt70,868.75;844.39;778.32;1102.4;1224.9;1207.4,Single,,PC9,plasmid_1;plasmid_2;plasmid_3;LTP_1;LTP_2;LTP_3,34469736
2,A2M+3,nt740,A2M,nt740,911.1;885.56;816.27;826.22;1426.7;1421.2,Single,,PC9,plasmid_1;plasmid_2;plasmid_3;LTP_1;LTP_2;LTP_3,34469736
3,A2M+4,nt249,A2M,nt249,416.65;404.97;373.28;263.42;450.09;883.55,Single,,PC9,plasmid_1;plasmid_2;plasmid_3;LTP_1;LTP_2;LTP_3,34469736
4,A2M+1,nt574,A2M,nt574,1852.5;1800.5;1659.6;1906.6;1544.9;1913.5,Single,,PC9,plasmid_1;plasmid_2;plasmid_3;LTP_1;LTP_2;LTP_3,34469736
...,...,...,...,...,...,...,...,...,...,...
31828,nt591,nt267,nt591,nt267,364.51;406.77;408.4;358.82;750.63;1125.7,Dual,,HELA,plasmid_1;plasmid_2;plasmid_3;LTP_1;LTP_2;LTP_3,34469736
31829,nt91,nt963,nt91,nt963,1119.0;1248.8;1253.8;1127.2;463.07;971.92,Dual,,HELA,plasmid_1;plasmid_2;plasmid_3;LTP_1;LTP_2;LTP_3,34469736
31830,nt45,nt899,nt45,nt899,814.83;909.3;912.96;819.14;2003.4;1575.3,Dual,,HELA,plasmid_1;plasmid_2;plasmid_3;LTP_1;LTP_2;LTP_3,34469736
31831,nt912,nt563,nt912,nt563,1275.4;1423.2;1428.9;1278.3;2913.7;2569.3,Dual,,HELA,plasmid_1;plasmid_2;plasmid_3;LTP_1;LTP_2;LTP_3,34469736


In [233]:
study_conditions['parrish_data']

[['plasmid_1', 'plasmid_2', 'plasmid_3'], ['LTP_1', 'LTP_2', 'LTP_3']]

In [234]:
all_GI

Unnamed: 0,Gene_A,Gene_B,Study_Source,Cell_Line,Phenotype,GI_Score,GI_Cutoff,Stat_Score,Stat_Cutoff
0,CCNL2,CCNL1,34469736,PC9,,-2.313788,-0.5,0.000165,0.1
1,CDK6,CDK4,34469736,PC9,,-1.537568,-0.5,0.00034,0.1
2,GSK3B,GSK3A,34469736,PC9,,-1.429767,-0.5,0.001618,0.1
3,G3BP2,G3BP1,34469736,PC9,,-1.429397,-0.5,0.004693,0.1
4,CNOT8,CNOT7,34469736,PC9,,-1.208182,-0.5,0.004693,0.1
...,...,...,...,...,...,...,...,...,...
1025,ACTL6A,ACTL6B,34469736,HeLa,,-0.388877,-0.5,0.033472,0.1
1026,CLEC4C,CLEC6A,34469736,HeLa,,0.069986,-0.5,0.789673,0.1
1027,ZFAND6,ZFAND5,34469736,HeLa,,0.249774,-0.5,0.107288,0.1
1028,METTL7B,METTL7A,34469736,HeLa,,-0.142798,-0.5,0.424612,0.1


In [235]:
# prepare the files for insertion
db_inserts = prepare_study_for_export(sequence_ref = sequence_ref.copy(), counts_ref = counts_ref.copy(), score_ref = all_GI.copy(), study_controls = controls['parrish_data'], study_conditions = study_conditions['parrish_data'], can_control_be_substring = False)

Starting processing...
Score reference...
Controls within SL score that are removed: 
0
---
Both GI and Stat cutoffs are present...
Counts reference...
Number of double pairs: 31630
Number of controls: 983
Number of singles: 31960
Sequence reference...
Done! Returning...


In [236]:
#PREV_REF['Study_Source'].value_counts()

In [237]:
PREV_REF.loc[PREV_REF['Study_Source'] == 'Parrish', :]

Unnamed: 0.1,Unnamed: 0,Gene_A,Gene_B,Study_Source,Cell_Line,Phenotype,GI_Score,GI_Cutoff,Stat_Score,Stat_Cutoff,Gene Pair
287675,0,CCNL2,CCNL1,Parrish,PC9,0,-2.313788,-0.5,0.000165,0.1,CCNL1_CCNL2
287676,1,CDK6,CDK4,Parrish,PC9,0,-1.537568,-0.5,0.000340,0.1,CDK4_CDK6
287677,2,GSK3B,GSK3A,Parrish,PC9,0,-1.429767,-0.5,0.001618,0.1,GSK3A_GSK3B
287678,3,G3BP2,G3BP1,Parrish,PC9,0,-1.429397,-0.5,0.004693,0.1,G3BP1_G3BP2
287679,4,CNOT8,CNOT7,Parrish,PC9,0,-1.208182,-0.5,0.004693,0.1,CNOT7_CNOT8
...,...,...,...,...,...,...,...,...,...,...,...
289730,1025,ACTL6A,ACTL6B,Parrish,HELA,0,-0.388877,-0.5,0.033472,0.1,ACTL6A_ACTL6B
289731,1026,CLEC4C,CLEC6A,Parrish,HELA,0,0.069986,-0.5,0.789673,0.1,CLEC4C_CLEC6A
289732,1027,ZFAND6,ZFAND5,Parrish,HELA,0,0.249774,-0.5,0.107288,0.1,ZFAND5_ZFAND6
289733,1028,METTL7B,METTL7A,Parrish,HELA,0,-0.142798,-0.5,0.424612,0.1,METTL7A_METTL7B


In [238]:
db_inserts['score_ref']['SL_or_not'].value_counts()

False    1903
True      157
Name: SL_or_not, dtype: int64

In [239]:
# insert to the database
insert_study_to_db(engine_link = SLKB_engine, db_inserts = db_inserts)

Final QC...
Beginning transaction...
Done sequence
Done counts
Done score
Successfully inserted!
Added Record stats...
Sequence insert: 9194
Counts insert: 64573
Score insert: 2060
Done!


## Shen (HeLa, 293T, A549) No Counts

In [240]:
shen_loc = os.path.join(learning_goals_loc_general, "Shen")

HELA_A549_293T = pd.read_excel(os.path.join(shen_loc, "2 - Raw and processed data from CRISPR competitive growth assay.xlsx"), sheet_name = "Table_S2B_CRISPR-KO-assay")
SL_labels = pd.read_excel(os.path.join(shen_loc, "3 - List of hit genetic interactions from CRISPR assay.xlsx"), skiprows = 1, sheet_name = "TableS3B_CRISPR-KO-hits")

In [241]:
HELA_A549_293T

Unnamed: 0,gene_gene,geneA,geneB,fA_HeLa,fB_HeLa,fA+fB_HeLa,pi_HeLa,sd_HeLa,PP_HeLa,abs_pi_HeLa,...,fA_293T,fB_293T,fA+fB_293T,pi_293T,sd_293T,PP_293T,abs_pi_293T,FDR_left_293T,FDR_right__293T,293T_Z
0,ABL1_ADA,ABL1,ADA,-0.010891,-0.023380,-0.034270,-0.038661,0.036023,0.704,0.038661,...,-0.039056,-0.026479,-0.065535,0.005756,0.031896,0.207,0.005756,0.800666,1.0,0.198972
1,ABL1_AKT1,ABL1,AKT1,-0.010891,-0.014149,-0.025039,-0.009330,0.037599,0.182,0.009330,...,-0.039056,-0.024441,-0.063497,-0.015256,0.027199,0.423,0.015256,0.557710,1.0,-0.527369
2,ABL1_ALK,ABL1,ALK,-0.010891,-0.013610,-0.024501,-0.015444,0.020846,0.793,0.015444,...,-0.039056,-0.024261,-0.063316,-0.034626,0.027275,0.783,0.034626,0.407198,1.0,-1.196939
3,ABL1_APC,ABL1,APC,-0.010891,-0.010225,-0.021116,-0.011350,0.019483,0.512,0.011350,...,-0.039056,-0.022621,-0.061677,-0.007403,0.008770,0.669,0.007403,0.639785,1.0,-0.255905
4,ABL1_ARID1A,ABL1,ARID1A,-0.010891,-0.016038,-0.026929,-0.029612,0.032232,0.600,0.029612,...,-0.039056,-0.018836,-0.057892,-0.039848,0.033482,0.755,0.039848,0.392145,1.0,-1.377448
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2625,VEGFA_VHL,VEGFA,VHL,-0.019953,-0.081441,-0.101394,-0.027529,0.019448,0.850,0.027529,...,-0.042977,-0.035299,-0.078276,0.017510,0.018634,0.648,0.017510,0.865491,1.0,0.605266
2626,VEGFA_WEE1,VEGFA,WEE1,-0.019953,-0.024776,-0.044729,-0.045977,0.025619,0.931,0.045977,...,-0.042977,-0.047175,-0.090152,-0.020847,0.028925,0.526,0.020847,0.499459,1.0,-0.720636
2627,VHL_WEE1,VHL,WEE1,-0.081441,-0.024776,-0.106217,-0.045337,0.038023,0.783,0.045337,...,-0.035299,-0.047175,-0.082474,0.001916,0.021769,0.106,0.001916,0.769568,1.0,0.066241
2628,,,,,,,,,,,...,,,,,,,,,,


In [242]:
SL_labels

Unnamed: 0,Interaction,geneA,geneB,HeLa_Z,A549_Z,293T_Z,Hit_Cell_Line,Interaction_type,Conserved?,Validation?,PriorSL_report?,PMID / ref,geneA.1,geneB.1,geneA.2,geneB.2,geneA.3,geneB.3
0,BRCA1_WEE1,BRCA1,WEE1,-0.420565,-1.101509,-5.859982,293T,Synthetic Lethal,private,,yes,25964244,16.81875,19.7427,9.098395,5.24626,14.7990,23.8672
1,BRCA2_CDK9,BRCA2,CDK9,-0.581225,1.231318,-5.130425,293T,Synthetic Lethal,private,,,,0.670472,11.60678,0.469309,6.88599,1.6771,21.0598
2,FNTA_TOP1,FNTA,TOP1,-1.700798,-2.046243,-4.888855,293T,Synthetic Lethal,private,,,,12.97785,3.776725,10.53671,4.040535,35.2184,89.1235
3,CHEK1_IGF1R,CHEK1,IGF1R,-1.694748,-1.732183,-4.781605,293T,Synthetic Lethal,private,,,,9.18234,3.231415,5.582225,11.213015,22.6362,7.9260
4,CHEK1_VEGFA,CHEK1,VEGFA,-2.323439,-2.475270,-4.599751,293T,Synthetic Lethal,private,,,,9.18234,21.09175,5.582225,9.042085,22.6362,15.8428
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
157,CHEK2_KMT2D,CHEK2,KMT2D,-3.060446,0.617330,1.211524,HeLa,Synthetic Lethal,private,,,,10.545135,0.635619,5.51226,1.142524,40.8714,3.9515
158,MAP2K1_VHL,MAP2K1,VHL,-3.586458,-3.138479,1.971959,"Hela, A549",Synthetic Lethal,conserved,,yes,18948595,11.77885,8.94539,5.56903,4.59031,51.4010,27.9101
159,CHEK1_SRC,CHEK1,SRC,-3.586796,0.240960,2.935314,HeLa,Synthetic Lethal,private,,yes,21148814,9.18234,0.004156,5.582225,32.8265,22.6362,11.3629
160,KIT_TP53,KIT,TP53,-0.705059,-0.652426,3.102322,293T,Epistasis,private,,,,0,1.605015,0.000841,12.2452,3.1454,42.3220


In [243]:
all_GI = pd.DataFrame(columns = ["Gene_A", "Gene_B", "Study_Source", "Cell_Line", "GI_Score", "GI_Cutoff", "Stat_Score", "Stat_Cutoff"])

In [244]:
for cell_line in ['293T', 'A549', 'HeLa']:
    curr_GI = pd.DataFrame(columns = ["Gene_A", "Gene_B", "Study_Source", "Cell_Line", "GI_Score", "GI_Cutoff", "Stat_Score", "Stat_Cutoff"])
    curr_GI["Gene_A"] = HELA_A549_293T["geneA"]
    curr_GI["Gene_B"] = HELA_A549_293T["geneB"]
    curr_GI["GI_Score"] = HELA_A549_293T.loc[:, cell_line + "_Z"]

    curr_GI["Study_Source"] = [study_name_to_pubmed_id['shen_data']] * len(curr_GI["Gene_A"])
    curr_GI["GI_Cutoff"] = [-3] * len(curr_GI["Gene_A"])
    curr_GI["Stat_Score"] = HELA_A549_293T.loc[:, "FDR_left_" + cell_line]
    cell_line = cell_line.upper()
    curr_GI["Cell_Line"] = [cell_line] * len(curr_GI["Gene_A"])
    
    curr_GI = curr_GI.fillna(0)
    curr_GI = curr_GI[:-2]
    
    all_GI = pd.concat([all_GI, curr_GI])
    
    
    print(cell_line)
    print("###############################")

293T
###############################
A549
###############################
HELA
###############################


In [245]:
# prepare the files for insertion
db_inserts = prepare_study_for_export(sequence_ref = None, counts_ref = None, score_ref = all_GI.copy(), study_controls = None, study_conditions = None)

Starting processing...
Score reference...
Only GI cutoff is present...
Counts reference...
Sequence reference...
Done! Returning...


In [246]:
db_inserts['score_ref']['SL_or_not'].value_counts()

False    7716
True      168
Name: SL_or_not, dtype: int64

In [247]:
# insert to the database
insert_study_to_db(engine_link = SLKB_engine, db_inserts = db_inserts)

No counts and sequences together
Final QC...
Beginning transaction...
Done score
Successfully inserted!
Added Record stats...
Score insert: 7884
Done!


## Thompson (A375, MeWo, RPE1) No Counts

In [248]:
thompson_loc = os.path.join(learning_goals_loc_general, "Thompson")

all_pairs = pd.read_excel(os.path.join(thompson_loc, "1 - Gene pairs used in the screen and analysis results.xlsx"), skiprows = 2)
statistically_sig = pd.read_excel(os.path.join(thompson_loc, "5 - Statistically significant gene pairs after filtering.xlsx"), skiprows = 3)

#sgRNAs = pd.read_excel(os.path.join(thompson_loc, "4 - gRNAs used for library construction.xlsx"), skiprows = 3)

In [249]:
all_pairs

Unnamed: 0,Pair,GENE PAIR,Gene pair class,A375_D28_rra_fdr_low,A375_D28_t_fdr_low,A375_D14_rra_fdr_low,A375_D14_t_fdr_low,Passes A375 filter?,Mewo_D28_rra_fdr_low,Mewo_D28_t_fdr_low,...,MEWO_D14_Bagel_Gene1,MEWO_D14_Bagel_Gene2,RPE_D14_Bagel_Gene1,RPE_D14_Bagel_Gene2,A375_D28_Bagel_Gene1,A375_D28_Bagel_Gene2,MEWO_D28_Bagel_Gene1,MEWO_D28_Bagel_Gene2,RPE_D28_Bagel_Gene1,RPE_D28_Bagel_Gene2
0,1,AARS2_AARS,Paralogous_gene_pair,0.228786,1.000000e+00,0.999996,1.000000,No,0.999996,1.000000e+00,...,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,2,ABCB7_ABCB6,Paralogous_gene_pair,0.999996,1.000000e+00,0.999996,1.000000,No,0.999996,1.000000e+00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,ABCF1_ABCF3,Paralogous_gene_pair,0.999996,1.000000e+00,0.999996,1.000000,No,0.999996,1.000000e+00,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
3,4,ABHD12B_ABHD12,Paralogous_gene_pair,0.999996,1.000000e+00,0.999996,1.000000,Yes,0.999996,1.000000e+00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,ABL1_EGFR,SynLethDB_gene_pair,0.999996,1.000000e+00,0.999996,1.000000,Yes,0.999996,1.000000e+00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1187,1188,ZSWIM3_CENPF,Bioinformatically_derived_pair,0.999996,1.000000e+00,0.999996,1.000000,Yes,0.999996,1.000000e+00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1188,1189,ZSWIM3_WDHD1,Bioinformatically_derived_pair,0.000028,8.774696e-03,0.000030,0.019452,No,0.000030,1.180000e-07,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0
1189,1190,ZYX_EXO1,Bioinformatically_derived_pair,0.000028,5.970000e-09,0.999996,0.041627,No,0.999996,1.000000e+00,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1190,1191,ZYX_MKI67,Bioinformatically_derived_pair,0.999996,1.000000e+00,0.999996,1.000000,Yes,0.999996,1.000000e+00,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [250]:
statistically_sig

Unnamed: 0,A375,Mewo,RPE
0,AP2A2_AP2A1,ARID1A_ARID1B,ALAS1_ALAS2
1,ARID4B_ARID4A,EAF1_EAF2,ALKBH4_ATR
2,ASF1A_ASF1B,ARPC5_ARPC5L,ARFGEF1_ARFGEF2
3,CCNL2_CCNL1,SRRM3_SRRM2,ARPC1A_ARPC1B
4,CDS1_CDS2,CCNT2_CCNT1,ASF1A_ASF1B
5,CHMP1B_CHMP1A,CHML_CHM,ATP6V1C2_ATP6V1C1
6,CLASP1_CLASP2,UAP1_UAP1L1,CCNI2_CCNI
7,CNOT8_CNOT7,CPPED1_ASPM,CCNT2_CCNT1
8,CSTF2T_CSTF2,CRK_CRKL,CDK13_CDK12
9,DNAJC6_GAK,FAM50B_FAM50A,CDS1_CDS2


In [251]:
gene_A_list, gene_B_list = [], []
for i in all_pairs['GENE PAIR']:
    split_ab = i.split('_')
    gene_A_list.append(split_ab[0])
    gene_B_list.append(split_ab[1])

In [252]:
all_GI = pd.DataFrame(columns = ["Gene_A", "Gene_B", "Study_Source", "Cell_Line", "GI_Score", "GI_Cutoff", "Stat_Score", "Stat_Cutoff"])
for cell_line in ["A375", "Mewo"]:
    curr_GI = pd.DataFrame(columns = ["Gene_A", "Gene_B", "GI_Score"])
    curr_GI["Gene_A"] = gene_A_list
    curr_GI["Gene_B"] = gene_B_list
    #curr_GI["GI_Score"] = None#all_pairs[cell_line + "_D14_t_fdr_low"]
    
    curr_GI["Study_Source"] = [study_name_to_pubmed_id['thompson_data']] * len(curr_GI["Gene_A"])
    curr_GI["Cell_Line"] = [cell_line] * len(curr_GI["Gene_A"])
    
    all_GI = pd.concat([all_GI, curr_GI])

In [253]:
all_GI

Unnamed: 0,Gene_A,Gene_B,Study_Source,Cell_Line,GI_Score,GI_Cutoff,Stat_Score,Stat_Cutoff
0,AARS2,AARS,33637726,A375,,,,
1,ABCB7,ABCB6,33637726,A375,,,,
2,ABCF1,ABCF3,33637726,A375,,,,
3,ABHD12B,ABHD12,33637726,A375,,,,
4,ABL1,EGFR,33637726,A375,,,,
...,...,...,...,...,...,...,...,...
1187,ZSWIM3,CENPF,33637726,Mewo,,,,
1188,ZSWIM3,WDHD1,33637726,Mewo,,,,
1189,ZYX,EXO1,33637726,Mewo,,,,
1190,ZYX,MKI67,33637726,Mewo,,,,


In [254]:
# prepare the files for insertion
db_inserts = prepare_study_for_export(sequence_ref = None, counts_ref = None, score_ref = all_GI.copy(), study_controls = None, study_conditions = None)

Starting processing...
Score reference...
Only Stat cutoff is present...
Counts reference...
Sequence reference...
Done! Returning...


In [255]:
all_GI = db_inserts['score_ref'].copy()
all_GI['SL_or_not'] = False

In [256]:
temp = statistically_sig['A375'].dropna()
temp = ['_'.join(sorted(i.split('_'))) for i in temp]
all_GI.loc[all_GI['gene_pair'].isin(temp) & (all_GI['cell_line_origin'] == 'A375'), 'SL_or_not'] = True

In [257]:
temp = statistically_sig['Mewo'].dropna()
temp = ['_'.join(sorted(i.split('_'))) for i in temp]
all_GI.loc[all_GI['gene_pair'].isin(temp) & (all_GI['cell_line_origin'] == 'A375'), 'SL_or_not'] = True

In [258]:
temp = statistically_sig['RPE'].dropna()
temp = ['_'.join(sorted(i.split('_'))) for i in temp]
all_GI.loc[all_GI['gene_pair'].isin(temp) & (all_GI['cell_line_origin'] == 'RPE'), 'SL_or_not'] = True

In [259]:
db_inserts['score_ref'] = all_GI

In [260]:
db_inserts['score_ref']['SL_or_not'].value_counts()

False    2315
True       69
Name: SL_or_not, dtype: int64

In [261]:
# insert to the database
insert_study_to_db(engine_link = SLKB_engine, db_inserts = db_inserts)

No counts and sequences together
Final QC...
Beginning transaction...
Done score
Successfully inserted!
Added Record stats...
Score insert: 2384
Done!


## Wong (OVCAR8)

In [262]:
wong_loc = os.path.join(learning_goals_loc_general, "Wong")

all_sgRNA = pd.read_excel(os.path.join(wong_loc, "barcode_counts.xlsx"), skiprows = 2)
sgRNA_ref = pd.read_excel(os.path.join(wong_loc, "sequences_ref.xlsx"))
#sgRNA_ref = pd.read_excel(os.path.join(wong_loc, "pnas.1517883113.sd03.xlsx"), skiprows = 2)

  warn(msg)


### Seq Ref

In [263]:
sgRNA_ref

Unnamed: 0,Guide_ID,Sequence
0,GFP_1,GGGCGAGGAGCTGTTCACCG
1,RFP_1,CACCCAGACCATGAAGATCA
2,RFP_2,CCACTTCAAGTGCACATCCG
3,NF1_1,GTTGTGCTCAGTACTGACTT
4,NF1_4,TTTCAGCTTCCAATAAAAAC
...,...,...
155,ING4_2,CCTAGAAGGCCGGACTCAAA
156,ING4_3,GGCACTACTCATATACTCAG
157,ING5_1,GATCTGCTTCAAAGCGCGCC
158,ING5_2,CTTCCAGCTGATGCGAGAGC


In [264]:
sequence_ref = sgRNA_ref.copy()

In [265]:
sequence_ref['Target'] = [i.split('_')[0] for i in sequence_ref['Guide_ID']]
sequence_ref.columns = ['sgRNA_guide_name', 'sgRNA_guide_seq', 'sgRNA_target_name']

In [266]:
sequence_ref

Unnamed: 0,sgRNA_guide_name,sgRNA_guide_seq,sgRNA_target_name
0,GFP_1,GGGCGAGGAGCTGTTCACCG,GFP
1,RFP_1,CACCCAGACCATGAAGATCA,RFP
2,RFP_2,CCACTTCAAGTGCACATCCG,RFP
3,NF1_1,GTTGTGCTCAGTACTGACTT,NF1
4,NF1_4,TTTCAGCTTCCAATAAAAAC,NF1
...,...,...,...
155,ING4_2,CCTAGAAGGCCGGACTCAAA,ING4
156,ING4_3,GGCACTACTCATATACTCAG,ING4
157,ING5_1,GATCTGCTTCAAAGCGCGCC,ING5
158,ING5_2,CTTCCAGCTGATGCGAGAGC,ING5


### Counts

In [267]:
all_sgRNA

Unnamed: 0,Key,2-wise gRNA combination,sgRNA-A,sgRNA-B,day5 (Replicate 1),day15 (Replicate 1),day20 (Replicate 1),day5 (Replicate 2),day15 (Replicate 2),day20 (Replicate 2)
0,11,dummyguide_1 + dummyguide_1,dummyguide_1,dummyguide_1,1987.0,1618.0,1919.0,1299.0,987.0,1485.0
1,12,dummyguide_1 + dummyguide_2,dummyguide_1,dummyguide_2,757.0,868.0,679.0,511.0,465.0,531.0
2,13,dummyguide_1 + dummyguide_3,dummyguide_1,dummyguide_3,1657.0,1901.0,1558.0,1166.0,1091.0,1243.0
3,14,dummyguide_1 + DNMT1_1,dummyguide_1,DNMT1_1,1178.0,1555.0,900.0,1093.0,972.0,638.0
4,15,dummyguide_1 + DNMT1_2,dummyguide_1,DNMT1_2,2103.0,2150.0,1291.0,1775.0,1387.0,2080.0
...,...,...,...,...,...,...,...,...,...,...
23404,153149,ING5_3 + ING4_2,ING5_3,ING4_2,1056.0,1430.0,1560.0,1264.0,826.0,1195.0
23405,153150,ING5_3 + ING4_3,ING5_3,ING4_3,1344.0,1051.0,2309.0,1552.0,1063.0,987.0
23406,153151,ING5_3 + ING5_1,ING5_3,ING5_1,1120.0,1129.0,1222.0,942.0,745.0,896.0
23407,153152,ING5_3 + ING5_2,ING5_3,ING5_2,922.0,715.0,1047.0,783.0,452.0,886.0


In [268]:
counts_ref = pd.DataFrame(columns = ["Guide 1", "Guide 2", "Gene 1", "Gene 2", "Count Replicates", "Type", "Sequencing", "Cell Line", "Condition"])

In [269]:
all_sgRNA.columns[4:]

Index(['day5 (Replicate 1)', 'day15 (Replicate 1)', 'day20 (Replicate 1)',
       'day5 (Replicate 2)', 'day15 (Replicate 2)', 'day20 (Replicate 2)'],
      dtype='object')

In [270]:
';'.join(all_sgRNA.columns[4:])

'day5 (Replicate 1);day15 (Replicate 1);day20 (Replicate 1);day5 (Replicate 2);day15 (Replicate 2);day20 (Replicate 2)'

In [271]:
all_sgRNA['Condition'] = ';'.join(all_sgRNA.columns[4:])

all_sgRNA['Count Replicates'] = all_sgRNA[['day5 (Replicate 1)', 'day15 (Replicate 1)', 'day20 (Replicate 1)',
       'day5 (Replicate 2)', 'day15 (Replicate 2)', 'day20 (Replicate 2)']].apply(    
    lambda x: ';'.join(x.astype(str)),
    axis=1
)

In [272]:
type_list = ["Dual"] * all_sgRNA.shape[0]
for i in range(len(all_sgRNA['2-wise gRNA combination'])):
    if "dummyguide" in all_sgRNA.loc[i, '2-wise gRNA combination']:
        type_list[i] = "Single"

In [273]:
all_sgRNA

Unnamed: 0,Key,2-wise gRNA combination,sgRNA-A,sgRNA-B,day5 (Replicate 1),day15 (Replicate 1),day20 (Replicate 1),day5 (Replicate 2),day15 (Replicate 2),day20 (Replicate 2),Condition,Count Replicates
0,11,dummyguide_1 + dummyguide_1,dummyguide_1,dummyguide_1,1987.0,1618.0,1919.0,1299.0,987.0,1485.0,day5 (Replicate 1);day15 (Replicate 1);day20 (...,1987.0;1618.0;1919.0;1299.0;987.0;1485.0
1,12,dummyguide_1 + dummyguide_2,dummyguide_1,dummyguide_2,757.0,868.0,679.0,511.0,465.0,531.0,day5 (Replicate 1);day15 (Replicate 1);day20 (...,757.0;868.0;679.0;510.99999999999994;465.0;531.0
2,13,dummyguide_1 + dummyguide_3,dummyguide_1,dummyguide_3,1657.0,1901.0,1558.0,1166.0,1091.0,1243.0,day5 (Replicate 1);day15 (Replicate 1);day20 (...,1657.0;1901.0;1558.0;1165.9999999999998;1091.0...
3,14,dummyguide_1 + DNMT1_1,dummyguide_1,DNMT1_1,1178.0,1555.0,900.0,1093.0,972.0,638.0,day5 (Replicate 1);day15 (Replicate 1);day20 (...,1178.0;1555.0;900.0;1093.0;972.0000000000001;6...
4,15,dummyguide_1 + DNMT1_2,dummyguide_1,DNMT1_2,2103.0,2150.0,1291.0,1775.0,1387.0,2080.0,day5 (Replicate 1);day15 (Replicate 1);day20 (...,2103.0;2150.0;1291.0;1775.0000000000002;1387.0...
...,...,...,...,...,...,...,...,...,...,...,...,...
23404,153149,ING5_3 + ING4_2,ING5_3,ING4_2,1056.0,1430.0,1560.0,1264.0,826.0,1195.0,day5 (Replicate 1);day15 (Replicate 1);day20 (...,1056.0;1430.0;1560.0;1264.0;826.0;1195.0
23405,153150,ING5_3 + ING4_3,ING5_3,ING4_3,1344.0,1051.0,2309.0,1552.0,1063.0,987.0,day5 (Replicate 1);day15 (Replicate 1);day20 (...,1344.0;1051.0;2309.0;1552.0;1063.0;987.0
23406,153151,ING5_3 + ING5_1,ING5_3,ING5_1,1120.0,1129.0,1222.0,942.0,745.0,896.0,day5 (Replicate 1);day15 (Replicate 1);day20 (...,1120.0;1129.0;1222.0;942.0;745.0;896.0
23407,153152,ING5_3 + ING5_2,ING5_3,ING5_2,922.0,715.0,1047.0,783.0,452.0,886.0,day5 (Replicate 1);day15 (Replicate 1);day20 (...,921.9999999999999;715.0;1047.0;783.0;452.0;886.0


In [274]:
guide_1_list = []
guide_2_list = []
gene_1_list = []
gene_2_list = []

for i in all_sgRNA['2-wise gRNA combination']:
    guide_1, guide_2 = i.split(' + ')
    
    guide_1_list.append(guide_1)
    guide_2_list.append(guide_2)
    
    gene_1, gene_2 = guide_1.split('_')[0].upper(), guide_2.split('_')[0].upper()
    
    gene_1_list.append(gene_1)
    gene_2_list.append(gene_2)

In [275]:
all_sgRNA = all_sgRNA.drop(['2-wise gRNA combination', 'Key', 'sgRNA-A', 'sgRNA-B', 'day5 (Replicate 1)', 'day15 (Replicate 1)', 'day20 (Replicate 1)',
                            'day5 (Replicate 2)', 'day15 (Replicate 2)', 'day20 (Replicate 2)'], axis = 1)

In [276]:
all_sgRNA['Guide 1'] = guide_1_list
all_sgRNA['Guide 2'] = guide_2_list
all_sgRNA['Gene 1'] = gene_1_list
all_sgRNA['Gene 2'] = gene_2_list
all_sgRNA['Type'] = type_list
all_sgRNA['Sequencing'] = ['CombiGEM-CRISPR'] * all_sgRNA.shape[0]
all_sgRNA['Cell Line'] = ['OVCAR8'] * all_sgRNA.shape[0]
all_sgRNA["Study"] = [study_name_to_pubmed_id['wong_data']] * all_sgRNA.shape[0]

In [277]:
counts_ref = pd.concat([counts_ref, all_sgRNA])

In [278]:
#counts_ref

## SL Scores

In [279]:
wong_loc = os.path.join(learning_goals_loc_general, "Wong")

statistically_sig = pd.read_excel(os.path.join(wong_loc, "OVCAR8-ADR Cell Proliferation.xlsx"))
all_SL = pd.read_excel(os.path.join(wong_loc, "log2_ratios.xlsx"), skiprows = 2)


  warn(msg)


In [280]:
all_SL

Unnamed: 0,2-wise gRNA combination,sgRNA-A,sgRNA-B,Log2 ratio – Day 20/Day15 (Replicate 1),Log2 ratio – Day 20/Day15 (Replicate 2),Log2 ratio – Day 15/Day5 (Replicate 1),Log2 ratio – Day 15/Day5 (Replicate 2)
0,dummyguide_1 + dummyguide_1,dummyguide_1,dummyguide_1,0.205770,0.534492,-0.441889,-0.084731
1,dummyguide_2 + dummyguide_1,dummyguide_2,dummyguide_1,-0.175483,-0.491619,-0.157136,0.320183
2,dummyguide_3 + dummyguide_1,dummyguide_3,dummyguide_1,-0.182784,-0.010254,-0.087128,0.372476
3,DNMT1_1 + dummyguide_1,DNMT1_1,dummyguide_1,0.152098,-0.430941,-0.148725,0.111234
4,DNMT1_2 + dummyguide_1,DNMT1_2,dummyguide_1,-0.005401,0.504039,-0.261538,-0.125159
...,...,...,...,...,...,...,...
11202,ING5_3 + ING5_1,ING5_3,ING5_1,-0.007535,0.324034,-0.043012,-0.441795
11203,ING5_2 + ING5_2,ING5_2,ING5_2,0.091024,1.334409,0.256652,-0.542246
11204,ING5_3 + ING5_2,ING5_3,ING5_2,0.447290,0.598651,-0.425111,-0.362895
11205,BRD4_3 + ING5_3,BRD4_3,ING5_3,0.073002,,-0.020841,


In [281]:
statistically_sig

Unnamed: 0,sgRNA-A,sgRNA-B,Log2 ratio – Day 20/Day15 (Replicate 1),Log2 ratio – Day 20/Day15 (Replicate 2),Q-value
0,BRD4_sg3,MLL_sg3,-2.61,-3.32,7.010000e-33
1,BMI1_sg2,HDAC2_sg3,-3.91,-1.93,3.410000e-32
2,BMI1_sg2,KDM1B_sg3,-1.34,-3.71,1.200000e-23
3,ING3_sg3,BMI1_sg1,-3.68,-1.11,4.570000e-21
4,BRD4_sg3,KDM6A_sg2,-2.30,-2.20,1.520000e-18
...,...,...,...,...,...
56,PRMT5_sg2,MBD1_sg1,-0.94,-1.04,2.470000e-03
57,EP300_sg3,MBD1_sg3,-0.92,-1.03,2.930000e-03
58,ING3_sg1,BRD4_sg3,-0.99,-0.96,2.930000e-03
59,KDM1A_sg1,HDAC2_sg3,-0.94,-0.95,4.710000e-03


In [282]:
all_SL.columns

Index(['2-wise gRNA combination', 'sgRNA-A', 'sgRNA-B',
       'Log2 ratio – Day 20/Day15 (Replicate 1)',
       'Log2 ratio – Day 20/Day15 (Replicate 2)',
       'Log2 ratio – Day 15/Day5 (Replicate 1)',
       'Log2 ratio – Day 15/Day5 (Replicate 2)'],
      dtype='object')

In [283]:
Wong_map_sig = {}

In [284]:
gene_A_list = all_SL['sgRNA-A']
gene_B_list = all_SL['sgRNA-B']

for row in range(statistically_sig.shape[0]):
    gene_A = statistically_sig.loc[row, 'sgRNA-A'].split('_')[0]
    gene_B = statistically_sig.loc[row, 'sgRNA-B'].split('_')[0]
    
    # sorted for uniqueness
    gene_A, gene_B = sorted([gene_A, gene_B])
    
    if gene_A == 'dummyguide' or gene_B == 'dummyguide':
        continue
    
    key = gene_A + "_" + gene_B
    if key not in Wong_map_sig:
        Wong_map_sig[key] = {}
        Wong_map_sig[key]['Log2 ratio – Day 20/Day15'] = statistically_sig.loc[row, 'Log2 ratio – Day 20/Day15 (Replicate 1)'] + statistically_sig.loc[row, 'Log2 ratio – Day 20/Day15 (Replicate 2)']
    else:
        Wong_map_sig[key]['Log2 ratio – Day 20/Day15'] += statistically_sig.loc[row, 'Log2 ratio – Day 20/Day15 (Replicate 1)'] + statistically_sig.loc[row, 'Log2 ratio – Day 20/Day15 (Replicate 2)']
        

In [285]:
all_SL = all_SL.fillna(0)

In [286]:
Wong_map = {}
Wong_map_repeats = {}

In [287]:
sig_combined = statistically_sig[['sgRNA-A', 'sgRNA-B']].agg(' + '.join, axis=1).tolist()

In [288]:
gene_A_list = all_SL['sgRNA-A']
gene_B_list = all_SL['sgRNA-B']

for row in range(all_SL.shape[0]):
    gene_A = all_SL.loc[row, 'sgRNA-A'].split('_')[0]
    gene_B = all_SL.loc[row, 'sgRNA-B'].split('_')[0]

    # sorted for uniqueness
    gene_A, gene_B = sorted([gene_A, gene_B])
    
#     if gene_A == 'dummyguide' or gene_B == 'dummyguide':
#         continue
    
    key = gene_A + "_" + gene_B
        
    if key not in Wong_map:
        Wong_map[key] = {}
        Wong_map[key]['Log2 ratio – Day 20/Day15'] = all_SL.loc[row, 'Log2 ratio – Day 20/Day15 (Replicate 1)'] + all_SL.loc[row, 'Log2 ratio – Day 20/Day15 (Replicate 2)']
        Wong_map[key]['Log2 ratio – Day 15/Day5'] = all_SL.loc[row, 'Log2 ratio – Day 15/Day5 (Replicate 1)'] + all_SL.loc[row, 'Log2 ratio – Day 15/Day5 (Replicate 2)']
        Wong_map[key]["Q_Val"] = 0
        Wong_map_repeats[key] = 1
    else:
        Wong_map[key]['Log2 ratio – Day 20/Day15'] += all_SL.loc[row, 'Log2 ratio – Day 20/Day15 (Replicate 1)'] + all_SL.loc[row, 'Log2 ratio – Day 20/Day15 (Replicate 2)']
        Wong_map[key]['Log2 ratio – Day 15/Day5'] += all_SL.loc[row, 'Log2 ratio – Day 15/Day5 (Replicate 1)'] + all_SL.loc[row, 'Log2 ratio – Day 15/Day5 (Replicate 2)']
        Wong_map_repeats[key] += 1
      
    curr_combined_1 = gene_A + "_sg" + all_SL.loc[row, 'sgRNA-A'].split('_')[1]  + " + " + gene_B + "_sg" + all_SL.loc[row, 'sgRNA-B'].split('_')[1]
    curr_combined_2 = gene_B + "_sg" + all_SL.loc[row, 'sgRNA-B'].split('_')[1]  + " + " + gene_A + "_sg" + all_SL.loc[row, 'sgRNA-A'].split('_')[1]
    
    if curr_combined_1 in sig_combined:
        Wong_map[key]["Q_Val"] += statistically_sig.loc[sig_combined.index(curr_combined_1), "Q-value"]
    elif curr_combined_2 in sig_combined:
        Wong_map[key]["Q_Val"] += statistically_sig.loc[sig_combined.index(curr_combined_2), "Q-value"]

In [289]:
statistically_sig

Unnamed: 0,sgRNA-A,sgRNA-B,Log2 ratio – Day 20/Day15 (Replicate 1),Log2 ratio – Day 20/Day15 (Replicate 2),Q-value
0,BRD4_sg3,MLL_sg3,-2.61,-3.32,7.010000e-33
1,BMI1_sg2,HDAC2_sg3,-3.91,-1.93,3.410000e-32
2,BMI1_sg2,KDM1B_sg3,-1.34,-3.71,1.200000e-23
3,ING3_sg3,BMI1_sg1,-3.68,-1.11,4.570000e-21
4,BRD4_sg3,KDM6A_sg2,-2.30,-2.20,1.520000e-18
...,...,...,...,...,...
56,PRMT5_sg2,MBD1_sg1,-0.94,-1.04,2.470000e-03
57,EP300_sg3,MBD1_sg3,-0.92,-1.03,2.930000e-03
58,ING3_sg1,BRD4_sg3,-0.99,-0.96,2.930000e-03
59,KDM1A_sg1,HDAC2_sg3,-0.94,-0.95,4.710000e-03


In [290]:
for key in Wong_map.keys():
    Wong_map[key]['Log2 ratio – Day 20/Day15'] = Wong_map[key]['Log2 ratio – Day 20/Day15']/Wong_map_repeats[key]
    Wong_map[key]['Log2 ratio – Day 15/Day5'] = Wong_map[key]['Log2 ratio – Day 15/Day5']/Wong_map_repeats[key]

In [291]:
all_SL

Unnamed: 0,2-wise gRNA combination,sgRNA-A,sgRNA-B,Log2 ratio – Day 20/Day15 (Replicate 1),Log2 ratio – Day 20/Day15 (Replicate 2),Log2 ratio – Day 15/Day5 (Replicate 1),Log2 ratio – Day 15/Day5 (Replicate 2)
0,dummyguide_1 + dummyguide_1,dummyguide_1,dummyguide_1,0.205770,0.534492,-0.441889,-0.084731
1,dummyguide_2 + dummyguide_1,dummyguide_2,dummyguide_1,-0.175483,-0.491619,-0.157136,0.320183
2,dummyguide_3 + dummyguide_1,dummyguide_3,dummyguide_1,-0.182784,-0.010254,-0.087128,0.372476
3,DNMT1_1 + dummyguide_1,DNMT1_1,dummyguide_1,0.152098,-0.430941,-0.148725,0.111234
4,DNMT1_2 + dummyguide_1,DNMT1_2,dummyguide_1,-0.005401,0.504039,-0.261538,-0.125159
...,...,...,...,...,...,...,...
11202,ING5_3 + ING5_1,ING5_3,ING5_1,-0.007535,0.324034,-0.043012,-0.441795
11203,ING5_2 + ING5_2,ING5_2,ING5_2,0.091024,1.334409,0.256652,-0.542246
11204,ING5_3 + ING5_2,ING5_3,ING5_2,0.447290,0.598651,-0.425111,-0.362895
11205,BRD4_3 + ING5_3,BRD4_3,ING5_3,0.073002,0.000000,-0.020841,0.000000


In [292]:
gene_A_list = []
gene_B_list = []
SL_scores = []
phenotype_scores = []
stat_scores = []

for row in range(all_SL.shape[0]):
    gene_A = all_SL.loc[row, 'sgRNA-A'].split('_')[0]
    gene_B = all_SL.loc[row, 'sgRNA-B'].split('_')[0]
    
    # sorted for uniqueness
    gene_A, gene_B = sorted([gene_A, gene_B])
    
    key = gene_A + "_" + gene_B
    
    gene_A_list.append(gene_A)
    gene_B_list.append(gene_B)
    SL_scores.append(Wong_map[key]['Log2 ratio – Day 20/Day15'])
    phenotype_scores.append(Wong_map[key]['Log2 ratio – Day 15/Day5'])
    stat_scores.append(Wong_map[key]['Q_Val'])

In [293]:
curr_GI = pd.DataFrame(columns = ["Gene_A", "Gene_B", "Study_Source", "Cell_Line", "GI_Score", "GI_Cutoff", "Stat_Score", "Stat_Cutoff"])
curr_GI["Gene_A"] = gene_A_list
curr_GI["Gene_B"] = gene_B_list
curr_GI["GI_Score"] = SL_scores
curr_GI["Stat_Score"] = stat_scores
curr_GI["GI_Cutoff"] = [-0.90] * len(gene_A_list)
curr_GI["Stat_Cutoff"] = [0.01] * len(gene_A_list)
curr_GI["Study_Source"] = [study_name_to_pubmed_id['wong_data']] * len(gene_A_list)
curr_GI["Cell_Line"] = ["OVCAR8"] * len(gene_A_list)
curr_GI = curr_GI.fillna(0)
curr_GI['Sorted_Genes'] = ['_'.join(sorted([curr_GI["Gene_A"][i], curr_GI["Gene_B"][i]])) for i in range(curr_GI.shape[0])]

In [294]:
temp = curr_GI.groupby('Sorted_Genes')['GI_Score'].apply(lambda x: np.mean(x))
curr_GI = curr_GI.merge(temp, how = 'left', left_on = 'Sorted_Genes', right_on = temp.index)

In [295]:
curr_GI = curr_GI.drop(columns = 'GI_Score_x')
curr_GI = curr_GI.rename(columns = {'GI_Score_y': 'GI_Score'})

In [296]:
curr_GI = curr_GI.groupby(["Gene_A", "Gene_B", "Study_Source", "Cell_Line", "GI_Score", "GI_Cutoff", "Stat_Score", "Stat_Cutoff", "Sorted_Genes"], as_index=False).count()
curr_GI = curr_GI.drop(columns = 'Sorted_Genes')

In [297]:
#curr_GI

In [298]:
#counts_ref

In [299]:
#curr_GI

In [300]:
#controls['wong_data']

In [301]:
# prepare the files for insertion
db_inserts = prepare_study_for_export(sequence_ref = sequence_ref.copy(), counts_ref = counts_ref.copy(), score_ref = curr_GI.copy(), study_controls = controls['wong_data'], study_conditions = study_conditions['wong_data'])

Starting processing...
Score reference...
Controls within SL score that are removed: 
50
---
Both GI and Stat cutoffs are present...
Counts reference...
Number of double pairs: 22050
Number of controls: 9
Number of singles: 1350
Sequence reference...
Done! Returning...


In [302]:
# Wong Specific
sig_gene_pairs = list(set(['_'.join(sorted([statistically_sig['sgRNA-A'].iloc[i].split('_')[0], statistically_sig['sgRNA-B'].iloc[i].split('_')[0]])) for i in range(statistically_sig.shape[0])]))
db_inserts['score_ref']['SL_or_not'] = [False] * db_inserts['score_ref'].shape[0]
db_inserts['score_ref']['SL_or_not'].loc[db_inserts['score_ref']['gene_pair'].isin(sig_gene_pairs)] = True

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  db_inserts['score_ref']['SL_or_not'].loc[db_inserts['score_ref']['gene_pair'].isin(sig_gene_pairs)] = True


In [303]:
# insert to the database
insert_study_to_db(engine_link = SLKB_engine, db_inserts = db_inserts)

Final QC...
Beginning transaction...
Done sequence
Done counts
Done score
Successfully inserted!
Added Record stats...
Sequence insert: 160
Counts insert: 23409
Score insert: 1225
Done!


## Zhao Data (A549, HELA)

In [304]:
zhao_loc = os.path.join(learning_goals_loc_general, "Zhao")

Zhao_excel = pd.ExcelFile(os.path.join(zhao_loc, "1 - sgRNA Counts.xlsx"))

sgRNA_ref = Zhao_excel.parse("Metabolism_dual_spacers")
Zhao_sgRNA_A549 =  Zhao_excel.parse("Raw counts from A549 screen")
Zhao_sgRNA_HELA =  Zhao_excel.parse("Raw counts from HeLa screen")
Zhao_sgRNA_plasmid = Zhao_excel.parse("Raw counts from plasmid library")

# Zhao_sgRNA_A549 =  pd.read_excel(os.path.join(zhao_loc, "1 - sgRNA Counts.xlsx"), sheet_name = "Raw counts from A549 screen")
# Zhao_sgRNA_HELA =  pd.read_excel(os.path.join(zhao_loc, "1 - sgRNA Counts.xlsx"), sheet_name = "Raw counts from HeLa screen")
# Zhao_sgRNA_plasmid =  pd.read_excel(os.path.join(zhao_loc, "1 - sgRNA Counts.xlsx"), sheet_name = "Raw counts from plasmid library")

### Seq Ref

In [305]:
sgRNA_ref

Unnamed: 0,SequenceID (GeneA_chr_position__GeneB_chr_position),Gene_A,Gene_A_spacer sequence,Gene_B,Gene_B_spacer sequence,Sequence for oligo synthesis
0,ALDOA_chr16_30078215__ALDOB_chr9_104188893,ALDOA,CTGGCTTGCGCCTTGCCATG,ALDOB,GGTACCTATTGTTGAACCAG,tatatatcttgtggaaaggacgaaacACCGCTGGCTTGCGCCTTGC...
1,ALDOA_chr16_30078215__ALDOC_chr17_26901140,ALDOA,CTGGCTTGCGCCTTGCCATG,ALDOC,CTCCTCTGGGGTATACTTGA,tatatatcttgtggaaaggacgaaacACCGCTGGCTTGCGCCTTGC...
2,ALDOA_chr16_30078215__ALDOC_chr17_26902525,ALDOA,CTGGCTTGCGCCTTGCCATG,ALDOC,GAAAGGGCTGGGTACGAGTG,tatatatcttgtggaaaggacgaaacACCGCTGGCTTGCGCCTTGC...
3,ALDOA_chr16_30078215__DLAT_chr11_111896198,ALDOA,CTGGCTTGCGCCTTGCCATG,DLAT,TGTGGCGCGTCTGTGCGCGA,tatatatcttgtggaaaggacgaaacACCGCTGGCTTGCGCCTTGC...
4,ALDOA_chr16_30078215__DLAT_chr11_111899333,ALDOA,CTGGCTTGCGCCTTGCCATG,DLAT,GATACAGATGATCGCTCCGA,tatatatcttgtggaaaggacgaaacACCGCTGGCTTGCGCCTTGC...
...,...,...,...,...,...,...
11929,TPI1_chr12_6976836__RPIA_chr2_88991246,TPI1,GAGGGCTTACCGGTGTCGGC,RPIA,GGGCCAAGACCCGCCCGTAG,tatatatcttgtggaaaggacgaaacACCGGAGGGCTTACCGGTGT...
11930,TPI1_chr12_6976836__TALDO1_chr11_747467,TPI1,GAGGGCTTACCGGTGTCGGC,TALDO1,CTCGACATAGCAAGACCGAG,tatatatcttgtggaaaggacgaaacACCGGAGGGCTTACCGGTGT...
11931,TPI1_chr12_6976836__TALDO1_chr11_755880,TPI1,GAGGGCTTACCGGTGTCGGC,TALDO1,CATCGACGAGTACAAGCCCC,tatatatcttgtggaaaggacgaaacACCGGAGGGCTTACCGGTGT...
11932,TPI1_chr12_6976836__TALDO1_chr11_755937,TPI1,GAGGGCTTACCGGTGTCGGC,TALDO1,AGCACAGATGCCCGCTTACC,tatatatcttgtggaaaggacgaaacACCGGAGGGCTTACCGGTGT...


In [306]:
guide_list = []
seq_list = []
for i in range(len(sgRNA_ref['SequenceID (GeneA_chr_position__GeneB_chr_position)'])):
    probes = sgRNA_ref['SequenceID (GeneA_chr_position__GeneB_chr_position)'][i]
    curr_A, curr_B = probes.split('__')
    
    if curr_A not in guide_list:
        guide_list.append(curr_A)
        seq_list.append(sgRNA_ref['Gene_A_spacer sequence'][i])
        
    if curr_B not in guide_list:
        guide_list.append(curr_B)
        seq_list.append(sgRNA_ref['Gene_B_spacer sequence'][i])

In [307]:
sgRNA_ref = pd.DataFrame({"Guide_ID": guide_list,
                         "Sequence": seq_list})

In [308]:
sequence_ref = sgRNA_ref.copy()
sequence_ref['Target'] = [i.split('_')[0] for i in sequence_ref['Guide_ID']]
sequence_ref.columns = ['sgRNA_guide_name', 'sgRNA_guide_seq', 'sgRNA_target_name']

In [309]:
idx = np.array([i for i in range(len(sequence_ref['sgRNA_guide_name'])) if 'HUMAN' in sequence_ref['sgRNA_guide_name'][i].upper()])
sequence_ref.loc[idx, 'sgRNA_target_name'] = 'control'

In [310]:
sequence_ref

Unnamed: 0,sgRNA_guide_name,sgRNA_guide_seq,sgRNA_target_name
0,ALDOA_chr16_30078215,CTGGCTTGCGCCTTGCCATG,ALDOA
1,ALDOB_chr9_104188893,GGTACCTATTGTTGAACCAG,ALDOB
2,ALDOC_chr17_26901140,CTCCTCTGGGGTATACTTGA,ALDOC
3,ALDOC_chr17_26902525,GAAAGGGCTGGGTACGAGTG,ALDOC
4,DLAT_chr11_111896198,TGTGGCGCGTCTGTGCGCGA,DLAT
...,...,...,...
151,LDHB_chr12_21796959,AGGGGAGAGTCGGCTCAATC,LDHB
152,G6PD_chrX_153775081,CCCGCCCCCGCCGATTAAAT,G6PD
153,PGLS_chr19_17622472,CGCCCTCGCCATGGCCGCGC,PGLS
154,PGAM1_chr10_99186191,CAGGCGCTACGAGGTGCGGA,PGAM1


### Counts

In [311]:
counts_ref = pd.DataFrame(columns = ["Guide 1", "Guide 2", "Gene 1", "Gene 2", "Count Replicates", "Type", "Sequencing", "Cell Line", "Condition"])

In [312]:
Zhao_sgRNA_plasmid

Unnamed: 0,SequenceID (GeneA_chr_position__GeneB_chr_position),MV4_plasmid_S7_trimmed53_len_filtered_counts
0,ALDOA_chr16_30078215__ALDOB_chr9_104188893,442
1,ALDOA_chr16_30078215__ALDOC_chr17_26901140,1199
2,ALDOA_chr16_30078215__ALDOC_chr17_26902525,284
3,ALDOA_chr16_30078215__DLAT_chr11_111896198,1012
4,ALDOA_chr16_30078215__DLAT_chr11_111899333,589
...,...,...
11929,TPI1_chr12_6976836__RPIA_chr2_88991246,194
11930,TPI1_chr12_6976836__TALDO1_chr11_747467,167
11931,TPI1_chr12_6976836__TALDO1_chr11_755880,144
11932,TPI1_chr12_6976836__TALDO1_chr11_755937,600


In [313]:
Zhao_sgRNA_A549['MV4_plasmid_S7_trimmed53_len_filtered_counts'] = Zhao_sgRNA_plasmid['MV4_plasmid_S7_trimmed53_len_filtered_counts']
Zhao_sgRNA_HELA['MV4_plasmid_S7_trimmed53_len_filtered_counts'] = Zhao_sgRNA_plasmid['MV4_plasmid_S7_trimmed53_len_filtered_counts']

In [314]:
Zhao_sgRNA_A549

Unnamed: 0,ID,probeA,probeB,geneA,geneB,A549_MV4_d3_1_S1_trimmed53_len_filtered_counts,A549_MV4_d3_2_S2_trimmed53_len_filtered_counts,A549_MV4_d14_1_S3_trimmed53_len_filtered_counts,A549_MV4_d14_2_S4_trimmed53_len_filtered_counts,A549_MV4_d20_1_S5_trimmed53_len_filtered_counts,A549_MV4_d20_2_S6_trimmed53_len_filtered_counts,A549_MV4_d28_1_S7_trimmed53_len_filtered_counts,A549_MV4_d28_2_S8_trimmed53_len_filtered_counts,MV4_plasmid_S7_trimmed53_len_filtered_counts
0,ALDOA_chr16_30078215__ALDOB_chr9_104188893,ALDOA_chr16_30078215,ALDOB_chr9_104188893,ALDOA,ALDOB,697,526,761,1144,728,1107,852,794,442
1,ALDOA_chr16_30078215__ALDOC_chr17_26901140,ALDOA_chr16_30078215,ALDOC_chr17_26901140,ALDOA,ALDOC,1370,1127,1452,1594,1434,1870,1771,917,1199
2,ALDOA_chr16_30078215__ALDOC_chr17_26902525,ALDOA_chr16_30078215,ALDOC_chr17_26902525,ALDOA,ALDOC,492,465,380,604,266,708,267,383,284
3,ALDOA_chr16_30078215__DLAT_chr11_111896198,ALDOA_chr16_30078215,DLAT_chr11_111896198,ALDOA,DLAT,1507,1170,1248,1444,968,1294,1214,824,1012
4,ALDOA_chr16_30078215__DLAT_chr11_111899333,ALDOA_chr16_30078215,DLAT_chr11_111899333,ALDOA,DLAT,678,635,643,771,537,887,378,517,589
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11929,TPI1_chr12_6976836__RPIA_chr2_88991246,TPI1_chr12_6976836,RPIA_chr2_88991246,TPI1,RPIA,267,355,147,139,178,70,254,97,194
11930,TPI1_chr12_6976836__TALDO1_chr11_747467,TPI1_chr12_6976836,TALDO1_chr11_747467,TPI1,TALDO1,194,221,109,139,220,39,294,1,167
11931,TPI1_chr12_6976836__TALDO1_chr11_755880,TPI1_chr12_6976836,TALDO1_chr11_755880,TPI1,TALDO1,227,365,224,292,161,169,188,64,144
11932,TPI1_chr12_6976836__TALDO1_chr11_755937,TPI1_chr12_6976836,TALDO1_chr11_755937,TPI1,TALDO1,418,681,526,772,531,1127,640,954,600


In [315]:
curr_counts = Zhao_sgRNA_A549.copy()
curr_counts = curr_counts.drop(["ID"], axis = 1)

In [316]:
curr_counts.columns[:4]

Index(['probeA', 'probeB', 'geneA', 'geneB'], dtype='object')

In [317]:
curr_counts.columns[4:]

Index(['A549_MV4_d3_1_S1_trimmed53_len_filtered_counts',
       'A549_MV4_d3_2_S2_trimmed53_len_filtered_counts',
       'A549_MV4_d14_1_S3_trimmed53_len_filtered_counts',
       'A549_MV4_d14_2_S4_trimmed53_len_filtered_counts',
       'A549_MV4_d20_1_S5_trimmed53_len_filtered_counts',
       'A549_MV4_d20_2_S6_trimmed53_len_filtered_counts',
       'A549_MV4_d28_1_S7_trimmed53_len_filtered_counts',
       'A549_MV4_d28_2_S8_trimmed53_len_filtered_counts',
       'MV4_plasmid_S7_trimmed53_len_filtered_counts'],
      dtype='object')

In [318]:
curr_counts.columns = ["Guide 1", "Guide 2", "Gene 1", "Gene 2"] + curr_counts.columns[4:].tolist()

In [319]:
replicates = curr_counts.columns[4:]

In [320]:
curr_counts['Count Replicates'] = curr_counts[replicates].apply(    
    lambda x: ';'.join(x.astype(str)),
    axis=1
)
curr_counts = curr_counts.drop(replicates, axis = 1)

In [321]:
curr_counts["Condition"] = [';'.join(replicates)] * curr_counts.shape[0]
curr_counts["Cell Line"] = ["A549"] * curr_counts.shape[0]

In [322]:
curr_counts

Unnamed: 0,Guide 1,Guide 2,Gene 1,Gene 2,Count Replicates,Condition,Cell Line
0,ALDOA_chr16_30078215,ALDOB_chr9_104188893,ALDOA,ALDOB,697;526;761;1144;728;1107;852;794;442,A549_MV4_d3_1_S1_trimmed53_len_filtered_counts...,A549
1,ALDOA_chr16_30078215,ALDOC_chr17_26901140,ALDOA,ALDOC,1370;1127;1452;1594;1434;1870;1771;917;1199,A549_MV4_d3_1_S1_trimmed53_len_filtered_counts...,A549
2,ALDOA_chr16_30078215,ALDOC_chr17_26902525,ALDOA,ALDOC,492;465;380;604;266;708;267;383;284,A549_MV4_d3_1_S1_trimmed53_len_filtered_counts...,A549
3,ALDOA_chr16_30078215,DLAT_chr11_111896198,ALDOA,DLAT,1507;1170;1248;1444;968;1294;1214;824;1012,A549_MV4_d3_1_S1_trimmed53_len_filtered_counts...,A549
4,ALDOA_chr16_30078215,DLAT_chr11_111899333,ALDOA,DLAT,678;635;643;771;537;887;378;517;589,A549_MV4_d3_1_S1_trimmed53_len_filtered_counts...,A549
...,...,...,...,...,...,...,...
11929,TPI1_chr12_6976836,RPIA_chr2_88991246,TPI1,RPIA,267;355;147;139;178;70;254;97;194,A549_MV4_d3_1_S1_trimmed53_len_filtered_counts...,A549
11930,TPI1_chr12_6976836,TALDO1_chr11_747467,TPI1,TALDO1,194;221;109;139;220;39;294;1;167,A549_MV4_d3_1_S1_trimmed53_len_filtered_counts...,A549
11931,TPI1_chr12_6976836,TALDO1_chr11_755880,TPI1,TALDO1,227;365;224;292;161;169;188;64;144,A549_MV4_d3_1_S1_trimmed53_len_filtered_counts...,A549
11932,TPI1_chr12_6976836,TALDO1_chr11_755937,TPI1,TALDO1,418;681;526;772;531;1127;640;954;600,A549_MV4_d3_1_S1_trimmed53_len_filtered_counts...,A549


In [323]:
counts_ref = pd.concat([counts_ref, curr_counts])

In [324]:
Zhao_sgRNA_A549

Unnamed: 0,ID,probeA,probeB,geneA,geneB,A549_MV4_d3_1_S1_trimmed53_len_filtered_counts,A549_MV4_d3_2_S2_trimmed53_len_filtered_counts,A549_MV4_d14_1_S3_trimmed53_len_filtered_counts,A549_MV4_d14_2_S4_trimmed53_len_filtered_counts,A549_MV4_d20_1_S5_trimmed53_len_filtered_counts,A549_MV4_d20_2_S6_trimmed53_len_filtered_counts,A549_MV4_d28_1_S7_trimmed53_len_filtered_counts,A549_MV4_d28_2_S8_trimmed53_len_filtered_counts,MV4_plasmid_S7_trimmed53_len_filtered_counts
0,ALDOA_chr16_30078215__ALDOB_chr9_104188893,ALDOA_chr16_30078215,ALDOB_chr9_104188893,ALDOA,ALDOB,697,526,761,1144,728,1107,852,794,442
1,ALDOA_chr16_30078215__ALDOC_chr17_26901140,ALDOA_chr16_30078215,ALDOC_chr17_26901140,ALDOA,ALDOC,1370,1127,1452,1594,1434,1870,1771,917,1199
2,ALDOA_chr16_30078215__ALDOC_chr17_26902525,ALDOA_chr16_30078215,ALDOC_chr17_26902525,ALDOA,ALDOC,492,465,380,604,266,708,267,383,284
3,ALDOA_chr16_30078215__DLAT_chr11_111896198,ALDOA_chr16_30078215,DLAT_chr11_111896198,ALDOA,DLAT,1507,1170,1248,1444,968,1294,1214,824,1012
4,ALDOA_chr16_30078215__DLAT_chr11_111899333,ALDOA_chr16_30078215,DLAT_chr11_111899333,ALDOA,DLAT,678,635,643,771,537,887,378,517,589
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11929,TPI1_chr12_6976836__RPIA_chr2_88991246,TPI1_chr12_6976836,RPIA_chr2_88991246,TPI1,RPIA,267,355,147,139,178,70,254,97,194
11930,TPI1_chr12_6976836__TALDO1_chr11_747467,TPI1_chr12_6976836,TALDO1_chr11_747467,TPI1,TALDO1,194,221,109,139,220,39,294,1,167
11931,TPI1_chr12_6976836__TALDO1_chr11_755880,TPI1_chr12_6976836,TALDO1_chr11_755880,TPI1,TALDO1,227,365,224,292,161,169,188,64,144
11932,TPI1_chr12_6976836__TALDO1_chr11_755937,TPI1_chr12_6976836,TALDO1_chr11_755937,TPI1,TALDO1,418,681,526,772,531,1127,640,954,600


In [325]:
Zhao_sgRNA_HELA

Unnamed: 0,ID,probeA,probeB,geneA,geneB,Hela_MV4_d3_1_S1_trimmed53_len_filtered_counts,Hela_MV4_d3_2_S2_trimmed53_len_filtered_counts,Hela_MV4_d14_1_S3_trimmed53_len_filtered_counts,Hela_MV4_d14_2_S4_trimmed53_len_filtered_counts,Hela_MV4_d20_1_S9_trimmed53_len_filtered_counts,Hela_MV4_d20_2_S10_trimmed53_len_filtered_counts,Hela_MV4_d28_1_S5_trimmed53_len_filtered_counts,Hela_MV4_d28_2_S6_trimmed53_len_filtered_counts,MV4_plasmid_S7_trimmed53_len_filtered_counts
0,ALDOA_chr16_30078215__ALDOB_chr9_104188893,ALDOA_chr16_30078215,ALDOB_chr9_104188893,ALDOA,ALDOB,640,422,621,563,818,450,782,367,442
1,ALDOA_chr16_30078215__ALDOC_chr17_26901140,ALDOA_chr16_30078215,ALDOC_chr17_26901140,ALDOA,ALDOC,1026,736,1455,1263,1504,2381,1358,1371,1199
2,ALDOA_chr16_30078215__ALDOC_chr17_26902525,ALDOA_chr16_30078215,ALDOC_chr17_26902525,ALDOA,ALDOC,305,365,375,397,434,806,319,560,284
3,ALDOA_chr16_30078215__DLAT_chr11_111896198,ALDOA_chr16_30078215,DLAT_chr11_111896198,ALDOA,DLAT,1008,947,1201,1283,1859,2112,1562,1604,1012
4,ALDOA_chr16_30078215__DLAT_chr11_111899333,ALDOA_chr16_30078215,DLAT_chr11_111899333,ALDOA,DLAT,206,620,885,855,958,1159,693,648,589
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11929,TPI1_chr12_6976836__RPIA_chr2_88991246,TPI1_chr12_6976836,RPIA_chr2_88991246,TPI1,RPIA,206,133,95,34,56,47,2,19,194
11930,TPI1_chr12_6976836__TALDO1_chr11_747467,TPI1_chr12_6976836,TALDO1_chr11_747467,TPI1,TALDO1,177,242,102,90,58,81,11,38,167
11931,TPI1_chr12_6976836__TALDO1_chr11_755880,TPI1_chr12_6976836,TALDO1_chr11_755880,TPI1,TALDO1,115,161,49,95,11,43,16,5,144
11932,TPI1_chr12_6976836__TALDO1_chr11_755937,TPI1_chr12_6976836,TALDO1_chr11_755937,TPI1,TALDO1,504,487,399,401,280,288,230,217,600


In [326]:
curr_counts = Zhao_sgRNA_HELA.copy()
curr_counts = curr_counts.drop(["ID"], axis = 1)

In [327]:
curr_counts.columns[:4]

Index(['probeA', 'probeB', 'geneA', 'geneB'], dtype='object')

In [328]:
curr_counts.columns[4:]

Index(['Hela_MV4_d3_1_S1_trimmed53_len_filtered_counts',
       'Hela_MV4_d3_2_S2_trimmed53_len_filtered_counts',
       'Hela_MV4_d14_1_S3_trimmed53_len_filtered_counts',
       'Hela_MV4_d14_2_S4_trimmed53_len_filtered_counts',
       'Hela_MV4_d20_1_S9_trimmed53_len_filtered_counts',
       'Hela_MV4_d20_2_S10_trimmed53_len_filtered_counts',
       'Hela_MV4_d28_1_S5_trimmed53_len_filtered_counts',
       'Hela_MV4_d28_2_S6_trimmed53_len_filtered_counts',
       'MV4_plasmid_S7_trimmed53_len_filtered_counts'],
      dtype='object')

In [329]:
curr_counts.columns = ["Guide 1", "Guide 2", "Gene 1", "Gene 2"] + curr_counts.columns[4:].tolist()

In [330]:
replicates = curr_counts.columns[4:]

In [331]:
curr_counts['Count Replicates'] = curr_counts[replicates].apply(    
    lambda x: ';'.join(x.astype(str)),
    axis=1
)
curr_counts = curr_counts.drop(replicates, axis = 1)

In [332]:
curr_counts["Condition"] = [';'.join(replicates)] * curr_counts.shape[0]
curr_counts["Cell Line"] = ["HELA"] * curr_counts.shape[0]

In [333]:
counts_ref = pd.concat([counts_ref, curr_counts])

In [334]:
counts_ref["Sequencing"] = ["Combinatorial CRISPR"] * counts_ref.shape[0]

In [335]:
counts_ref.loc[counts_ref['Gene 1'] == 0, 'Gene 1'] = 'control'
counts_ref.loc[counts_ref['Gene 2'] == 0, 'Gene 2'] = 'control'
counts_ref["Study"] = [study_name_to_pubmed_id['zhao_data']] * counts_ref.shape[0]

In [336]:
counts_ref

Unnamed: 0,Guide 1,Guide 2,Gene 1,Gene 2,Count Replicates,Type,Sequencing,Cell Line,Condition,Study
0,ALDOA_chr16_30078215,ALDOB_chr9_104188893,ALDOA,ALDOB,697;526;761;1144;728;1107;852;794;442,,Combinatorial CRISPR,A549,A549_MV4_d3_1_S1_trimmed53_len_filtered_counts...,29452643
1,ALDOA_chr16_30078215,ALDOC_chr17_26901140,ALDOA,ALDOC,1370;1127;1452;1594;1434;1870;1771;917;1199,,Combinatorial CRISPR,A549,A549_MV4_d3_1_S1_trimmed53_len_filtered_counts...,29452643
2,ALDOA_chr16_30078215,ALDOC_chr17_26902525,ALDOA,ALDOC,492;465;380;604;266;708;267;383;284,,Combinatorial CRISPR,A549,A549_MV4_d3_1_S1_trimmed53_len_filtered_counts...,29452643
3,ALDOA_chr16_30078215,DLAT_chr11_111896198,ALDOA,DLAT,1507;1170;1248;1444;968;1294;1214;824;1012,,Combinatorial CRISPR,A549,A549_MV4_d3_1_S1_trimmed53_len_filtered_counts...,29452643
4,ALDOA_chr16_30078215,DLAT_chr11_111899333,ALDOA,DLAT,678;635;643;771;537;887;378;517;589,,Combinatorial CRISPR,A549,A549_MV4_d3_1_S1_trimmed53_len_filtered_counts...,29452643
...,...,...,...,...,...,...,...,...,...,...
11929,TPI1_chr12_6976836,RPIA_chr2_88991246,TPI1,RPIA,206;133;95;34;56;47;2;19;194,,Combinatorial CRISPR,HELA,Hela_MV4_d3_1_S1_trimmed53_len_filtered_counts...,29452643
11930,TPI1_chr12_6976836,TALDO1_chr11_747467,TPI1,TALDO1,177;242;102;90;58;81;11;38;167,,Combinatorial CRISPR,HELA,Hela_MV4_d3_1_S1_trimmed53_len_filtered_counts...,29452643
11931,TPI1_chr12_6976836,TALDO1_chr11_755880,TPI1,TALDO1,115;161;49;95;11;43;16;5;144,,Combinatorial CRISPR,HELA,Hela_MV4_d3_1_S1_trimmed53_len_filtered_counts...,29452643
11932,TPI1_chr12_6976836,TALDO1_chr11_755937,TPI1,TALDO1,504;487;399;401;280;288;230;217;600,,Combinatorial CRISPR,HELA,Hela_MV4_d3_1_S1_trimmed53_len_filtered_counts...,29452643


## SL Data

In [337]:
zhao_loc = os.path.join(learning_goals_loc_general, "Zhao", 'created')

Zhao_A549 =  pd.read_excel(os.path.join(zhao_loc, "Zhao_GI.xlsx"), sheet_name = "A549 pi scores")
Zhao_HELA =  pd.read_excel(os.path.join(zhao_loc, "Zhao_GI.xlsx"), sheet_name = "HeLa pi socres")

In [338]:
Zhao_A549

Unnamed: 0,gene_gene,geneA,fA,geneB,fB,fA+fB,pi,sd,PP,abs pi,FDR left,FDR right,z
0,ALDOA_ALDOB,ALDOA,-0.047146,ALDOB,-0.005978,-0.053124,0.004084,0.018323,0.163,0.004084,0.761518,1,0.169437
1,ALDOA_ALDOC,ALDOA,-0.047146,ALDOC,-0.018737,-0.065883,-0.058298,0.034592,0.912,0.058298,0.205590,1,-2.418505
2,ALDOA_DLAT,ALDOA,-0.047146,DLAT,-0.012969,-0.060115,-0.071055,0.037351,0.949,0.071055,0.178348,1,-2.947731
3,ALDOA_DLD,ALDOA,-0.047146,DLD,-0.027896,-0.075042,-0.065549,0.016643,1.000,0.065549,0.184315,1,-2.719313
4,ALDOA_ENO1,ALDOA,-0.047146,ENO1,-0.044006,-0.091152,-0.048887,0.024589,0.958,0.048887,0.251583,1,-2.028083
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1270,RPIA_TKT,RPIA,-0.030796,TKT,-0.036411,-0.067207,-0.014492,0.020125,0.525,0.014492,0.462568,1,-0.601182
1271,RPIA_TPI1,RPIA,-0.030796,TPI1,-0.030328,-0.061124,-0.026897,0.027609,0.675,0.026897,0.372878,1,-1.115811
1272,TALDO1_TKT,TALDO1,-0.031732,TKT,-0.036411,-0.068143,-0.080038,0.031667,0.990,0.080038,0.130720,1,-3.320382
1273,TALDO1_TPI1,TALDO1,-0.031732,TPI1,-0.030328,-0.062060,-0.056565,0.037941,0.860,0.056565,0.208709,1,-2.346589


In [339]:
Zhao_HELA

Unnamed: 0,gene_gene,geneA,fA,geneB,fB,fA+fB,pi,sd,PP,abs pi,FDR left,FDR right,z
0,ALDOA_ALDOB,ALDOA,-0.090395,ALDOB,-0.023448,-0.113843,0.004951,0.022351,0.165,0.004951,0.790159,1,0.188401
1,ALDOA_ALDOC,ALDOA,-0.090395,ALDOC,-0.027559,-0.117955,-0.049581,0.049662,0.684,0.049581,0.369074,1,-1.886880
2,ALDOA_DLAT,ALDOA,-0.090395,DLAT,0.006216,-0.084180,-0.055467,0.052478,0.697,0.055467,0.355821,1,-2.110882
3,ALDOA_DLD,ALDOA,-0.090395,DLD,-0.019896,-0.110292,-0.019820,0.026161,0.555,0.019820,0.517702,1,-0.754265
4,ALDOA_ENO1,ALDOA,-0.090395,ENO1,-0.090564,-0.180959,0.003899,0.065412,0.046,0.003899,0.787505,1,0.148370
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1270,RPIA_TKT,RPIA,-0.037973,TKT,-0.065134,-0.103107,-0.034815,0.029087,0.745,0.034815,0.402030,1,-1.324924
1271,RPIA_TPI1,RPIA,-0.037973,TPI1,-0.089012,-0.126985,-0.024122,0.034493,0.519,0.024122,0.479355,1,-0.918003
1272,TALDO1_TKT,TALDO1,-0.026272,TKT,-0.065134,-0.091406,-0.004685,0.021982,0.158,0.004685,0.689172,1,-0.178284
1273,TALDO1_TPI1,TALDO1,-0.026272,TPI1,-0.089012,-0.115284,-0.036088,0.034647,0.694,0.036088,0.401973,1,-1.373376


In [340]:
all_GI = pd.DataFrame(columns = ["Gene_A", "Gene_B", "Study_Source", "Cell_Line", "GI_Score", "GI_Cutoff", "Stat_Score", "Stat_Cutoff"])


In [341]:
curr_GI = pd.DataFrame(columns = ["Gene_A", "Gene_B", "Study_Source", "Cell_Line", "GI_Score", "GI_Cutoff", "Stat_Score", "Stat_Cutoff"])
curr_GI["Gene_A"] = Zhao_A549["geneA"]
curr_GI["Gene_B"] = Zhao_A549["geneB"]
curr_GI["GI_Score"] = Zhao_A549["z"]
curr_GI["GI_Cutoff"] = [-3] * len(curr_GI["Gene_A"])
curr_GI["Study_Source"] = [study_name_to_pubmed_id['zhao_data']] * len(curr_GI["Gene_A"])
curr_GI["Cell_Line"] = ["A549"] * len(curr_GI["Gene_A"])
curr_GI = curr_GI.fillna(0)
curr_GI["Stat_Score"] = Zhao_A549.loc[:, "FDR left"]
curr_GI["Stat_Cutoff"] = [float("nan")] * len(curr_GI["Gene_A"])

all_GI = pd.concat([all_GI, curr_GI])

In [342]:
curr_GI = pd.DataFrame(columns = ["Gene_A", "Gene_B", "Study_Source", "Cell_Line", "GI_Score", "GI_Cutoff", "Stat_Score", "Stat_Cutoff"])
curr_GI["Gene_A"] = Zhao_HELA["geneA"]
curr_GI["Gene_B"] = Zhao_HELA["geneB"]
curr_GI["GI_Score"] = Zhao_HELA["z"]
curr_GI["GI_Cutoff"] = [-3] * len(curr_GI["Gene_A"])
curr_GI["Study_Source"] = [study_name_to_pubmed_id['zhao_data']] * len(curr_GI["Gene_A"])
curr_GI["Cell_Line"] = ["HELA"] * len(curr_GI["Gene_A"])
curr_GI = curr_GI.fillna(0)
curr_GI["Stat_Score"] = Zhao_HELA.loc[:, "FDR left"]
curr_GI["Stat_Cutoff"] = [float("nan")] * len(curr_GI["Gene_A"])

all_GI = pd.concat([all_GI, curr_GI])

In [343]:
# prepare the files for insertion
db_inserts = prepare_study_for_export(sequence_ref = sequence_ref.copy(), counts_ref = counts_ref.copy(), score_ref = all_GI.copy(), study_controls = controls['zhao_data'], study_conditions = study_conditions['zhao_data'])

Starting processing...
Score reference...
Controls within SL score that are removed: 
0
---
Only GI cutoff is present...
Counts reference...
Number of double pairs: 22950
Number of controls: 0
Number of singles: 918
Sequence reference...
Done! Returning...


In [344]:
#PREV_REF['Study_Source'].value_counts()

In [345]:
#PREV_REF.loc[PREV_REF['Study_Source'] == 'Zhao', :]

In [346]:
#db_inserts['score_ref']['SL_or_not'].value_counts()

In [347]:
# insert to the database
insert_study_to_db(engine_link = SLKB_engine, db_inserts = db_inserts)

Final QC...
Beginning transaction...
Done sequence
Done counts
Done score
Successfully inserted!
Added Record stats...
Sequence insert: 156
Counts insert: 23868
Score insert: 2550
Done!


## ShanTang Data

In [348]:
shantang_loc = os.path.join(learning_goals_loc_general, "ShanTang")
shan_data = pd.read_csv(os.path.join(shantang_loc, "DKO_22RV1_RawCounts_1.csv"))

In [349]:
shan_data

Unnamed: 0,construct_id,target_a_id,probe_a_id,probe_a_seq,target_b_id,probe_b_id,probe_b_seq,T0_2,T0_3,T12_2,T12_3
0,0Safe_safe_ACOC_204550.4522__0Safe_safe_ACOC_2...,0Safe_safe_ACOC,0Safe_safe_ACOC_204550.4522,GTGTATTTGGCTTCCAAAA,0Safe_safe_ACOC,0Safe_safe_ACOC_204550.4522,GTGTATTTGGCTTCCAAAA,22,17,36,43
1,0Safe_safe_ACOC_204550.4522__0Safe_safe_ACOC_2...,0Safe_safe_ACOC,0Safe_safe_ACOC_204550.4522,GTGTATTTGGCTTCCAAAA,0Safe_safe_ACOC,0Safe_safe_ACOC_204550.4525,GCATGGCCTCCACTTGCAA,57,73,107,98
2,0Safe_safe_ACOC_204550.4522__0Safe_safe_ACOC_2...,0Safe_safe_ACOC,0Safe_safe_ACOC_204550.4522,GTGTATTTGGCTTCCAAAA,0Safe_safe_ACOC,0Safe_safe_ACOC_204550.4590,GCACCAGTCTATGCCACCAC,78,55,124,166
3,0Safe_safe_ACOC_204550.4522__0Safe_safe_DTKP_2...,0Safe_safe_ACOC,0Safe_safe_ACOC_204550.4522,GTGTATTTGGCTTCCAAAA,0Safe_safe_DTKP,0Safe_safe_DTKP_204550.5261,GTGAATTTAAGGCACAACCC,42,41,62,62
4,0Safe_safe_ACOC_204550.4522__0Safe_safe_DTKP_2...,0Safe_safe_ACOC,0Safe_safe_ACOC_204550.4522,GTGTATTTGGCTTCCAAAA,0Safe_safe_DTKP,0Safe_safe_DTKP_204550.5308,GCTGGACTATGCCAGGACCT,17,27,38,50
...,...,...,...,...,...,...,...,...,...,...,...
60625,WNT5A_4__UPF3A_4,WNT5A,WNT5A_4,GTAACTTCCAACAGGGGGTGG,UPF3A,UPF3A_4,GACCCAGTCTATCAAAAGACC,105,97,227,235
60626,WNT5A_4__WNT5A_1,WNT5A,WNT5A_4,GTAACTTCCAACAGGGGGTGG,WNT5A,WNT5A_1,GAGGATGCGAGCACTCTCGT,339,282,455,564
60627,WNT5A_4__WNT5A_2,WNT5A,WNT5A_4,GTAACTTCCAACAGGGGGTGG,WNT5A,WNT5A_2,GAGTATCAATTCCGACATCGA,288,308,448,597
60628,WNT5A_4__WNT5A_3,WNT5A,WNT5A_4,GTAACTTCCAACAGGGGGTGG,WNT5A,WNT5A_3,GAGTTAACTTCCAACAGGGGG,201,198,333,416


In [350]:
for column in ['target_a_id', 'probe_a_id', 'target_b_id', 'probe_b_id']:
    shan_data[column] = ['-'.join(i.split('_')) for i in shan_data[column]]

In [351]:
shan_data

Unnamed: 0,construct_id,target_a_id,probe_a_id,probe_a_seq,target_b_id,probe_b_id,probe_b_seq,T0_2,T0_3,T12_2,T12_3
0,0Safe_safe_ACOC_204550.4522__0Safe_safe_ACOC_2...,0Safe-safe-ACOC,0Safe-safe-ACOC-204550.4522,GTGTATTTGGCTTCCAAAA,0Safe-safe-ACOC,0Safe-safe-ACOC-204550.4522,GTGTATTTGGCTTCCAAAA,22,17,36,43
1,0Safe_safe_ACOC_204550.4522__0Safe_safe_ACOC_2...,0Safe-safe-ACOC,0Safe-safe-ACOC-204550.4522,GTGTATTTGGCTTCCAAAA,0Safe-safe-ACOC,0Safe-safe-ACOC-204550.4525,GCATGGCCTCCACTTGCAA,57,73,107,98
2,0Safe_safe_ACOC_204550.4522__0Safe_safe_ACOC_2...,0Safe-safe-ACOC,0Safe-safe-ACOC-204550.4522,GTGTATTTGGCTTCCAAAA,0Safe-safe-ACOC,0Safe-safe-ACOC-204550.4590,GCACCAGTCTATGCCACCAC,78,55,124,166
3,0Safe_safe_ACOC_204550.4522__0Safe_safe_DTKP_2...,0Safe-safe-ACOC,0Safe-safe-ACOC-204550.4522,GTGTATTTGGCTTCCAAAA,0Safe-safe-DTKP,0Safe-safe-DTKP-204550.5261,GTGAATTTAAGGCACAACCC,42,41,62,62
4,0Safe_safe_ACOC_204550.4522__0Safe_safe_DTKP_2...,0Safe-safe-ACOC,0Safe-safe-ACOC-204550.4522,GTGTATTTGGCTTCCAAAA,0Safe-safe-DTKP,0Safe-safe-DTKP-204550.5308,GCTGGACTATGCCAGGACCT,17,27,38,50
...,...,...,...,...,...,...,...,...,...,...,...
60625,WNT5A_4__UPF3A_4,WNT5A,WNT5A-4,GTAACTTCCAACAGGGGGTGG,UPF3A,UPF3A-4,GACCCAGTCTATCAAAAGACC,105,97,227,235
60626,WNT5A_4__WNT5A_1,WNT5A,WNT5A-4,GTAACTTCCAACAGGGGGTGG,WNT5A,WNT5A-1,GAGGATGCGAGCACTCTCGT,339,282,455,564
60627,WNT5A_4__WNT5A_2,WNT5A,WNT5A-4,GTAACTTCCAACAGGGGGTGG,WNT5A,WNT5A-2,GAGTATCAATTCCGACATCGA,288,308,448,597
60628,WNT5A_4__WNT5A_3,WNT5A,WNT5A-4,GTAACTTCCAACAGGGGGTGG,WNT5A,WNT5A-3,GAGTTAACTTCCAACAGGGGG,201,198,333,416


### Seq Ref

In [352]:
sgRNA_ref = pd.DataFrame(columns = ["Guide_ID", "Sequence"])
sgRNA_ref['Guide_ID'] = pd.concat([shan_data['probe_a_id'], shan_data['probe_b_id']])
sgRNA_ref['Sequence'] = pd.concat([shan_data['probe_a_seq'], shan_data['probe_b_seq']])
sgRNA_ref.index = sgRNA_ref['Guide_ID']
sgRNA_ref = sgRNA_ref.drop_duplicates()
sgRNA_ref = sgRNA_ref.reset_index(drop = True)

In [353]:
sgRNA_ref

Unnamed: 0,Guide_ID,Sequence
0,0Safe-safe-ACOC-204550.4522,GTGTATTTGGCTTCCAAAA
1,0Safe-safe-ACOC-204550.4525,GCATGGCCTCCACTTGCAA
2,0Safe-safe-ACOC-204550.4590,GCACCAGTCTATGCCACCAC
3,0Safe-safe-DTKP-204550.5261,GTGAATTTAAGGCACAACCC
4,0Safe-safe-DTKP-204550.5308,GCTGGACTATGCCAGGACCT
...,...,...
242,UPF3A-4,GACCCAGTCTATCAAAAGACC
243,WNT5A-1,GAGGATGCGAGCACTCTCGT
244,WNT5A-2,GAGTATCAATTCCGACATCGA
245,WNT5A-3,GAGTTAACTTCCAACAGGGGG


In [354]:
sequence_ref = sgRNA_ref.copy()
sequence_ref['Target'] = [i.split('-')[0] for i in sequence_ref['Guide_ID']]
sequence_ref.columns = ['sgRNA_guide_name', 'sgRNA_guide_seq', 'sgRNA_target_name']
sequence_ref.loc[sequence_ref['sgRNA_target_name'] == '0Safe', 'sgRNA_target_name'] = 'control'


In [355]:
sequence_ref

Unnamed: 0,sgRNA_guide_name,sgRNA_guide_seq,sgRNA_target_name
0,0Safe-safe-ACOC-204550.4522,GTGTATTTGGCTTCCAAAA,control
1,0Safe-safe-ACOC-204550.4525,GCATGGCCTCCACTTGCAA,control
2,0Safe-safe-ACOC-204550.4590,GCACCAGTCTATGCCACCAC,control
3,0Safe-safe-DTKP-204550.5261,GTGAATTTAAGGCACAACCC,control
4,0Safe-safe-DTKP-204550.5308,GCTGGACTATGCCAGGACCT,control
...,...,...,...
242,UPF3A-4,GACCCAGTCTATCAAAAGACC,UPF3A
243,WNT5A-1,GAGGATGCGAGCACTCTCGT,WNT5A
244,WNT5A-2,GAGTATCAATTCCGACATCGA,WNT5A
245,WNT5A-3,GAGTTAACTTCCAACAGGGGG,WNT5A


### Counts

In [356]:
counts_ref = pd.DataFrame(columns = ["Guide 1", "Guide 2", "Gene 1", "Gene 2", "Count Replicates", "Type", "Sequencing", "Cell Line", "Condition"])

In [357]:
counts_ref['Guide 1'] = shan_data['probe_a_id']
counts_ref['Guide 2'] = shan_data['probe_b_id']
counts_ref['Gene 1'] = shan_data['target_a_id']
counts_ref['Gene 2'] = shan_data['target_b_id']

In [358]:
shan_data

Unnamed: 0,construct_id,target_a_id,probe_a_id,probe_a_seq,target_b_id,probe_b_id,probe_b_seq,T0_2,T0_3,T12_2,T12_3
0,0Safe_safe_ACOC_204550.4522__0Safe_safe_ACOC_2...,0Safe-safe-ACOC,0Safe-safe-ACOC-204550.4522,GTGTATTTGGCTTCCAAAA,0Safe-safe-ACOC,0Safe-safe-ACOC-204550.4522,GTGTATTTGGCTTCCAAAA,22,17,36,43
1,0Safe_safe_ACOC_204550.4522__0Safe_safe_ACOC_2...,0Safe-safe-ACOC,0Safe-safe-ACOC-204550.4522,GTGTATTTGGCTTCCAAAA,0Safe-safe-ACOC,0Safe-safe-ACOC-204550.4525,GCATGGCCTCCACTTGCAA,57,73,107,98
2,0Safe_safe_ACOC_204550.4522__0Safe_safe_ACOC_2...,0Safe-safe-ACOC,0Safe-safe-ACOC-204550.4522,GTGTATTTGGCTTCCAAAA,0Safe-safe-ACOC,0Safe-safe-ACOC-204550.4590,GCACCAGTCTATGCCACCAC,78,55,124,166
3,0Safe_safe_ACOC_204550.4522__0Safe_safe_DTKP_2...,0Safe-safe-ACOC,0Safe-safe-ACOC-204550.4522,GTGTATTTGGCTTCCAAAA,0Safe-safe-DTKP,0Safe-safe-DTKP-204550.5261,GTGAATTTAAGGCACAACCC,42,41,62,62
4,0Safe_safe_ACOC_204550.4522__0Safe_safe_DTKP_2...,0Safe-safe-ACOC,0Safe-safe-ACOC-204550.4522,GTGTATTTGGCTTCCAAAA,0Safe-safe-DTKP,0Safe-safe-DTKP-204550.5308,GCTGGACTATGCCAGGACCT,17,27,38,50
...,...,...,...,...,...,...,...,...,...,...,...
60625,WNT5A_4__UPF3A_4,WNT5A,WNT5A-4,GTAACTTCCAACAGGGGGTGG,UPF3A,UPF3A-4,GACCCAGTCTATCAAAAGACC,105,97,227,235
60626,WNT5A_4__WNT5A_1,WNT5A,WNT5A-4,GTAACTTCCAACAGGGGGTGG,WNT5A,WNT5A-1,GAGGATGCGAGCACTCTCGT,339,282,455,564
60627,WNT5A_4__WNT5A_2,WNT5A,WNT5A-4,GTAACTTCCAACAGGGGGTGG,WNT5A,WNT5A-2,GAGTATCAATTCCGACATCGA,288,308,448,597
60628,WNT5A_4__WNT5A_3,WNT5A,WNT5A-4,GTAACTTCCAACAGGGGGTGG,WNT5A,WNT5A-3,GAGTTAACTTCCAACAGGGGG,201,198,333,416


In [359]:
counts_ref['Count Replicates'] = shan_data[['T0_2', 'T0_3', 'T12_2', 'T12_3']].apply(    
    lambda x: ';'.join(x.astype(str)),
    axis=1
)


In [360]:
counts_ref['Sequencing'] = ['Combinatorial CRISPR'] * counts_ref.shape[0]
counts_ref['Cell Line'] = ['22Rv1'] * counts_ref.shape[0]

In [361]:
counts_ref["Condition"] = [';'.join(['T0_1', 'T0_2', 'T12_1', 'T12_2'])] * counts_ref.shape[0]

In [362]:
counts_ref

Unnamed: 0,Guide 1,Guide 2,Gene 1,Gene 2,Count Replicates,Type,Sequencing,Cell Line,Condition
0,0Safe-safe-ACOC-204550.4522,0Safe-safe-ACOC-204550.4522,0Safe-safe-ACOC,0Safe-safe-ACOC,22;17;36;43,,Combinatorial CRISPR,22Rv1,T0_1;T0_2;T12_1;T12_2
1,0Safe-safe-ACOC-204550.4522,0Safe-safe-ACOC-204550.4525,0Safe-safe-ACOC,0Safe-safe-ACOC,57;73;107;98,,Combinatorial CRISPR,22Rv1,T0_1;T0_2;T12_1;T12_2
2,0Safe-safe-ACOC-204550.4522,0Safe-safe-ACOC-204550.4590,0Safe-safe-ACOC,0Safe-safe-ACOC,78;55;124;166,,Combinatorial CRISPR,22Rv1,T0_1;T0_2;T12_1;T12_2
3,0Safe-safe-ACOC-204550.4522,0Safe-safe-DTKP-204550.5261,0Safe-safe-ACOC,0Safe-safe-DTKP,42;41;62;62,,Combinatorial CRISPR,22Rv1,T0_1;T0_2;T12_1;T12_2
4,0Safe-safe-ACOC-204550.4522,0Safe-safe-DTKP-204550.5308,0Safe-safe-ACOC,0Safe-safe-DTKP,17;27;38;50,,Combinatorial CRISPR,22Rv1,T0_1;T0_2;T12_1;T12_2
...,...,...,...,...,...,...,...,...,...
60625,WNT5A-4,UPF3A-4,WNT5A,UPF3A,105;97;227;235,,Combinatorial CRISPR,22Rv1,T0_1;T0_2;T12_1;T12_2
60626,WNT5A-4,WNT5A-1,WNT5A,WNT5A,339;282;455;564,,Combinatorial CRISPR,22Rv1,T0_1;T0_2;T12_1;T12_2
60627,WNT5A-4,WNT5A-2,WNT5A,WNT5A,288;308;448;597,,Combinatorial CRISPR,22Rv1,T0_1;T0_2;T12_1;T12_2
60628,WNT5A-4,WNT5A-3,WNT5A,WNT5A,201;198;333;416,,Combinatorial CRISPR,22Rv1,T0_1;T0_2;T12_1;T12_2


In [363]:
idx = (counts_ref['Gene 1'].isin(['luciferase', 'EGFP', 'LacZ'])) | (counts_ref['Gene 2'].isin(['luciferase', 'EGFP', 'LacZ']))

In [364]:
idx.sum()

11699

In [365]:
# remove the 3 of them
counts_ref = counts_ref[~idx]
counts_ref["Study"] = [study_name_to_pubmed_id['shantang_data']] * counts_ref.shape[0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  counts_ref["Study"] = [study_name_to_pubmed_id['shantang_data']] * counts_ref.shape[0]


In [366]:
counts_ref

Unnamed: 0,Guide 1,Guide 2,Gene 1,Gene 2,Count Replicates,Type,Sequencing,Cell Line,Condition,Study
0,0Safe-safe-ACOC-204550.4522,0Safe-safe-ACOC-204550.4522,0Safe-safe-ACOC,0Safe-safe-ACOC,22;17;36;43,,Combinatorial CRISPR,22Rv1,T0_1;T0_2;T12_1;T12_2,36060092
1,0Safe-safe-ACOC-204550.4522,0Safe-safe-ACOC-204550.4525,0Safe-safe-ACOC,0Safe-safe-ACOC,57;73;107;98,,Combinatorial CRISPR,22Rv1,T0_1;T0_2;T12_1;T12_2,36060092
2,0Safe-safe-ACOC-204550.4522,0Safe-safe-ACOC-204550.4590,0Safe-safe-ACOC,0Safe-safe-ACOC,78;55;124;166,,Combinatorial CRISPR,22Rv1,T0_1;T0_2;T12_1;T12_2,36060092
3,0Safe-safe-ACOC-204550.4522,0Safe-safe-DTKP-204550.5261,0Safe-safe-ACOC,0Safe-safe-DTKP,42;41;62;62,,Combinatorial CRISPR,22Rv1,T0_1;T0_2;T12_1;T12_2,36060092
4,0Safe-safe-ACOC-204550.4522,0Safe-safe-DTKP-204550.5308,0Safe-safe-ACOC,0Safe-safe-DTKP,17;27;38;50,,Combinatorial CRISPR,22Rv1,T0_1;T0_2;T12_1;T12_2,36060092
...,...,...,...,...,...,...,...,...,...,...
60625,WNT5A-4,UPF3A-4,WNT5A,UPF3A,105;97;227;235,,Combinatorial CRISPR,22Rv1,T0_1;T0_2;T12_1;T12_2,36060092
60626,WNT5A-4,WNT5A-1,WNT5A,WNT5A,339;282;455;564,,Combinatorial CRISPR,22Rv1,T0_1;T0_2;T12_1;T12_2,36060092
60627,WNT5A-4,WNT5A-2,WNT5A,WNT5A,288;308;448;597,,Combinatorial CRISPR,22Rv1,T0_1;T0_2;T12_1;T12_2,36060092
60628,WNT5A-4,WNT5A-3,WNT5A,WNT5A,201;198;333;416,,Combinatorial CRISPR,22Rv1,T0_1;T0_2;T12_1;T12_2,36060092


## SL Scores

In [367]:
shantang_loc = os.path.join(learning_goals_loc_general, "ShanTang")
shan_SL = pd.read_csv(os.path.join(shantang_loc, "shan_sgRNA_level_calculated_121722.csv"))

In [368]:
shan_SL

Unnamed: 0,Gene Pair (Original),GI_Replicate_Z_Score_0,GI_Replicate_Z_Score_1,GI_Replicate_Average
0,AKT3_AR,-0.241486,-1.712796,-0.977141
1,AKT3_AURKA,0.687381,-0.385961,0.150710
2,AKT3_BMP6,-0.938292,-0.793694,-0.865993
3,AKT3_CCNE2,0.262964,-0.366995,-0.052016
4,AKT3_CDC6,-0.238720,0.188453,-0.025134
...,...,...,...,...
1220,ULK1_UPF3A,0.433624,-0.659875,-0.113125
1221,ULK1_WNT5A,0.184594,-0.416153,-0.115779
1222,UPF2_UPF3A,0.047645,1.053003,0.550324
1223,UPF2_WNT5A,0.068202,0.713417,0.390810


In [369]:
curr_GI = pd.DataFrame(columns = ["Gene_A", "Gene_B", "Study_Source", "Cell_Line", "GI_Score", "GI_Cutoff", "Stat_Score", "Stat_Cutoff"])
curr_GI["Gene_A"] = [i.split('_')[0] for i in shan_SL['Gene Pair (Original)']]
curr_GI["Gene_B"] = [i.split('_')[1] for i in shan_SL['Gene Pair (Original)']]
curr_GI["GI_Score"] = shan_SL['GI_Replicate_Average']
curr_GI["GI_Cutoff"] = [-0.5] * len(curr_GI["Gene_A"])
curr_GI["Study_Source"] = [study_name_to_pubmed_id['shantang_data']] * len(curr_GI["Gene_A"])
curr_GI["Cell_Line"] = ["22Rv1"] * len(curr_GI["Gene_A"])
curr_GI = curr_GI.fillna(0)
curr_GI["Stat_Score"] = [float("nan")] * len(curr_GI["Gene_A"])
curr_GI["Stat_Cutoff"] = [float("nan")] * len(curr_GI["Gene_A"])

In [370]:
curr_GI

Unnamed: 0,Gene_A,Gene_B,Study_Source,Cell_Line,GI_Score,GI_Cutoff,Stat_Score,Stat_Cutoff
0,AKT3,AR,36060092,22Rv1,-0.977141,-0.5,,
1,AKT3,AURKA,36060092,22Rv1,0.150710,-0.5,,
2,AKT3,BMP6,36060092,22Rv1,-0.865993,-0.5,,
3,AKT3,CCNE2,36060092,22Rv1,-0.052016,-0.5,,
4,AKT3,CDC6,36060092,22Rv1,-0.025134,-0.5,,
...,...,...,...,...,...,...,...,...
1220,ULK1,UPF3A,36060092,22Rv1,-0.113125,-0.5,,
1221,ULK1,WNT5A,36060092,22Rv1,-0.115779,-0.5,,
1222,UPF2,UPF3A,36060092,22Rv1,0.550324,-0.5,,
1223,UPF2,WNT5A,36060092,22Rv1,0.390810,-0.5,,


In [371]:
(curr_GI['GI_Score'] < -1).sum()

10

In [372]:
study_conditions['shantang_data']

[['T0_1', 'T0_2'], ['T12_1', 'T12_2']]

In [373]:
# prepare the files for insertion
db_inserts = prepare_study_for_export(sequence_ref = sequence_ref.copy(), counts_ref = counts_ref.copy(), score_ref = curr_GI.copy(), study_controls = controls['shantang_data'], study_conditions = study_conditions['shantang_data'])

Starting processing...
Score reference...
Controls within SL score that are removed: 
0
---
Only GI cutoff is present...
Counts reference...
Number of double pairs: 37767
Number of controls: 614
Number of singles: 10550
Sequence reference...
Done! Returning...


In [374]:
db_inserts['score_ref']['SL_or_not'].value_counts()

False    1052
True      173
Name: SL_or_not, dtype: int64

In [375]:
# insert to the database
insert_study_to_db(engine_link = SLKB_engine, db_inserts = db_inserts)

Final QC...
Beginning transaction...
Done sequence
Done counts
Done score
Successfully inserted!
Added Record stats...
Sequence insert: 247
Counts insert: 48931
Score insert: 1225
Done!


## Najm et al

## Sequence

In [376]:
# najm_loc = os.path.join(learning_goals_loc_general, "Najm", "Najm_29251726_supplementary")

# #GI is for RPE1
# synleth_lib = pd.read_csv(os.path.join(najm_loc, "Supplementary Table 2 SynLet library.txt"), sep = '\t')

In [377]:
# synleth_lib['Guide_Name'] = synleth_lib['Gene'] + '_' + synleth_lib['Well ID'] + '_' + synleth_lib['Cas9']

In [378]:
# synleth_lib = synleth_lib.loc[:,['Guide_Name', 'sgRNA sequence', 'Gene']]

In [379]:
# sequence_ref = synleth_lib.copy()
# sequence_ref.columns = ['sgRNA_guide_name', 'sgRNA_guide_seq', 'sgRNA_target_name']

In [380]:
# sequence_ref.loc[sequence_ref['sgRNA_guide_seq'] == 'AAAAAAAGAGTCGAATGTTTT']

In [381]:
# seq_to_guide = {}
# for i in range(sequence_ref.shape[0]):
#     seq_to_guide[sequence_ref.loc[i, 'sgRNA_guide_seq']] = sequence_ref.loc[i, 'sgRNA_guide_name']

## Sequence (Create from counts)

In [382]:
najm_loc = os.path.join(learning_goals_loc_general, "Najm", "Najm_29251726_supplementary")

#GI is for RPE1
synleth_lib = pd.read_csv(os.path.join(najm_loc, "Supplementary Table 3 SynLet screening data.txt"), sep = '\t')

In [383]:
synleth_lib

Unnamed: 0,U6 Sequence;H1 Sequence,U6 gene,U6 well,H1 gene,H1 well,Cell Line,Time Point,Rep A Reads,Rep B Reads,Rep C Reads,...,Rep B Log Norm,Rep C Log Norm,Rep A LFC,Rep B LFC,Rep C LFC,Avg LFC,U6 expected,H1 expected,Sum Expected,Measured - Expected
0,AAAGTGGAACTCAGGACATG;AAAAAAAGAGTCGAATGTTTT,HPRT intron,G08,6T,H12,786O,Day 21,2438,1997,2175.0,...,8.035362,8.057403,0.193558,0.523903,0.545944,0.421135,0.441435,0.579674,1.021109,-0.599974
1,AAAGTGGAACTCAGGACATG;AAAGAGTCCACTCTGCACTTG,HPRT intron,G08,UBC,F10,786O,Day 21,2176,1385,1485.0,...,7.509841,7.509365,0.062268,0.030280,0.029804,0.040784,0.441435,0.272248,0.713683,-0.672899
2,AAAGTGGAACTCAGGACATG;AACAGCTCCGTGTACTGAGGC,HPRT intron,G08,CD81,G08,786O,Day 21,2294,1579,1844.0,...,7.697994,7.820199,0.617168,0.697543,0.819748,0.711486,0.441435,0.593754,1.035189,-0.323703
3,AAAGTGGAACTCAGGACATG;AAGACGAAATTGAAGACGAAG,HPRT intron,G08,CD81,G12,786O,Day 21,2169,1229,1705.0,...,7.338445,7.707652,0.461431,0.262669,0.631877,0.451992,0.441435,0.696023,1.137459,-0.685466
4,AAAGTGGAACTCAGGACATG;AAGCGTACTGCTCATCATCGT,HPRT intron,G08,HSP90AA1,D01,786O,Day 21,1771,984,1526.0,...,7.019908,7.548444,0.452550,0.225973,0.754510,0.477678,0.441435,0.474748,0.916183,-0.438506
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92155,TTCTGACTACAACATCCAGA;TTGCTTTCATTTAATGCTACA,UBB,F03,PARP2,G03,OVCAR8,Day 11,134,118,123.0,...,6.379506,6.124881,0.526160,0.290210,0.254625,0.356998,0.386108,0.225575,0.611683,-0.254684
92156,TTCTGACTACAACATCCAGA;TTGGGACGAGTCCTGTGAGAA,UBB,F03,IMPDH1,D04,OVCAR8,Day 11,117,119,102.0,...,6.112980,6.015239,0.442162,0.411885,0.097741,0.317263,0.386108,0.115624,0.501732,-0.184469
92157,TTCTGACTACAACATCCAGA;TTTAGGAATTGCTGTTGGGAC,UBB,F03,HPRT intron,H05,OVCAR8,Day 11,149,126,136.0,...,6.522797,6.324821,0.477853,0.183833,0.197976,0.286554,0.386108,0.105574,0.491681,-0.205127
92158,TTCTGACTACAACATCCAGA;TTTCCATCACTTGGTTGAATA,UBB,F03,BCL2A1,B02,OVCAR8,Day 11,169,147,202.0,...,7.088410,6.466948,0.515907,0.261833,0.621462,0.466401,0.386108,0.388544,0.774652,-0.308251


In [384]:
synleth_lib['CL+TIME'] = synleth_lib['Cell Line'] + '_' + synleth_lib['Time Point']

In [385]:
synleth_lib['CL+TIME']

0          786O_Day 21
1          786O_Day 21
2          786O_Day 21
3          786O_Day 21
4          786O_Day 21
             ...      
92155    OVCAR8_Day 11
92156    OVCAR8_Day 11
92157    OVCAR8_Day 11
92158    OVCAR8_Day 11
92159    OVCAR8_Day 11
Name: CL+TIME, Length: 92160, dtype: object

In [386]:
set(synleth_lib['CL+TIME'])

{'786O_Day 11',
 '786O_Day 21',
 'A375_Day 21',
 'A375_Day 9',
 'A549_Day 21',
 'HT29_Day 21',
 'Meljuso_Day 21',
 'Meljuso_Day 9',
 'OVCAR8_Day 11',
 'OVCAR8_Day 21'}

In [387]:
# only get the final time points for each cell line
synleth_lib = synleth_lib.loc[synleth_lib['Time Point'] == 'Day 21']

In [388]:
synleth_lib.reset_index(drop = True, inplace = True)

In [389]:
set(synleth_lib['Cell Line'])

{'786O', 'A375', 'A549', 'HT29', 'Meljuso', 'OVCAR8'}

In [390]:
seq_to_guide = {}
target_counts = {}
for i in range(synleth_lib.shape[0]):
    split = synleth_lib.loc[i, 'U6 Sequence;H1 Sequence'].split(';')
    
    guide_1_seq = split[0]
    guide_2_seq = split[1]
    
    target_1 = synleth_lib.loc[i, 'U6 gene']
    target_2 = synleth_lib.loc[i, 'H1 gene']
    
    if (guide_1_seq not in seq_to_guide):
        if (target_1 not in target_counts):
            target_counts[target_1] = 1
        else:
            target_counts[target_1] += 1
        seq_to_guide[guide_1_seq] = '_'.join([target_1, str(target_counts[target_1])])
    if (guide_2_seq not in seq_to_guide):
        if (target_2 not in target_counts):
            target_counts[target_2] = 1
        else:
            target_counts[target_2] += 1
        seq_to_guide[guide_2_seq] = '_'.join([target_2, str(target_counts[target_2])])

In [391]:
len(seq_to_guide)

192

In [392]:
seq_list = []
guide_list = []

for seq in seq_to_guide:
    seq_list.append(seq)
    guide_list.append(seq_to_guide[seq])

In [393]:
sequence_ref = pd.DataFrame({'sgRNA_guide_name': guide_list,
                             'sgRNA_guide_seq': seq_list})

In [394]:
sequence_ref['sgRNA_target_name'] = [i.split('_')[0] for i in sequence_ref['sgRNA_guide_name']]

In [395]:
sequence_ref

Unnamed: 0,sgRNA_guide_name,sgRNA_guide_seq,sgRNA_target_name
0,HPRT intron_1,AAAGTGGAACTCAGGACATG,HPRT intron
1,6T_1,AAAAAAAGAGTCGAATGTTTT,6T
2,UBC_1,AAAGAGTCCACTCTGCACTTG,UBC
3,CD81_1,AACAGCTCCGTGTACTGAGGC,CD81
4,CD81_2,AAGACGAAATTGAAGACGAAG,CD81
...,...,...,...
187,AKT1_6,TGTCATGGAGTACGCCAACG,AKT1
188,BCL2_6,TGTCGCAGAGGGGCTACGAG,BCL2
189,BCL2L10_6,TGTTGCTGGCCGACTACCTG,BCL2L10
190,UBB_5,TGTTGTAGTCAGAAAGAGTA,UBB


## Counts

In [396]:
counts_ref = pd.DataFrame(columns = ["Guide 1", "Guide 2", "Gene 1", "Gene 2", "Count Replicates", "Type", "Sequencing", "Cell Line", "Condition"])

In [397]:
najm_loc = os.path.join(learning_goals_loc_general, "Najm", "Najm_29251726_supplementary")

#GI is for RPE1
synleth_lib = pd.read_csv(os.path.join(najm_loc, "Supplementary Table 3 SynLet screening data.txt"), sep = '\t')

In [398]:
synleth_lib

Unnamed: 0,U6 Sequence;H1 Sequence,U6 gene,U6 well,H1 gene,H1 well,Cell Line,Time Point,Rep A Reads,Rep B Reads,Rep C Reads,...,Rep B Log Norm,Rep C Log Norm,Rep A LFC,Rep B LFC,Rep C LFC,Avg LFC,U6 expected,H1 expected,Sum Expected,Measured - Expected
0,AAAGTGGAACTCAGGACATG;AAAAAAAGAGTCGAATGTTTT,HPRT intron,G08,6T,H12,786O,Day 21,2438,1997,2175.0,...,8.035362,8.057403,0.193558,0.523903,0.545944,0.421135,0.441435,0.579674,1.021109,-0.599974
1,AAAGTGGAACTCAGGACATG;AAAGAGTCCACTCTGCACTTG,HPRT intron,G08,UBC,F10,786O,Day 21,2176,1385,1485.0,...,7.509841,7.509365,0.062268,0.030280,0.029804,0.040784,0.441435,0.272248,0.713683,-0.672899
2,AAAGTGGAACTCAGGACATG;AACAGCTCCGTGTACTGAGGC,HPRT intron,G08,CD81,G08,786O,Day 21,2294,1579,1844.0,...,7.697994,7.820199,0.617168,0.697543,0.819748,0.711486,0.441435,0.593754,1.035189,-0.323703
3,AAAGTGGAACTCAGGACATG;AAGACGAAATTGAAGACGAAG,HPRT intron,G08,CD81,G12,786O,Day 21,2169,1229,1705.0,...,7.338445,7.707652,0.461431,0.262669,0.631877,0.451992,0.441435,0.696023,1.137459,-0.685466
4,AAAGTGGAACTCAGGACATG;AAGCGTACTGCTCATCATCGT,HPRT intron,G08,HSP90AA1,D01,786O,Day 21,1771,984,1526.0,...,7.019908,7.548444,0.452550,0.225973,0.754510,0.477678,0.441435,0.474748,0.916183,-0.438506
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92155,TTCTGACTACAACATCCAGA;TTGCTTTCATTTAATGCTACA,UBB,F03,PARP2,G03,OVCAR8,Day 11,134,118,123.0,...,6.379506,6.124881,0.526160,0.290210,0.254625,0.356998,0.386108,0.225575,0.611683,-0.254684
92156,TTCTGACTACAACATCCAGA;TTGGGACGAGTCCTGTGAGAA,UBB,F03,IMPDH1,D04,OVCAR8,Day 11,117,119,102.0,...,6.112980,6.015239,0.442162,0.411885,0.097741,0.317263,0.386108,0.115624,0.501732,-0.184469
92157,TTCTGACTACAACATCCAGA;TTTAGGAATTGCTGTTGGGAC,UBB,F03,HPRT intron,H05,OVCAR8,Day 11,149,126,136.0,...,6.522797,6.324821,0.477853,0.183833,0.197976,0.286554,0.386108,0.105574,0.491681,-0.205127
92158,TTCTGACTACAACATCCAGA;TTTCCATCACTTGGTTGAATA,UBB,F03,BCL2A1,B02,OVCAR8,Day 11,169,147,202.0,...,7.088410,6.466948,0.515907,0.261833,0.621462,0.466401,0.386108,0.388544,0.774652,-0.308251


In [399]:
synleth_lib['CL+TIME'] = synleth_lib['Cell Line'] + '_' + synleth_lib['Time Point']

In [400]:
synleth_lib['CL+TIME']

0          786O_Day 21
1          786O_Day 21
2          786O_Day 21
3          786O_Day 21
4          786O_Day 21
             ...      
92155    OVCAR8_Day 11
92156    OVCAR8_Day 11
92157    OVCAR8_Day 11
92158    OVCAR8_Day 11
92159    OVCAR8_Day 11
Name: CL+TIME, Length: 92160, dtype: object

In [401]:
set(synleth_lib['CL+TIME'])

{'786O_Day 11',
 '786O_Day 21',
 'A375_Day 21',
 'A375_Day 9',
 'A549_Day 21',
 'HT29_Day 21',
 'Meljuso_Day 21',
 'Meljuso_Day 9',
 'OVCAR8_Day 11',
 'OVCAR8_Day 21'}

In [402]:
# only get the final time points for each cell line
synleth_lib = synleth_lib.loc[synleth_lib['Time Point'] == 'Day 21']

In [403]:
synleth_lib.reset_index(drop = True, inplace = True)

In [404]:
set(synleth_lib['Cell Line'])

{'786O', 'A375', 'A549', 'HT29', 'Meljuso', 'OVCAR8'}

In [405]:
synleth_lib

Unnamed: 0,U6 Sequence;H1 Sequence,U6 gene,U6 well,H1 gene,H1 well,Cell Line,Time Point,Rep A Reads,Rep B Reads,Rep C Reads,...,Rep C Log Norm,Rep A LFC,Rep B LFC,Rep C LFC,Avg LFC,U6 expected,H1 expected,Sum Expected,Measured - Expected,CL+TIME
0,AAAGTGGAACTCAGGACATG;AAAAAAAGAGTCGAATGTTTT,HPRT intron,G08,6T,H12,786O,Day 21,2438,1997,2175.0,...,8.057403,0.193558,0.523903,0.545944,0.421135,0.441435,0.579674,1.021109,-0.599974,786O_Day 21
1,AAAGTGGAACTCAGGACATG;AAAGAGTCCACTCTGCACTTG,HPRT intron,G08,UBC,F10,786O,Day 21,2176,1385,1485.0,...,7.509365,0.062268,0.030280,0.029804,0.040784,0.441435,0.272248,0.713683,-0.672899,786O_Day 21
2,AAAGTGGAACTCAGGACATG;AACAGCTCCGTGTACTGAGGC,HPRT intron,G08,CD81,G08,786O,Day 21,2294,1579,1844.0,...,7.820199,0.617168,0.697543,0.819748,0.711486,0.441435,0.593754,1.035189,-0.323703,786O_Day 21
3,AAAGTGGAACTCAGGACATG;AAGACGAAATTGAAGACGAAG,HPRT intron,G08,CD81,G12,786O,Day 21,2169,1229,1705.0,...,7.707652,0.461431,0.262669,0.631877,0.451992,0.441435,0.696023,1.137459,-0.685466,786O_Day 21
4,AAAGTGGAACTCAGGACATG;AAGCGTACTGCTCATCATCGT,HPRT intron,G08,HSP90AA1,D01,786O,Day 21,1771,984,1526.0,...,7.548444,0.452550,0.225973,0.754510,0.477678,0.441435,0.474748,0.916183,-0.438506,786O_Day 21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55291,TTCTGACTACAACATCCAGA;TTGCTTTCATTTAATGCTACA,UBB,F03,PARP2,G03,OVCAR8,Day 21,140,93,291.0,...,6.124881,-0.486649,-1.041673,-0.625293,-0.717872,-0.161739,0.242457,0.080718,-0.798590,OVCAR8_Day 21
55292,TTCTGACTACAACATCCAGA;TTGGGACGAGTCCTGTGAGAA,UBB,F03,IMPDH1,D04,OVCAR8,Day 21,185,270,418.0,...,6.015239,0.018028,0.577453,-0.002889,0.197531,-0.161739,0.743171,0.581432,-0.383901,OVCAR8_Day 21
55293,TTCTGACTACAACATCCAGA;TTTAGGAATTGCTGTTGGGAC,UBB,F03,HPRT intron,H05,OVCAR8,Day 21,215,208,337.0,...,6.324821,-0.077820,-0.104056,-0.617864,-0.266580,-0.161739,0.350282,0.188542,-0.455122,OVCAR8_Day 21
55294,TTCTGACTACAACATCCAGA;TTTCCATCACTTGGTTGAATA,UBB,F03,BCL2A1,B02,OVCAR8,Day 21,178,186,486.0,...,6.466948,-0.488464,-0.405178,-0.240275,-0.377972,-0.161739,0.323509,0.161769,-0.539742,OVCAR8_Day 21


In [406]:
synleth_lib['Gene 1'] = synleth_lib['U6 gene']
synleth_lib['Gene 2'] = synleth_lib['H1 gene']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  synleth_lib['Gene 1'] = synleth_lib['U6 gene']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  synleth_lib['Gene 2'] = synleth_lib['H1 gene']


In [407]:
guide_1_seqs = [i.split(';')[0] for i in synleth_lib['U6 Sequence;H1 Sequence']]
guide_2_seqs = [i.split(';')[1] for i in synleth_lib['U6 Sequence;H1 Sequence']]

In [408]:
guide_1_ids = [seq_to_guide[i] for i in guide_1_seqs]
guide_2_ids = [seq_to_guide[i] for i in guide_2_seqs]

In [409]:
synleth_lib['Guide 1'] = guide_1_ids
synleth_lib['Guide 2'] = guide_2_ids

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  synleth_lib['Guide 1'] = guide_1_ids
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  synleth_lib['Guide 2'] = guide_2_ids


In [410]:
synleth_lib.columns

Index(['U6 Sequence;H1 Sequence', 'U6 gene', 'U6 well', 'H1 gene', 'H1 well',
       'Cell Line', 'Time Point', 'Rep A Reads', 'Rep B Reads', 'Rep C Reads',
       'pDNA Reads', 'pDNA Log Norm', 'Rep A Log Norm', 'Rep B Log Norm',
       'Rep C Log Norm', 'Rep A LFC', 'Rep B LFC', 'Rep C LFC', 'Avg LFC',
       'U6 expected', 'H1 expected', 'Sum Expected', 'Measured - Expected',
       'CL+TIME', 'Gene 1', 'Gene 2', 'Guide 1', 'Guide 2'],
      dtype='object')

In [411]:
synleth_lib["Count Replicates"] = synleth_lib[['pDNA Reads', 'Rep A Reads', 'Rep B Reads', 'Rep C Reads']].apply(    
    lambda x: ';'.join(x.astype(str)),
    axis=1
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  synleth_lib["Count Replicates"] = synleth_lib[['pDNA Reads', 'Rep A Reads', 'Rep B Reads', 'Rep C Reads']].apply(


In [412]:
synleth_lib["Condition"] = ';'.join(['pDNA_Reads', 'Rep_A_Reads', 'Rep_B_Reads', 'Rep_C_Reads'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  synleth_lib["Condition"] = ';'.join(['pDNA_Reads', 'Rep_A_Reads', 'Rep_B_Reads', 'Rep_C_Reads'])


In [413]:
synleth_lib[["Guide 1", "Guide 2", "Gene 1", "Gene 2", "Count Replicates", "Cell Line", "Condition"]]

Unnamed: 0,Guide 1,Guide 2,Gene 1,Gene 2,Count Replicates,Cell Line,Condition
0,HPRT intron_1,6T_1,HPRT intron,6T,1137.0;2438.0;1997.0;2175.0,786O,pDNA_Reads;Rep_A_Reads;Rep_B_Reads;Rep_C_Reads
1,HPRT intron_1,UBC_1,HPRT intron,UBC,1112.0;2176.0;1385.0;1485.0,786O,pDNA_Reads;Rep_A_Reads;Rep_B_Reads;Rep_C_Reads
2,HPRT intron_1,CD81_1,HPRT intron,CD81,796.0;2294.0;1579.0;1844.0,786O,pDNA_Reads;Rep_A_Reads;Rep_B_Reads;Rep_C_Reads
3,HPRT intron_1,CD81_2,HPRT intron,CD81,839.0;2169.0;1229.0;1705.0,786O,pDNA_Reads;Rep_A_Reads;Rep_B_Reads;Rep_C_Reads
4,HPRT intron_1,HSP90AA1_1,HPRT intron,HSP90AA1,689.0;1771.0;984.0;1526.0,786O,pDNA_Reads;Rep_A_Reads;Rep_B_Reads;Rep_C_Reads
...,...,...,...,...,...,...,...
55291,UBB_6,PARP2_3,UBB,PARP2,431.0;140.0;93.0;291.0,OVCAR8,pDNA_Reads;Rep_A_Reads;Rep_B_Reads;Rep_C_Reads
55292,UBB_6,IMPDH1_3,UBB,IMPDH1,399.0;185.0;270.0;418.0,OVCAR8,pDNA_Reads;Rep_A_Reads;Rep_B_Reads;Rep_C_Reads
55293,UBB_6,HPRT intron_6,UBB,HPRT intron,496.0;215.0;208.0;337.0,OVCAR8,pDNA_Reads;Rep_A_Reads;Rep_B_Reads;Rep_C_Reads
55294,UBB_6,BCL2A1_3,UBB,BCL2A1,548.0;178.0;186.0;486.0,OVCAR8,pDNA_Reads;Rep_A_Reads;Rep_B_Reads;Rep_C_Reads


In [414]:
counts_ref = pd.concat([counts_ref, synleth_lib[["Guide 1", "Guide 2", "Gene 1", "Gene 2", "Count Replicates", "Cell Line", "Condition"]]])
counts_ref["Study"] = [study_name_to_pubmed_id['najm_data']] * counts_ref.shape[0]

In [415]:
counts_ref 

Unnamed: 0,Guide 1,Guide 2,Gene 1,Gene 2,Count Replicates,Type,Sequencing,Cell Line,Condition,Study
0,HPRT intron_1,6T_1,HPRT intron,6T,1137.0;2438.0;1997.0;2175.0,,,786O,pDNA_Reads;Rep_A_Reads;Rep_B_Reads;Rep_C_Reads,29251726
1,HPRT intron_1,UBC_1,HPRT intron,UBC,1112.0;2176.0;1385.0;1485.0,,,786O,pDNA_Reads;Rep_A_Reads;Rep_B_Reads;Rep_C_Reads,29251726
2,HPRT intron_1,CD81_1,HPRT intron,CD81,796.0;2294.0;1579.0;1844.0,,,786O,pDNA_Reads;Rep_A_Reads;Rep_B_Reads;Rep_C_Reads,29251726
3,HPRT intron_1,CD81_2,HPRT intron,CD81,839.0;2169.0;1229.0;1705.0,,,786O,pDNA_Reads;Rep_A_Reads;Rep_B_Reads;Rep_C_Reads,29251726
4,HPRT intron_1,HSP90AA1_1,HPRT intron,HSP90AA1,689.0;1771.0;984.0;1526.0,,,786O,pDNA_Reads;Rep_A_Reads;Rep_B_Reads;Rep_C_Reads,29251726
...,...,...,...,...,...,...,...,...,...,...
55291,UBB_6,PARP2_3,UBB,PARP2,431.0;140.0;93.0;291.0,,,OVCAR8,pDNA_Reads;Rep_A_Reads;Rep_B_Reads;Rep_C_Reads,29251726
55292,UBB_6,IMPDH1_3,UBB,IMPDH1,399.0;185.0;270.0;418.0,,,OVCAR8,pDNA_Reads;Rep_A_Reads;Rep_B_Reads;Rep_C_Reads,29251726
55293,UBB_6,HPRT intron_6,UBB,HPRT intron,496.0;215.0;208.0;337.0,,,OVCAR8,pDNA_Reads;Rep_A_Reads;Rep_B_Reads;Rep_C_Reads,29251726
55294,UBB_6,BCL2A1_3,UBB,BCL2A1,548.0;178.0;186.0;486.0,,,OVCAR8,pDNA_Reads;Rep_A_Reads;Rep_B_Reads;Rep_C_Reads,29251726


## SL Score

In [416]:
SL_results = pd.read_csv(os.path.join(najm_loc, "Supplementary Table 4 SynLet FDRs.txt"), sep = '\t')
SL_results['Gene 1'] = [i.upper() for i in SL_results['Gene 1']]
SL_results['Gene 2'] = [i.upper() for i in SL_results['Gene 2']]

In [417]:
SL_results

Unnamed: 0,Gene 1,Gene 2,Group 1,Group 2,Source,SynLet q-value,Buffer q-value
0,AKT1,AKT1,AKT,AKT,no 786O,1.000000,0.0001
1,AKT1,AKT1,AKT,AKT,no A375,1.000000,0.0001
2,AKT1,AKT1,AKT,AKT,no A549,1.000000,0.0001
3,AKT1,AKT1,AKT,AKT,no HT29,1.000000,0.0001
4,AKT1,AKT1,AKT,AKT,no Meljuso,1.000000,0.0001
...,...,...,...,...,...,...,...
10928,EEF2,BCL2A1,Ctrl,Apop,Meljuso,0.845612,1.0000
10929,EEF2,MAP2K1,Ctrl,Mapk,OVCAR8,0.857619,1.0000
10930,EEF2,BCL2,Ctrl,Apop,Meljuso,0.867560,1.0000
10931,EEF2,AKT2,Ctrl,AKT,HT29,0.893805,1.0000


In [418]:
gene_pair_sorted = []
for i in range(SL_results.shape[0]):
    gene1 = SL_results.loc[i, 'Gene 1']
    gene2 = SL_results.loc[i, 'Gene 2']
    
    gene1, gene2 = sorted([gene1, gene2])
    gene_pair_sorted.append('_'.join([gene1, gene2]))

SL_results['gene pair sorted'] = gene_pair_sorted

In [419]:
SL_results

Unnamed: 0,Gene 1,Gene 2,Group 1,Group 2,Source,SynLet q-value,Buffer q-value,gene pair sorted
0,AKT1,AKT1,AKT,AKT,no 786O,1.000000,0.0001,AKT1_AKT1
1,AKT1,AKT1,AKT,AKT,no A375,1.000000,0.0001,AKT1_AKT1
2,AKT1,AKT1,AKT,AKT,no A549,1.000000,0.0001,AKT1_AKT1
3,AKT1,AKT1,AKT,AKT,no HT29,1.000000,0.0001,AKT1_AKT1
4,AKT1,AKT1,AKT,AKT,no Meljuso,1.000000,0.0001,AKT1_AKT1
...,...,...,...,...,...,...,...,...
10928,EEF2,BCL2A1,Ctrl,Apop,Meljuso,0.845612,1.0000,BCL2A1_EEF2
10929,EEF2,MAP2K1,Ctrl,Mapk,OVCAR8,0.857619,1.0000,EEF2_MAP2K1
10930,EEF2,BCL2,Ctrl,Apop,Meljuso,0.867560,1.0000,BCL2_EEF2
10931,EEF2,AKT2,Ctrl,AKT,HT29,0.893805,1.0000,AKT2_EEF2


In [420]:
all_cell_lines = {'786O', 'A375', 'Meljuso', 'OVCAR8', 'A549', 'HT29'}

In [421]:
single_cell_lines = np.array([True if i in all_cell_lines else False for i in SL_results['Source']])
SL_results = SL_results.loc[single_cell_lines]

In [422]:
SL_results = SL_results.reset_index(drop=True)

In [423]:
all_GI = pd.DataFrame(columns = ["Gene_A", "Gene_B", "Study_Source", "Cell_Line", "GI_Score", "GI_Cutoff", "Stat_Score", "Stat_Cutoff"])
all_GI["Gene_A"] = SL_results['Gene 1']
all_GI["Gene_B"] = SL_results['Gene 2']
all_GI["GI_Score"] = SL_results['SynLet q-value']
all_GI["GI_Cutoff"] = [0.01] * len(all_GI["Gene_A"])
all_GI["Study_Source"] = [study_name_to_pubmed_id['najm_data']] * len(all_GI["Gene_A"])
all_GI["Cell_Line"] = SL_results['Source']
all_GI = all_GI.fillna(0)
all_GI["Stat_Score"] = [float("nan")] * len(all_GI["Gene_A"])
all_GI["Stat_Cutoff"] = [float("nan")] * len(all_GI["Gene_A"])

In [424]:
counts_ref

Unnamed: 0,Guide 1,Guide 2,Gene 1,Gene 2,Count Replicates,Type,Sequencing,Cell Line,Condition,Study
0,HPRT intron_1,6T_1,HPRT intron,6T,1137.0;2438.0;1997.0;2175.0,,,786O,pDNA_Reads;Rep_A_Reads;Rep_B_Reads;Rep_C_Reads,29251726
1,HPRT intron_1,UBC_1,HPRT intron,UBC,1112.0;2176.0;1385.0;1485.0,,,786O,pDNA_Reads;Rep_A_Reads;Rep_B_Reads;Rep_C_Reads,29251726
2,HPRT intron_1,CD81_1,HPRT intron,CD81,796.0;2294.0;1579.0;1844.0,,,786O,pDNA_Reads;Rep_A_Reads;Rep_B_Reads;Rep_C_Reads,29251726
3,HPRT intron_1,CD81_2,HPRT intron,CD81,839.0;2169.0;1229.0;1705.0,,,786O,pDNA_Reads;Rep_A_Reads;Rep_B_Reads;Rep_C_Reads,29251726
4,HPRT intron_1,HSP90AA1_1,HPRT intron,HSP90AA1,689.0;1771.0;984.0;1526.0,,,786O,pDNA_Reads;Rep_A_Reads;Rep_B_Reads;Rep_C_Reads,29251726
...,...,...,...,...,...,...,...,...,...,...
55291,UBB_6,PARP2_3,UBB,PARP2,431.0;140.0;93.0;291.0,,,OVCAR8,pDNA_Reads;Rep_A_Reads;Rep_B_Reads;Rep_C_Reads,29251726
55292,UBB_6,IMPDH1_3,UBB,IMPDH1,399.0;185.0;270.0;418.0,,,OVCAR8,pDNA_Reads;Rep_A_Reads;Rep_B_Reads;Rep_C_Reads,29251726
55293,UBB_6,HPRT intron_6,UBB,HPRT intron,496.0;215.0;208.0;337.0,,,OVCAR8,pDNA_Reads;Rep_A_Reads;Rep_B_Reads;Rep_C_Reads,29251726
55294,UBB_6,BCL2A1_3,UBB,BCL2A1,548.0;178.0;186.0;486.0,,,OVCAR8,pDNA_Reads;Rep_A_Reads;Rep_B_Reads;Rep_C_Reads,29251726


In [425]:
all_GI

Unnamed: 0,Gene_A,Gene_B,Study_Source,Cell_Line,GI_Score,GI_Cutoff,Stat_Score,Stat_Cutoff
0,AKT1,AKT2,29251726,HT29,0.001196,0.01,,
1,AKT2,AKT1,29251726,HT29,0.001196,0.01,,
2,AKT2,AKT3,29251726,Meljuso,0.091691,0.01,,
3,AKT3,AKT2,29251726,Meljuso,0.091691,0.01,,
4,AKT1,AKT3,29251726,OVCAR8,0.131109,0.01,,
...,...,...,...,...,...,...,...,...
5041,EEF2,BCL2A1,29251726,Meljuso,0.845612,0.01,,
5042,EEF2,MAP2K1,29251726,OVCAR8,0.857619,0.01,,
5043,EEF2,BCL2,29251726,Meljuso,0.867560,0.01,,
5044,EEF2,AKT2,29251726,HT29,0.893805,0.01,,


In [426]:
all_GI['Cell_Line']

0          HT29
1          HT29
2       Meljuso
3       Meljuso
4        OVCAR8
         ...   
5041    Meljuso
5042     OVCAR8
5043    Meljuso
5044       HT29
5045     OVCAR8
Name: Cell_Line, Length: 5046, dtype: object

In [427]:
# prepare the files for insertion
db_inserts = prepare_study_for_export(sequence_ref = sequence_ref.copy(), counts_ref = counts_ref.copy(), score_ref = all_GI.copy(), study_controls = controls['najm_data'], study_conditions = study_conditions['najm_data'])

Starting processing...
Score reference...
Controls within SL score that are removed: 
1272
---
Only GI cutoff is present...
Counts reference...
Number of double pairs: 32400
Number of controls: 2646
Number of singles: 20250
Sequence reference...
Done! Returning...


In [428]:
# drop duplicates (Najm Specific)
db_inserts['score_ref'] = db_inserts['score_ref'].drop_duplicates(subset = ['gene_pair', 'cell_line_origin'])

In [429]:
#db_inserts['score_ref']

In [430]:
# Najm 
#db_inserts['score_ref'].loc[db_inserts['score_ref']['gene_1'] != db_inserts['score_ref']['gene_2'], 'SL_or_not'].value_counts()

In [431]:
#db_inserts['score_ref']['SL_or_not'].value_counts()

In [432]:
# insert to the database
insert_study_to_db(engine_link = SLKB_engine, db_inserts = db_inserts)

Final QC...
Beginning transaction...
Done sequence
Done counts
Done score
Successfully inserted!
Added Record stats...
Sequence insert: 192
Counts insert: 55296
Score insert: 1800
Done!


## ShanTang Data (Saos2 and TT2 cell line)

In [122]:
shantang_loc = os.path.join(learning_goals_loc_general, "ShanTang", 'OS_DKOScreen_Final')
lib_file = pd.read_csv(os.path.join(shantang_loc, "lib_dko_os.txt"), sep = '\t')
saos_counts = pd.read_csv(os.path.join(shantang_loc, "OS_DKO_SaOS2_Miss1_20230209.txt"), sep = ' ')
tt2_counts = pd.read_csv(os.path.join(shantang_loc, "OS_DKO_TT2_Miss1_20230209.txt"), sep = '\t')

In [123]:
for column in ['target_a_id', 'target_b_id']:
    lib_file[column] = [i.split(':')[0] for i in lib_file[column]]

In [124]:
idx = (lib_file['target_a_id'].isin(['luciferase', 'EGFP', 'LacZ'])) | (lib_file['target_b_id'].isin(['luciferase', 'EGFP', 'LacZ']))

In [125]:
idx.sum()

5379

In [126]:
# remove the 3 of them
lib_file = lib_file[~idx]

In [127]:
lib_file

Unnamed: 0,construct_id,target_a_id,probe_a_id,probe_a_seq,target_b_id,probe_b_id,probe_b_seq
0,BRD4_1__BRD4_1,BRD4,BRD4_1,GGGAACAATAAAGAAGCGCT,BRD4,BRD4_1,GGGAACAATAAAGAAGCGCT
1,BRD4_2__BRD4_1,BRD4,BRD4_2,ACAGGAGGAGGATTCGGCTG,BRD4,BRD4_1,GGGAACAATAAAGAAGCGCT
2,BRD4_3__BRD4_1,BRD4,BRD4_3,CACCAAACTCCTGAGCATCA,BRD4,BRD4_1,GGGAACAATAAAGAAGCGCT
3,BRD4_4__BRD4_1,BRD4,BRD4_4,TTCGACTGATGACTCTGAGG,BRD4,BRD4_1,GGGAACAATAAAGAAGCGCT
4,CHEK1_1__BRD4_1,CHEK1,CHEK1_1,TGGTATTGGAATAACTCACA,BRD4,BRD4_1,GGGAACAATAAAGAAGCGCT
...,...,...,...,...,...,...,...
59734,0Safe_safe_TMM_204550.337__0Safe_safe_U2_20455...,0Safe,0Safe_safe_TMM_204550.337,GAGAGTATACATTCAACC,0Safe,0Safe_safe_U2_204550.3037,GTAGCTGACATTGCTAT
59735,0Safe_safe_U1_204550.3846__0Safe_safe_U2_20455...,0Safe,0Safe_safe_U1_204550.3846,GGTTTAAACCCTTTAAAAT,0Safe,0Safe_safe_U2_204550.3037,GTAGCTGACATTGCTAT
59736,0Safe_safe_U1_204550.4037__0Safe_safe_U2_20455...,0Safe,0Safe_safe_U1_204550.4037,GATGAAAGTCGAATCCTAT,0Safe,0Safe_safe_U2_204550.3037,GTAGCTGACATTGCTAT
59737,0Safe_safe_U2_204550.3016__0Safe_safe_U2_20455...,0Safe,0Safe_safe_U2_204550.3016,GCACAAAAGTATTGGGGT,0Safe,0Safe_safe_U2_204550.3037,GTAGCTGACATTGCTAT


In [82]:
saos_counts

Unnamed: 0,construct_id,S0A,S0B,S0C,SEA,SEB,SEC
0,0Safe_safe_ACOC_204550.4522__0Safe_safe_ACOC_2...,9,12,12,19,33,30
1,0Safe_safe_ACOC_204550.4522__0Safe_safe_ACOC_2...,32,29,39,32,62,90
2,0Safe_safe_ACOC_204550.4522__0Safe_safe_DTKP_2...,22,36,39,31,39,26
3,0Safe_safe_ACOC_204550.4522__0Safe_safe_DTKP_2...,30,36,24,64,95,101
4,0Safe_safe_ACOC_204550.4522__0Safe_safe_GE_204...,23,30,43,31,58,61
...,...,...,...,...,...,...,...
62495,YAP1_4__WEE1_4,5,5,11,1,2,8
62496,YAP1_4__YAP1_1,35,59,62,56,124,72
62497,YAP1_4__YAP1_2,33,44,60,49,54,98
62498,YAP1_4__YAP1_3,40,50,48,20,17,42


In [83]:
tt2_counts

Unnamed: 0,construct_id,TEA,T0A,TEC,TEB,T0B,T0C
0,0Safe_safe_ACOC_204550.4522__0Safe_safe_ACOC_2...,15,11,24,40,9,9
1,0Safe_safe_ACOC_204550.4522__0Safe_safe_ACOC_2...,36,32,49,36,51,39
2,0Safe_safe_ACOC_204550.4522__0Safe_safe_DTKP_2...,26,23,34,36,22,40
3,0Safe_safe_ACOC_204550.4522__0Safe_safe_DTKP_2...,47,36,64,64,38,53
4,0Safe_safe_ACOC_204550.4522__0Safe_safe_GE_204...,59,28,56,78,92,71
...,...,...,...,...,...,...,...
62495,luciferase:1028-1047_luciferase_-__YAP1_3,40,82,80,28,85,78
62496,luciferase:1028-1047_luciferase_-__YAP1_4,11,55,57,60,48,49
62497,luciferase:1028-1047_luciferase_-__luciferase:...,18,18,15,23,10,24
62498,luciferase:1028-1047_luciferase_-__luciferase:...,0,0,0,0,1,0


In [84]:
for column in ['target_a_id', 'probe_a_id', 'target_b_id', 'probe_b_id']:
    lib_file[column] = ['-'.join(i.split('_')) for i in lib_file[column]]

### Seq Ref

In [92]:
sgRNA_ref = pd.DataFrame(columns = ["Guide_ID", "Sequence"])
sgRNA_ref['Guide_ID'] = pd.concat([lib_file['probe_a_id'], lib_file['probe_b_id']])
sgRNA_ref['Sequence'] = pd.concat([lib_file['probe_a_seq'], lib_file['probe_b_seq']])
sgRNA_ref.index = sgRNA_ref['Guide_ID']
sgRNA_ref = sgRNA_ref.drop_duplicates()
sgRNA_ref = sgRNA_ref.reset_index(drop = True)

In [93]:
sgRNA_ref['Guide_ID'] = [i.upper() for i in sgRNA_ref['Guide_ID']]

In [94]:
sgRNA_ref

Unnamed: 0,Guide_ID,Sequence
0,BRD4-1,GGGAACAATAAAGAAGCGCT
1,BRD4-2,ACAGGAGGAGGATTCGGCTG
2,BRD4-3,CACCAAACTCCTGAGCATCA
3,BRD4-4,TTCGACTGATGACTCTGAGG
4,CHEK1-1,TGGTATTGGAATAACTCACA
...,...,...
234,0SAFE-SAFE-TMM-204550.337,GAGAGTATACATTCAACC
235,0SAFE-SAFE-U1-204550.3846,GGTTTAAACCCTTTAAAAT
236,0SAFE-SAFE-U1-204550.4037,GATGAAAGTCGAATCCTAT
237,0SAFE-SAFE-U2-204550.3016,GCACAAAAGTATTGGGGT


In [95]:
sequence_ref

Unnamed: 0,sgRNA_guide_name,sgRNA_guide_seq,sgRNA_target_name
0,BRD4-1,GGGAACAATAAAGAAGCGCT,BRD4
1,BRD4-2,ACAGGAGGAGGATTCGGCTG,BRD4
2,BRD4-3,CACCAAACTCCTGAGCATCA,BRD4
3,BRD4-4,TTCGACTGATGACTCTGAGG,BRD4
4,CHEK1-1,TGGTATTGGAATAACTCACA,CHEK1
...,...,...,...
234,0Safe-safe-TMM-204550.337,GAGAGTATACATTCAACC,0Safe
235,0Safe-safe-U1-204550.3846,GGTTTAAACCCTTTAAAAT,0Safe
236,0Safe-safe-U1-204550.4037,GATGAAAGTCGAATCCTAT,0Safe
237,0Safe-safe-U2-204550.3016,GCACAAAAGTATTGGGGT,0Safe


In [96]:
sequence_ref = sgRNA_ref.copy()
sequence_ref['Target'] = [i.split('-')[0] for i in sequence_ref['Guide_ID']]
sequence_ref.columns = ['sgRNA_guide_name', 'sgRNA_guide_seq', 'sgRNA_target_name']
sequence_ref.loc[sequence_ref['sgRNA_target_name'].isin(controls['shantang_2CL_data']), 'sgRNA_target_name'] = 'control'


In [97]:
sequence_ref

Unnamed: 0,sgRNA_guide_name,sgRNA_guide_seq,sgRNA_target_name
0,BRD4-1,GGGAACAATAAAGAAGCGCT,BRD4
1,BRD4-2,ACAGGAGGAGGATTCGGCTG,BRD4
2,BRD4-3,CACCAAACTCCTGAGCATCA,BRD4
3,BRD4-4,TTCGACTGATGACTCTGAGG,BRD4
4,CHEK1-1,TGGTATTGGAATAACTCACA,CHEK1
...,...,...,...
234,0SAFE-SAFE-TMM-204550.337,GAGAGTATACATTCAACC,control
235,0SAFE-SAFE-U1-204550.3846,GGTTTAAACCCTTTAAAAT,control
236,0SAFE-SAFE-U1-204550.4037,GATGAAAGTCGAATCCTAT,control
237,0SAFE-SAFE-U2-204550.3016,GCACAAAAGTATTGGGGT,control


In [98]:
(sequence_ref['sgRNA_target_name'] == 'control').sum()

15

### Counts

In [33]:
saos_counts = lib_file.merge(saos_counts, how = 'left', left_on = 'construct_id', right_on = 'construct_id')
tt2_counts = lib_file.merge(tt2_counts, how = 'left', left_on = 'construct_id', right_on = 'construct_id')

In [34]:
saos_counts['Count Replicates'] = saos_counts[['S0A', 'S0B', 'S0C', 'SEA', 'SEB', 'SEC']].apply(    
    lambda x: ';'.join(x.astype(str)),
    axis=1
)
saos_counts['Cell Line'] = ['SAOS2'] * saos_counts.shape[0]
saos_counts["Condition"] = [';'.join(['S0A', 'S0B', 'S0C', 'SEA', 'SEB', 'SEC'])] * saos_counts.shape[0]

tt2_counts['Count Replicates'] = tt2_counts[['T0A', 'T0B', 'T0C', 'TEA', 'TEB', 'TEC']].apply(    
    lambda x: ';'.join(x.astype(str)),
    axis=1
)

tt2_counts['Cell Line'] = ['TT2'] * tt2_counts.shape[0]
tt2_counts["Condition"] = [';'.join(['T0A', 'T0B', 'T0C', 'TEA', 'TEB', 'TEC'])] * tt2_counts.shape[0]

In [35]:
saos_counts

Unnamed: 0,construct_id,target_a_id,probe_a_id,probe_a_seq,target_b_id,probe_b_id,probe_b_seq,S0A,S0B,S0C,SEA,SEB,SEC,Count Replicates,Cell Line,Condition
0,BRD4_1__BRD4_1,BRD4,BRD4-1,GGGAACAATAAAGAAGCGCT,BRD4,BRD4-1,GGGAACAATAAAGAAGCGCT,163,110,163,78,103,101,163;110;163;78;103;101,SAOS2,S0A;S0B;S0C;SEA;SEB;SEC
1,BRD4_2__BRD4_1,BRD4,BRD4-2,ACAGGAGGAGGATTCGGCTG,BRD4,BRD4-1,GGGAACAATAAAGAAGCGCT,719,525,845,308,449,434,719;525;845;308;449;434,SAOS2,S0A;S0B;S0C;SEA;SEB;SEC
2,BRD4_3__BRD4_1,BRD4,BRD4-3,CACCAAACTCCTGAGCATCA,BRD4,BRD4-1,GGGAACAATAAAGAAGCGCT,170,164,240,65,98,125,170;164;240;65;98;125,SAOS2,S0A;S0B;S0C;SEA;SEB;SEC
3,BRD4_4__BRD4_1,BRD4,BRD4-4,TTCGACTGATGACTCTGAGG,BRD4,BRD4-1,GGGAACAATAAAGAAGCGCT,194,179,231,107,134,177,194;179;231;107;134;177,SAOS2,S0A;S0B;S0C;SEA;SEB;SEC
4,CHEK1_1__BRD4_1,CHEK1,CHEK1-1,TGGTATTGGAATAACTCACA,BRD4,BRD4-1,GGGAACAATAAAGAAGCGCT,24,21,35,14,20,12,24;21;35;14;20;12,SAOS2,S0A;S0B;S0C;SEA;SEB;SEC
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57116,0Safe_safe_TMM_204550.337__0Safe_safe_U2_20455...,0Safe,0Safe-safe-TMM-204550.337,GAGAGTATACATTCAACC,0Safe,0Safe-safe-U2-204550.3037,GTAGCTGACATTGCTAT,5,11,12,22,20,18,5;11;12;22;20;18,SAOS2,S0A;S0B;S0C;SEA;SEB;SEC
57117,0Safe_safe_U1_204550.3846__0Safe_safe_U2_20455...,0Safe,0Safe-safe-U1-204550.3846,GGTTTAAACCCTTTAAAAT,0Safe,0Safe-safe-U2-204550.3037,GTAGCTGACATTGCTAT,12,14,11,18,37,28,12;14;11;18;37;28,SAOS2,S0A;S0B;S0C;SEA;SEB;SEC
57118,0Safe_safe_U1_204550.4037__0Safe_safe_U2_20455...,0Safe,0Safe-safe-U1-204550.4037,GATGAAAGTCGAATCCTAT,0Safe,0Safe-safe-U2-204550.3037,GTAGCTGACATTGCTAT,4,8,10,11,24,25,4;8;10;11;24;25,SAOS2,S0A;S0B;S0C;SEA;SEB;SEC
57119,0Safe_safe_U2_204550.3016__0Safe_safe_U2_20455...,0Safe,0Safe-safe-U2-204550.3016,GCACAAAAGTATTGGGGT,0Safe,0Safe-safe-U2-204550.3037,GTAGCTGACATTGCTAT,56,31,86,70,76,106,56;31;86;70;76;106,SAOS2,S0A;S0B;S0C;SEA;SEB;SEC


In [36]:
counts_ref = pd.DataFrame(columns = ["Guide 1", "Guide 2", "Gene 1", "Gene 2", "Count Replicates", "Type", "Sequencing", "Cell Line", "Condition"])

In [37]:
study_name_to_pubmed_id.keys()

dict_keys(['diehl_data', 'han_data', 'horlbeck_data', 'ito_data', 'parrish_data', 'shen_data', 'thompson_data', 'wong_data', 'zhao_data', 'shantang_data', 'najm_data', 'shantang_2CL_data'])

In [38]:
counts_ref['Guide 1'] = saos_counts['probe_a_id'].tolist() + tt2_counts['probe_a_id'].tolist()
counts_ref['Guide 2'] = saos_counts['probe_b_id'].tolist() + tt2_counts['probe_b_id'].tolist()
counts_ref['Gene 1'] = saos_counts['target_a_id'].tolist() + tt2_counts['target_a_id'].tolist()
counts_ref['Gene 2'] = saos_counts['target_b_id'].tolist() + tt2_counts['target_b_id'].tolist()
counts_ref['Count Replicates'] = saos_counts['Count Replicates'].tolist() + tt2_counts['Count Replicates'].tolist()
counts_ref['Cell Line'] = saos_counts['Cell Line'].tolist() + tt2_counts['Cell Line'].tolist()
counts_ref['Condition'] = saos_counts['Condition'].tolist() + tt2_counts['Condition'].tolist()
counts_ref['Sequencing'] = ['Combinatorial CRISPR'] * counts_ref.shape[0]
counts_ref["Study"] = [study_name_to_pubmed_id['shantang_2CL_data']] * counts_ref.shape[0]

In [39]:
counts_ref

Unnamed: 0,Guide 1,Guide 2,Gene 1,Gene 2,Count Replicates,Type,Sequencing,Cell Line,Condition,Study
0,BRD4-1,BRD4-1,BRD4,BRD4,163;110;163;78;103;101,,Combinatorial CRISPR,SAOS2,S0A;S0B;S0C;SEA;SEB;SEC,Shan_2CL
1,BRD4-2,BRD4-1,BRD4,BRD4,719;525;845;308;449;434,,Combinatorial CRISPR,SAOS2,S0A;S0B;S0C;SEA;SEB;SEC,Shan_2CL
2,BRD4-3,BRD4-1,BRD4,BRD4,170;164;240;65;98;125,,Combinatorial CRISPR,SAOS2,S0A;S0B;S0C;SEA;SEB;SEC,Shan_2CL
3,BRD4-4,BRD4-1,BRD4,BRD4,194;179;231;107;134;177,,Combinatorial CRISPR,SAOS2,S0A;S0B;S0C;SEA;SEB;SEC,Shan_2CL
4,CHEK1-1,BRD4-1,CHEK1,BRD4,24;21;35;14;20;12,,Combinatorial CRISPR,SAOS2,S0A;S0B;S0C;SEA;SEB;SEC,Shan_2CL
...,...,...,...,...,...,...,...,...,...,...
114237,0Safe-safe-TMM-204550.337,0Safe-safe-U2-204550.3037,0Safe,0Safe,22;12;15;6;20;14,,Combinatorial CRISPR,TT2,T0A;T0B;T0C;TEA;TEB;TEC,Shan_2CL
114238,0Safe-safe-U1-204550.3846,0Safe-safe-U2-204550.3037,0Safe,0Safe,21;37;25;16;24;14,,Combinatorial CRISPR,TT2,T0A;T0B;T0C;TEA;TEB;TEC,Shan_2CL
114239,0Safe-safe-U1-204550.4037,0Safe-safe-U2-204550.3037,0Safe,0Safe,10;3;8;12;81;14,,Combinatorial CRISPR,TT2,T0A;T0B;T0C;TEA;TEB;TEC,Shan_2CL
114240,0Safe-safe-U2-204550.3016,0Safe-safe-U2-204550.3037,0Safe,0Safe,50;50;64;37;59;73,,Combinatorial CRISPR,TT2,T0A;T0B;T0C;TEA;TEB;TEC,Shan_2CL


## SL Scores

In [40]:
# there are no scores available, generate them instead

In [41]:
# prepare the files for insertion
db_inserts = prepare_study_for_export(sequence_ref = sequence_ref.copy(), counts_ref = counts_ref.copy(), score_ref = None, study_controls = controls['shantang_2CL_data'], study_conditions = study_conditions['shantang_2CL_data'])

There are no scores, but there are counts...Generating Placeholder...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  curr_counts['sorted_genes'] = ['_'.join(sorted([curr_counts['Gene 1'].iloc[i], curr_counts['Gene 2'].iloc[i]])) for i in range(curr_counts.shape[0])]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  curr_counts.drop_duplicates(subset = ['sorted_genes', 'Cell Line'], keep = 'first', inplace = True)


Starting processing...
Score reference...
Controls within SL score that are removed: 
0
---
No scores/stats cutoffs are available, possibly generated. Setting all to be NOT SL
Counts reference...
Number of double pairs: 98560
Number of controls: 450
Number of singles: 15232
Sequence reference...
Done! Returning...


In [42]:
# insert to the database
insert_study_to_db(engine_link = SLKB_engine, db_inserts = db_inserts)

Final QC...
Beginning transaction...
Done sequence
Done counts
Done score
Successfully inserted!
Added Record stats...
Sequence insert: 239
Counts insert: 114242
Score insert: 3080
Done!


## Shuai Data

In [154]:
shuai_loc = os.path.join(learning_goals_loc_general, "Shuai")
lib_file = pd.read_csv(os.path.join(shuai_loc, "cdko_lib.txt"), sep = '\t', skiprows = 3)
counts_file = pd.read_csv(os.path.join(shuai_loc, "CDCDKOLib_20230201003949_counts_combined.txt"), sep = '\t')


In [155]:
for column in ['target_a_id', 'probe_a_id', 'target_b_id', 'probe_b_id']:
    lib_file[column] = ['-'.join(i.split('_')) for i in lib_file[column]]

In [156]:
lib_file['target_a_id'] = [i.split('-')[0] for i in lib_file['probe_a_id']]
lib_file['target_b_id'] = [i.split('-')[0] for i in lib_file['probe_b_id']]

In [157]:
lib_file

Unnamed: 0,construct_id,target_a_id,probe_a_id,probe_a_seq,target_b_id,probe_b_id,probe_b_seq
0,FTH1_1__FTH1_1,FTH1,FTH1-1,CACCATGGACAGGTAAACGT,FTH1,FTH1-1,CACCATGGACAGGTAAACGT
1,FTH1_1__FTH1_2,FTH1,FTH1-1,CACCATGGACAGGTAAACGT,FTH1,FTH1-2,TGTTCACCTTGATATCCTGA
2,FTH1_1__FTH1_3,FTH1,FTH1-1,CACCATGGACAGGTAAACGT,FTH1,FTH1-3,AATCTTCCTTCAGGATATCA
3,FTH1_1__ANAPC10_1,FTH1,FTH1-1,CACCATGGACAGGTAAACGT,ANAPC10,ANAPC10-1,GACATACCCGAATTTCTTGA
4,FTH1_1__ANAPC10_2,FTH1,FTH1-1,CACCATGGACAGGTAAACGT,ANAPC10,ANAPC10-2,ATTTAGTGAACATCCAATTC
...,...,...,...,...,...,...,...
52895,0Safe_safe_U1_204550.4461_1__0Safe_safe_DTKP_2...,0Safe,0Safe-safe-U1-204550.4461-1,GTCTTAAAGAGTAAAGTACA,0Safe,0Safe-safe-DTKP-204550.5816-1,GTGGAGTGGCGGAATCAGAT
52896,0Safe_safe_U1_204550.4461_1__0Safe_safe_U1_204...,0Safe,0Safe-safe-U1-204550.4461-1,GTCTTAAAGAGTAAAGTACA,0Safe,0Safe-safe-U1-204550.4097-1,GCTTCAATATGACAGAACTC
52897,0Safe_safe_U1_204550.4461_1__0Safe_safe_U1_204...,0Safe,0Safe-safe-U1-204550.4461-1,GTCTTAAAGAGTAAAGTACA,0Safe,0Safe-safe-U1-204550.4192-1,GGATGATCAAAAATACTGTA
52898,0Safe_safe_U1_204550.4461_1__0Safe_safe_U1_204...,0Safe,0Safe-safe-U1-204550.4461-1,GTCTTAAAGAGTAAAGTACA,0Safe,0Safe-safe-U1-204550.4450-1,GCTTATTTAGTTTGGTTCAA


In [158]:
counts_file

Unnamed: 0,construct_id,F01_S4_001_trimmed53_len_filtered_counts,F02_S5_001_trimmed53_len_filtered_counts,F03_S6_001_trimmed53_len_filtered_counts,T01_S1_001_trimmed53_len_filtered_counts,T02_S2_001_trimmed53_len_filtered_counts,T03_S3_001_trimmed53_len_filtered_counts
0,0Safe_safe_ACOC_204550.4590_1__0Safe_safe_ACOC...,344,572,563,524,638,647
1,0Safe_safe_ACOC_204550.4590_1__0Safe_safe_ACOC...,57,68,68,72,95,178
2,0Safe_safe_ACOC_204550.4590_1__0Safe_safe_ACOC...,126,218,237,80,104,131
3,0Safe_safe_ACOC_204550.4590_1__0Safe_safe_ACOC...,75,161,113,49,63,106
4,0Safe_safe_ACOC_204550.4590_1__0Safe_safe_ACOC...,48,100,74,42,65,70
...,...,...,...,...,...,...,...
52895,XIAP_3__WEE1_2,37,62,70,508,676,595
52896,XIAP_3__WEE1_3,54,109,75,185,230,236
52897,XIAP_3__XIAP_1,411,538,585,429,560,572
52898,XIAP_3__XIAP_2,285,498,504,371,497,482


### Seq Ref

In [159]:
sgRNA_ref = pd.DataFrame(columns = ["Guide_ID", "Sequence"])
sgRNA_ref['Guide_ID'] = pd.concat([lib_file['probe_a_id'], lib_file['probe_b_id']])
sgRNA_ref['Sequence'] = pd.concat([lib_file['probe_a_seq'], lib_file['probe_b_seq']])
sgRNA_ref.index = sgRNA_ref['Guide_ID']
sgRNA_ref = sgRNA_ref.drop_duplicates()
sgRNA_ref = sgRNA_ref.reset_index(drop = True)

In [160]:
sgRNA_ref['Guide_ID'] = [i.upper() for i in sgRNA_ref['Guide_ID']]

In [161]:
sequence_ref = sgRNA_ref.copy()
sequence_ref['Target'] = [i.split('-')[0] for i in sequence_ref['Guide_ID']]
sequence_ref.columns = ['sgRNA_guide_name', 'sgRNA_guide_seq', 'sgRNA_target_name']
sequence_ref.loc[sequence_ref['sgRNA_target_name'].isin(controls['shuai_data']), 'sgRNA_target_name'] = 'control'


In [162]:
sequence_ref

Unnamed: 0,sgRNA_guide_name,sgRNA_guide_seq,sgRNA_target_name
0,FTH1-1,CACCATGGACAGGTAAACGT,FTH1
1,FTH1-2,TGTTCACCTTGATATCCTGA,FTH1
2,FTH1-3,AATCTTCCTTCAGGATATCA,FTH1
3,ANAPC10-1,GACATACCCGAATTTCTTGA,ANAPC10
4,ANAPC10-2,ATTTAGTGAACATCCAATTC,ANAPC10
...,...,...,...
225,0SAFE-SAFE-DTKP-204550.5816-1,GTGGAGTGGCGGAATCAGAT,control
226,0SAFE-SAFE-U1-204550.4097-1,GCTTCAATATGACAGAACTC,control
227,0SAFE-SAFE-U1-204550.4192-1,GGATGATCAAAAATACTGTA,control
228,0SAFE-SAFE-U1-204550.4450-1,GCTTATTTAGTTTGGTTCAA,control


In [163]:
(sequence_ref['sgRNA_target_name'] == 'control').sum()

17

### Counts

In [164]:
counts_file = lib_file.merge(counts_file, how = 'left', left_on = 'construct_id', right_on = 'construct_id')

In [165]:
counts_file

Unnamed: 0,construct_id,target_a_id,probe_a_id,probe_a_seq,target_b_id,probe_b_id,probe_b_seq,F01_S4_001_trimmed53_len_filtered_counts,F02_S5_001_trimmed53_len_filtered_counts,F03_S6_001_trimmed53_len_filtered_counts,T01_S1_001_trimmed53_len_filtered_counts,T02_S2_001_trimmed53_len_filtered_counts,T03_S3_001_trimmed53_len_filtered_counts
0,FTH1_1__FTH1_1,FTH1,FTH1-1,CACCATGGACAGGTAAACGT,FTH1,FTH1-1,CACCATGGACAGGTAAACGT,25,30,48,24,43,41
1,FTH1_1__FTH1_2,FTH1,FTH1-1,CACCATGGACAGGTAAACGT,FTH1,FTH1-2,TGTTCACCTTGATATCCTGA,79,77,77,26,57,46
2,FTH1_1__FTH1_3,FTH1,FTH1-1,CACCATGGACAGGTAAACGT,FTH1,FTH1-3,AATCTTCCTTCAGGATATCA,74,89,119,3,1,4
3,FTH1_1__ANAPC10_1,FTH1,FTH1-1,CACCATGGACAGGTAAACGT,ANAPC10,ANAPC10-1,GACATACCCGAATTTCTTGA,147,201,235,63,73,84
4,FTH1_1__ANAPC10_2,FTH1,FTH1-1,CACCATGGACAGGTAAACGT,ANAPC10,ANAPC10-2,ATTTAGTGAACATCCAATTC,70,127,119,6,7,14
...,...,...,...,...,...,...,...,...,...,...,...,...,...
52895,0Safe_safe_U1_204550.4461_1__0Safe_safe_DTKP_2...,0Safe,0Safe-safe-U1-204550.4461-1,GTCTTAAAGAGTAAAGTACA,0Safe,0Safe-safe-DTKP-204550.5816-1,GTGGAGTGGCGGAATCAGAT,65,104,114,83,97,122
52896,0Safe_safe_U1_204550.4461_1__0Safe_safe_U1_204...,0Safe,0Safe-safe-U1-204550.4461-1,GTCTTAAAGAGTAAAGTACA,0Safe,0Safe-safe-U1-204550.4097-1,GCTTCAATATGACAGAACTC,2,13,3,0,0,1
52897,0Safe_safe_U1_204550.4461_1__0Safe_safe_U1_204...,0Safe,0Safe-safe-U1-204550.4461-1,GTCTTAAAGAGTAAAGTACA,0Safe,0Safe-safe-U1-204550.4192-1,GGATGATCAAAAATACTGTA,28,46,41,23,34,55
52898,0Safe_safe_U1_204550.4461_1__0Safe_safe_U1_204...,0Safe,0Safe-safe-U1-204550.4461-1,GTCTTAAAGAGTAAAGTACA,0Safe,0Safe-safe-U1-204550.4450-1,GCTTATTTAGTTTGGTTCAA,20,43,42,0,0,1


In [166]:
study_conditions['shuai_data']

[['T1', 'T2', 'T3'], ['F1', 'F2', 'F3']]

In [167]:
counts_file['Count Replicates'] = counts_file[['T01_S1_001_trimmed53_len_filtered_counts', 'T02_S2_001_trimmed53_len_filtered_counts', 'T03_S3_001_trimmed53_len_filtered_counts',
                                              'F01_S4_001_trimmed53_len_filtered_counts', 'F02_S5_001_trimmed53_len_filtered_counts', 'F03_S6_001_trimmed53_len_filtered_counts']].apply(    
    lambda x: ';'.join(x.astype(str)),
    axis=1
)
counts_file['Cell Line'] = ['Shuai_CL'] * counts_file.shape[0]
counts_file["Condition"] = [';'.join(['T1', 'T2', 'T3', 'F1', 'F2', 'F3'])] * counts_file.shape[0]

In [168]:
counts_file

Unnamed: 0,construct_id,target_a_id,probe_a_id,probe_a_seq,target_b_id,probe_b_id,probe_b_seq,F01_S4_001_trimmed53_len_filtered_counts,F02_S5_001_trimmed53_len_filtered_counts,F03_S6_001_trimmed53_len_filtered_counts,T01_S1_001_trimmed53_len_filtered_counts,T02_S2_001_trimmed53_len_filtered_counts,T03_S3_001_trimmed53_len_filtered_counts,Count Replicates,Cell Line,Condition
0,FTH1_1__FTH1_1,FTH1,FTH1-1,CACCATGGACAGGTAAACGT,FTH1,FTH1-1,CACCATGGACAGGTAAACGT,25,30,48,24,43,41,24;43;41;25;30;48,Shuai_CL,T1;T2;T3;F1;F2;F3
1,FTH1_1__FTH1_2,FTH1,FTH1-1,CACCATGGACAGGTAAACGT,FTH1,FTH1-2,TGTTCACCTTGATATCCTGA,79,77,77,26,57,46,26;57;46;79;77;77,Shuai_CL,T1;T2;T3;F1;F2;F3
2,FTH1_1__FTH1_3,FTH1,FTH1-1,CACCATGGACAGGTAAACGT,FTH1,FTH1-3,AATCTTCCTTCAGGATATCA,74,89,119,3,1,4,3;1;4;74;89;119,Shuai_CL,T1;T2;T3;F1;F2;F3
3,FTH1_1__ANAPC10_1,FTH1,FTH1-1,CACCATGGACAGGTAAACGT,ANAPC10,ANAPC10-1,GACATACCCGAATTTCTTGA,147,201,235,63,73,84,63;73;84;147;201;235,Shuai_CL,T1;T2;T3;F1;F2;F3
4,FTH1_1__ANAPC10_2,FTH1,FTH1-1,CACCATGGACAGGTAAACGT,ANAPC10,ANAPC10-2,ATTTAGTGAACATCCAATTC,70,127,119,6,7,14,6;7;14;70;127;119,Shuai_CL,T1;T2;T3;F1;F2;F3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52895,0Safe_safe_U1_204550.4461_1__0Safe_safe_DTKP_2...,0Safe,0Safe-safe-U1-204550.4461-1,GTCTTAAAGAGTAAAGTACA,0Safe,0Safe-safe-DTKP-204550.5816-1,GTGGAGTGGCGGAATCAGAT,65,104,114,83,97,122,83;97;122;65;104;114,Shuai_CL,T1;T2;T3;F1;F2;F3
52896,0Safe_safe_U1_204550.4461_1__0Safe_safe_U1_204...,0Safe,0Safe-safe-U1-204550.4461-1,GTCTTAAAGAGTAAAGTACA,0Safe,0Safe-safe-U1-204550.4097-1,GCTTCAATATGACAGAACTC,2,13,3,0,0,1,0;0;1;2;13;3,Shuai_CL,T1;T2;T3;F1;F2;F3
52897,0Safe_safe_U1_204550.4461_1__0Safe_safe_U1_204...,0Safe,0Safe-safe-U1-204550.4461-1,GTCTTAAAGAGTAAAGTACA,0Safe,0Safe-safe-U1-204550.4192-1,GGATGATCAAAAATACTGTA,28,46,41,23,34,55,23;34;55;28;46;41,Shuai_CL,T1;T2;T3;F1;F2;F3
52898,0Safe_safe_U1_204550.4461_1__0Safe_safe_U1_204...,0Safe,0Safe-safe-U1-204550.4461-1,GTCTTAAAGAGTAAAGTACA,0Safe,0Safe-safe-U1-204550.4450-1,GCTTATTTAGTTTGGTTCAA,20,43,42,0,0,1,0;0;1;20;43;42,Shuai_CL,T1;T2;T3;F1;F2;F3


In [169]:
counts_ref = pd.DataFrame(columns = ["Guide 1", "Guide 2", "Gene 1", "Gene 2", "Count Replicates", "Type", "Sequencing", "Cell Line", "Condition"])

In [170]:
study_name_to_pubmed_id.keys()

dict_keys(['diehl_data', 'han_data', 'horlbeck_data', 'ito_data', 'parrish_data', 'shen_data', 'thompson_data', 'wong_data', 'zhao_data', 'shantang_data', 'najm_data', 'shantang_2CL_data', 'shuai_data'])

In [171]:
counts_ref['Guide 1'] = counts_file['probe_a_id'].tolist()
counts_ref['Guide 2'] = counts_file['probe_b_id'].tolist()
counts_ref['Gene 1'] = counts_file['target_a_id'].tolist()
counts_ref['Gene 2'] = counts_file['target_b_id'].tolist()
counts_ref['Count Replicates'] = counts_file['Count Replicates'].tolist()
counts_ref['Cell Line'] = counts_file['Cell Line'].tolist()
counts_ref['Condition'] = counts_file['Condition'].tolist()
counts_ref['Sequencing'] = ['Combinatorial CRISPR'] * counts_ref.shape[0]
counts_ref["Study"] = [study_name_to_pubmed_id['shuai_data']] * counts_ref.shape[0]

In [172]:
counts_ref

Unnamed: 0,Guide 1,Guide 2,Gene 1,Gene 2,Count Replicates,Type,Sequencing,Cell Line,Condition,Study
0,FTH1-1,FTH1-1,FTH1,FTH1,24;43;41;25;30;48,,Combinatorial CRISPR,Shuai_CL,T1;T2;T3;F1;F2;F3,Shuai
1,FTH1-1,FTH1-2,FTH1,FTH1,26;57;46;79;77;77,,Combinatorial CRISPR,Shuai_CL,T1;T2;T3;F1;F2;F3,Shuai
2,FTH1-1,FTH1-3,FTH1,FTH1,3;1;4;74;89;119,,Combinatorial CRISPR,Shuai_CL,T1;T2;T3;F1;F2;F3,Shuai
3,FTH1-1,ANAPC10-1,FTH1,ANAPC10,63;73;84;147;201;235,,Combinatorial CRISPR,Shuai_CL,T1;T2;T3;F1;F2;F3,Shuai
4,FTH1-1,ANAPC10-2,FTH1,ANAPC10,6;7;14;70;127;119,,Combinatorial CRISPR,Shuai_CL,T1;T2;T3;F1;F2;F3,Shuai
...,...,...,...,...,...,...,...,...,...,...
52895,0Safe-safe-U1-204550.4461-1,0Safe-safe-DTKP-204550.5816-1,0Safe,0Safe,83;97;122;65;104;114,,Combinatorial CRISPR,Shuai_CL,T1;T2;T3;F1;F2;F3,Shuai
52896,0Safe-safe-U1-204550.4461-1,0Safe-safe-U1-204550.4097-1,0Safe,0Safe,0;0;1;2;13;3,,Combinatorial CRISPR,Shuai_CL,T1;T2;T3;F1;F2;F3,Shuai
52897,0Safe-safe-U1-204550.4461-1,0Safe-safe-U1-204550.4192-1,0Safe,0Safe,23;34;55;28;46;41,,Combinatorial CRISPR,Shuai_CL,T1;T2;T3;F1;F2;F3,Shuai
52898,0Safe-safe-U1-204550.4461-1,0Safe-safe-U1-204550.4450-1,0Safe,0Safe,0;0;1;20;43;42,,Combinatorial CRISPR,Shuai_CL,T1;T2;T3;F1;F2;F3,Shuai


## SL Scores

In [173]:
# there are no scores available, generate them instead

In [174]:
# prepare the files for insertion
db_inserts = prepare_study_for_export(sequence_ref = sequence_ref.copy(), counts_ref = counts_ref.copy(), score_ref = None, study_controls = controls['shuai_data'], study_conditions = study_conditions['shuai_data'])

There are no scores, but there are counts...Generating Placeholder...
Starting processing...
Score reference...
Controls within SL score that are removed: 
71
---
No scores/stats cutoffs are available, possibly generated. Setting all to be NOT SL
Counts reference...
Number of double pairs: 44730
Number of controls: 289
Number of singles: 7881
Sequence reference...
Done! Returning...


In [175]:
# insert to the database
insert_study_to_db(engine_link = SLKB_engine, db_inserts = db_inserts)

Final QC...
Beginning transaction...
Done sequence
Done counts
Done score
Successfully inserted!
Added Record stats...
Sequence insert: 230
Counts insert: 52900
Score insert: 2485
Done!


## Done!