In [1]:
import numpy as np
import pandas as pd
import os
import math
from itertools import chain
import pickle
import sqlalchemy
from sqlalchemy.orm import sessionmaker
from scipy import optimize
from scipy.stats import sem
import subprocess
import shlex

In [2]:
from sqlalchemy.engine import Engine
from sqlalchemy import event

@event.listens_for(Engine, "connect")
def set_sqlite_pragma(dbapi_connection, connection_record):
    cursor = dbapi_connection.cursor()
    cursor.execute("PRAGMA foreign_keys=ON")
    cursor.close()

## Extract Original SL Results

In [3]:
# Load in the datasets
data_locs = "/users/PAS1376/bg12/SyntheticLethality/SyntheticLethalityReview/Project/ml_inputs"
learning_goals_loc_general =  os.path.join(data_locs, "learning_goals")
learning_goals_loc_general = '/users/PAS1376/bg12/SyntheticLethality - NewDB/data'


In [4]:
# read the database
SLKB_engine = sqlalchemy.create_engine('sqlite:///SLKB_sqlite3')
#SLKB_engine_session = sessionmaker(bind=SLKB_engine)

In [5]:
db_metadata = sqlalchemy.MetaData(bind=SLKB_engine)
db_metadata.reflect(SLKB_engine)

In [6]:
db_metadata.tables

FacadeDict({'CDKO_EXPERIMENT_DESIGN': Table('CDKO_EXPERIMENT_DESIGN', MetaData(bind=Engine(sqlite:///SLKB_sqlite3)), Column('sgRNA_id', INTEGER(), table=<CDKO_EXPERIMENT_DESIGN>, primary_key=True), Column('sgRNA_guide_name', TEXT(), table=<CDKO_EXPERIMENT_DESIGN>, nullable=False), Column('sgRNA_guide_seq', TEXT(), table=<CDKO_EXPERIMENT_DESIGN>, nullable=False), Column('sgRNA_target_name', TEXT(), table=<CDKO_EXPERIMENT_DESIGN>, nullable=False), Column('study_origin', TEXT(), table=<CDKO_EXPERIMENT_DESIGN>, nullable=False), schema=None), 'CDKO_ORIGINAL_SL_RESULTS': Table('CDKO_ORIGINAL_SL_RESULTS', MetaData(bind=Engine(sqlite:///SLKB_sqlite3)), Column('id', INTEGER(), table=<CDKO_ORIGINAL_SL_RESULTS>, primary_key=True), Column('gene_pair_id', INTEGER(), table=<CDKO_ORIGINAL_SL_RESULTS>), Column('gene_pair', TEXT(), table=<CDKO_ORIGINAL_SL_RESULTS>, nullable=False), Column('study_origin', TEXT(), table=<CDKO_ORIGINAL_SL_RESULTS>, nullable=False), Column('cell_line_origin', TEXT(), table

In [7]:
# db_metadata = sqlalchemy.MetaData(bind=SLKB_engine)
# db_metadata.reflect(SLKB_engine)

# all_results_tables = ['HORLBECK_SCORE', 
#                       'MAGECK_SCORE', 
#                       'MEDIAN_NB_SCORE', 
#                       'MEDIAN_NB_SCORE_FULL_NORM', 
#                       'MEDIAN_B_SCORE', 
#                       'MEDIAN_B_SCORE_FULL_NORM',
#                       'SGRA_DERIVED_NB_SCORE', 
#                       'SGRA_DERIVED_NB_SCORE_FULL_NORM', 
#                       'SGRA_DERIVED_B_SCORE', 
#                       'SGRA_DERIVED_B_SCORE_FULL_NORM', 
#                       'GEMINI_SCORE']

# for table_name in all_results_tables:
#     curr_table = db_metadata.tables[table_name]
#     curr_table.drop()

In [8]:
## store pubmed IDs
study_name_to_pubmed_id = {}

study_name_to_pubmed_id['diehl_data'] = '33956155'

study_name_to_pubmed_id['han_data'] = '28319085'

study_name_to_pubmed_id['horlbeck_data'] = '30033366'

study_name_to_pubmed_id['ito_data'] = '34857952'

study_name_to_pubmed_id['parrish_data'] = '34469736'

study_name_to_pubmed_id['shen_data'] = '28319113'

study_name_to_pubmed_id['thompson_data'] = '33637726'

study_name_to_pubmed_id['wong_data'] = '26864203'

study_name_to_pubmed_id['zhao_data'] = '29452643'

study_name_to_pubmed_id['shantang_data'] = '36060092'

study_name_to_pubmed_id['najm_data'] = '29251726'



In [9]:
rev_study_name_to_pubmed_id = {}
for item in study_name_to_pubmed_id:
    rev_study_name_to_pubmed_id[study_name_to_pubmed_id[item]] = item

In [10]:
## controls
controls = {}

# diehl et al
controls['diehl_data'] = ["wildtype-I-CeuI", "wildtype-I-SceI", "nan"] + ["Non-Human-Target-114", "Non-Human-Target-122",
                                    "Non-Human-Target-144", "Non-Human-Target-148", "Non-Human-Target-161", "Non-Human-Target-178", "Non-Human-Target-185", "Non-Human-Target-222", "Non-Human-Target-223",
                                    "Non-Human-Target-224", "Non-Human-Target-23", "Non-Human-Target-243", "Non-Human-Target-245", "Non-Human-Target-249", "Non-Human-Target-292", "Non-Human-Target-298",
                                    "Non-Human-Target-31", "Non-Human-Target-311", "Non-Human-Target-313", "Non-Human-Target-327", "Non-Human-Target-333", "Non-Human-Target-335", "Non-Human-Target-339",
                                    "Non-Human-Target-341", "Non-Human-Target-343", "Non-Human-Target-389", "Non-Human-Target-39", "Non-Human-Target-397", "Non-Human-Target-398", "Non-Human-Target-40", 
                                    "Non-Human-Target-402", "Non-Human-Target-42", "Non-Human-Target-432", "Non-Human-Target-444", "Non-Human-Target-466", "Non-Human-Target-479", "Non-Human-Target-495",
                                    "Non-Human-Target-512", "Non-Human-Target-515", "Non-Human-Target-526", "Non-Human-Target-532", "Non-Human-Target-542", "Non-Human-Target-547", "Non-Human-Target-55", 
                                    "Non-Human-Target-551", "Non-Human-Target-560", "Non-Human-Target-565", "Non-Human-Target-584", "Non-Human-Target-588", "Non-Human-Target-595", "Non-Human-Target-602",
                                    "Non-Human-Target-622", "Non-Human-Target-636", "Non-Human-Target-637", "Non-Human-Target-654", "Non-Human-Target-659", "Non-Human-Target-668", "Non-Human-Target-676",
                                    "Non-Human-Target-678", "Non-Human-Target-681", "Non-Human-Target-692", "Non-Human-Target-719", "Non-Human-Target-732", "Non-Human-Target-736", "Non-Human-Target-748",
                                    "Non-Human-Target-752", "Non-Human-Target-766", "Non-Human-Target-782", "Non-Human-Target-789", "Non-Human-Target-798", "Non-Human-Target-799", "Non-Human-Target-801",
                                    "Non-Human-Target-805", "Non-Human-Target-807", "Non-Human-Target-808", "Non-Human-Target-814", "Non-Human-Target-816", "Non-Human-Target-822", "Non-Human-Target-824",
                                    "Non-Human-Target-827", "Non-Human-Target-828", "Non-Human-Target-835", "Non-Human-Target-85", "Non-Human-Target-857", "Non-Human-Target-863", "Non-Human-Target-864",
                                    "Non-Human-Target-877", "Non-Human-Target-88", "Non-Human-Target-884", "Non-Human-Target-900", "Non-Human-Target-902", "Non-Human-Target-905", "Non-Human-Target-941",
                                    "Non-Human-Target-942", "Non-Human-Target-953", "Non-Human-Target-954", "Non-Human-Target-958", "Non-Human-Target-959", "Non-Human-Target-965", "Non-Human-Target-969",
                                    "Non-Human-Target-970", "Non-Human-Target-99", "Non-Human-Target-995", "Non-Human-Target-997"]

# horlbeck et al
controls['horlbeck_data'] = ["negative"]

# parrish et al
controls['parrish_data'] = ["nt" + str(i+1) for i in range(975)] + ["FAKE_GENE_" + str(i+1) for i in range(50)]
controls['parrish_data_original'] = ["nt" + str(i+1) for i in range(975)] + ["FAKE_GENE_" + str(i+1) for i in range(50)]

# wong et al
controls['wong_data'] = ["DUMMYGUIDE"]

# zhao et al
controls['zhao_data'] = ["0", 'CONTROL']

# tang et al
controls['shantang_data'] = ['0SAFE-SAFE-GE',
                             '0SAFE-SAFE-SP',
                             '0SAFE-SAFE-MP',
                             '0SAFE-SAFE-U2',
                             '0SAFE-SAFE-DTKP',
                             '0SAFE-SAFE-ACOC',
                             '0SAFE-SAFE-TMM',
                             '0SAFE-SAFE-U1',
                             '0SAFE-SAFE-U3']

controls['najm_data'] = ["HPRT INTRON",
                         "6T",
                         "EEF2",
                         "CD81"]

for study in controls:
    controls[study] = [i.upper() for i in controls[study]]# + [i for i in controls[study]]

In [11]:
## conditions
study_conditions = {}

# diehl et al
study_conditions['diehl_data'] = [['ctrl_1',
                                    'ctrl_2'],
                                   ['rep_1',
                                    'rep_2',
                                    'rep_3']]

# ctrl_1;ctrl_2;rep_1;rep_2;rep_3
# ["wildtype-I-CeuI", "wildtype-I-SceI", "nan"]

# horlbeck et al
study_conditions['horlbeck_data'] = {}
# study_conditions['horlbeck_data']['JURKAT'] = [['JURKAT_barcode,T0,rep1',
#                                                 'JURKAT_barcode,T0,rep2'],
#                                                ['JURKAT_barcode,cyc,rep1',
#                                                 'JURKAT_barcode,cyc,rep2',]]
study_conditions['horlbeck_data']['JURKAT'] = [['JURKAT_tripleseq,T0,rep1',
                                                'JURKAT_tripleseq,T0,rep2'],
                                               ['JURKAT_tripleseq,cyc,rep1',
                                                'JURKAT_tripleseq,cyc,rep2',]]

# study_conditions['horlbeck_data']['K562'] = [['K562_barcode,T0,rep1',
#                                         'K562_barcode,T0,rep2'],
#                                        ['K562_barcode,cyc,rep1',
#                                         'K562_barcode,cyc,rep2',]]
study_conditions['horlbeck_data']['K562'] = [['K562_barcode,T0,rep1',
                                        'K562_barcode,T0,rep2'],
                                       ['K562_tripleseq,cyc,rep1',
                                        'K562_tripleseq,cyc,rep2',]]

# parrish et al
study_conditions['parrish_data'] = [["plasmid_1", 
                                     "plasmid_2",
                                     "plasmid_3"],
                                    ["LTP_1",
                                     "LTP_2",
                                     "LTP_3"]]
study_conditions['parrish_data_original'] = [["plasmid_1", 
                                     "plasmid_2",
                                     "plasmid_3"],
                                    ["LTP_1",
                                     "LTP_2",
                                     "LTP_3"]]

# wong et al
study_conditions['wong_data'] = [["day5 (Replicate 1)", 
                                  "day5 (Replicate 2)"],
                                 ["day20 (Replicate 1)",
                                  "day20 (Replicate 2)"]]

# zhao et al
study_conditions['zhao_data'] = {}
study_conditions['zhao_data']["HELA"] = [["Hela_MV4_d3_1_S1_trimmed53_len_filtered_counts",
                                          "Hela_MV4_d3_2_S2_trimmed53_len_filtered_counts"],
                                         ["Hela_MV4_d28_1_S5_trimmed53_len_filtered_counts",
                                          "Hela_MV4_d28_2_S6_trimmed53_len_filtered_counts"]]

study_conditions['zhao_data']["A549"] = [["A549_MV4_d3_1_S1_trimmed53_len_filtered_counts",
                                          "A549_MV4_d3_2_S2_trimmed53_len_filtered_counts"],
                                         ["A549_MV4_d28_1_S7_trimmed53_len_filtered_counts",
                                          "A549_MV4_d28_2_S8_trimmed53_len_filtered_counts"]]

study_conditions['shantang_data'] = [["T0_1", 
                                     "T0_2"],
                                    ["T12_1",
                                     "T12_2"]]

study_conditions['najm_data'] = [['pDNA_Reads'],
                                 ['Rep_A_Reads',
                                  'Rep_B_Reads',
                                  'Rep_C_Reads']]


In [12]:
# read the data

# experiment design
experiment_design = pd.read_sql_table('CDKO_EXPERIMENT_DESIGN', SLKB_engine, index_col = 'sgRNA_id')#
experiment_design.reset_index(drop = True, inplace = True)
experiment_design.index.rename('sgRNA_id', inplace = True)

# counts
counts = pd.read_sql_table('CDKO_SGRNA_COUNTS', SLKB_engine, index_col = 'sgRNA_pair_id')
counts.reset_index(drop = True, inplace = True)
counts.index.rename('sgRNA_pair_id', inplace = True)

# scores
scores = pd.read_sql_table('CDKO_ORIGINAL_SL_RESULTS', SLKB_engine, index_col = 'id')
scores.reset_index(drop = True, inplace = True)
scores.index.rename('gene_pair_id', inplace = True)


In [13]:
# join the tables together
#counts = counts.merge(scores, how = 'left', left_on = 'gene_pair_id_original', right_index = True)
counts = counts.merge(experiment_design, how = 'left', left_on = 'guide_1_id', right_index = True, suffixes = ('', '_g1'))
counts = counts.merge(experiment_design, how = 'left', left_on = 'guide_2_id', right_index = True, suffixes = ('', '_g2'))
# rename
counts = counts.rename({'sgRNA_guide_name': 'sgRNA_guide_name_g1',
                        'sgRNA_guide_seq': 'sgRNA_guide_seq_g1',
                        'sgRNA_target_name': 'sgRNA_target_name_g1',
                        'study_origin_x': 'study_origin',
                        'cell_line_origin_x': 'cell_line_origin'}, axis = 1)

In [14]:
# subset is sufficient
#counts = counts.loc[:,['guide_1_id', 'guide_2_id', 'sgRNA_guide_name_g1', 'sgRNA_guide_name_g2', 'sgRNA_target_name_g1', 'sgRNA_target_name_g2', 'T0_counts', 'T0_replicate_names', 'TEnd_counts', 'TEnd_replicate_names', 'target_type', 'study_origin', 'cell_line_origin']]

## Analysis

In [15]:
def get_raw_counts(curr_counts):
    
    print('Getting raw counts...')
    # get counts
    T0_counts = curr_counts['T0_counts'].apply(    
        lambda x: np.array(x.split(";"), dtype = np.float64)
    )

    T0_counts = pd.DataFrame(data = T0_counts.tolist(),
                   index = T0_counts.index, columns = curr_counts['T0_replicate_names'].iloc[0].split(';'))

    TEnd_counts = curr_counts['TEnd_counts'].apply(    
        lambda x: np.array(x.split(";"), dtype = np.float64)
    )

    TEnd_counts = pd.DataFrame(data = TEnd_counts.tolist(),
                   index = TEnd_counts.index, columns = curr_counts['TEnd_replicate_names'].iloc[0].split(';'))
    
    # make sure no columns are filled with NAs completely
    NA_replicate = T0_counts.isna().sum()
    if (NA_replicate == T0_counts.shape[0]).sum() > 0:
        print('Removing NA replicate from T0...')
        T0_counts.drop(NA_replicate.index[NA_replicate == T0_counts.shape[0]], axis = 1, inplace = True)
    
    NA_replicate = TEnd_counts.isna().sum()
    if (NA_replicate == T0_counts.shape[0]).sum() > 0:
        print('Removing NA replicate from TEnd...')
        TEnd_counts.drop(NA_replicate.index[NA_replicate == TEnd_counts.shape[0]], axis = 1, inplace = True)

    T0_counts = T0_counts.fillna(0)
    TEnd_counts = TEnd_counts.fillna(0)
    
    return((T0_counts, TEnd_counts))

In [16]:
def filter_counts(curr_counts, filtering_counts = 100):
    
    print(' '.join(["Filtering enabled... Condition:", str(filtering_counts), "counts"]))
    
    curr_counts[curr_counts < filtering_counts] = np.nan
    
    # drop the entire sgRNAs
    curr_counts = curr_counts.dropna()    
    
    return(curr_counts)

In [17]:
def normalize_counts(curr_counts, set_normalization = 1e6):
    
    print("Normalization enabled...")
    
    if set_normalization is not None:
        print("Current counts:")
        print(curr_counts.sum(axis = 0))
        
        norm_value = set_normalization
        print(' '.join(["Normalize based on a specific value...", str(set_normalization), "counts"]))
    else:
        print("Normalize based on sample counts... Current counts:")
        print(curr_counts.sum(axis = 0))
        
        norm_value = np.median(curr_counts.sum(axis = 0))
        print(' '.join(["Normalize value...", str(norm_value), "counts"]))    
    
    filt_locations = curr_counts.isna()
    #print(curr_counts.isna().sum())
    
    curr_counts = (curr_counts * norm_value) / curr_counts.sum(axis = 0)
    curr_counts[filt_locations] = np.nan
    
    return(curr_counts)

In [18]:
def sort_pairs_and_guides(curr_counts):
    # sort the genes and guides based on gene ordering
    print('Sorting gene pairs and guides based on ordering gene ordering...')
    gene_pairs = []
    gene_pair_guides = []
    for i in range(curr_counts.shape[0]):

        guide_1 = curr_counts['sgRNA_guide_name_g1'].iloc[i]
        guide_2 = curr_counts['sgRNA_guide_name_g2'].iloc[i]

        gene_1 = curr_counts['sgRNA_target_name_g1'].iloc[i]
        gene_2 = curr_counts['sgRNA_target_name_g2'].iloc[i]

        t_gene_1, t_gene_2 = sorted([gene_1, gene_2])


        if (t_gene_1 == gene_1) and (t_gene_2 == gene_2):
            gene_1 = t_gene_1
            gene_2 = t_gene_2
        else:
            gene_1 = t_gene_1
            gene_2 = t_gene_2

            # swap the guides accordingly
            temp = guide_1
            guide_1 = guide_2
            guide_2 = temp

        gene_pairs.append('|'.join([gene_1, gene_2]))
        gene_pair_guides.append('|'.join([guide_1, guide_2]))

    return(gene_pairs, gene_pair_guides)

In [19]:
# taken from Horlbeck et al., https://github.com/mhorlbeck/GImap_tools/blob/601cd22126432edadb30202e952859195c73a841/GImap_analysis.py
def quadFitForceIntercept(xdata, ydata, bdata):
    m1 = optimize.fmin(lambda m, x, y: ((m[0]*(x**2) + m[1]*x + bdata - y)**2).sum(), x0=[0.1,0.1], args=(xdata, ydata), disp=0)
    
    return lambda x1: m1[0]*(np.array(x1)**2) + m1[1]*np.array(x1) + bdata

In [20]:
def run_horlbeck_preprocessing(curr_counts, filterThreshold = 35, pseudocount = 10):
        
    T0_counts, TEnd_counts = get_raw_counts(curr_counts.copy())
    
    # horlbeck uses single x single as double, proceed to move them to dual instead
    replace_idx = (curr_counts['target_type'] == 'Single') & (curr_counts['sgRNA_target_name_g1'] == curr_counts['sgRNA_target_name_g2'])
    curr_counts.loc[replace_idx, 'target_type'] = 'Dual'

    if T0_counts.shape[1] != TEnd_counts.shape[1]:
        print("Mismatch times, averaging...")

        T0_counts = pd.DataFrame(data = T0_counts.apply(lambda x: np.mean(x), axis = 1).values,
                             index = T0_counts.index)

        TEnd_counts = pd.DataFrame(data = TEnd_counts.apply(lambda x: np.mean(x), axis = 1).values,
                 index = TEnd_counts.index)

    T0_counts = pd.concat([T0_counts, curr_counts['sgRNA_guide_name_g1'], curr_counts['sgRNA_guide_name_g2']], axis = 1)
    TEnd_counts = pd.concat([TEnd_counts, curr_counts['sgRNA_guide_name_g1'], curr_counts['sgRNA_guide_name_g2']], axis = 1)
    all_sgRNAs = set(TEnd_counts['sgRNA_guide_name_g1']).union(set(TEnd_counts['sgRNA_guide_name_g2']))

    # add sorted targets
    sorted_gene_pairs, sorted_gene_guides = sort_pairs_and_guides(curr_counts.copy())
    curr_counts['sgRNA_pair'] = sorted_gene_guides
    curr_counts['gene_pair'] = sorted_gene_pairs
    
    replicate_list = []
    for replicate_i in range(len(T0_counts.columns)-2):
        print("For replicate " + str(replicate_i + 1))
        meanCounts = pd.concat((TEnd_counts.iloc[:,replicate_i].groupby(TEnd_counts['sgRNA_guide_name_g1']).agg(np.median),TEnd_counts.iloc[:,replicate_i].groupby(TEnd_counts['sgRNA_guide_name_g2']).agg(np.median)),axis=1, keys=['sgRNA_guide_name_g1', 'sgRNA_guide_name_g2'])
        sgsToFilter = set(meanCounts.loc[meanCounts.loc[:,'sgRNA_guide_name_g1'] < filterThreshold].index).union(set(meanCounts.loc[meanCounts.loc[:,'sgRNA_guide_name_g2'] < filterThreshold].index))
        print(" ".join(["Total of", str(len(sgsToFilter)), 'sgRNAs were filtered out of', str(len(all_sgRNAs))]))

        chosen_idx = np.array([True if i not in sgsToFilter else False for i in TEnd_counts['sgRNA_guide_name_g1']]) & np.array([True if i not in sgsToFilter else False for i in TEnd_counts['sgRNA_guide_name_g2']])
        TEnd_counts_curr = TEnd_counts.iloc[chosen_idx, replicate_i]
        T0_counts_curr = T0_counts.iloc[chosen_idx, replicate_i]

        counts_ratio = ((T0_counts_curr + pseudocount).sum()*1.0)/(TEnd_counts_curr + pseudocount).sum()

        # calculate FC like in horlbeck
        replicate_FC = np.log2((TEnd_counts_curr + pseudocount)/(T0_counts_curr + pseudocount)/counts_ratio)
        replicate_FC.columns = ['Replicate_' + str(replicate_i+1) + "_FC"]
        replicate_FC.name = 'Replicate_' + str(replicate_i+1) + "_FC"

        # get control
        control_effect = 0
        if 'Control' in set(curr_counts['target_type']):
            control_index = curr_counts['target_type'] == 'Control'
            if control_index.sum() != 0:
                control_effect = replicate_FC.loc[control_index].median()

        replicate_FC -= control_effect

        # doubling differences, taken from original code
        replicate_FC /= 6.3

        curr_counts = curr_counts.join(replicate_FC)

        replicate_list.append(replicate_FC)

    # save the results to original data
    replicate_list = pd.concat(replicate_list, axis = 1)
    replicate_list = replicate_list.dropna()

    replicate_list = replicate_list.mean(axis = 1)

    replicate_list.columns = ['FC_Averaged']
    replicate_list.name = 'FC_Averaged'

    curr_counts = curr_counts.join(replicate_list)

    average_of_transpose = curr_counts.groupby('sgRNA_pair')['FC_Averaged'].apply(np.nanmean)
    curr_counts = curr_counts.join(average_of_transpose,
                             on = 'sgRNA_pair',
                             rsuffix = "_abbaAveraged")
    
    return(curr_counts)

In [21]:
def run_horlbeck_score(curr_counts, do_preprocessing = True):
    
    print('Running horlbeck score...')
    
    ######### preprocessing
    
    print('Running preprocessing...')

    if do_preprocessing:
        curr_counts = run_horlbeck_preprocessing(curr_counts)


    #########/ preprocessing
    
    ######### original horlbeck scoring
    
    print('Started scoring')

    # first, drop the rows with nan replicateFCname
    curr_counts.dropna(subset = ['FC_Averaged_abbaAveraged'], inplace = True)

    # get ab/ba
    a_average, b_average = curr_counts.loc[curr_counts['target_type'] != 'Dual'].copy(), curr_counts.loc[curr_counts['target_type'] != 'Dual'].copy()
    #curr_counts.loc[curr_counts['target_type'] == 'Single'].copy(), curr_counts.loc[curr_counts['target_type'] == 'Single'].copy()
    #
    a_average = a_average[a_average['sgRNA_target_name_g2'] == "CONTROL"]
    b_average = b_average[b_average['sgRNA_target_name_g1'] == "CONTROL"]

    a_average = a_average.groupby('sgRNA_guide_name_g1')['FC_Averaged_abbaAveraged'].apply(np.mean)
    b_average = b_average.groupby('sgRNA_guide_name_g2')['FC_Averaged_abbaAveraged'].apply(np.mean)

    # single, control, and dual phenotypes are used in calculation
    all_pairs = set(curr_counts['sgRNA_guide_name_g1']).union(set(curr_counts['sgRNA_guide_name_g2']))
    curr_counts['GI_Averaged'] = 0

    # for missing pairs, update a_average, b_average
    a_average_0s = list(all_pairs.difference(set(a_average.index)))
    a_average = pd.concat([a_average, pd.Series(data = np.zeros(len(a_average_0s)), index = a_average_0s)])

    b_average_0s = list(all_pairs.difference(set(b_average.index)))
    b_average = pd.concat([b_average, pd.Series(data = np.zeros(len(b_average_0s)), index = b_average_0s)])

    # store in a matrix
    GI_Score_1 = pd.DataFrame(0, index = sorted(list(all_pairs)), columns = sorted(list(all_pairs)))
    GI_Score_2 = pd.DataFrame(0, index = sorted(list(all_pairs)), columns = sorted(list(all_pairs)))

    ## A orientation ()

    ## go through all query sgRNAs
    for query_sgRNA in all_pairs:

        ## get all the pairs with the given query
        idx_loc = (curr_counts['sgRNA_guide_name_g2'] == query_sgRNA)

        if len(idx_loc) == 0:
            continue

        ## all pairs 
        curr_filtered_pairs = curr_counts.loc[idx_loc, :]

        ## get sgRNAs assayed together with the query sgRNA
        selected_sgRNAs = curr_filtered_pairs['sgRNA_guide_name_g1'].values

        if 'Control' in set(curr_counts['target_type']):
            control_sgRNAs = np.where(curr_filtered_pairs['sgRNA_target_name_g1'] == "CONTROL")[0]

        # Fit to a quadratic formula, where the x is the single phenotypes and y is the pair phenotypes

        xs = a_average.loc[selected_sgRNAs].values # a -> b
        ys = curr_filtered_pairs['FC_Averaged_abbaAveraged'].values
        bs = b_average.loc[query_sgRNA] # b -> a

        res_fn = quadFitForceIntercept(xs, ys, bs)

        # get expected
        expected_phenotype = res_fn(xs)

        # the difference is the GI score
        GI_Score = ys - expected_phenotype

        if ('Control' in set(curr_counts['target_type'])) and len(control_sgRNAs) > 0:
            if GI_Score[control_sgRNAs].std() != 0:
                GI_Score /= GI_Score[control_sgRNAs].std()

        GI_Score_1.loc[query_sgRNA, selected_sgRNAs] = GI_Score

    ## B orientation ()
    ## go through all query sgRNAs
    for query_sgRNA in all_pairs:

        ## get all the pairs with the given query
        idx_loc = (curr_counts['sgRNA_guide_name_g1'] == query_sgRNA)

        if len(idx_loc) == 0:
            continue

        ## all pairs 
        curr_filtered_pairs = curr_counts.loc[idx_loc, :]

        ## get sgRNAs assayed together with the query sgRNA
        selected_sgRNAs = curr_filtered_pairs['sgRNA_guide_name_g2'].values

        if 'Control' in set(curr_counts['target_type']):
            control_sgRNAs = np.where(curr_filtered_pairs['sgRNA_target_name_g2'] == "CONTROL")[0]

        # Fit to a quadratic formula, where the x is the single phenotypes and y is the pair phenotypes

        xs = b_average.loc[selected_sgRNAs].values # b -> a
        ys = curr_filtered_pairs['FC_Averaged_abbaAveraged'].values
        bs = a_average.loc[query_sgRNA] # a -> b

        res_fn = quadFitForceIntercept(xs, ys, bs)

        # get expected
        expected_phenotype = res_fn(xs)

        # the difference is the GI score
        GI_Score = ys - expected_phenotype

        if ('Control' in set(curr_counts['target_type'])) and len(control_sgRNAs) > 0:
            if GI_Score[control_sgRNAs].std() != 0:
                GI_Score /= GI_Score[control_sgRNAs].std()

        # set the 
        #curr_res['sgRNA_level']['dual'].loc[idx_loc, replicate_GI_name] += GI_Score
        GI_Score_2.loc[query_sgRNA, selected_sgRNAs] = GI_Score


    # average between A and B orientations
    #curr_res['sgRNA_level']['dual'][replicate_GI_name] /= 2
    GI_Score_avg = (GI_Score_1 + GI_Score_2)/2
    GI_Score_avg = (GI_Score_avg + GI_Score_avg.T)/2

    for i in range(len(curr_counts['GI_Averaged'])):
        guide_1 = curr_counts['sgRNA_guide_name_g1'].iloc[i]
        guide_2 = curr_counts['sgRNA_guide_name_g2'].iloc[i]

        curr_counts['GI_Averaged'].iloc[i] = GI_Score_avg.loc[guide_1, guide_2]

    
    ######### /original horlbeck scoring
    
    
    # store results
    SL_score = curr_counts.groupby('gene_pair')['GI_Averaged'].apply(lambda x: np.mean(x))
    SE = curr_counts.groupby('gene_pair')['GI_Averaged'].apply(lambda x: sem(x, ddof=1))

    genes_1 = [i.split('|')[0] for i in SL_score.index]
    genes_2 = [i.split('|')[1] for i in SL_score.index]

    horlbeck_results = pd.DataFrame(data = {'SL_score' : SL_score.values,
                                             'standard_error' : SE.values,
                                             'Gene 1' : genes_1,
                                             'Gene 2' : genes_2}, index = SL_score.index)
    
    
    # remove possible controls
    control_idx = np.array([True if 'CONTROL' in i else False for i in horlbeck_results.index])
    horlbeck_results = horlbeck_results.loc[~control_idx]
    
    results = {}
    results['HORLBECK_SCORE'] = horlbeck_results

    
    
    
    
    return(results)

In [22]:
def run_median_scores(curr_counts, full_normalization = False):

    # for standard error
    median_SE_constant = 1.25
    
    print('Running median scores...')
    
    ######### preprocessing
    t_0_comb, t_end_comb = get_raw_counts(curr_counts)
    
    # filter counts, only at T0
    t_0_comb = filter_counts(t_0_comb, filtering_counts = 35)
    print(' '.join(['Filtered a total of', str(t_end_comb.shape[0] - t_0_comb.shape[0]), "out of", str(t_end_comb.shape[0]), "sgRNAs."]))
    print("\n---\n")
        
    # add pseudocount of 10 after filtering
    t_0_comb = t_0_comb + 10
    t_end_comb = t_end_comb + 10
    
    # some sgRNAs were filtered out
    overlapping_sgRNAs = sorted(list(set(t_0_comb.index).intersection(set(t_end_comb.index))))
    
    t_0_comb = t_0_comb.loc[overlapping_sgRNAs,:]
    t_end_comb = t_end_comb.loc[overlapping_sgRNAs,:]
    curr_counts = curr_counts.loc[overlapping_sgRNAs,:]
        
    # normalize to the median of the all time points
    if full_normalization:
        print('Full normalization...')
        normalization_value = np.median(pd.concat([t_0_comb, t_end_comb], axis = 1).sum(axis = 0))

        t_0_comb = normalize_counts(t_0_comb, set_normalization = normalization_value)
        t_end_comb = normalize_counts(t_end_comb, set_normalization = normalization_value)
        
    else:
        print('Not full normalization...')
        for subset in set(curr_counts['target_type']):
            idx = curr_counts.loc[curr_counts['target_type'] == subset,:].index

            # normalize to the median of the all time points
            normalization_value = np.median(pd.concat([t_0_comb.loc[idx,:], t_end_comb.loc[idx,:]], axis = 1).sum(axis = 0))

            t_0_comb.loc[idx,:] = normalize_counts(t_0_comb.loc[idx,:], set_normalization = normalization_value)
            t_end_comb.loc[idx,:] = normalize_counts(t_end_comb.loc[idx,:], set_normalization = normalization_value)


    
    # get median of counts 
    t_0_comb = t_0_comb.apply(lambda x: np.median(x), axis = 1)
    t_end_comb = t_end_comb.apply(lambda x: np.median(x), axis = 1)

    # get LFC
    FC = np.log2(t_end_comb) - np.log2(t_0_comb)
    
    # set FC
    curr_counts['FC'] = FC
    
    # add sorted targets
    sorted_gene_pairs, sorted_gene_guides = sort_pairs_and_guides(curr_counts.copy())
    curr_counts['sgRNA_pair'] = sorted_gene_guides
    curr_counts['gene_pair'] = sorted_gene_pairs
    
    ######### /preprocessing
    
    # store results
    results = {}
    results['MEDIAN_B_SCORE'] = None
    results['MEDIAN_NB_SCORE'] = None
    
    ######### scoring
    
    # get the three target categories
    single = curr_counts.loc[curr_counts['target_type'] == 'Single']
    dual = curr_counts.loc[curr_counts['target_type'] == 'Dual']
    control = curr_counts.loc[curr_counts['target_type'] == 'Control']
    
    print('Available singles: ' + str(single.shape[0]))
    print('Available duals: ' + str(dual.shape[0]))
    print('Available control: ' + str(control.shape[0]))
    
    temp_repeat = single.copy()
    temp_repeat['sgRNA_guide_name_g1'] = single["sgRNA_guide_name_g2"]
    temp_repeat['sgRNA_target_name_g1'] = single["sgRNA_target_name_g2"]
    temp_repeat['sgRNA_guide_name_g2'] = single["sgRNA_guide_name_g1"]
    temp_repeat['sgRNA_target_name_g1'] = single["sgRNA_target_name_g2"]
    
    single_repeat = pd.concat([single, temp_repeat])

    # get single sgRNA impact
    EC_single = single_repeat.groupby("sgRNA_guide_name_g1")['FC'].apply(
            lambda x: np.median(x))
    
    # get control sgRNA impact
    EC_control = None
    if control.shape[0] != 0:

        temp_repeat = control.copy()
        temp_repeat['sgRNA_guide_name_g1'] = control["sgRNA_guide_name_g2"]
        temp_repeat['sgRNA_guide_name_g2'] = control["sgRNA_guide_name_g1"]

        EC_control = pd.concat([control, temp_repeat]).groupby("sgRNA_guide_name_g1")['FC'].apply(
            lambda x: np.median(x)
        )

        EC_single = EC_single.drop(set(EC_control.index).intersection(set(EC_single.index)))
    
    # all available dual sgRNAs
    all_pairs = set(dual['sgRNA_guide_name_g1']).union(set(dual['sgRNA_guide_name_g2']))
    
    # fill for empty
    missing_pairs = np.array(list(all_pairs.difference(set(EC_single.index))))

    print(' '.join(["Filtered single sgRNA count:", str(len(set(missing_pairs)))]))
    
    # add them as 0s
    EC_single = pd.concat([EC_single, pd.Series(index = missing_pairs, data = np.zeros(len(missing_pairs)))])
    
    # get EC for each
    EC_1 = EC_single[dual['sgRNA_guide_name_g1']]
    EC_2 = EC_single[dual['sgRNA_guide_name_g2']]
    
    # calculate Impact Scores (sgRNA level)

    dual['Median-NB-dual-IS'] = dual['FC'].values
    dual['Median-NB-single-IS-Guide-1'] = EC_1.values
    dual['Median-NB-single-IS-Guide-2'] = EC_2.values
    dual['Median-NB-dual-SL-sgRNA'] = dual['FC'].values - EC_1.values - EC_2.values
    
    ## calculate SL scores (sgRNA)
    gene_pair_SL = dual.groupby('gene_pair')['Median-NB-dual-IS'].apply(lambda x: np.median(x))
    gene_pair_SE = dual.groupby('gene_pair')['Median-NB-dual-IS'].apply(lambda x: np.var(x) / np.size(x))
    
    ## calculate SL scores (gene)
    gene_SL = single_repeat.groupby("sgRNA_target_name_g1")['FC'].apply(
    lambda x: np.median(x))
    gene_SE = single_repeat.groupby("sgRNA_target_name_g1")['FC'].apply(
        lambda x: np.var(x) / np.size(x))

    genes_1 = np.array([i.split('|')[0] for i in gene_pair_SL.index])
    genes_2 = np.array([i.split('|')[1] for i in gene_pair_SL.index])

    all_genes = set(genes_1).union(set(genes_2))
    missing_genes = all_genes.difference(set(gene_SL.index))
    print(' '.join(["Filtered gene count:", str(len(set(missing_genes)))]))

    # add them as 0s
    gene_SL = pd.concat([gene_SL, pd.Series(index = missing_genes, data = np.zeros(len(missing_genes)))])
    gene_SE = pd.concat([gene_SE, pd.Series(index = missing_genes, data = np.zeros(len(missing_genes)))])

    median_nb_SL = gene_pair_SL.values - gene_SL[genes_1].values - gene_SL[genes_2].values
    median_nb_SE = np.sqrt(gene_pair_SE.values + gene_SE[genes_1].values + gene_SE[genes_2].values) * median_SE_constant
    median_nb_Z = median_nb_SL/median_nb_SE

    median_nb_results = pd.DataFrame(data = {'SL_score' : median_nb_SL,
                                             'standard_error' : median_nb_SE,
                                             'Z_SL_score' : median_nb_Z,
                                             'Gene 1' : genes_1,
                                             'Gene 2' : genes_2}, index = gene_pair_SL.index)
    
    results['MEDIAN_NB_SCORE'] = median_nb_results
    
    if EC_control is not None:
        control_median = np.median(EC_control)

        dual['Median-B-dual-IS'] = dual['FC'].values - control_median
        dual['Median-B-single-IS-Guide-1'] = EC_1.values - control_median
        dual['Median-B-single-IS-Guide-2'] = EC_2.values - control_median
        dual['Median-B-dual-SL-sgRNA'] = (dual['FC'].values - control_median) - (EC_1.values - control_median) - (EC_2.values - control_median)

        ## calculate SL scores (sgRNA)
        gene_pair_SL = dual.groupby('gene_pair')['Median-B-dual-IS'].apply(lambda x: np.median(x))
        gene_pair_SE = dual.groupby('gene_pair')['Median-B-dual-IS'].apply(lambda x: np.var(x) / np.size(x))

        # remove controls first
        single_repeat['FC'] = single_repeat['FC'] - control_median
        ## calculate SL scores (gene)
        gene_SL = single_repeat.groupby("sgRNA_target_name_g1")['FC'].apply(
        lambda x: np.median(x))
        gene_SE = single_repeat.groupby("sgRNA_target_name_g1")['FC'].apply(
            lambda x: np.var(x) / np.size(x))

        genes_1 = np.array([i.split('|')[0] for i in gene_pair_SL.index])
        genes_2 = np.array([i.split('|')[1] for i in gene_pair_SL.index])

        all_genes = set(genes_1).union(set(genes_2))
        missing_genes = all_genes.difference(set(gene_SL.index))
        print(' '.join(["Filtered gene count:", str(len(set(missing_genes)))]))

        # add them as 0s
        gene_SL = pd.concat([gene_SL, pd.Series(index = missing_genes, data = np.zeros(len(missing_genes)))])
        gene_SE = pd.concat([gene_SE, pd.Series(index = missing_genes, data = np.zeros(len(missing_genes)))])

        median_b_SL = gene_pair_SL.values - gene_SL[genes_1].values - gene_SL[genes_2].values
        median_b_SE = np.sqrt(gene_pair_SE.values + gene_SE[genes_1].values + gene_SE[genes_2].values) * median_SE_constant
        median_b_Z = median_b_SL/median_b_SE

        median_b_results = pd.DataFrame(data = {'SL_score' : median_b_SL,
                                                 'standard_error' : median_b_SE,
                                                 'Z_SL_score' : median_b_Z,
                                                 'Gene 1' : genes_1,
                                                 'Gene 2' : genes_2}, index = gene_pair_SL.index)
        
        results['MEDIAN_B_SCORE'] = median_b_results
    
    ######### /scoring
    
    
    # return computed scores
    return(results)

In [23]:
def run_sgrna_scores(curr_counts, full_normalization = False):
        
    # for standard error
    median_SE_constant = 1.25

    print('Running sgrna derived score...')

    ######### preprocessing
    t_0_comb, t_end_comb = get_raw_counts(curr_counts)
    
    # filter counts, only at T0
    t_0_comb = filter_counts(t_0_comb, filtering_counts = 35)
    print(' '.join(['Filtered a total of', str(t_end_comb.shape[0] - t_0_comb.shape[0]), "out of", str(t_end_comb.shape[0]), "sgRNAs."]))
    print("\n---\n")
        
    # add pseudocount of 10 after filtering
    t_0_comb = t_0_comb + 10
    t_end_comb = t_end_comb + 10
    
    # some sgRNAs were filtered out
    overlapping_sgRNAs = sorted(list(set(t_0_comb.index).intersection(set(t_end_comb.index))))
    
    t_0_comb = t_0_comb.loc[overlapping_sgRNAs,:]
    t_end_comb = t_end_comb.loc[overlapping_sgRNAs,:]
    curr_counts = curr_counts.loc[overlapping_sgRNAs,:]
    
    if full_normalization:
        print('Full normalization...')
        
        # normalize to the median of the all time points
        normalization_value = np.median(pd.concat([t_0_comb, t_end_comb], axis = 1).sum(axis = 0))

        t_0_comb = normalize_counts(t_0_comb, set_normalization = normalization_value)
        t_end_comb = normalize_counts(t_end_comb, set_normalization = normalization_value)
        
    else:
        print('Not full normalization...')
    
        for subset in set(curr_counts['target_type']):
            idx = curr_counts.loc[curr_counts['target_type'] == subset,:].index

            # normalize to the median of the all time points
            normalization_value = np.median(pd.concat([t_0_comb.loc[idx,:], t_end_comb.loc[idx,:]], axis = 1).sum(axis = 0))

            t_0_comb.loc[idx,:] = normalize_counts(t_0_comb.loc[idx,:], set_normalization = normalization_value)
            t_end_comb.loc[idx,:] = normalize_counts(t_end_comb.loc[idx,:], set_normalization = normalization_value)
    
    # if mismatch, average
    if t_0_comb.shape[1] != t_end_comb.shape[1]:
        print("Mismatch times, averaging...")
        t_0_comb = pd.DataFrame(data = t_0_comb.apply(lambda x: np.mean(x), axis = 1).values,
                     index = t_0_comb.index)
        t_end_comb = pd.DataFrame(data = t_end_comb.apply(lambda x: np.mean(x), axis = 1).values,
             index = t_end_comb.index)
    
    # set FC
    curr_counts['FC'] = 0
    
    # add NOT sorted targets
#     curr_counts['sgRNA_pair'] = ['|'.join(sorted([curr_counts['sgRNA_guide_name_g1'].iloc[i], curr_counts['sgRNA_guide_name_g2'].iloc[i]])) for i in range(curr_counts.shape[0])]
#     curr_counts['gene_pair'] = ['|'.join(sorted([curr_counts['sgRNA_target_name_g1'].iloc[i], curr_counts['sgRNA_target_name_g2'].iloc[i]])) for i in range(curr_counts.shape[0])]
    
    # add sorted targets
    sorted_gene_pairs, sorted_gene_guides = sort_pairs_and_guides(curr_counts.copy())
    curr_counts['sgRNA_pair'] = sorted_gene_guides
    curr_counts['gene_pair'] = sorted_gene_pairs
    
    # get count annotations
    count_annotations = curr_counts.loc[:,['sgRNA_guide_name_g1', 'sgRNA_guide_name_g2', 'sgRNA_target_name_g1', 'sgRNA_target_name_g2', 'target_type', 'sgRNA_pair', 'gene_pair']].copy()

    
    ######### /preprocessing
    
    print('Starting scoring..')
    
    replicate_results = []
    for i in range(t_0_comb.shape[1]):
        print('calculating for replicate ' + str(i))

        replicate_fc = pd.DataFrame(data = np.log2(t_end_comb.iloc[:, i]/t_0_comb.iloc[:, i]).values,
                                    index = t_end_comb.index,
                                    columns = ['FC'])

        # merge
        replicate_fc = replicate_fc.merge(count_annotations, left_index = True, right_index = True)

        # get the three target categories
        single = replicate_fc.loc[curr_counts['target_type'] == 'Single']
        dual = replicate_fc.loc[curr_counts['target_type'] == 'Dual']
        control = replicate_fc.loc[curr_counts['target_type'] == 'Control']

        ## proceed with GI calculation
        temp_repeat = single.copy()
        temp_repeat['sgRNA_guide_name_g1'] = single["sgRNA_guide_name_g2"]
        temp_repeat['sgRNA_target_name_g1'] = single["sgRNA_target_name_g2"]
        temp_repeat['sgRNA_guide_name_g2'] = single["sgRNA_guide_name_g1"]
        temp_repeat['sgRNA_target_name_g1'] = single["sgRNA_target_name_g2"]

        single_repeat = pd.concat([single, temp_repeat])

        # get single sgRNA impact
        EC_single = single_repeat.groupby("sgRNA_guide_name_g1")['FC'].apply(
                lambda x: np.median(x))
        sgRNA_SE = single_repeat.groupby("sgRNA_guide_name_g1")['FC'].apply(
            lambda x: median_SE_constant * np.sqrt(np.var(x) / np.size(x)))


        EC_control = None
        if control.shape[0] != 0:# and (study != 'parrish_data')

            temp_repeat = control.copy()
            temp_repeat['sgRNA_guide_name_g1'] = control["sgRNA_guide_name_g2"]
            temp_repeat['sgRNA_guide_name_g2'] = control["sgRNA_guide_name_g1"]

            EC_control = np.median(pd.concat([control, temp_repeat])['FC'])

        ## get all pairs
        all_pairs = set(dual['sgRNA_guide_name_g1']).union(set(dual['sgRNA_guide_name_g2']))

        missing_pairs = np.array(list(all_pairs.difference(set(EC_single.index))))

        print(' '.join(["Filtered single sgRNA count:", str(len(set(missing_pairs)))]))

        # add them as 0s

        EC_single = pd.concat([EC_single, pd.Series(index = missing_pairs, data = np.zeros(len(missing_pairs)))])
        sgRNA_SE = pd.concat([sgRNA_SE, pd.Series(index = missing_pairs, data = np.zeros(len(missing_pairs)))])

        sgRNA_level_scores = dual.groupby(['gene_pair', 'sgRNA_pair'], as_index = False)['FC'].apply(lambda x: np.mean(x))
        sgRNA_level_SE = dual.groupby(['gene_pair', 'sgRNA_pair'], as_index = False)['FC'].apply(lambda x: np.sqrt(np.var(x) / np.size(x)))

        guide_1 = np.array([i.split('|')[0] for i in sgRNA_level_scores['sgRNA_pair']])
        guide_2 = np.array([i.split('|')[1] for i in sgRNA_level_scores['sgRNA_pair']])
        EC_1 = EC_single[guide_1]
        EC_2 = EC_single[guide_2]

        SE_1 = sgRNA_SE[guide_1]
        SE_2 = sgRNA_SE[guide_2]

        sgRNA_level_scores['SL'] = sgRNA_level_scores['FC'].values - EC_1.values - EC_2.values
        sgRNA_level_scores['SE'] = np.sqrt(np.square(sgRNA_level_SE['FC'].values) + np.square(SE_1.values) + np.square(SE_2.values))
        sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'].isna()] = 1
        sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'] == 0] = 1
        sgRNA_level_scores['Z-Score'] = sgRNA_level_scores['SL'].values/sgRNA_level_scores['SE'].values

        gene_SL_scores_nobackground = sgRNA_level_scores.groupby('gene_pair')['Z-Score'].apply(lambda x: np.median(x))
        gene_SL_scores_SE = sgRNA_level_scores.groupby('gene_pair')['Z-Score'].apply(lambda x:  median_SE_constant * np.sqrt(np.var(x) / np.size(x)))
        gene_SL_scores_SE.loc[gene_SL_scores_SE.isna()] = 1
        gene_SL_scores_SE.loc[gene_SL_scores_SE == 0] = 1
        gene_SL_scores_nobackground_Z = gene_SL_scores_nobackground/gene_SL_scores_SE

        results_nb = pd.concat([gene_SL_scores_nobackground, gene_SL_scores_SE, gene_SL_scores_nobackground_Z], axis = 1)
        results_nb.columns = ['sgRNA-Score-NB_' + str(i), 'sgRNA-Score-NB SE_' + str(i), 'sgRNA-Score-NB SL_' + str(i)]

        replicate_results.append(results_nb)

        if EC_control is not None:

            single['FC'] = single['FC'] - EC_control
            dual['FC'] = dual['FC'] - EC_control

            ## proceed with GI calculation
            temp_repeat = single.copy()
            temp_repeat['sgRNA_guide_name_g1'] = single["sgRNA_guide_name_g2"]
            temp_repeat['sgRNA_target_name_g1'] = single["sgRNA_target_name_g2"]
            temp_repeat['sgRNA_guide_name_g2'] = single["sgRNA_guide_name_g1"]
            temp_repeat['sgRNA_target_name_g1'] = single["sgRNA_target_name_g2"]

            single_repeat = pd.concat([single, temp_repeat])

            # get single sgRNA impact
            EC_single = single_repeat.groupby("sgRNA_guide_name_g1")['FC'].apply(
                    lambda x: np.median(x))
            sgRNA_SE = single_repeat.groupby("sgRNA_guide_name_g1")['FC'].apply(
                lambda x: median_SE_constant * np.sqrt(np.var(x) / np.size(x)))


            EC_control = None
            if control.shape[0] != 0:# and (study != 'parrish_data')

                temp_repeat = control.copy()
                temp_repeat['sgRNA_guide_name_g1'] = control["sgRNA_guide_name_g2"]
                temp_repeat['sgRNA_guide_name_g2'] = control["sgRNA_guide_name_g1"]

                EC_control = np.median(pd.concat([control, temp_repeat])['FC'])

            ## get all pairs
            all_pairs = set(dual['sgRNA_guide_name_g1']).union(set(dual['sgRNA_guide_name_g2']))

            missing_pairs = np.array(list(all_pairs.difference(set(EC_single.index))))

            print(' '.join(["Filtered single sgRNA count:", str(len(set(missing_pairs)))]))

            # add them as 0s

            EC_single = pd.concat([EC_single, pd.Series(index = missing_pairs, data = np.zeros(len(missing_pairs)))])
            sgRNA_SE = pd.concat([sgRNA_SE, pd.Series(index = missing_pairs, data = np.zeros(len(missing_pairs)))])

            sgRNA_level_scores = dual.groupby(['gene_pair', 'sgRNA_pair'], as_index = False)['FC'].apply(lambda x: np.mean(x))
            sgRNA_level_SE = dual.groupby(['gene_pair', 'sgRNA_pair'], as_index = False)['FC'].apply(lambda x: np.sqrt(np.var(x) / np.size(x)))

            guide_1 = np.array([i.split('|')[0] for i in sgRNA_level_scores['sgRNA_pair']])
            guide_2 = np.array([i.split('|')[1] for i in sgRNA_level_scores['sgRNA_pair']])
            EC_1 = EC_single[guide_1]
            EC_2 = EC_single[guide_2]

            SE_1 = sgRNA_SE[guide_1]
            SE_2 = sgRNA_SE[guide_2]

            sgRNA_level_scores['SL'] = sgRNA_level_scores['FC'].values - EC_1.values - EC_2.values
            sgRNA_level_scores['SE'] = np.sqrt(np.square(sgRNA_level_SE['FC'].values) + np.square(SE_1.values) + np.square(SE_2.values))
            sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'].isna()] = 1
            sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'] == 0] = 1
            sgRNA_level_scores['Z-Score'] = sgRNA_level_scores['SL'].values/sgRNA_level_scores['SE'].values

            gene_SL_scores_w_background = sgRNA_level_scores.groupby('gene_pair')['Z-Score'].apply(lambda x: np.median(x))
            gene_SL_scores_SE = sgRNA_level_scores.groupby('gene_pair')['Z-Score'].apply(lambda x:  median_SE_constant * np.sqrt(np.var(x) / np.size(x)))
            gene_SL_scores_SE.loc[gene_SL_scores_SE.isna()] = 1
            gene_SL_scores_SE.loc[gene_SL_scores_SE == 0] = 1
            gene_SL_scores_w_background_Z = gene_SL_scores_w_background/gene_SL_scores_SE


            results_b = pd.concat([gene_SL_scores_w_background, gene_SL_scores_SE, gene_SL_scores_w_background_Z], axis = 1)
            results_b.columns = ['sgRNA-Score-B_' + str(i), 'sgRNA-Score-B SE_' + str(i), 'sgRNA-Score-B SL_' + str(i)]

            replicate_results.append(results_b)

    # save results
    results = {}
    results['SGRA_DERIVED_NB_SCORE'] = None
    results['SGRA_DERIVED_B_SCORE'] = None

    merged = pd.concat(replicate_results, axis = 1)
    
    # sort the names
    merged.index = ['|'.join(sorted(i.split('|'))) for i in merged .index]

    merged['sgRNA-Score_Average_NB'] = merged.loc[:,['sgRNA-Score-NB SL_' + str(i) for i in range(t_end_comb.shape[1])]].mean(axis = 1)
    results['SGRA_DERIVED_NB_SCORE'] = pd.DataFrame(merged['sgRNA-Score_Average_NB'])
    results['SGRA_DERIVED_NB_SCORE'].columns = ['SL_score']
    results['SGRA_DERIVED_NB_SCORE']['Gene 1'] = [i.split('|')[0] for i in results['SGRA_DERIVED_NB_SCORE'].index]
    results['SGRA_DERIVED_NB_SCORE']['Gene 2'] = [i.split('|')[1] for i in results['SGRA_DERIVED_NB_SCORE'].index]

    if 'sgRNA-Score-B_0' in merged.columns:
        merged['sgRNA-Score_Average_B'] = merged.loc[:,['sgRNA-Score-B SL_' + str(i) for i in range(t_end_comb.shape[1])]].mean(axis = 1)
        results['SGRA_DERIVED_B_SCORE'] = pd.DataFrame(merged['sgRNA-Score_Average_B'])
        results['SGRA_DERIVED_B_SCORE'].columns = ['SL_score']
        results['SGRA_DERIVED_B_SCORE']['Gene 1'] = [i.split('|')[0] for i in results['SGRA_DERIVED_B_SCORE'].index]
        results['SGRA_DERIVED_B_SCORE']['Gene 2'] = [i.split('|')[1] for i in results['SGRA_DERIVED_B_SCORE'].index]

    return(results)


In [24]:
def add_to_fp_and_list(command, cmd_list, fp):
    # write, and add to list
    fp.write(command + '\n')
    cmd_list.append(command)

In [25]:
def run_mageck_score(curr_counts, curr_study, curr_cl, save_dir = 'MAGECK_Files'):

    print('Running mageck score...')

    # !no preprocessing!
    T0_counts, TEnd_counts = get_raw_counts(curr_counts)

    # due to mageck, don't have any comma on columns
    T0_counts.columns = ['T0_' + str(i) for i in range(T0_counts.shape[1])]
    TEnd_counts.columns = ['TEnd_' + str(i) for i in range(TEnd_counts.shape[1])]

    # get the annotations
    curr_counts['sgRNA_pair'] = ['|'.join([curr_counts['sgRNA_guide_name_g1'].iloc[i], curr_counts['sgRNA_guide_name_g2'].iloc[i]]) for i in range(curr_counts.shape[0])]
    curr_counts['gene_pair'] = ['|'.join([curr_counts['sgRNA_target_name_g1'].iloc[i], curr_counts['sgRNA_target_name_g2'].iloc[i]]) for i in range(curr_counts.shape[0])]

    curr_counts['sgRNA_pair_mageck_id'] = curr_counts['sgRNA_pair'].values + "|" + np.array(range(curr_counts.shape[0]), dtype = str)

    # combine them
    comb = pd.concat([curr_counts.loc[:, ['sgRNA_pair_mageck_id', 'gene_pair']], T0_counts, TEnd_counts], axis = 1)
    comb = comb.fillna(0)

    ######### save

    # get save location 
    save_loc = os.path.join(os.getcwd(), save_dir, curr_study, curr_cl)
    os.makedirs(save_loc, exist_ok = True)

    # save the counts
    comb.to_csv(os.path.join(save_loc, "counts.csv"), sep = ',', index = False)

    ######### /save

    ######### create script and run

    file_loc = os.path.join(save_loc, 'MAGECK_commands.sh')
    control_loc = '../../../controls.txt'

    fp = open(file_loc, '+w')

    cmd_list = []

    add_to_fp_and_list("#!/bin/sh", cmd_list, fp)
    add_to_fp_and_list("module load python/3.9-2022.05", cmd_list, fp)
    add_to_fp_and_list("source activate cnn_training", cmd_list, fp)

    # get index of last time point columns

    t_end_col_locs = []
    for i in range(comb.shape[1]):
        if comb.columns[i] in TEnd_counts.columns:
            t_end_col_locs.append(str(i-2))

    add_to_fp_and_list("cd \"" + os.path.join(os.getcwd(), save_loc) + "\"", cmd_list, fp)

    command = "mageck test -k counts.csv -t \"" + ','.join(t_end_col_locs) + "\" --normcounts-to-file -n out --pdf-report"
    if 'Control' in set(curr_counts['target_type']):
        command = "mageck test -k counts.csv -t \"" + ','.join(t_end_col_locs) + "\" --norm-method control --control-gene " + control_loc + " --normcounts-to-file -n out --pdf-report"

    add_to_fp_and_list(command, cmd_list, fp)
    fp.close()

    # set chmod
    os.chmod(file_loc, 0o0777)
    
    # scores have already been computed
    if os.path.exists(os.path.join(save_loc, "out.sgrna_summary.txt")):
        print('Scores exist!')
    else:
        print("Running mageck...")
        process = subprocess.run([file_loc], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        if process.returncode == 1:
            print("Error in mageck!!!")
            return(process.stdout.splitlines())
        else:
            print("Finished running mageck!")


    ######### load results

    print('Loading computed results...')

    res = pd.read_csv(os.path.join(save_loc, "out.sgrna_summary.txt"), index_col = 0, sep = "\t")
    res.index = ['|'.join(i.split('|')[:len(i.split('|'))-1]) for i in res.index]

    gene1 = np.array([genes.split('|')[0] for genes in res['Gene']])
    gene2 = np.array([genes.split('|')[1] for genes in res['Gene']])

    res_df = pd.DataFrame(data = {'Gene Pair': res['Gene'],
                                  'Gene 1' : gene1,
                                  'Gene 2' : gene2,
                                  'MAGECK-FC': res['LFC']})

    # add a column for gene pairs, but sorted
    sorted_pairs = []
    sorted_gene_1 = []
    sorted_gene_2 = []
    for i in range(res_df.shape[0]):

        gene_1 = str(res_df["Gene 1"].iloc[i])
        gene_2 = str(res_df["Gene 2"].iloc[i])

        t_gene_1, t_gene_2 = sorted([gene_1, gene_2])


        if (t_gene_1 == gene_1) and (t_gene_2 == gene_2):
            gene_1 = t_gene_1
            gene_2 = t_gene_2
        else:
            gene_1 = t_gene_1
            gene_2 = t_gene_2

        pair = '|'.join([gene_1, gene_2])

        sorted_gene_1.append(gene_1)
        sorted_gene_2.append(gene_2)

        sorted_pairs.append(pair)
    res_df['Gene Pair'] = sorted_pairs
    res_df['Gene 1'] = sorted_gene_1
    res_df['Gene 2'] = sorted_gene_2

    # get dual controls and remove them
    dual_controls_idx = np.array([True if i == 'CONTROL|CONTROL' else False for i in res_df['Gene Pair']])
    dual_controls = res_df.loc[dual_controls_idx]
    res_df = res_df.loc[~dual_controls_idx]

    # get single targets and remove them
    singles_idx = np.array([True if i == 'CONTROL' else False for i in res_df['Gene 1']]) | np.array([True if i == 'CONTROL' else False for i in res_df['Gene 2']]) | (res_df['Gene 1'] == res_df['Gene 2'])
    singles = res_df.loc[singles_idx]
    res_df = res_df.loc[~singles_idx]

    temp_repeat = singles.copy()
    temp_repeat['Gene 1'] = singles["Gene 2"]
    temp_repeat['Gene 2'] = singles["Gene 1"]

    single_repeat = pd.concat([singles, temp_repeat])

    # now res_df contains strictly dual targets
    ## calculate SL scores
    gene_pair_SL = res_df.groupby('Gene Pair')['MAGECK-FC'].apply(lambda x: np.median(x))
    gene_pair_SE = res_df.groupby('Gene Pair')['MAGECK-FC'].apply(lambda x: np.var(x) / np.size(x))

    gene_SL = single_repeat.groupby("Gene 1")['MAGECK-FC'].apply(
        lambda x: np.median(x))
    gene_SE = single_repeat.groupby("Gene 1")['MAGECK-FC'].apply(
        lambda x: np.var(x) / np.size(x))

    genes_1 = np.array([i.split('|')[0] for i in gene_pair_SL.index])
    genes_2 = np.array([i.split('|')[1] for i in gene_pair_SL.index])

    all_genes = set(genes_1).union(set(genes_2))
    missing_genes = all_genes.difference(set(gene_SL.index))
    print(' '.join(["Filtered gene count:", str(len(set(missing_genes)))]))

    # add them as 0s
    gene_SL = pd.concat([gene_SL, pd.Series(index = missing_genes, data = np.zeros(len(missing_genes)))])
    gene_SE = pd.concat([gene_SE, pd.Series(index = missing_genes, data = np.zeros(len(missing_genes)))])

    mageck_SL = gene_pair_SL.values - gene_SL[genes_1].values - gene_SL[genes_2].values
    mageck_SE = np.sqrt(gene_pair_SE.values + gene_SE[genes_1].values + gene_SE[genes_2].values) * math.sqrt(2)
    mageck_Z = mageck_SL/mageck_SE

    mageck_results = pd.DataFrame(data = {'SL_score' : mageck_SL,
                                             'standard_error' : mageck_SE,
                                             'Z_SL_score' : mageck_Z,
                                             'Gene 1' : genes_1,
                                             'Gene 2' : genes_2}, index = ['|'.join(sorted(i.split('|'))) for i in gene_pair_SL.index])

    ######### /load results

    results = {}
    results['MAGECK_SCORE'] = mageck_results

    return(results)

In [26]:
def run_gemini_score(curr_counts, curr_study, curr_cl, save_dir = 'GEMINI_Files'):

    print('Running gemini score...')
    
    # !no preprocessing!
    T0_counts, TEnd_counts = get_raw_counts(curr_counts)

    T0_counts.columns = ['T0_' + str(i) for i in range(T0_counts.shape[1])]
    TEnd_counts.columns = ['TEnd_' + str(i) for i in range(TEnd_counts.shape[1])]
    
    # get save location 
    save_loc = os.path.join(os.getcwd(), save_dir, curr_study, curr_cl)
    os.makedirs(save_loc, exist_ok = True)
    
    # save the sequences
    study_sequences = pd.DataFrame({'Guide_ID' : curr_counts['sgRNA_guide_name_g1'].tolist() + curr_counts['sgRNA_guide_name_g2'].tolist(),
                                    'Sequence' : curr_counts['sgRNA_guide_seq_g1'].tolist() + curr_counts['sgRNA_guide_seq_g2'].tolist()})

    study_sequences.drop_duplicates(subset = 'Sequence', inplace = True)
    study_sequences.index = study_sequences['Guide_ID']
    study_sequences.to_csv(os.path.join(save_loc, "sequences.csv"), sep = ',', index = False)

    # save guidexgene annotation
    guide_gene_annotation = pd.DataFrame({'Sequences Comb': curr_counts[['sgRNA_guide_seq_g1', 'sgRNA_guide_seq_g2']].agg(';'.join, axis = 1),
                                          'Gene 1': curr_counts['sgRNA_target_name_g1'],
                                          'Gene 2': curr_counts['sgRNA_target_name_g2']})

    gemini_counts = pd.DataFrame(guide_gene_annotation['Sequences Comb'].copy())
    gemini_counts = gemini_counts.merge(T0_counts, how = 'left', right_index = True, left_index = True)
    gemini_counts = gemini_counts.merge(TEnd_counts, how = 'left', right_index = True, left_index = True)

    guide_gene_annotation.reset_index(drop = True, inplace = True)
    guide_gene_annotation.to_csv(os.path.join(save_loc, "guide_gene_annotation.csv"), sep = ',', index = False)

    # save counts
    gemini_counts.reset_index(drop = True, inplace = True)
    gemini_counts.to_csv(os.path.join(save_loc, "counts.csv"), sep = ',', index = False)
    
    # write a gemini bash file
    file_loc = os.path.join(save_loc, 'GEMINI_commands.sh')
    fp = open(file_loc, '+w')
    fp.write("#!/bin/sh\n")
    fp.write('module load R/4.1.0-gnu9.1\n')
    fp.write('Rscript --vanilla GEMINI.R --args ' + os.path.join(save_dir, curr_study, curr_cl) + '\n')
    fp.close()

    # set chmod
    os.chmod(file_loc, 0o0777)

    
    #### scoring
    
    # scores have already been computed
    if os.path.exists(os.path.join(save_loc, 'GEMINI_Scores.csv')):
        print('Scores exist!')
    else:
        print("Running GEMINI...")
        process = subprocess.run([file_loc], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        if process.returncode == 1:
            print("Error in GEMINI!!!")
            return(process.stdout.splitlines())
        else:
            print("Finished running GEMINI!")

        
    ######### load results
    
    res = pd.read_csv(os.path.join(save_loc, 'GEMINI_Scores.csv'), index_col = 0)

    res.index = ['|'.join(sorted(i.split(';'))) for i in res.index]
    res.columns = ['GEMINI Score']

    # # get only dual SL
    only_dual_idx = [False if 'CONTROL' in i else True for i in res.index]
    res = res.loc[only_dual_idx]

    # set results
    gemini_results = pd.DataFrame(data = {'SL_score' : res['GEMINI Score'].values,
                                             'Gene 1' : [i.split('|')[0] for i in res.index],
                                             'Gene 2' : [i.split('|')[1] for i in res.index]}, index = ['|'.join(sorted(i.split('|'))) for i in res.index])

    
    
    ######### /load results

    results = {}
    results['GEMINI_SCORE'] = gemini_results
    
    return(results)

In [27]:
def add_table_to_db(curr_counts, curr_results, table_name, engine_link):
    
    print('---------ADDING-TO-DB---------')
    
    # print table
    print('Processing table for: ' + table_name)
    
    # add sorted targets
    # add a sorted gene pair column
    curr_counts['gene_pair'] = ['|'.join(sorted([curr_counts['sgRNA_target_name_g1'].iloc[i], curr_counts['sgRNA_target_name_g2'].iloc[i]])) for i in range(curr_counts.shape[0])]

    # remove the same ones
    curr_results = curr_results.loc[curr_results['Gene 1'] != curr_results['Gene 2'],:]

    # keep only score columns
    curr_results.drop(['Gene 1', 'Gene 2'], axis = 1, inplace = True, errors = 'ignore')

    # make the index
    # curr_counts['gene_pair'] = ['|'.join(sorted([curr_counts['sgRNA_target_name_g1'].iloc[i], curr_counts['sgRNA_target_name_g2'].iloc[i]])) for i in range(curr_counts.shape[0])]

    # merge and get final table
    curr_results = curr_results.merge(curr_counts.drop_duplicates(subset = 'gene_pair'), how = 'left', left_index = True, right_on ='gene_pair').loc[:, ['gene_pair_id'] + list(curr_results.columns)]

    # first, get the metadata
    db_metadata = sqlalchemy.MetaData(bind=engine_link)
    db_metadata.reflect(engine_link)

    # access the tables
    curr_table = db_metadata.tables[table_name]

    # then, start the session
    engine_session = sessionmaker(bind=engine_link)
    curr_session = engine_session()

    # get number of records in each table
    curr_records_num = curr_session.query(curr_table).count()

    curr_results.reset_index(drop = True, inplace = True)
    curr_results.index += curr_records_num
    # set index
    curr_results['id'] = curr_results.index
    
    if curr_results['gene_pair_id'].isna().sum() > 0:
        print('NA found')
        return()
        
    with engine_link.begin() as transaction:
        # insert sequence
        print('Beginning transaction...')

        # insert scores
        curr_results.to_sql(name = table_name, con = transaction, if_exists = 'append', index = False, index_label = 'id')

        print('Successfully inserted!')

        print('Added Record stats...')
        print(' '.join(['Score insert:', str(curr_results.shape[0])]))

In [28]:
def check_if_added_to_table(curr_counts, table_name, engine_link):
    print('Checking if score already computed: ' + table_name)
    
    # get available results
    res = pd.read_sql_table(table_name, engine_link, index_col = 'id')
    
    if res.shape[0] == 0:
        # none added, so proceed
        return(False)
    else:
        # get all IDs
        available_ids = set(res['gene_pair_id']).difference(set(curr_counts['gene_pair_id']))
        
        # already added
        if len(available_ids) == 0:
            print('Scores already in database!')
            print('Inserted scores: ' + str(len(set(curr_counts['gene_pair_id']))))
            print('---------NOT-TO-DB---------')
            return(True)
        elif len(available_ids) == len(set(res['gene_pair_id'])):
            # none added, so proceed
            return(False)
        else:
            print('Scores already in database?')
            print('Inserted scores?: ' + str(res.shape[0]))
            print('---------NOT-TO-DB---------')
            return(True)

In [29]:
def query_result_table(curr_counts, table_name, curr_study, curr_cl, engine_link):
    
    print('Accessing table: ' + table_name)
    
    # get available results
    res = pd.read_sql_table(table_name, engine_link, index_col = 'id')
    
    # possible gene pairs
    curr_counts['gene_pair'] = ['|'.join(sorted([curr_counts['sgRNA_target_name_g1'].iloc[i], curr_counts['sgRNA_target_name_g2'].iloc[i]])) for i in range(curr_counts.shape[0])]

    # get results
    query_res = curr_counts.loc[curr_counts['target_type'] == 'Dual', ['gene_pair', 'gene_pair_id']].drop_duplicates(subset = ['gene_pair_id'])
    query_res = query_res.merge(res, left_on = 'gene_pair_id', right_on = 'gene_pair_id').drop('gene_pair_id', axis = 1)
    
    # add column names to the front
    names_dict = {i: table_name + '_' + i for i in query_res.columns[1:]}
    query_res.rename(columns = names_dict, inplace = True)
    #query_res.columns[1:] = table_name + '_' + query_res.columns[1:]
    
    print('Available gene pairs: ' + str(query_res.shape[0]))
    
    # add name of study
    query_res['study_origin'] = curr_study
    
    # add name of cell line
    query_res['cell_line_origin'] = curr_cl
    
    return(query_res)

In [30]:
available_studies = set(counts['study_origin'])
available_studies = sorted([rev_study_name_to_pubmed_id[i] for i in set(counts['study_origin'])])

In [31]:
available_studies

['diehl_data',
 'horlbeck_data',
 'najm_data',
 'parrish_data',
 'shantang_data',
 'wong_data',
 'zhao_data']

In [32]:
# store_results = {}
# horlbeck_processed = False
# for curr_study in available_studies:
#     store_results[curr_study] = {}
#     print('Working on study: ' + curr_study)

#     # get study counts and seq
#     study_counts = counts.loc[counts['study_origin'] == study_name_to_pubmed_id[curr_study]].copy()

#     curr_seq_ids = np.array(sorted(list(set(study_counts['guide_1_id'].tolist() + study_counts['guide_2_id'].tolist()))))
#     study_sequences = experiment_design.loc[curr_seq_ids]

#     # the analysis runs for each individual cell line
#     available_cell_lines = set(study_counts['cell_line_origin'])

#     for curr_cl in available_cell_lines:
#         store_results[curr_study][curr_cl] = {}
#         print('Working on cell line: ' + curr_cl)
#         curr_counts = study_counts.loc[study_counts['cell_line_origin'] == curr_cl].copy()
        
#         # run horlbeck score
# #         if curr_study == 'horlbeck_data' and (not horlbeck_processed):
# #             # Horlbeck does additional filtering on JURKAT based on K562
# #             JURKAT_counts = study_counts.loc[study_counts['cell_line_origin'] == 'JURKAT'].copy()
# #             K562_counts = study_counts.loc[study_counts['cell_line_origin'] == 'K562'].copy()

# #             #if (not check_if_added_to_table(JURKAT_counts.copy(), 'HORLBECK_SCORE', SLKB_engine)) and (not check_if_added_to_table(K562_counts.copy(), 'HORLBECK_SCORE', SLKB_engine)):

# #             # do the preprocessing
# #             JURKAT_counts = run_horlbeck_preprocessing(JURKAT_counts, filterThreshold = 35, pseudocount = 10)
# #             K562_counts = run_horlbeck_preprocessing(K562_counts, filterThreshold = 35, pseudocount = 10)

# #             # do the additional filtering
# #             singles_K562 = JURKAT_counts.loc[JURKAT_counts['target_type'] == 'Single'].copy().dropna()
# #             singles_JURKAT = K562_counts.loc[K562_counts['target_type'] == 'Single'].copy().dropna()

# #             a_average_K562 = singles_K562[singles_K562['sgRNA_target_name_g2'] == "CONTROL"]
# #             a_average_K562 = pd.DataFrame(a_average_K562.groupby('sgRNA_guide_name_g1')['FC_Averaged_abbaAveraged'].apply(np.mean))

# #             a_average_JURKAT = singles_JURKAT[singles_JURKAT['sgRNA_target_name_g2'] == "CONTROL"]
# #             a_average_JURKAT = pd.DataFrame(a_average_JURKAT.groupby('sgRNA_guide_name_g1')['FC_Averaged_abbaAveraged'].apply(np.mean))

# #             a_average_JURKAT = a_average_JURKAT.join(a_average_K562, rsuffix = '_K562')

# #             filter_criteria = a_average_JURKAT.loc[((a_average_JURKAT['FC_Averaged_abbaAveraged'] > -0.025) & (a_average_JURKAT['FC_Averaged_abbaAveraged_K562'] < -0.05))]

# #             additionally_filtered = list(filter_criteria.index)

# #             to_be_dropped = JURKAT_counts['sgRNA_guide_name_g1'].isin(additionally_filtered) | JURKAT_counts['sgRNA_guide_name_g2'].isin(additionally_filtered)

# #             # drop them
# #             JURKAT_counts = JURKAT_counts.loc[~to_be_dropped, :]

# #             # now, proceed to run horlbeck score
# #             JURKAT_res = run_horlbeck_score(JURKAT_counts.copy(), do_preprocessing = False)
# #             K562_res = run_horlbeck_score(K562_counts.copy(), do_preprocessing = False)

# #             store_results[curr_study]['JURKAT'].update(JURKAT_res)
# #             store_results[curr_study]['K562'].update(K562_res)

# #             add_table_to_db(JURKAT_counts.copy(), JURKAT_res['HORLBECK_SCORE'], 'HORLBECK_SCORE', SLKB_engine)
# #             add_table_to_db(K562_counts.copy(), K562_res['HORLBECK_SCORE'], 'HORLBECK_SCORE', SLKB_engine)

# #             horlbeck_processed = True
# # #             else:
# # #                 print('Skipping horlbeck score...horlbeck case')

# #         elif curr_study == 'diehl_data':
# # #             if not check_if_added_to_table(curr_counts.copy(), 'HORLBECK_SCORE', SLKB_engine):
# #                 # very low counts, set the threshold to be 0
# #             temp_counts = run_horlbeck_preprocessing(curr_counts.copy(), filterThreshold = 0, pseudocount = 10)
# #             horlbeck_res = run_horlbeck_score(temp_counts.copy(), do_preprocessing = False)
# #             store_results[curr_study][curr_cl].update(horlbeck_res)
# #             add_table_to_db(curr_counts.copy(), horlbeck_res['HORLBECK_SCORE'], 'HORLBECK_SCORE', SLKB_engine)
# # #             else:
# # #                 print('Skipping horlbeck score...')
# #         else:
# #             if curr_study != 'horlbeck_data':
# # #                 if not check_if_added_to_table(curr_counts.copy(), 'HORLBECK_SCORE', SLKB_engine):
# #                 horlbeck_res = run_horlbeck_score(curr_counts.copy(), do_preprocessing = True)
# #                 store_results[curr_study][curr_cl].update(horlbeck_res)
# #                 add_table_to_db(curr_counts.copy(), horlbeck_res['HORLBECK_SCORE'], 'HORLBECK_SCORE', SLKB_engine)
# # #                 else:
# # #                     print('Skipping horlbeck score...')
    

#         # run MAGeCK
# #         if not check_if_added_to_table(curr_counts.copy(), 'MAGECK_SCORE', SLKB_engine):
#         mageck_res = run_mageck_score(curr_counts.copy(), curr_study, curr_cl, save_dir = 'MAGECK_Files')
#         store_results[curr_study][curr_cl].update(mageck_res)
#         add_table_to_db(curr_counts.copy(), mageck_res['MAGECK_SCORE'], 'MAGECK_SCORE', SLKB_engine)
            
# #         else:
# #             print('Skipping MAGeCK score...')

#         # run median scores
# #         if not check_if_added_to_table(curr_counts.copy(), 'MEDIAN_NB_SCORE', SLKB_engine):
#         median_res = run_median_scores(curr_counts.copy(), full_normalization = False)
#         store_results[curr_study][curr_cl].update(median_res)
#         add_table_to_db(curr_counts.copy(), median_res['MEDIAN_NB_SCORE'], 'MEDIAN_NB_SCORE', SLKB_engine)
#         if median_res['MEDIAN_B_SCORE'] is not None:
#             add_table_to_db(curr_counts.copy(), median_res['MEDIAN_B_SCORE'], 'MEDIAN_B_SCORE', SLKB_engine)
# #         else:
# #             print('Skipping median score...')

#         # run median scores
#         #if not check_if_added_to_table(curr_counts.copy(), 'MEDIAN_NB_SCORE_FULL_NORM', SLKB_engine):
#         median_res2 = run_median_scores(curr_counts.copy(), full_normalization = True)
#         store_results[curr_study][curr_cl].update(median_res2)
#         add_table_to_db(curr_counts.copy(), median_res2['MEDIAN_NB_SCORE'], 'MEDIAN_NB_SCORE_FULL_NORM', SLKB_engine)
#         if median_res2['MEDIAN_B_SCORE'] is not None:
#             add_table_to_db(curr_counts.copy(), median_res2['MEDIAN_B_SCORE'], 'MEDIAN_B_SCORE_FULL_NORM', SLKB_engine)
# #         else:
# #             print('Skipping median score...')

#         # run sgRNA scores
# #         if not check_if_added_to_table(curr_counts.copy(), 'SGRA_DERIVED_NB_SCORE', SLKB_engine):
#         sgRNA_res = run_sgrna_scores(curr_counts.copy(), full_normalization = False)
#         store_results[curr_study][curr_cl].update(sgRNA_res)
#         add_table_to_db(curr_counts.copy(), sgRNA_res['SGRA_DERIVED_NB_SCORE'], 'SGRA_DERIVED_NB_SCORE', SLKB_engine)
#         if sgRNA_res['SGRA_DERIVED_B_SCORE'] is not None:
#             add_table_to_db(curr_counts.copy(), sgRNA_res['SGRA_DERIVED_B_SCORE'], 'SGRA_DERIVED_B_SCORE', SLKB_engine)
# #         else:
# #             print('Skipping sgRNA score...')

# #         if not check_if_added_to_table(curr_counts.copy(), 'SGRA_DERIVED_NB_SCORE_FULL_NORM', SLKB_engine):
#         sgRNA_res2 = run_sgrna_scores(curr_counts.copy(), full_normalization = True)
#         store_results[curr_study][curr_cl].update(sgRNA_res2)
#         add_table_to_db(curr_counts.copy(), sgRNA_res2['SGRA_DERIVED_NB_SCORE'], 'SGRA_DERIVED_NB_SCORE_FULL_NORM', SLKB_engine)
#         if sgRNA_res2['SGRA_DERIVED_B_SCORE'] is not None:
#             add_table_to_db(curr_counts.copy(), sgRNA_res2['SGRA_DERIVED_B_SCORE'], 'SGRA_DERIVED_B_SCORE_FULL_NORM', SLKB_engine)
# #         else:
# #             print('Skipping sgRNA score...')

#         # run GEMINI
#         #if not check_if_added_to_table(curr_counts.copy(), 'GEMINI_SCORE', SLKB_engine):
#         gemini_res = run_gemini_score(curr_counts.copy(), curr_study, curr_cl, save_dir = 'GEMINI_Files')
#         store_results[curr_study][curr_cl].update(gemini_res)
#         add_table_to_db(curr_counts.copy(), gemini_res['GEMINI_SCORE'], 'GEMINI_SCORE', SLKB_engine)
# #         else:
# #             print('Skipping GEMINI score...')


In [33]:
for curr_study in available_studies:
    print('Working on study: ' + curr_study)

    # get study counts and seq
    study_counts = counts.loc[counts['study_origin'] == study_name_to_pubmed_id[curr_study]].copy()

    curr_seq_ids = np.array(sorted(list(set(study_counts['guide_1_id'].tolist() + study_counts['guide_2_id'].tolist()))))
    study_sequences = experiment_design.loc[curr_seq_ids]

    # the analysis runs for each individual cell line
    available_cell_lines = set(study_counts['cell_line_origin'])

    for curr_cl in available_cell_lines:
        print('Working on cell line: ' + curr_cl)
        curr_counts = study_counts.loc[study_counts['cell_line_origin'] == curr_cl].copy()

        # run median scores
        if not check_if_added_to_table(curr_counts.copy(), 'MEDIAN_NB_SCORE', SLKB_engine):
            median_res = run_median_scores(curr_counts.copy(), full_normalization = False)
            add_table_to_db(curr_counts.copy(), median_res['MEDIAN_NB_SCORE'], 'MEDIAN_NB_SCORE', SLKB_engine)
            if median_res['MEDIAN_B_SCORE'] is not None:
                add_table_to_db(curr_counts.copy(), median_res['MEDIAN_B_SCORE'], 'MEDIAN_B_SCORE', SLKB_engine)
        else:
            print('Skipping median score...')
            
        print('---------------------')


Working on study: diehl_data
Working on cell line: RPE1
Checking if score already computed: MEDIAN_NB_SCORE
Running median scores...
Getting raw counts...
Filtering enabled... Condition: 35 counts
Filtered a total of 9757 out of 248191 sgRNAs.

---

Not full normalization...
Normalization enabled...
Current counts:
ctrl_1    3434902.0
ctrl_2    2905388.0
dtype: float64
Normalize based on a specific value... 59882.0 counts
Normalization enabled...
Current counts:
rep_1    43897.0
rep_2    59882.0
rep_3    23578.0
dtype: float64
Normalize based on a specific value... 59882.0 counts
Normalization enabled...
Current counts:
ctrl_1    49753319.0
ctrl_2    41904226.0
dtype: float64
Normalize based on a specific value... 1621160.0 counts
Normalization enabled...
Current counts:
rep_1    1621160.0
rep_2    1589753.0
rep_3     550919.0
dtype: float64
Normalize based on a specific value... 1621160.0 counts
Normalization enabled...
Current counts:
ctrl_1    171179130.0
ctrl_2    143161060.0
dtype

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dual['Median-NB-dual-IS'] = dual['FC'].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dual['Median-NB-single-IS-Guide-1'] = EC_1.values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dual['Median-NB-single-IS-Guide-2'] = EC_2.values
A value is trying to be set on a copy of a slice from a Dat

Filtered gene count: 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dual['Median-B-dual-IS'] = dual['FC'].values - control_median
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dual['Median-B-single-IS-Guide-1'] = EC_1.values - control_median
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dual['Median-B-single-IS-Guide-2'] = EC_2.values - control_median
A value is 

Filtered gene count: 0
---------ADDING-TO-DB---------
Processing table for: MEDIAN_NB_SCORE
Beginning transaction...
Successfully inserted!
Added Record stats...
Score insert: 10720
---------ADDING-TO-DB---------
Processing table for: MEDIAN_B_SCORE
Beginning transaction...
Successfully inserted!
Added Record stats...
Score insert: 10720
---------------------
Working on study: horlbeck_data
Working on cell line: JURKAT
Checking if score already computed: MEDIAN_NB_SCORE
Running median scores...
Getting raw counts...
Filtering enabled... Condition: 35 counts
Filtered a total of 128359 out of 1044484 sgRNAs.

---

Not full normalization...
Normalization enabled...
Current counts:
JURKAT_tripleseq,T0,rep1    67829.0
JURKAT_tripleseq,T0,rep2    52320.0
dtype: float64
Normalize based on a specific value... 89585.5 counts
Normalization enabled...
Current counts:
JURKAT_tripleseq,cyc,rep1    111342.0
JURKAT_tripleseq,cyc,rep2    130002.0
dtype: float64
Normalize based on a specific value... 8

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dual['Median-NB-dual-IS'] = dual['FC'].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dual['Median-NB-single-IS-Guide-1'] = EC_1.values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dual['Median-NB-single-IS-Guide-2'] = EC_2.values
A value is trying to be set on a copy of a slice from a Dat

Filtered single sgRNA count: 4
Filtered gene count: 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dual['Median-B-dual-IS'] = dual['FC'].values - control_median
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dual['Median-B-single-IS-Guide-1'] = EC_1.values - control_median
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dual['Median-B-single-IS-Guide-2'] = EC_2.values - control_median
A value is 

Filtered gene count: 0
---------ADDING-TO-DB---------
Processing table for: MEDIAN_NB_SCORE
Beginning transaction...
Successfully inserted!
Added Record stats...
Score insert: 110614
---------ADDING-TO-DB---------
Processing table for: MEDIAN_B_SCORE
Beginning transaction...
Successfully inserted!
Added Record stats...
Score insert: 110614
---------------------
Working on cell line: K562
Checking if score already computed: MEDIAN_NB_SCORE
Running median scores...
Getting raw counts...
Filtering enabled... Condition: 35 counts
Filtered a total of 92141 out of 1044484 sgRNAs.

---

Not full normalization...
Normalization enabled...
Current counts:
K562_barcode,T0,rep1     90567.0
K562_barcode,T0,rep2    105269.0
dtype: float64
Normalize based on a specific value... 339731.0 counts
Normalization enabled...
Current counts:
K562_tripleseq,cyc,rep1    627562.0
K562_tripleseq,cyc,rep2    574193.0
dtype: float64
Normalize based on a specific value... 339731.0 counts
Normalization enabled...
Cu

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dual['Median-NB-dual-IS'] = dual['FC'].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dual['Median-NB-single-IS-Guide-1'] = EC_1.values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dual['Median-NB-single-IS-Guide-2'] = EC_2.values
A value is trying to be set on a copy of a slice from a Dat

Filtered gene count: 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dual['Median-B-dual-IS'] = dual['FC'].values - control_median
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dual['Median-B-single-IS-Guide-1'] = EC_1.values - control_median
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dual['Median-B-single-IS-Guide-2'] = EC_2.values - control_median
A value is 

Filtered gene count: 0
---------ADDING-TO-DB---------
Processing table for: MEDIAN_NB_SCORE
Beginning transaction...
Successfully inserted!
Added Record stats...
Score insert: 110684
---------ADDING-TO-DB---------
Processing table for: MEDIAN_B_SCORE
Beginning transaction...
Successfully inserted!
Added Record stats...
Score insert: 110684
---------------------
Working on study: najm_data
Working on cell line: HT29
Checking if score already computed: MEDIAN_NB_SCORE
Running median scores...
Getting raw counts...
Removing NA replicate from TEnd...
Filtering enabled... Condition: 35 counts
Filtered a total of 1 out of 9216 sgRNAs.

---

Not full normalization...
Normalization enabled...
Current counts:
pDNA_Reads    3609228.0
dtype: float64
Normalize based on a specific value... 3609228.0 counts
Normalization enabled...
Current counts:
Rep_A_Reads    3009073.0
Rep_B_Reads    4006083.0
dtype: float64
Normalize based on a specific value... 3609228.0 counts
Normalization enabled...
Current 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dual['Median-NB-dual-IS'] = dual['FC'].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dual['Median-NB-single-IS-Guide-1'] = EC_1.values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dual['Median-NB-single-IS-Guide-2'] = EC_2.values
A value is trying to be set on a copy of a slice from a Dat

Beginning transaction...
Successfully inserted!
Added Record stats...
Score insert: 300
---------ADDING-TO-DB---------
Processing table for: MEDIAN_B_SCORE
Beginning transaction...
Successfully inserted!
Added Record stats...
Score insert: 300
---------------------
Working on cell line: A549
Checking if score already computed: MEDIAN_NB_SCORE
Running median scores...
Getting raw counts...
Removing NA replicate from TEnd...
Filtering enabled... Condition: 35 counts
Filtered a total of 1 out of 9216 sgRNAs.

---

Not full normalization...
Normalization enabled...
Current counts:
pDNA_Reads    3609228.0
dtype: float64
Normalize based on a specific value... 3933111.0 counts
Normalization enabled...
Current counts:
Rep_A_Reads    3933111.0
Rep_B_Reads    4567319.0
dtype: float64
Normalize based on a specific value... 3933111.0 counts
Normalization enabled...
Current counts:
pDNA_Reads    2407466.0
dtype: float64
Normalize based on a specific value... 3650146.0 counts
Normalization enabled..

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dual['Median-NB-dual-IS'] = dual['FC'].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dual['Median-NB-single-IS-Guide-1'] = EC_1.values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dual['Median-NB-single-IS-Guide-2'] = EC_2.values
A value is trying to be set on a copy of a slice from a Dat

Beginning transaction...
Successfully inserted!
Added Record stats...
Score insert: 300
---------ADDING-TO-DB---------
Processing table for: MEDIAN_B_SCORE
Beginning transaction...
Successfully inserted!
Added Record stats...
Score insert: 300
---------------------
Working on cell line: OVCAR8
Checking if score already computed: MEDIAN_NB_SCORE
Running median scores...
Getting raw counts...
Filtering enabled... Condition: 35 counts
Filtered a total of 1 out of 9216 sgRNAs.

---

Not full normalization...
Normalization enabled...
Current counts:
pDNA_Reads    3609228.0
dtype: float64
Normalize based on a specific value... 2499117.5 counts
Normalization enabled...
Current counts:
Rep_A_Reads    1540891.0
Rep_B_Reads    1515861.0
Rep_C_Reads    3457344.0
dtype: float64
Normalize based on a specific value... 2499117.5 counts
Normalization enabled...
Current counts:
pDNA_Reads    2407466.0
dtype: float64
Normalize based on a specific value... 1809065.5 counts
Normalization enabled...
Curren

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dual['Median-NB-dual-IS'] = dual['FC'].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dual['Median-NB-single-IS-Guide-1'] = EC_1.values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dual['Median-NB-single-IS-Guide-2'] = EC_2.values
A value is trying to be set on a copy of a slice from a Dat

Beginning transaction...
Successfully inserted!
Added Record stats...
Score insert: 300
---------ADDING-TO-DB---------
Processing table for: MEDIAN_B_SCORE
Beginning transaction...
Successfully inserted!
Added Record stats...
Score insert: 300
---------------------
Working on cell line: A375
Checking if score already computed: MEDIAN_NB_SCORE
Running median scores...
Getting raw counts...
Filtering enabled... Condition: 35 counts
Filtered a total of 1 out of 9216 sgRNAs.

---

Not full normalization...
Normalization enabled...
Current counts:
pDNA_Reads    3609228.0
dtype: float64
Normalize based on a specific value... 3141784.5 counts
Normalization enabled...
Current counts:
Rep_A_Reads    3399698.0
Rep_B_Reads    2883871.0
Rep_C_Reads    2112357.0
dtype: float64
Normalize based on a specific value... 3141784.5 counts
Normalization enabled...
Current counts:
pDNA_Reads    2407466.0
dtype: float64
Normalize based on a specific value... 2540828.5 counts
Normalization enabled...
Current 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dual['Median-NB-dual-IS'] = dual['FC'].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dual['Median-NB-single-IS-Guide-1'] = EC_1.values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dual['Median-NB-single-IS-Guide-2'] = EC_2.values
A value is trying to be set on a copy of a slice from a Dat

Beginning transaction...
Successfully inserted!
Added Record stats...
Score insert: 300
---------ADDING-TO-DB---------
Processing table for: MEDIAN_B_SCORE
Beginning transaction...
Successfully inserted!
Added Record stats...
Score insert: 300
---------------------
Working on cell line: 786O
Checking if score already computed: MEDIAN_NB_SCORE
Running median scores...
Getting raw counts...
Filtering enabled... Condition: 35 counts
Filtered a total of 1 out of 9216 sgRNAs.

---

Not full normalization...
Normalization enabled...
Current counts:
pDNA_Reads    3609228.0
dtype: float64
Normalize based on a specific value... 4237644.0 counts
Normalization enabled...
Current counts:
Rep_A_Reads    6257808.0
Rep_B_Reads    4094431.0
Rep_C_Reads    4380857.0
dtype: float64
Normalize based on a specific value... 4237644.0 counts
Normalization enabled...
Current counts:
pDNA_Reads    2407466.0
dtype: float64
Normalize based on a specific value... 3228455.0 counts
Normalization enabled...
Current 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dual['Median-NB-dual-IS'] = dual['FC'].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dual['Median-NB-single-IS-Guide-1'] = EC_1.values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dual['Median-NB-single-IS-Guide-2'] = EC_2.values
A value is trying to be set on a copy of a slice from a Dat

Beginning transaction...
Successfully inserted!
Added Record stats...
Score insert: 300
---------ADDING-TO-DB---------
Processing table for: MEDIAN_B_SCORE
Beginning transaction...
Successfully inserted!
Added Record stats...
Score insert: 300
---------------------
Working on cell line: MELJUSO
Checking if score already computed: MEDIAN_NB_SCORE
Running median scores...
Getting raw counts...
Removing NA replicate from TEnd...
Filtering enabled... Condition: 35 counts
Filtered a total of 1 out of 9216 sgRNAs.

---

Not full normalization...
Normalization enabled...
Current counts:
pDNA_Reads    3609228.0
dtype: float64
Normalize based on a specific value... 3643736.0 counts
Normalization enabled...
Current counts:
Rep_A_Reads    3643736.0
Rep_B_Reads    4225805.0
dtype: float64
Normalize based on a specific value... 3643736.0 counts
Normalization enabled...
Current counts:
pDNA_Reads    2407466.0
dtype: float64
Normalize based on a specific value... 3020501.0 counts
Normalization enable

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dual['Median-NB-dual-IS'] = dual['FC'].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dual['Median-NB-single-IS-Guide-1'] = EC_1.values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dual['Median-NB-single-IS-Guide-2'] = EC_2.values
A value is trying to be set on a copy of a slice from a Dat

Beginning transaction...
Successfully inserted!
Added Record stats...
Score insert: 300
---------ADDING-TO-DB---------
Processing table for: MEDIAN_B_SCORE
Beginning transaction...
Successfully inserted!
Added Record stats...
Score insert: 300
---------------------
Working on study: parrish_data
Working on cell line: HELA
Checking if score already computed: MEDIAN_NB_SCORE
Running median scores...
Getting raw counts...
Filtering enabled... Condition: 35 counts
Filtered a total of 0 out of 31833 sgRNAs.

---

Not full normalization...
Normalization enabled...
Current counts:
plasmid_1    468996.67
plasmid_2    522803.80
plasmid_3    524885.28
dtype: float64
Normalize based on a specific value... 669420.108 counts
Normalization enabled...
Current counts:
LTP_1    813954.936
LTP_2    930812.115
LTP_3    936990.738
dtype: float64
Normalize based on a specific value... 669420.108 counts
Normalization enabled...
Current counts:
plasmid_1    1.583331e+07
plasmid_2    1.765058e+07
plasmid_3   

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dual['Median-NB-dual-IS'] = dual['FC'].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dual['Median-NB-single-IS-Guide-1'] = EC_1.values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dual['Median-NB-single-IS-Guide-2'] = EC_2.values
A value is trying to be set on a copy of a slice from a Dat

Filtered gene count: 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dual['Median-B-dual-IS'] = dual['FC'].values - control_median
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dual['Median-B-single-IS-Guide-1'] = EC_1.values - control_median
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dual['Median-B-single-IS-Guide-2'] = EC_2.values - control_median
A value is 

Filtered gene count: 0
---------ADDING-TO-DB---------
Processing table for: MEDIAN_NB_SCORE
Beginning transaction...
Successfully inserted!
Added Record stats...
Score insert: 1030
---------ADDING-TO-DB---------
Processing table for: MEDIAN_B_SCORE
Beginning transaction...
Successfully inserted!
Added Record stats...
Score insert: 1030
---------------------
Working on cell line: PC9
Checking if score already computed: MEDIAN_NB_SCORE
Running median scores...
Getting raw counts...
Filtering enabled... Condition: 35 counts
Filtered a total of 0 out of 32740 sgRNAs.

---

Not full normalization...
Normalization enabled...
Current counts:
plasmid_1    422252.993
plasmid_2    410554.421
plasmid_3    378813.398
dtype: float64
Normalize based on a specific value... 445240.2355 counts
Normalization enabled...
Current counts:
LTP_1    502102.581
LTP_2    515471.323
LTP_3    468227.478
dtype: float64
Normalize based on a specific value... 445240.2355 counts
Normalization enabled...
Current count

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dual['Median-NB-dual-IS'] = dual['FC'].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dual['Median-NB-single-IS-Guide-1'] = EC_1.values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dual['Median-NB-single-IS-Guide-2'] = EC_2.values
A value is trying to be set on a copy of a slice from a Dat

Filtered gene count: 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dual['Median-B-dual-IS'] = dual['FC'].values - control_median
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dual['Median-B-single-IS-Guide-1'] = EC_1.values - control_median
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dual['Median-B-single-IS-Guide-2'] = EC_2.values - control_median
A value is 

Filtered gene count: 0
---------ADDING-TO-DB---------
Processing table for: MEDIAN_NB_SCORE
Beginning transaction...
Successfully inserted!
Added Record stats...
Score insert: 1030
---------ADDING-TO-DB---------
Processing table for: MEDIAN_B_SCORE
Beginning transaction...
Successfully inserted!
Added Record stats...
Score insert: 1030
---------------------
Working on study: shantang_data
Working on cell line: 22RV1
Checking if score already computed: MEDIAN_NB_SCORE
Running median scores...
Getting raw counts...
Filtering enabled... Condition: 35 counts
Filtered a total of 8134 out of 48931 sgRNAs.

---

Not full normalization...
Normalization enabled...
Current counts:
T0_1    4174896.0
T0_2    4200469.0
dtype: float64
Normalize based on a specific value... 4771163.0 counts
Normalization enabled...
Current counts:
T12_1    5341857.0
T12_2    6456429.0
dtype: float64
Normalize based on a specific value... 4771163.0 counts
Normalization enabled...
Current counts:
T0_1    877394.0
T0_2 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dual['Median-NB-dual-IS'] = dual['FC'].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dual['Median-NB-single-IS-Guide-1'] = EC_1.values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dual['Median-NB-single-IS-Guide-2'] = EC_2.values
A value is trying to be set on a copy of a slice from a Dat

Filtered gene count: 0
Filtered gene count: 0
---------ADDING-TO-DB---------
Processing table for: MEDIAN_NB_SCORE
Beginning transaction...
Successfully inserted!
Added Record stats...
Score insert: 1225
---------ADDING-TO-DB---------
Processing table for: MEDIAN_B_SCORE
Beginning transaction...
Successfully inserted!
Added Record stats...
Score insert: 1225
---------------------
Working on study: wong_data
Working on cell line: OVCAR8
Checking if score already computed: MEDIAN_NB_SCORE
Running median scores...
Getting raw counts...
Filtering enabled... Condition: 35 counts
Filtered a total of 812 out of 23409 sgRNAs.

---

Not full normalization...
Normalization enabled...
Current counts:
day5 (Replicate 1)    28372036.0
day5 (Replicate 2)    27787859.0
dtype: float64
Normalize based on a specific value... 28079947.5 counts
Normalization enabled...
Current counts:
day20 (Replicate 1)    32287131.0
day20 (Replicate 2)    23231108.0
dtype: float64
Normalize based on a specific value... 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dual['Median-NB-dual-IS'] = dual['FC'].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dual['Median-NB-single-IS-Guide-1'] = EC_1.values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dual['Median-NB-single-IS-Guide-2'] = EC_2.values
A value is trying to be set on a copy of a slice from a Dat

Filtered gene count: 0
---------ADDING-TO-DB---------
Processing table for: MEDIAN_NB_SCORE
Beginning transaction...
Successfully inserted!
Added Record stats...
Score insert: 1225
---------ADDING-TO-DB---------
Processing table for: MEDIAN_B_SCORE
Beginning transaction...
Successfully inserted!
Added Record stats...
Score insert: 1225
---------------------
Working on study: zhao_data
Working on cell line: HELA
Checking if score already computed: MEDIAN_NB_SCORE
Running median scores...
Getting raw counts...
Filtering enabled... Condition: 35 counts
Filtered a total of 662 out of 11934 sgRNAs.

---

Not full normalization...
Normalization enabled...
Current counts:
Hela_MV4_d3_1_S1_trimmed53_len_filtered_counts    263437.0
Hela_MV4_d3_2_S2_trimmed53_len_filtered_counts    240792.0
dtype: float64
Normalize based on a specific value... 279991.0 counts
Normalization enabled...
Current counts:
Hela_MV4_d28_1_S5_trimmed53_len_filtered_counts    311450.0
Hela_MV4_d28_2_S6_trimmed53_len_filte

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dual['Median-NB-dual-IS'] = dual['FC'].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dual['Median-NB-single-IS-Guide-1'] = EC_1.values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dual['Median-NB-single-IS-Guide-2'] = EC_2.values
A value is trying to be set on a copy of a slice from a Dat

Beginning transaction...
Successfully inserted!
Added Record stats...
Score insert: 1275
---------------------
Working on cell line: A549
Checking if score already computed: MEDIAN_NB_SCORE
Running median scores...
Getting raw counts...
Filtering enabled... Condition: 35 counts
Filtered a total of 445 out of 11934 sgRNAs.

---

Not full normalization...
Normalization enabled...
Current counts:
A549_MV4_d3_1_S1_trimmed53_len_filtered_counts    318714.0
A549_MV4_d3_2_S2_trimmed53_len_filtered_counts    366817.0
dtype: float64
Normalize based on a specific value... 386171.0 counts
Normalization enabled...
Current counts:
A549_MV4_d28_1_S7_trimmed53_len_filtered_counts    405525.0
A549_MV4_d28_2_S8_trimmed53_len_filtered_counts    416768.0
dtype: float64
Normalize based on a specific value... 386171.0 counts
Normalization enabled...
Current counts:
A549_MV4_d3_1_S1_trimmed53_len_filtered_counts    6193814.0
A549_MV4_d3_2_S2_trimmed53_len_filtered_counts    6832663.0
dtype: float64
Normaliz

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dual['Median-NB-dual-IS'] = dual['FC'].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dual['Median-NB-single-IS-Guide-1'] = EC_1.values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dual['Median-NB-single-IS-Guide-2'] = EC_2.values
A value is trying to be set on a copy of a slice from a Dat

Beginning transaction...
Successfully inserted!
Added Record stats...
Score insert: 1275
---------------------


In [34]:
for curr_study in available_studies:
    print('Working on study: ' + curr_study)

    # get study counts and seq
    study_counts = counts.loc[counts['study_origin'] == study_name_to_pubmed_id[curr_study]].copy()

    curr_seq_ids = np.array(sorted(list(set(study_counts['guide_1_id'].tolist() + study_counts['guide_2_id'].tolist()))))
    study_sequences = experiment_design.loc[curr_seq_ids]

    # the analysis runs for each individual cell line
    available_cell_lines = set(study_counts['cell_line_origin'])

    for curr_cl in available_cell_lines:
        print('Working on cell line: ' + curr_cl)
        curr_counts = study_counts.loc[study_counts['cell_line_origin'] == curr_cl].copy()

        # run sgRNA scores
        if not check_if_added_to_table(curr_counts.copy(), 'SGRA_DERIVED_NB_SCORE', SLKB_engine):
            sgRNA_res = run_sgrna_scores(curr_counts.copy(), full_normalization = False)
            add_table_to_db(curr_counts.copy(), sgRNA_res['SGRA_DERIVED_NB_SCORE'], 'SGRA_DERIVED_NB_SCORE', SLKB_engine)
            if sgRNA_res['SGRA_DERIVED_B_SCORE'] is not None:
                add_table_to_db(curr_counts.copy(), sgRNA_res['SGRA_DERIVED_B_SCORE'], 'SGRA_DERIVED_B_SCORE', SLKB_engine)
        else:
            print('Skipping sgRNA score...')
            
        print('---------------------')


Working on study: diehl_data
Working on cell line: RPE1
Checking if score already computed: SGRA_DERIVED_NB_SCORE
Running sgrna derived score...
Getting raw counts...
Filtering enabled... Condition: 35 counts
Filtered a total of 9757 out of 248191 sgRNAs.

---

Not full normalization...
Normalization enabled...
Current counts:
ctrl_1    3434902.0
ctrl_2    2905388.0
dtype: float64
Normalize based on a specific value... 59882.0 counts
Normalization enabled...
Current counts:
rep_1    43897.0
rep_2    59882.0
rep_3    23578.0
dtype: float64
Normalize based on a specific value... 59882.0 counts
Normalization enabled...
Current counts:
ctrl_1    49753319.0
ctrl_2    41904226.0
dtype: float64
Normalize based on a specific value... 1621160.0 counts
Normalization enabled...
Current counts:
rep_1    1621160.0
rep_2    1589753.0
rep_3     550919.0
dtype: float64
Normalize based on a specific value... 1621160.0 counts
Normalization enabled...
Current counts:
ctrl_1    171179130.0
ctrl_2    14316

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'].isna()] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'] == 0] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  single['FC'] = single['FC'] - EC_control
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the d

Filtered single sgRNA count: 1


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'].isna()] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'] == 0] = 1


---------ADDING-TO-DB---------
Processing table for: SGRA_DERIVED_NB_SCORE
Beginning transaction...
Successfully inserted!
Added Record stats...
Score insert: 10720
---------ADDING-TO-DB---------
Processing table for: SGRA_DERIVED_B_SCORE
Beginning transaction...
Successfully inserted!
Added Record stats...
Score insert: 10720
---------------------
Working on study: horlbeck_data
Working on cell line: JURKAT
Checking if score already computed: SGRA_DERIVED_NB_SCORE
Running sgrna derived score...
Getting raw counts...
Filtering enabled... Condition: 35 counts
Filtered a total of 128359 out of 1044484 sgRNAs.

---

Not full normalization...
Normalization enabled...
Current counts:
JURKAT_tripleseq,T0,rep1    67829.0
JURKAT_tripleseq,T0,rep2    52320.0
dtype: float64
Normalize based on a specific value... 89585.5 counts
Normalization enabled...
Current counts:
JURKAT_tripleseq,cyc,rep1    111342.0
JURKAT_tripleseq,cyc,rep2    130002.0
dtype: float64
Normalize based on a specific value... 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'].isna()] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'] == 0] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  single['FC'] = single['FC'] - EC_control
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the d

Filtered single sgRNA count: 4


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'].isna()] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'] == 0] = 1


calculating for replicate 1
Filtered single sgRNA count: 4


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'].isna()] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'] == 0] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  single['FC'] = single['FC'] - EC_control
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the d

Filtered single sgRNA count: 4


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'].isna()] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'] == 0] = 1


---------ADDING-TO-DB---------
Processing table for: SGRA_DERIVED_NB_SCORE
Beginning transaction...
Successfully inserted!
Added Record stats...
Score insert: 110614
---------ADDING-TO-DB---------
Processing table for: SGRA_DERIVED_B_SCORE
Beginning transaction...
Successfully inserted!
Added Record stats...
Score insert: 110614
---------------------
Working on cell line: K562
Checking if score already computed: SGRA_DERIVED_NB_SCORE
Running sgrna derived score...
Getting raw counts...
Filtering enabled... Condition: 35 counts
Filtered a total of 92141 out of 1044484 sgRNAs.

---

Not full normalization...
Normalization enabled...
Current counts:
K562_barcode,T0,rep1     90567.0
K562_barcode,T0,rep2    105269.0
dtype: float64
Normalize based on a specific value... 339731.0 counts
Normalization enabled...
Current counts:
K562_tripleseq,cyc,rep1    627562.0
K562_tripleseq,cyc,rep2    574193.0
dtype: float64
Normalize based on a specific value... 339731.0 counts
Normalization enabled...
C

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'].isna()] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'] == 0] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  single['FC'] = single['FC'] - EC_control
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the d

Filtered single sgRNA count: 9


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'].isna()] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'] == 0] = 1


calculating for replicate 1
Filtered single sgRNA count: 9


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'].isna()] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'] == 0] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  single['FC'] = single['FC'] - EC_control
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the d

Filtered single sgRNA count: 9


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'].isna()] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'] == 0] = 1


---------ADDING-TO-DB---------
Processing table for: SGRA_DERIVED_NB_SCORE
Beginning transaction...
Successfully inserted!
Added Record stats...
Score insert: 110684
---------ADDING-TO-DB---------
Processing table for: SGRA_DERIVED_B_SCORE
Beginning transaction...
Successfully inserted!
Added Record stats...
Score insert: 110684
---------------------
Working on study: najm_data
Working on cell line: HT29
Checking if score already computed: SGRA_DERIVED_NB_SCORE
Running sgrna derived score...
Getting raw counts...
Removing NA replicate from TEnd...
Filtering enabled... Condition: 35 counts
Filtered a total of 1 out of 9216 sgRNAs.

---

Not full normalization...
Normalization enabled...
Current counts:
pDNA_Reads    3609228.0
dtype: float64
Normalize based on a specific value... 3609228.0 counts
Normalization enabled...
Current counts:
Rep_A_Reads    3009073.0
Rep_B_Reads    4006083.0
dtype: float64
Normalize based on a specific value... 3609228.0 counts
Normalization enabled...
Current

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'].isna()] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'] == 0] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  single['FC'] = single['FC'] - EC_control
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the d

Filtered single sgRNA count: 0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'].isna()] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'] == 0] = 1


---------ADDING-TO-DB---------
Processing table for: SGRA_DERIVED_NB_SCORE
Beginning transaction...
Successfully inserted!
Added Record stats...
Score insert: 300
---------ADDING-TO-DB---------
Processing table for: SGRA_DERIVED_B_SCORE
Beginning transaction...
Successfully inserted!
Added Record stats...
Score insert: 300
---------------------
Working on cell line: A549
Checking if score already computed: SGRA_DERIVED_NB_SCORE
Running sgrna derived score...
Getting raw counts...
Removing NA replicate from TEnd...
Filtering enabled... Condition: 35 counts
Filtered a total of 1 out of 9216 sgRNAs.

---

Not full normalization...
Normalization enabled...
Current counts:
pDNA_Reads    3609228.0
dtype: float64
Normalize based on a specific value... 3933111.0 counts
Normalization enabled...
Current counts:
Rep_A_Reads    3933111.0
Rep_B_Reads    4567319.0
dtype: float64
Normalize based on a specific value... 3933111.0 counts
Normalization enabled...
Current counts:
pDNA_Reads    2407466.0
d

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'].isna()] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'] == 0] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  single['FC'] = single['FC'] - EC_control
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the d

Filtered single sgRNA count: 0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'].isna()] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'] == 0] = 1


---------ADDING-TO-DB---------
Processing table for: SGRA_DERIVED_NB_SCORE
Beginning transaction...
Successfully inserted!
Added Record stats...
Score insert: 300
---------ADDING-TO-DB---------
Processing table for: SGRA_DERIVED_B_SCORE
Beginning transaction...
Successfully inserted!
Added Record stats...
Score insert: 300
---------------------
Working on cell line: OVCAR8
Checking if score already computed: SGRA_DERIVED_NB_SCORE
Running sgrna derived score...
Getting raw counts...
Filtering enabled... Condition: 35 counts
Filtered a total of 1 out of 9216 sgRNAs.

---

Not full normalization...
Normalization enabled...
Current counts:
pDNA_Reads    3609228.0
dtype: float64
Normalize based on a specific value... 2499117.5 counts
Normalization enabled...
Current counts:
Rep_A_Reads    1540891.0
Rep_B_Reads    1515861.0
Rep_C_Reads    3457344.0
dtype: float64
Normalize based on a specific value... 2499117.5 counts
Normalization enabled...
Current counts:
pDNA_Reads    2407466.0
dtype: fl

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'].isna()] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'] == 0] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  single['FC'] = single['FC'] - EC_control
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the d

Filtered single sgRNA count: 0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'].isna()] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'] == 0] = 1


---------ADDING-TO-DB---------
Processing table for: SGRA_DERIVED_NB_SCORE
Beginning transaction...
Successfully inserted!
Added Record stats...
Score insert: 300
---------ADDING-TO-DB---------
Processing table for: SGRA_DERIVED_B_SCORE
Beginning transaction...
Successfully inserted!
Added Record stats...
Score insert: 300
---------------------
Working on cell line: A375
Checking if score already computed: SGRA_DERIVED_NB_SCORE
Running sgrna derived score...
Getting raw counts...
Filtering enabled... Condition: 35 counts
Filtered a total of 1 out of 9216 sgRNAs.

---

Not full normalization...
Normalization enabled...
Current counts:
pDNA_Reads    3609228.0
dtype: float64
Normalize based on a specific value... 3141784.5 counts
Normalization enabled...
Current counts:
Rep_A_Reads    3399698.0
Rep_B_Reads    2883871.0
Rep_C_Reads    2112357.0
dtype: float64
Normalize based on a specific value... 3141784.5 counts
Normalization enabled...
Current counts:
pDNA_Reads    2407466.0
dtype: floa

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'].isna()] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'] == 0] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  single['FC'] = single['FC'] - EC_control
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the d

Filtered single sgRNA count: 0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'].isna()] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'] == 0] = 1


---------ADDING-TO-DB---------
Processing table for: SGRA_DERIVED_NB_SCORE
Beginning transaction...
Successfully inserted!
Added Record stats...
Score insert: 300
---------ADDING-TO-DB---------
Processing table for: SGRA_DERIVED_B_SCORE
Beginning transaction...
Successfully inserted!
Added Record stats...
Score insert: 300
---------------------
Working on cell line: 786O
Checking if score already computed: SGRA_DERIVED_NB_SCORE
Running sgrna derived score...
Getting raw counts...
Filtering enabled... Condition: 35 counts
Filtered a total of 1 out of 9216 sgRNAs.

---

Not full normalization...
Normalization enabled...
Current counts:
pDNA_Reads    3609228.0
dtype: float64
Normalize based on a specific value... 4237644.0 counts
Normalization enabled...
Current counts:
Rep_A_Reads    6257808.0
Rep_B_Reads    4094431.0
Rep_C_Reads    4380857.0
dtype: float64
Normalize based on a specific value... 4237644.0 counts
Normalization enabled...
Current counts:
pDNA_Reads    2407466.0
dtype: floa

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'].isna()] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'] == 0] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  single['FC'] = single['FC'] - EC_control
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the d

Filtered single sgRNA count: 0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'].isna()] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'] == 0] = 1


---------ADDING-TO-DB---------
Processing table for: SGRA_DERIVED_NB_SCORE
Beginning transaction...
Successfully inserted!
Added Record stats...
Score insert: 300
---------ADDING-TO-DB---------
Processing table for: SGRA_DERIVED_B_SCORE
Beginning transaction...
Successfully inserted!
Added Record stats...
Score insert: 300
---------------------
Working on cell line: MELJUSO
Checking if score already computed: SGRA_DERIVED_NB_SCORE
Running sgrna derived score...
Getting raw counts...
Removing NA replicate from TEnd...
Filtering enabled... Condition: 35 counts
Filtered a total of 1 out of 9216 sgRNAs.

---

Not full normalization...
Normalization enabled...
Current counts:
pDNA_Reads    3609228.0
dtype: float64
Normalize based on a specific value... 3643736.0 counts
Normalization enabled...
Current counts:
Rep_A_Reads    3643736.0
Rep_B_Reads    4225805.0
dtype: float64
Normalize based on a specific value... 3643736.0 counts
Normalization enabled...
Current counts:
pDNA_Reads    2407466.

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'].isna()] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'] == 0] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  single['FC'] = single['FC'] - EC_control
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the d

Filtered single sgRNA count: 0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'].isna()] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'] == 0] = 1


---------ADDING-TO-DB---------
Processing table for: SGRA_DERIVED_NB_SCORE
Beginning transaction...
Successfully inserted!
Added Record stats...
Score insert: 300
---------ADDING-TO-DB---------
Processing table for: SGRA_DERIVED_B_SCORE
Beginning transaction...
Successfully inserted!
Added Record stats...
Score insert: 300
---------------------
Working on study: parrish_data
Working on cell line: HELA
Checking if score already computed: SGRA_DERIVED_NB_SCORE
Running sgrna derived score...
Getting raw counts...
Filtering enabled... Condition: 35 counts
Filtered a total of 0 out of 31833 sgRNAs.

---

Not full normalization...
Normalization enabled...
Current counts:
plasmid_1    468996.67
plasmid_2    522803.80
plasmid_3    524885.28
dtype: float64
Normalize based on a specific value... 669420.108 counts
Normalization enabled...
Current counts:
LTP_1    813954.936
LTP_2    930812.115
LTP_3    936990.738
dtype: float64
Normalize based on a specific value... 669420.108 counts
Normalizatio

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'].isna()] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'] == 0] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  single['FC'] = single['FC'] - EC_control
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the d

Filtered single sgRNA count: 45


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'].isna()] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'] == 0] = 1


calculating for replicate 1
Filtered single sgRNA count: 45


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'].isna()] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'] == 0] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  single['FC'] = single['FC'] - EC_control
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the d

Filtered single sgRNA count: 45


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'].isna()] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'] == 0] = 1


calculating for replicate 2
Filtered single sgRNA count: 45


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'].isna()] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'] == 0] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  single['FC'] = single['FC'] - EC_control
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the d

Filtered single sgRNA count: 45


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'].isna()] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'] == 0] = 1


---------ADDING-TO-DB---------
Processing table for: SGRA_DERIVED_NB_SCORE
Beginning transaction...
Successfully inserted!
Added Record stats...
Score insert: 1030
---------ADDING-TO-DB---------
Processing table for: SGRA_DERIVED_B_SCORE
Beginning transaction...
Successfully inserted!
Added Record stats...
Score insert: 1030
---------------------
Working on cell line: PC9
Checking if score already computed: SGRA_DERIVED_NB_SCORE
Running sgrna derived score...
Getting raw counts...
Filtering enabled... Condition: 35 counts
Filtered a total of 0 out of 32740 sgRNAs.

---

Not full normalization...
Normalization enabled...
Current counts:
plasmid_1    422252.993
plasmid_2    410554.421
plasmid_3    378813.398
dtype: float64
Normalize based on a specific value... 445240.2355 counts
Normalization enabled...
Current counts:
LTP_1    502102.581
LTP_2    515471.323
LTP_3    468227.478
dtype: float64
Normalize based on a specific value... 445240.2355 counts
Normalization enabled...
Current coun

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'].isna()] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'] == 0] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  single['FC'] = single['FC'] - EC_control
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the d

Filtered single sgRNA count: 15


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'].isna()] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'] == 0] = 1


calculating for replicate 1
Filtered single sgRNA count: 15


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'].isna()] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'] == 0] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  single['FC'] = single['FC'] - EC_control
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the d

Filtered single sgRNA count: 15


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'].isna()] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'] == 0] = 1


calculating for replicate 2
Filtered single sgRNA count: 15


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'].isna()] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'] == 0] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  single['FC'] = single['FC'] - EC_control
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the d

Filtered single sgRNA count: 15


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'].isna()] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'] == 0] = 1


---------ADDING-TO-DB---------
Processing table for: SGRA_DERIVED_NB_SCORE
Beginning transaction...
Successfully inserted!
Added Record stats...
Score insert: 1030
---------ADDING-TO-DB---------
Processing table for: SGRA_DERIVED_B_SCORE
Beginning transaction...
Successfully inserted!
Added Record stats...
Score insert: 1030
---------------------
Working on study: shantang_data
Working on cell line: 22RV1
Checking if score already computed: SGRA_DERIVED_NB_SCORE
Running sgrna derived score...
Getting raw counts...
Filtering enabled... Condition: 35 counts
Filtered a total of 8134 out of 48931 sgRNAs.

---

Not full normalization...
Normalization enabled...
Current counts:
T0_1    4174896.0
T0_2    4200469.0
dtype: float64
Normalize based on a specific value... 4771163.0 counts
Normalization enabled...
Current counts:
T12_1    5341857.0
T12_2    6456429.0
dtype: float64
Normalize based on a specific value... 4771163.0 counts
Normalization enabled...
Current counts:
T0_1    877394.0
T0_2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'].isna()] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'] == 0] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  single['FC'] = single['FC'] - EC_control
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the d

Filtered single sgRNA count: 0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'].isna()] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'] == 0] = 1


calculating for replicate 1
Filtered single sgRNA count: 0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'].isna()] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'] == 0] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  single['FC'] = single['FC'] - EC_control
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the d

Filtered single sgRNA count: 0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'].isna()] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'] == 0] = 1


---------ADDING-TO-DB---------
Processing table for: SGRA_DERIVED_NB_SCORE
Beginning transaction...
Successfully inserted!
Added Record stats...
Score insert: 1225
---------ADDING-TO-DB---------
Processing table for: SGRA_DERIVED_B_SCORE
Beginning transaction...
Successfully inserted!
Added Record stats...
Score insert: 1225
---------------------
Working on study: wong_data
Working on cell line: OVCAR8
Checking if score already computed: SGRA_DERIVED_NB_SCORE
Running sgrna derived score...
Getting raw counts...
Filtering enabled... Condition: 35 counts
Filtered a total of 812 out of 23409 sgRNAs.

---

Not full normalization...
Normalization enabled...
Current counts:
day5 (Replicate 1)    28372036.0
day5 (Replicate 2)    27787859.0
dtype: float64
Normalize based on a specific value... 28079947.5 counts
Normalization enabled...
Current counts:
day20 (Replicate 1)    32287131.0
day20 (Replicate 2)    23231108.0
dtype: float64
Normalize based on a specific value... 28079947.5 counts
Norm

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'].isna()] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'] == 0] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  single['FC'] = single['FC'] - EC_control
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the d

Filtered single sgRNA count: 0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'].isna()] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'] == 0] = 1


calculating for replicate 1
Filtered single sgRNA count: 0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'].isna()] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'] == 0] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  single['FC'] = single['FC'] - EC_control
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the d

Filtered single sgRNA count: 0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'].isna()] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'] == 0] = 1


---------ADDING-TO-DB---------
Processing table for: SGRA_DERIVED_NB_SCORE
Beginning transaction...
Successfully inserted!
Added Record stats...
Score insert: 1225
---------ADDING-TO-DB---------
Processing table for: SGRA_DERIVED_B_SCORE
Beginning transaction...
Successfully inserted!
Added Record stats...
Score insert: 1225
---------------------
Working on study: zhao_data
Working on cell line: HELA
Checking if score already computed: SGRA_DERIVED_NB_SCORE
Running sgrna derived score...
Getting raw counts...
Filtering enabled... Condition: 35 counts
Filtered a total of 662 out of 11934 sgRNAs.

---

Not full normalization...
Normalization enabled...
Current counts:
Hela_MV4_d3_1_S1_trimmed53_len_filtered_counts    263437.0
Hela_MV4_d3_2_S2_trimmed53_len_filtered_counts    240792.0
dtype: float64
Normalize based on a specific value... 279991.0 counts
Normalization enabled...
Current counts:
Hela_MV4_d28_1_S5_trimmed53_len_filtered_counts    311450.0
Hela_MV4_d28_2_S6_trimmed53_len_filt

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'].isna()] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'] == 0] = 1


calculating for replicate 1
Filtered single sgRNA count: 1


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'].isna()] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'] == 0] = 1


---------ADDING-TO-DB---------
Processing table for: SGRA_DERIVED_NB_SCORE
Beginning transaction...
Successfully inserted!
Added Record stats...
Score insert: 1275
---------------------
Working on cell line: A549
Checking if score already computed: SGRA_DERIVED_NB_SCORE
Running sgrna derived score...
Getting raw counts...
Filtering enabled... Condition: 35 counts
Filtered a total of 445 out of 11934 sgRNAs.

---

Not full normalization...
Normalization enabled...
Current counts:
A549_MV4_d3_1_S1_trimmed53_len_filtered_counts    318714.0
A549_MV4_d3_2_S2_trimmed53_len_filtered_counts    366817.0
dtype: float64
Normalize based on a specific value... 386171.0 counts
Normalization enabled...
Current counts:
A549_MV4_d28_1_S7_trimmed53_len_filtered_counts    405525.0
A549_MV4_d28_2_S8_trimmed53_len_filtered_counts    416768.0
dtype: float64
Normalize based on a specific value... 386171.0 counts
Normalization enabled...
Current counts:
A549_MV4_d3_1_S1_trimmed53_len_filtered_counts    619381

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'].isna()] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'] == 0] = 1


calculating for replicate 1
Filtered single sgRNA count: 1


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'].isna()] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgRNA_level_scores['SE'].loc[sgRNA_level_scores['SE'] == 0] = 1


---------ADDING-TO-DB---------
Processing table for: SGRA_DERIVED_NB_SCORE
Beginning transaction...
Successfully inserted!
Added Record stats...
Score insert: 1275
---------------------


In [37]:
horlbeck_processed = False

In [38]:
for curr_study in available_studies:
    print('Working on study: ' + curr_study)

    # get study counts and seq
    study_counts = counts.loc[counts['study_origin'] == study_name_to_pubmed_id[curr_study]].copy()

    curr_seq_ids = np.array(sorted(list(set(study_counts['guide_1_id'].tolist() + study_counts['guide_2_id'].tolist()))))
    study_sequences = experiment_design.loc[curr_seq_ids]

    # the analysis runs for each individual cell line
    available_cell_lines = set(study_counts['cell_line_origin'])

    for curr_cl in available_cell_lines:
        print('Working on cell line: ' + curr_cl)
        curr_counts = study_counts.loc[study_counts['cell_line_origin'] == curr_cl].copy()
        
        # run horlbeck score
        if curr_study == 'horlbeck_data' and (not horlbeck_processed):
            # Horlbeck does additional filtering on JURKAT based on K562
            JURKAT_counts = study_counts.loc[study_counts['cell_line_origin'] == 'JURKAT'].copy()
            K562_counts = study_counts.loc[study_counts['cell_line_origin'] == 'K562'].copy()

            if (not check_if_added_to_table(JURKAT_counts.copy(), 'HORLBECK_SCORE', SLKB_engine)) and (not check_if_added_to_table(K562_counts.copy(), 'HORLBECK_SCORE', SLKB_engine)):

                # do the preprocessing
                JURKAT_counts = run_horlbeck_preprocessing(JURKAT_counts, filterThreshold = 35, pseudocount = 10)
                K562_counts = run_horlbeck_preprocessing(K562_counts, filterThreshold = 35, pseudocount = 10)

                singles_JURKAT = JURKAT_counts.loc[JURKAT_counts['target_type'] == 'Single'].copy().dropna()
                singles_K562 = K562_counts.loc[K562_counts['target_type'] == 'Single'].copy().dropna()

                a_average_K562 = singles_K562[singles_K562['sgRNA_target_name_g2'] == "CONTROL"]
                a_average_K562 = pd.DataFrame(a_average_K562.groupby('sgRNA_guide_name_g1')['FC_Averaged_abbaAveraged'].apply(np.mean))
                a_average_K562.columns = [a_average_K562.columns[0] + '_K562']

                a_average_JURKAT = singles_JURKAT[singles_JURKAT['sgRNA_target_name_g2'] == "CONTROL"]
                a_average_JURKAT = pd.DataFrame(a_average_JURKAT.groupby('sgRNA_guide_name_g1')['FC_Averaged_abbaAveraged'].apply(np.mean))
                a_average_JURKAT.columns = [a_average_JURKAT.columns[0] + '_JURKAT']

                a_average_JURKAT = a_average_JURKAT.merge(a_average_K562, how = 'left', left_index = True, right_index = True)

                filter_criteria = a_average_JURKAT.loc[((a_average_JURKAT['FC_Averaged_abbaAveraged_JURKAT'] > -0.025) & (a_average_JURKAT['FC_Averaged_abbaAveraged_K562'] < -0.05))]

                additionally_filtered = list(filter_criteria.index)

                to_be_dropped = JURKAT_counts['sgRNA_guide_name_g1'].isin(additionally_filtered) | JURKAT_counts['sgRNA_guide_name_g2'].isin(additionally_filtered)

                # drop them
                JURKAT_counts = JURKAT_counts.loc[~to_be_dropped, :]

                # now, proceed to run horlbeck score
                JURKAT_res = run_horlbeck_score(JURKAT_counts.copy(), do_preprocessing = False)
                K562_res = run_horlbeck_score(K562_counts.copy(), do_preprocessing = False)

                add_table_to_db(JURKAT_counts.copy(), JURKAT_res['HORLBECK_SCORE'], 'HORLBECK_SCORE', SLKB_engine)
                add_table_to_db(K562_counts.copy(), K562_res['HORLBECK_SCORE'], 'HORLBECK_SCORE', SLKB_engine)

                horlbeck_processed = True
            else:
                print('Skipping horlbeck score...horlbeck case')

        elif curr_study == 'diehl_data':
            if not check_if_added_to_table(curr_counts.copy(), 'HORLBECK_SCORE', SLKB_engine):
                # very low counts, set the threshold to be 0
                temp_counts = run_horlbeck_preprocessing(curr_counts.copy(), filterThreshold = 0, pseudocount = 10)
                horlbeck_res = run_horlbeck_score(temp_counts.copy(), do_preprocessing = False)
                add_table_to_db(curr_counts.copy(), horlbeck_res['HORLBECK_SCORE'], 'HORLBECK_SCORE', SLKB_engine)
            else:
                print('Skipping horlbeck score...')
        else:
            if curr_study != 'horlbeck_data':
                if not check_if_added_to_table(curr_counts.copy(), 'HORLBECK_SCORE', SLKB_engine):
                    horlbeck_res = run_horlbeck_score(curr_counts.copy(), do_preprocessing = True)
                    add_table_to_db(curr_counts.copy(), horlbeck_res['HORLBECK_SCORE'], 'HORLBECK_SCORE', SLKB_engine)
                else:
                    print('Skipping horlbeck score...')
                    
        print('---------------------')

Working on study: diehl_data
Working on cell line: RPE1
Checking if score already computed: HORLBECK_SCORE
Getting raw counts...
Mismatch times, averaging...
Sorting gene pairs and guides based on ordering gene ordering...
For replicate 1
Total of 0 sgRNAs were filtered out of 908
Running horlbeck score...
Running preprocessing...
Started scoring


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  curr_counts['GI_Averaged'].iloc[i] = GI_Score_avg.loc[guide_1, guide_2]


---------ADDING-TO-DB---------
Processing table for: HORLBECK_SCORE


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  curr_results.drop(['Gene 1', 'Gene 2'], axis = 1, inplace = True, errors = 'ignore')


Beginning transaction...
Successfully inserted!
Added Record stats...
Score insert: 10720
---------------------
Working on study: horlbeck_data
Working on cell line: JURKAT
Checking if score already computed: HORLBECK_SCORE
Checking if score already computed: HORLBECK_SCORE
Getting raw counts...
Sorting gene pairs and guides based on ordering gene ordering...
For replicate 1
Total of 188 sgRNAs were filtered out of 1022
For replicate 2
Total of 172 sgRNAs were filtered out of 1022


  res = f(group)


Getting raw counts...
Sorting gene pairs and guides based on ordering gene ordering...
For replicate 1
Total of 149 sgRNAs were filtered out of 1022
For replicate 2
Total of 160 sgRNAs were filtered out of 1022


  res = f(group)


Running horlbeck score...
Running preprocessing...
Started scoring


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  curr_counts['GI_Averaged'].iloc[i] = GI_Score_avg.loc[guide_1, guide_2]
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  ret = ret.dtype.type(ret / rcount)


Running horlbeck score...
Running preprocessing...
Started scoring


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  curr_counts['GI_Averaged'].iloc[i] = GI_Score_avg.loc[guide_1, guide_2]
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  ret = ret.dtype.type(ret / rcount)


---------ADDING-TO-DB---------
Processing table for: HORLBECK_SCORE


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  curr_results.drop(['Gene 1', 'Gene 2'], axis = 1, inplace = True, errors = 'ignore')


Beginning transaction...
Successfully inserted!
Added Record stats...
Score insert: 75078
---------ADDING-TO-DB---------
Processing table for: HORLBECK_SCORE


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  curr_results.drop(['Gene 1', 'Gene 2'], axis = 1, inplace = True, errors = 'ignore')


Beginning transaction...
Successfully inserted!
Added Record stats...
Score insert: 100128
---------------------
Working on cell line: K562
---------------------
Working on study: najm_data
Working on cell line: HT29
Checking if score already computed: HORLBECK_SCORE
Running horlbeck score...
Running preprocessing...
Getting raw counts...
Removing NA replicate from TEnd...
Mismatch times, averaging...
Sorting gene pairs and guides based on ordering gene ordering...
For replicate 1
Total of 6 sgRNAs were filtered out of 192


  res = f(group)


Started scoring


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  curr_counts['GI_Averaged'].iloc[i] = GI_Score_avg.loc[guide_1, guide_2]


---------ADDING-TO-DB---------
Processing table for: HORLBECK_SCORE


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  curr_results.drop(['Gene 1', 'Gene 2'], axis = 1, inplace = True, errors = 'ignore')


Beginning transaction...
Successfully inserted!
Added Record stats...
Score insert: 300
---------------------
Working on cell line: A549
Checking if score already computed: HORLBECK_SCORE
Running horlbeck score...
Running preprocessing...
Getting raw counts...
Removing NA replicate from TEnd...
Mismatch times, averaging...
Sorting gene pairs and guides based on ordering gene ordering...
For replicate 1
Total of 1 sgRNAs were filtered out of 192


  res = f(group)


Started scoring


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  curr_counts['GI_Averaged'].iloc[i] = GI_Score_avg.loc[guide_1, guide_2]


---------ADDING-TO-DB---------
Processing table for: HORLBECK_SCORE


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  curr_results.drop(['Gene 1', 'Gene 2'], axis = 1, inplace = True, errors = 'ignore')


Beginning transaction...
Successfully inserted!
Added Record stats...
Score insert: 300
---------------------
Working on cell line: OVCAR8
Checking if score already computed: HORLBECK_SCORE
Running horlbeck score...
Running preprocessing...
Getting raw counts...
Mismatch times, averaging...
Sorting gene pairs and guides based on ordering gene ordering...
For replicate 1
Total of 2 sgRNAs were filtered out of 192


  res = f(group)


Started scoring


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  curr_counts['GI_Averaged'].iloc[i] = GI_Score_avg.loc[guide_1, guide_2]


---------ADDING-TO-DB---------
Processing table for: HORLBECK_SCORE


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  curr_results.drop(['Gene 1', 'Gene 2'], axis = 1, inplace = True, errors = 'ignore')


Beginning transaction...
Successfully inserted!
Added Record stats...
Score insert: 300
---------------------
Working on cell line: A375
Checking if score already computed: HORLBECK_SCORE
Running horlbeck score...
Running preprocessing...
Getting raw counts...
Mismatch times, averaging...
Sorting gene pairs and guides based on ordering gene ordering...
For replicate 1
Total of 2 sgRNAs were filtered out of 192


  res = f(group)


Started scoring


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  curr_counts['GI_Averaged'].iloc[i] = GI_Score_avg.loc[guide_1, guide_2]


---------ADDING-TO-DB---------
Processing table for: HORLBECK_SCORE


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  curr_results.drop(['Gene 1', 'Gene 2'], axis = 1, inplace = True, errors = 'ignore')


Beginning transaction...
Successfully inserted!
Added Record stats...
Score insert: 300
---------------------
Working on cell line: 786O
Checking if score already computed: HORLBECK_SCORE
Running horlbeck score...
Running preprocessing...
Getting raw counts...
Mismatch times, averaging...
Sorting gene pairs and guides based on ordering gene ordering...
For replicate 1
Total of 1 sgRNAs were filtered out of 192


  res = f(group)


Started scoring


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  curr_counts['GI_Averaged'].iloc[i] = GI_Score_avg.loc[guide_1, guide_2]


---------ADDING-TO-DB---------
Processing table for: HORLBECK_SCORE


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  curr_results.drop(['Gene 1', 'Gene 2'], axis = 1, inplace = True, errors = 'ignore')


Beginning transaction...
Successfully inserted!
Added Record stats...
Score insert: 300
---------------------
Working on cell line: MELJUSO
Checking if score already computed: HORLBECK_SCORE
Running horlbeck score...
Running preprocessing...
Getting raw counts...
Removing NA replicate from TEnd...
Mismatch times, averaging...
Sorting gene pairs and guides based on ordering gene ordering...
For replicate 1
Total of 1 sgRNAs were filtered out of 192


  res = f(group)


Started scoring


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  curr_counts['GI_Averaged'].iloc[i] = GI_Score_avg.loc[guide_1, guide_2]


---------ADDING-TO-DB---------
Processing table for: HORLBECK_SCORE


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  curr_results.drop(['Gene 1', 'Gene 2'], axis = 1, inplace = True, errors = 'ignore')


Beginning transaction...
Successfully inserted!
Added Record stats...
Score insert: 300
---------------------
Working on study: parrish_data
Working on cell line: HELA
Checking if score already computed: HORLBECK_SCORE
Running horlbeck score...
Running preprocessing...
Getting raw counts...
Sorting gene pairs and guides based on ordering gene ordering...
For replicate 1
Total of 23 sgRNAs were filtered out of 9168
For replicate 2
Total of 29 sgRNAs were filtered out of 9168
For replicate 3
Total of 35 sgRNAs were filtered out of 9168


  res = f(group)


Started scoring


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  curr_counts['GI_Averaged'].iloc[i] = GI_Score_avg.loc[guide_1, guide_2]
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  ret = ret.dtype.type(ret / rcount)


---------ADDING-TO-DB---------
Processing table for: HORLBECK_SCORE
Beginning transaction...
Successfully inserted!
Added Record stats...
Score insert: 1028
---------------------
Working on cell line: PC9
Checking if score already computed: HORLBECK_SCORE
Running horlbeck score...
Running preprocessing...
Getting raw counts...
Sorting gene pairs and guides based on ordering gene ordering...
For replicate 1
Total of 1 sgRNAs were filtered out of 9190
For replicate 2
Total of 2 sgRNAs were filtered out of 9190
For replicate 3
Total of 6 sgRNAs were filtered out of 9190


  res = f(group)


Started scoring


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  curr_counts['GI_Averaged'].iloc[i] = GI_Score_avg.loc[guide_1, guide_2]


---------ADDING-TO-DB---------
Processing table for: HORLBECK_SCORE
Beginning transaction...
Successfully inserted!
Added Record stats...
Score insert: 1030
---------------------
Working on study: shantang_data
Working on cell line: 22RV1
Checking if score already computed: HORLBECK_SCORE
Running horlbeck score...
Running preprocessing...
Getting raw counts...
Sorting gene pairs and guides based on ordering gene ordering...
For replicate 1
Total of 12 sgRNAs were filtered out of 222
For replicate 2
Total of 7 sgRNAs were filtered out of 222


  res = f(group)


Started scoring


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  curr_counts['GI_Averaged'].iloc[i] = GI_Score_avg.loc[guide_1, guide_2]
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  ret = ret.dtype.type(ret / rcount)


---------ADDING-TO-DB---------
Processing table for: HORLBECK_SCORE


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  curr_results.drop(['Gene 1', 'Gene 2'], axis = 1, inplace = True, errors = 'ignore')


Beginning transaction...
Successfully inserted!
Added Record stats...
Score insert: 1225
---------------------
Working on study: wong_data
Working on cell line: OVCAR8
Checking if score already computed: HORLBECK_SCORE
Running horlbeck score...
Running preprocessing...
Getting raw counts...
Sorting gene pairs and guides based on ordering gene ordering...
For replicate 1
Total of 3 sgRNAs were filtered out of 153
For replicate 2
Total of 3 sgRNAs were filtered out of 153


  res = f(group)


Started scoring


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  curr_counts['GI_Averaged'].iloc[i] = GI_Score_avg.loc[guide_1, guide_2]


---------ADDING-TO-DB---------
Processing table for: HORLBECK_SCORE


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  curr_results.drop(['Gene 1', 'Gene 2'], axis = 1, inplace = True, errors = 'ignore')


Beginning transaction...
Successfully inserted!
Added Record stats...
Score insert: 1225
---------------------
Working on study: zhao_data
Working on cell line: HELA
Checking if score already computed: HORLBECK_SCORE
Running horlbeck score...
Running preprocessing...
Getting raw counts...
Sorting gene pairs and guides based on ordering gene ordering...
For replicate 1
Total of 6 sgRNAs were filtered out of 156
For replicate 2
Total of 8 sgRNAs were filtered out of 156


  res = f(group)


Started scoring


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  curr_counts['GI_Averaged'].iloc[i] = GI_Score_avg.loc[guide_1, guide_2]
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  ret = ret.dtype.type(ret / rcount)


---------ADDING-TO-DB---------
Processing table for: HORLBECK_SCORE
Beginning transaction...
Successfully inserted!
Added Record stats...
Score insert: 1275
---------------------
Working on cell line: A549
Checking if score already computed: HORLBECK_SCORE
Running horlbeck score...
Running preprocessing...
Getting raw counts...
Sorting gene pairs and guides based on ordering gene ordering...
For replicate 1
Total of 3 sgRNAs were filtered out of 156
For replicate 2
Total of 2 sgRNAs were filtered out of 156


  res = f(group)


Started scoring


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  curr_counts['GI_Averaged'].iloc[i] = GI_Score_avg.loc[guide_1, guide_2]


---------ADDING-TO-DB---------
Processing table for: HORLBECK_SCORE
Beginning transaction...
Successfully inserted!
Added Record stats...
Score insert: 1275
---------------------


In [39]:
for curr_study in available_studies:
    print('Working on study: ' + curr_study)

    # get study counts and seq
    study_counts = counts.loc[counts['study_origin'] == study_name_to_pubmed_id[curr_study]].copy()

    curr_seq_ids = np.array(sorted(list(set(study_counts['guide_1_id'].tolist() + study_counts['guide_2_id'].tolist()))))
    study_sequences = experiment_design.loc[curr_seq_ids]

    # the analysis runs for each individual cell line
    available_cell_lines = set(study_counts['cell_line_origin'])

    for curr_cl in available_cell_lines:
        print('Working on cell line: ' + curr_cl)
        curr_counts = study_counts.loc[study_counts['cell_line_origin'] == curr_cl].copy()

        # run MAGeCK
        if not check_if_added_to_table(curr_counts.copy(), 'MAGECK_SCORE', SLKB_engine):
            mageck_res = run_mageck_score(curr_counts.copy(), curr_study, curr_cl, save_dir = 'MAGECK_Files')
            add_table_to_db(curr_counts.copy(), mageck_res['MAGECK_SCORE'], 'MAGECK_SCORE', SLKB_engine)
        else:
            print('Skipping MAGeCK score...')
            
        print('---------------------')


Working on study: diehl_data
Working on cell line: RPE1
Checking if score already computed: MAGECK_SCORE
Running mageck score...
Getting raw counts...
Scores exist!
Loading computed results...
Filtered gene count: 0
---------ADDING-TO-DB---------
Processing table for: MAGECK_SCORE
Beginning transaction...
Successfully inserted!
Added Record stats...
Score insert: 10720
---------------------
Working on study: horlbeck_data
Working on cell line: JURKAT
Checking if score already computed: MAGECK_SCORE
Running mageck score...
Getting raw counts...
Scores exist!
Loading computed results...
Filtered gene count: 0
---------ADDING-TO-DB---------
Processing table for: MAGECK_SCORE
Beginning transaction...
Successfully inserted!
Added Record stats...
Score insert: 111156
---------------------
Working on cell line: K562
Checking if score already computed: MAGECK_SCORE
Running mageck score...
Getting raw counts...
Scores exist!
Loading computed results...
Filtered gene count: 0
---------ADDING-TO-

In [40]:
for curr_study in available_studies:
    print('Working on study: ' + curr_study)

    # get study counts and seq
    study_counts = counts.loc[counts['study_origin'] == study_name_to_pubmed_id[curr_study]].copy()

    curr_seq_ids = np.array(sorted(list(set(study_counts['guide_1_id'].tolist() + study_counts['guide_2_id'].tolist()))))
    study_sequences = experiment_design.loc[curr_seq_ids]

    # the analysis runs for each individual cell line
    available_cell_lines = set(study_counts['cell_line_origin'])

    for curr_cl in available_cell_lines:
        print('Working on cell line: ' + curr_cl)
        curr_counts = study_counts.loc[study_counts['cell_line_origin'] == curr_cl].copy()
        
        # run GEMINI
        if not check_if_added_to_table(curr_counts.copy(), 'GEMINI_SCORE', SLKB_engine):
            gemini_res = run_gemini_score(curr_counts.copy(), curr_study, curr_cl, save_dir = 'GEMINI_Files')
            add_table_to_db(curr_counts.copy(), gemini_res['GEMINI_SCORE'], 'GEMINI_SCORE', SLKB_engine)
        else:
            print('Skipping GEMINI score...')
            
        print('---------------------')


Working on study: diehl_data
Working on cell line: RPE1
Checking if score already computed: GEMINI_SCORE
Running gemini score...
Getting raw counts...
Scores exist!
---------ADDING-TO-DB---------
Processing table for: GEMINI_SCORE
Beginning transaction...
Successfully inserted!
Added Record stats...
Score insert: 10720
---------------------
Working on study: horlbeck_data
Working on cell line: JURKAT
Checking if score already computed: GEMINI_SCORE
Running gemini score...
Getting raw counts...
Scores exist!
---------ADDING-TO-DB---------
Processing table for: GEMINI_SCORE
Beginning transaction...
Successfully inserted!
Added Record stats...
Score insert: 111156
---------------------
Working on cell line: K562
Checking if score already computed: GEMINI_SCORE
Running gemini score...
Getting raw counts...
Scores exist!
---------ADDING-TO-DB---------
Processing table for: GEMINI_SCORE
Beginning transaction...
Successfully inserted!
Added Record stats...
Score insert: 111156
---------------