In [1]:
import numpy as np
import pandas as pd
import os
import math
from itertools import chain
import pickle
import sqlalchemy
from sqlalchemy.orm import sessionmaker
import stellargraph as sg
import tensorflow as tf
import networkx as nx
import keras
import json
from sklearn.metrics import f1_score
import multiprocessing
import shap
import copy

2023-02-09 01:24:29.926576: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-09 01:24:31.117080: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /apps/cuda/9.0.176/lib64:/apps/cuda/9.0.176/lib
2023-02-09 01:24:31.117118: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-02-09 01:24:34.696126: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvi

In [2]:
from sqlalchemy.engine import Engine
from sqlalchemy import event

@event.listens_for(Engine, "connect")
def set_sqlite_pragma(dbapi_connection, connection_record):
    cursor = dbapi_connection.cursor()
    cursor.execute("PRAGMA foreign_keys=ON")
    cursor.close()

## Create a SL Database with Wet-Lab Experiments

all SL data:

* Diehl
* Han
* Horlbeck (Done)
* Ito
* Laufer (Done)
* Parrish
* Shen
* Thompson
* Wong
* Zhao (Done)

In [3]:
# Load in the datasets
data_locs = "/users/PAS1376/bg12/SyntheticLethality/SyntheticLethalityReview/Project/ml_inputs"
learning_goals_loc_general =  os.path.join(data_locs, "learning_goals")
learning_goals_loc_general = '/users/PAS1376/bg12/SyntheticLethality - NewDB/data'


In [4]:
db_location =  "/users/PAS1376/bg12/SyntheticLethality - NewDB/Python_Clean/SLKB_sqlite3"

In [5]:
# read the database
SLKB_engine = sqlalchemy.create_engine('sqlite:///' + db_location)
#SLKB_engine_session = sessionmaker(bind=SLKB_engine)SLKB_sqlite3

In [6]:
db_metadata = sqlalchemy.MetaData(bind=SLKB_engine)
db_metadata.reflect(SLKB_engine)

In [7]:
db_metadata.tables

FacadeDict({'CDKO_EXPERIMENT_DESIGN': Table('CDKO_EXPERIMENT_DESIGN', MetaData(bind=Engine(sqlite:////users/PAS1376/bg12/SyntheticLethality - NewDB/Python_Clean/SLKB_sqlite3)), Column('sgRNA_id', INTEGER(), table=<CDKO_EXPERIMENT_DESIGN>, primary_key=True), Column('sgRNA_guide_name', TEXT(), table=<CDKO_EXPERIMENT_DESIGN>, nullable=False), Column('sgRNA_guide_seq', TEXT(), table=<CDKO_EXPERIMENT_DESIGN>, nullable=False), Column('sgRNA_target_name', TEXT(), table=<CDKO_EXPERIMENT_DESIGN>, nullable=False), Column('study_origin', TEXT(), table=<CDKO_EXPERIMENT_DESIGN>, nullable=False), schema=None), 'CDKO_ORIGINAL_SL_RESULTS': Table('CDKO_ORIGINAL_SL_RESULTS', MetaData(bind=Engine(sqlite:////users/PAS1376/bg12/SyntheticLethality - NewDB/Python_Clean/SLKB_sqlite3)), Column('id', INTEGER(), table=<CDKO_ORIGINAL_SL_RESULTS>, primary_key=True), Column('gene_pair_id', INTEGER(), table=<CDKO_ORIGINAL_SL_RESULTS>), Column('gene_pair', TEXT(), table=<CDKO_ORIGINAL_SL_RESULTS>, nullable=False), Co

In [8]:
## store pubmed IDs
study_name_to_pubmed_id = {}

study_name_to_pubmed_id['diehl_data'] = '33956155'

study_name_to_pubmed_id['han_data'] = '28319085'

study_name_to_pubmed_id['horlbeck_data'] = '30033366'

study_name_to_pubmed_id['ito_data'] = '34857952'

study_name_to_pubmed_id['parrish_data'] = '34469736'

study_name_to_pubmed_id['shen_data'] = '28319113'

study_name_to_pubmed_id['thompson_data'] = '33637726'

study_name_to_pubmed_id['wong_data'] = '26864203'

study_name_to_pubmed_id['zhao_data'] = '29452643'

study_name_to_pubmed_id['shantang_data'] = '36060092'

study_name_to_pubmed_id['najm_data'] = '29251726'



In [9]:
rev_study_name_to_pubmed_id = {}
for item in study_name_to_pubmed_id:
    rev_study_name_to_pubmed_id[study_name_to_pubmed_id[item]] = item

In [10]:
raw_scores = pd.read_csv('SLKB_original_raw_reported_SL.csv', index_col = 0)
raw_scores['study_origin'] = raw_scores['study_origin'].astype(str)
raw_scores['SL_or_not'] = raw_scores['SL_or_not'].astype(str)

In [11]:
all_results_tables = ['HORLBECK_SCORE', 
                      'MAGECK_SCORE', 
                      'MEDIAN_NB_SCORE', 
                      #'MEDIAN_NB_SCORE_FULL_NORM', 
                      'MEDIAN_B_SCORE', 
                      #'MEDIAN_B_SCORE_FULL_NORM',
                      'SGRA_DERIVED_NB_SCORE', 
                      #'SGRA_DERIVED_NB_SCORE_FULL_NORM', 
                      'SGRA_DERIVED_B_SCORE', 
                      #'SGRA_DERIVED_B_SCORE_FULL_NORM', 
                      'GEMINI_SCORE']

In [12]:
# read the data

# experiment design
experiment_design = pd.read_sql_table('CDKO_EXPERIMENT_DESIGN', SLKB_engine, index_col = 'sgRNA_id')#
experiment_design.drop(['study_origin'], axis = 1, inplace = True)
experiment_design.reset_index(drop = True, inplace = True)
experiment_design.index.rename('sgRNA_id', inplace = True)

# counts
counts = pd.read_sql_table('CDKO_SGRNA_COUNTS', SLKB_engine, index_col = 'sgRNA_pair_id')
counts.reset_index(drop = True, inplace = True)
counts.index.rename('sgRNA_pair_id', inplace = True)

# scores
scores = pd.read_sql_table('CDKO_ORIGINAL_SL_RESULTS', SLKB_engine, index_col = 'gene_pair_id')

In [13]:
# join the tables together
counts = counts.merge(scores, how = 'left', left_on = 'gene_pair_id', right_index = True)
counts = counts.merge(experiment_design, how = 'left', left_on = 'guide_1_id', right_index = True, suffixes = ('', '_g1'))
counts = counts.merge(experiment_design, how = 'left', left_on = 'guide_2_id', right_index = True, suffixes = ('', '_g2'))
# rename
counts = counts.rename({'sgRNA_guide_name': 'sgRNA_guide_name_g1',
                        'sgRNA_guide_seq': 'sgRNA_guide_seq_g1',
                        'sgRNA_target_name': 'sgRNA_target_name_g1',
                        'study_origin_x': 'study_origin',
                        'cell_line_origin_x': 'cell_line_origin'}, axis = 1)

experiment_design = pd.read_sql_table('CDKO_EXPERIMENT_DESIGN', SLKB_engine, index_col = 'sgRNA_id')

In [14]:
def query_result_table(curr_counts, table_name, curr_study, curr_cl, engine_link):
    
    print('Accessing table: ' + table_name)
    
    # get available results
    res = pd.read_sql_table(table_name, engine_link, index_col = 'id')
    
    # possible gene pairs
    curr_counts['gene_pair'] = ['|'.join(sorted([curr_counts['sgRNA_target_name_g1'].iloc[i], curr_counts['sgRNA_target_name_g2'].iloc[i]])) for i in range(curr_counts.shape[0])]

    # get results
    query_res = curr_counts.loc[curr_counts['target_type'] == 'Dual', ['gene_pair', 'gene_pair_id']].drop_duplicates(subset = ['gene_pair_id'])
    query_res = query_res.merge(res, left_on = 'gene_pair_id', right_on = 'gene_pair_id').drop('gene_pair_id', axis = 1)
    
    # add column names to the front
    names_dict = {i: table_name + '_' + i for i in query_res.columns[1:]}
    query_res.rename(columns = names_dict, inplace = True)
    #query_res.columns[1:] = table_name + '_' + query_res.columns[1:]
    
    print('Available gene pairs: ' + str(query_res.shape[0]))
    
    # add name of study
    query_res['study_origin'] = curr_study
    
    # add name of cell line
    query_res['cell_line_origin'] = curr_cl
    
    # set the gene pair as index
    query_res.index = query_res['gene_pair']
    
    return(query_res)

In [26]:
available_studies = set(counts['study_origin'])
available_studies = sorted([rev_study_name_to_pubmed_id[i] for i in set(counts['study_origin'])])

In [27]:
available_studies

['diehl_data',
 'horlbeck_data',
 'najm_data',
 'parrish_data',
 'shantang_data',
 'wong_data',
 'zhao_data']

In [28]:
all_scores = []
for curr_study in available_studies:
    print('Working on study: ' + curr_study)

    # get study counts and seq
    study_counts = counts.loc[counts['study_origin'] == study_name_to_pubmed_id[curr_study]].copy()

    curr_seq_ids = np.array(sorted(list(set(study_counts['guide_1_id'].tolist() + study_counts['guide_2_id'].tolist()))))
    study_sequences = experiment_design.loc[curr_seq_ids]

    # the analysis runs for each individual cell line
    available_cell_lines = set(study_counts['cell_line_origin'])


    for curr_cl in available_cell_lines:
        # store results here
        study_scores = []
    
        print('Working on cell line: ' + curr_cl)
        curr_counts = study_counts.loc[study_counts['cell_line_origin'] == curr_cl].copy()
        
        for table_name in all_results_tables:
            study_scores.append(query_result_table(curr_counts.copy(), table_name, curr_study, curr_cl, SLKB_engine))
    
        # remove duplicate annotation columns
        study_scores = pd.concat(study_scores, axis = 1, ignore_index = False)
        study_scores = study_scores.loc[:,~study_scores.columns.duplicated(keep = 'last')].copy()
        
        # make sure the annotations are all filled
        study_scores['gene_pair'] = study_scores.index
        study_scores['study_origin'] = study_name_to_pubmed_id[curr_study]
        study_scores['cell_line_origin'] = curr_cl
        
        # reset the index, gene_pair -> id
        study_scores.reset_index(drop = True, inplace = True)

        # add to big table 
        all_scores.append(study_scores)
    
    print('-----')
    
print('Done getting all data!')
    
# combine the scores at the end
all_scores = pd.concat(all_scores, axis = 0, ignore_index = True)

# add individual genes
all_scores['gene_1'] = [i.split('|')[0] for i in all_scores['gene_pair']]
all_scores['gene_2'] = [i.split('|')[1] for i in all_scores['gene_pair']]

# sort such that all annotations are at the front
all_columns = sorted(list(all_scores.columns))
annotation_columns = ['gene_pair', 'gene_1', 'gene_2', 'study_origin', 'cell_line_origin']

# get the final scores
all_scores = all_scores.loc[:, annotation_columns + [i for i in all_columns if i not in annotation_columns]]

Working on study: diehl_data
Working on cell line: RPE1
Accessing table: HORLBECK_SCORE
Available gene pairs: 10720
Accessing table: MAGECK_SCORE
Available gene pairs: 10720
Accessing table: MEDIAN_NB_SCORE
Available gene pairs: 10720
Accessing table: MEDIAN_B_SCORE
Available gene pairs: 10720
Accessing table: SGRA_DERIVED_NB_SCORE
Available gene pairs: 10720
Accessing table: SGRA_DERIVED_B_SCORE
Available gene pairs: 10720
Accessing table: GEMINI_SCORE
Available gene pairs: 10720
-----
Working on study: horlbeck_data
Working on cell line: K562
Accessing table: HORLBECK_SCORE
Available gene pairs: 100128
Accessing table: MAGECK_SCORE
Available gene pairs: 111156
Accessing table: MEDIAN_NB_SCORE
Available gene pairs: 110684
Accessing table: MEDIAN_B_SCORE
Available gene pairs: 110684
Accessing table: SGRA_DERIVED_NB_SCORE
Available gene pairs: 110684
Accessing table: SGRA_DERIVED_B_SCORE
Available gene pairs: 110684
Accessing table: GEMINI_SCORE
Available gene pairs: 111156
Working on c

In [29]:
all_scores.to_csv('SLKB_calculated_scores.csv')

In [30]:
#curr_counts['target_type'].value_counts()

Dual      11475
Single      459
Name: target_type, dtype: int64

In [81]:
available_genes = set(curr_counts['sgRNA_target_name_g1'].tolist()).union(set(curr_counts['sgRNA_target_name_g2'].tolist()))

In [38]:
def get_stats_for_study(curr_sequences, curr_counts, curr_calculated_scores, curr_original_scores, curr_raw_scores):
    
    # stats for the experiment design
    print('sgRNA information:')
    if curr_sequences.shape[0] == 0:
        print('No sgRNA information in SLKB')
    else:
        print('Total available sgRNAs: ' + str(curr_sequences.shape[0]))
        print('sgRNAs that target controls: ' + str((curr_sequences['sgRNA_target_name'] == 'CONTROL').sum()))
        print('sgRNAs that target NOT controls: ' + str((curr_sequences['sgRNA_target_name'] != 'CONTROL').sum()))
        
    print('++++++')
    print('Counts information:')
    if curr_counts.shape[0] == 0:
        print('No counts information in SLKB')
    else:
        print('Total available sgRNAs counts ' + str(curr_counts.shape[0]))
        print('sgRNA counts that target controls: ' + str((curr_counts['target_type'] == 'Control').sum()))
        print('sgRNA counts that target single genes: ' + str((curr_counts['target_type'] == 'Single').sum()))
        print('sgRNA counts that target dual genes: ' + str((curr_counts['target_type'] == 'Dual').sum()))
        available_genes = set(curr_counts['sgRNA_target_name_g1'].tolist()).union(set(curr_counts['sgRNA_target_name_g2'].tolist()))
        if 'CONTROL' in available_genes:
            available_genes.remove('CONTROL')
        print('Total genes with counts ' + str(len(available_genes)))
    
    print('++++++')
    print('Calculated scores information:')
    if curr_calculated_scores.shape[0] == 0:
        print('No calculated scores information in SLKB')
    else:
        print('Total calculated SL scores ' + str(curr_calculated_scores.shape[0]))
        print('Total unique number of gene pairs ' + str(len(set(curr_calculated_scores['gene_pair']))))
        available_genes = set(curr_calculated_scores['gene_1'].tolist()).union(set(curr_calculated_scores['gene_2'].tolist()))
        print('Total genes with SL scores ' + str(len(available_genes)))
    
    print('++++++')
    print('Original SL scores information:')
    if curr_original_scores.shape[0] == 0:
        print('No processed original scores information in SLKB (processed)')
    else:
        print('Total calculated SL scores ' + str(curr_original_scores.shape[0]))
        print('Total unique number of gene pairs ' + str(len(set(curr_original_scores['gene_pair']))))
        available_genes = set(curr_original_scores['gene_1'].tolist()).union(set(curr_original_scores['gene_2'].tolist()))
        print('Total genes with SL scores ' + str(len(available_genes)))
        print('Total SL: ' + str((curr_original_scores['SL_or_not'] == "1").sum()))
        print('Total Not SL: ' + str((curr_original_scores['SL_or_not'] == "0").sum()))
    
    print('++++++')
    print('Raw SL scores information:')
    if curr_raw_scores.shape[0] == 0:
        print('No raw scores information in SLKB (processed)')
    else:
        print('Total raw SL scores ' + str(curr_raw_scores.shape[0]))
        print('Total raw SL scores (Only Dual) ' + str((curr_raw_scores['gene_1'] != curr_raw_scores['gene_2']).sum()))
        print('Total unique number of gene pairs ' + str(len(set(curr_raw_scores['gene_pair']))))
        available_genes = set(curr_raw_scores['gene_1'].tolist()).union(set(curr_raw_scores['gene_2'].tolist()))
        print('Total genes with SL scores ' + str(len(available_genes)))
        print('Total SL: ' + str((curr_raw_scores['SL_or_not'] == "SL").sum()))
        print('Total Not SL: ' + str((curr_raw_scores['SL_or_not'] == "Not SL").sum()))
        
    print('\n\n---------------\n\n')
    pass

In [40]:
study_name_to_pubmed_id

{'diehl_data': '33956155',
 'han_data': '28319085',
 'horlbeck_data': '30033366',
 'ito_data': '34857952',
 'parrish_data': '34469736',
 'shen_data': '28319113',
 'thompson_data': '33637726',
 'wong_data': '26864203',
 'zhao_data': '29452643',
 'shantang_data': '36060092',
 'najm_data': '29251726'}

In [41]:
for curr_study in study_name_to_pubmed_id.keys():
    print('Working on study: ' + curr_study)
    get_stats_for_study(#curr_study = study_name_to_pubmed_id[curr_study], 
                        curr_sequences = experiment_design.loc[experiment_design['study_origin'] == study_name_to_pubmed_id[curr_study], :].copy(), 
                        curr_counts = counts.loc[counts['study_origin'] == study_name_to_pubmed_id[curr_study], :].copy(),
                        curr_calculated_scores = all_scores.loc[all_scores['study_origin'] == study_name_to_pubmed_id[curr_study], :].copy(), 
                        curr_original_scores = scores.loc[scores['study_origin'] == study_name_to_pubmed_id[curr_study], :].copy(),
                        curr_raw_scores = raw_scores.loc[raw_scores['study_origin'] == study_name_to_pubmed_id[curr_study], :].copy())

Working on study: diehl_data
sgRNA information:
Total available sgRNAs: 1176
sgRNAs that target controls: 188
sgRNAs that target NOT controls: 988
++++++
Counts information:
Total available sgRNAs counts 323551
sgRNA counts that target controls: 2187
sgRNA counts that target single genes: 43236
sgRNA counts that target dual genes: 278128
Total genes with counts 200
++++++
Calculated scores information:
Total calculated SL scores 10720
Total unique number of gene pairs 10720
Total genes with SL scores 200
++++++
Original SL scores information:
Total calculated SL scores 10720
Total unique number of gene pairs 10720
Total genes with SL scores 200
Total SL: 6776
Total Not SL: 3944
++++++
Raw SL scores information:
Total raw SL scores 12736
Total raw SL scores (Only Dual) 12673
Total unique number of gene pairs 10783
Total genes with SL scores 200
Total SL: 7810
Total Not SL: 4926


---------------


Working on study: han_data
sgRNA information:
No sgRNA information in SLKB
++++++
Counts i

In [134]:
curr_raw_scores

Unnamed: 0,gene_1,gene_2,study_origin,cell_line_origin,SL_score,SL_score_cutoff,statistical_score,statistical_score_cutoff,gene_pair,SL_or_not
284131,ALDOA,ALDOB,29452643,A549,0.169437,-3.0,0.761518,0.0,ALDOA_ALDOB,Not SL
284132,ALDOA,ALDOC,29452643,A549,-2.418505,-3.0,0.205590,0.0,ALDOA_ALDOC,Not SL
284133,ALDOA,DLAT,29452643,A549,-2.947731,-3.0,0.178348,0.0,ALDOA_DLAT,Not SL
284134,ALDOA,DLD,29452643,A549,-2.719313,-3.0,0.184315,0.0,ALDOA_DLD,Not SL
284135,ALDOA,ENO1,29452643,A549,-2.028083,-3.0,0.251583,0.0,ALDOA_ENO1,Not SL
...,...,...,...,...,...,...,...,...,...,...
286676,RPIA,TKT,29452643,HELA,-1.324924,-3.0,0.402030,0.0,RPIA_TKT,Not SL
286677,RPIA,TPI1,29452643,HELA,-0.918003,-3.0,0.479355,0.0,RPIA_TPI1,Not SL
286678,TALDO1,TKT,29452643,HELA,-0.178284,-3.0,0.689172,0.0,TALDO1_TKT,Not SL
286679,TALDO1,TPI1,29452643,HELA,-1.373376,-3.0,0.401973,0.0,TALDO1_TPI1,Not SL


In [135]:
#curr_raw_scores
(curr_raw_scores['statistical_score_cutoff'].iloc[0] != 0) and (curr_raw_scores['SL_score_cutoff'].iloc[0] != 0)

False

In [137]:
curr_raw_scores['statistical_score_cutoff'].iloc[0] != 0

False

In [138]:
(curr_raw_scores['SL_score_cutoff'].iloc[0] != 0)

True

In [93]:
curr_raw_scores = raw_scores.loc[raw_scores['study_origin'] == study_name_to_pubmed_id[curr_study], :].copy()

In [96]:
(curr_raw_scores['gene_1'] != curr_raw_scores['gene_2']).sum()

284131    False
284132    False
284133    False
284134    False
284135    False
          ...  
286676    False
286677    False
286678    False
286679    False
286680    False
Length: 2550, dtype: bool

In [45]:
study_name_to_pubmed_id[curr_study]

'29452643'

In [37]:
for curr_study in available_studies:
    print('Working on study: ' + curr_study)

Working on study: diehl_data
Working on study: horlbeck_data
Working on study: najm_data
Working on study: parrish_data
Working on study: shantang_data
Working on study: wong_data
Working on study: zhao_data


In [72]:
curr_sequences = experiment_design.loc[experiment_design['study_origin'] == study_name_to_pubmed_id[curr_study], :].copy()
curr_counts = counts.loc[counts['study_origin'] == study_name_to_pubmed_id[curr_study], :].copy()
curr_calculated_scores = all_scores.loc[all_scores['study_origin'] == study_name_to_pubmed_id[curr_study], :].copy()
curr_original_scores = scores.loc[scores['study_origin'] == study_name_to_pubmed_id[curr_study], :].copy()
curr_raw_scores = raw_scores.loc[raw_scores['study_origin'] == study_name_to_pubmed_id[curr_study], :].copy()

In [75]:
(curr_sequences['sgRNA_target_name'] == 'CONTROL').sum()

3

In [None]:
get_stats_for_study(#curr_study = study_name_to_pubmed_id[curr_study], 
                    curr_sequences = experiment_design.loc[experiment_design['study_origin'] == study_name_to_pubmed_id[curr_study], :].copy(), 
                    curr_counts = counts.loc[counts['study_origin'] == study_name_to_pubmed_id[curr_study], :].copy(),
                    curr_calculated_scores = all_scores.loc[all_scores['study_origin'] == study_name_to_pubmed_id[curr_study], :].copy(), 
                    curr_original_scores = scores.loc[scores['study_origin'] == study_name_to_pubmed_id[curr_study], :].copy(),
                    curr_raw_scores = raw_scores.loc[raw_scores['study_origin'] == study_name_to_pubmed_id[curr_study], :].copy())

In [44]:
scores

Unnamed: 0_level_0,id,gene_pair,study_origin,cell_line_origin,gene_1,gene_2,SL_or_not,SL_score,statistical_score,SL_score_cutoff,statistical_score_cutoff
gene_pair_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0.0,0,AKT1_AMBRA1,33956155,RPE1,AKT1,AMBRA1,1,-0.010982,0.0,-1.00,0.0
91.0,1,AKT3_AMBRA1,33956155,RPE1,AKT3,AMBRA1,1,2.159344,0.0,-1.00,0.0
183.0,2,AMBRA1_ARF6,33956155,RPE1,ARF6,AMBRA1,1,-0.564699,0.0,-1.00,0.0
184.0,3,AMBRA1_ATF4,33956155,RPE1,ATF4,AMBRA1,1,0.999030,0.0,-1.00,0.0
185.0,4,AMBRA1_ATG10,33956155,RPE1,ATG10,AMBRA1,1,3.916281,0.0,-1.00,0.0
...,...,...,...,...,...,...,...,...,...,...,...
303415.0,279291,UBB_UBC,29251726,HT29,UBB,UBC,1,1.000000,0.0,0.01,0.0
303413.0,279292,UBB_UBC,29251726,A375,UBB,UBC,1,0.938604,0.0,0.01,0.0
303412.0,279293,UBB_UBC,29251726,786O,UBB,UBC,1,1.000000,0.0,0.01,0.0
303417.0,279294,UBB_UBC,29251726,OVCAR8,UBB,UBC,1,0.967425,0.0,0.01,0.0


In [41]:
all_scores

Unnamed: 0,gene_pair,gene_1,gene_2,study_origin,cell_line_origin,GEMINI_SCORE_SL_score,HORLBECK_SCORE_SL_score,HORLBECK_SCORE_standard_error,MAGECK_SCORE_SL_score,MAGECK_SCORE_Z_SL_score,MAGECK_SCORE_standard_error,MEDIAN_B_SCORE_SL_score,MEDIAN_B_SCORE_Z_SL_score,MEDIAN_B_SCORE_standard_error,MEDIAN_NB_SCORE_SL_score,MEDIAN_NB_SCORE_Z_SL_score,MEDIAN_NB_SCORE_standard_error,SGRA_DERIVED_B_SCORE_SL_score,SGRA_DERIVED_NB_SCORE_SL_score
0,AKT1|AMBRA1,AKT1,AMBRA1,33956155,RPE1,-1.457454,-0.178004,0.155417,1.01035,1.683856,0.600022,0.780423,2.614573,0.298490,1.035966,3.470691,0.298490,2.981263,2.813702
1,AKT1|ATG10,AKT1,ATG10,33956155,RPE1,-4.339744,-0.215414,0.083963,1.25735,4.267072,0.294663,0.446131,2.169452,0.205642,0.701674,3.412108,0.205642,2.859691,2.714971
2,AKT1|ATG101,AKT1,ATG101,33956155,RPE1,-2.912154,0.141539,0.160130,1.58630,2.126466,0.745980,1.240603,2.667042,0.465161,1.496145,3.216406,0.465161,3.582877,3.462986
3,AKT1|ATG12,AKT1,ATG12,33956155,RPE1,-3.021675,-0.168705,0.176245,1.66625,2.028864,0.821273,0.899307,1.909342,0.471003,1.154849,2.451892,0.471003,2.169773,2.025237
4,AKT1|ATG13,AKT1,ATG13,33956155,RPE1,-2.216132,0.193127,0.220762,1.65970,3.181299,0.521705,0.826477,2.378947,0.347413,1.082020,3.114506,0.347413,1.502438,1.395536
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
241887,RPIA|TKT,RPIA,TKT,29452643,HELA,-0.134070,-0.074928,0.031160,-1.46593,-2.293561,0.639150,,,,-0.311912,-0.526281,0.592671,,-0.141378
241888,RPIA|TPI1,RPIA,TPI1,29452643,HELA,-0.448128,-0.028457,0.025095,-0.89331,-1.316672,0.678460,,,,0.318737,0.612123,0.520708,,0.716199
241889,TALDO1|TPI1,TALDO1,TPI1,29452643,HELA,-0.366839,-0.067253,0.017028,-1.21965,-1.482102,0.822919,,,,0.195485,0.343784,0.568628,,-0.404442
241890,TALDO1|TKT,TALDO1,TKT,29452643,HELA,-0.404770,-0.010693,0.046912,-1.32243,-1.870724,0.706908,,,,0.260096,0.410191,0.634084,,0.634473


In [38]:
print('Accessing stats for ' + curr_study)

Accessing stats for zhao_data


In [35]:
experiment_design

Unnamed: 0_level_0,sgRNA_guide_name,sgRNA_guide_seq,sgRNA_target_name,study_origin
sgRNA_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,H7SK-1N,CCAGCATAGCTCTTAAACCAGTGATGATNACTGAGGCTCAGTTAGA...,H7SK,33956155
2,H7SK-2N,CCAGCATAGCTCTTAAACCAGTGATGATNNCTGAGGCTCAGTTAGA...,H7SK,33956155
3,H7SK-3N,CCAGCATAGCTCTTAAACCAGTGATGANNNCTGAGGCTCAGTTAGA...,H7SK,33956155
4,H7SK-4N,CCAGCATAGCTCTTAAACCAGTGATGANNNNTGAGGCTCAGTTAGA...,H7SK,33956155
5,HU6-1N,GCTATTTCTAGCTCTAAAACCAGTGATGATNACTGAGGCTCGTTTC...,HU6,33956155
...,...,...,...,...
12144,AKT1_6,TGTCATGGAGTACGCCAACG,AKT1,29251726
12145,BCL2_6,TGTCGCAGAGGGGCTACGAG,BCL2,29251726
12146,BCL2L10_6,TGTTGCTGGCCGACTACCTG,BCL2L10,29251726
12147,UBB_5,TGTTGTAGTCAGAAAGAGTA,UBB,29251726


In [36]:
counts

Unnamed: 0_level_0,guide_1_id,guide_2_id,gene_pair_id,gene_pair_orientation,T0_counts,T0_replicate_names,TEnd_counts,TEnd_replicate_names,target_type,study_origin,...,SL_score,statistical_score,SL_score_cutoff,statistical_score_cutoff,sgRNA_guide_name_g1,sgRNA_guide_seq_g1,sgRNA_target_name_g1,sgRNA_guide_name_g2,sgRNA_guide_seq_g2,sgRNA_target_name_g2
sgRNA_pair_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,268,276,0,A_B,906.0;869.0,ctrl_1;ctrl_2,6.0;195.0;0.0,rep_1;rep_2;rep_3,Dual,33956155,...,-0.010982,0.0,-1.00,0.0,AKT1-KO-1-R,ACGTGAGGCTCCCCTCAACA,AKT1,AMBRA1-KO-1-R,TTCTAGGTATCACCGAGAAA,AMBRA1
1,268,277,0,A_B,1140.0;1069.0,ctrl_1;ctrl_2,3.0;5.0;0.0,rep_1;rep_2;rep_3,Dual,33956155,...,-0.010982,0.0,-1.00,0.0,AKT1-KO-1-R,ACGTGAGGCTCCCCTCAACA,AKT1,AMBRA1-KO-2-R,TGAGAGATACTGGATCATCC,AMBRA1
2,268,278,0,A_B,703.0;583.0,ctrl_1;ctrl_2,14.0;13.0;19.0,rep_1;rep_2;rep_3,Dual,33956155,...,-0.010982,0.0,-1.00,0.0,AKT1-KO-1-R,ACGTGAGGCTCCCCTCAACA,AKT1,AMBRA1-KO-3-R,CTTGGCAGGTCCCCAGCTCC,AMBRA1
3,268,279,0,A_B,698.0;656.0,ctrl_1;ctrl_2,25.0;19.0;21.0,rep_1;rep_2;rep_3,Dual,33956155,...,-0.010982,0.0,-1.00,0.0,AKT1-KO-1-R,ACGTGAGGCTCCCCTCAACA,AKT1,AMBRA1-KO-4-R,CCGTAATATAGATATTATGG,AMBRA1
4,268,288,1,A_B,721.0;660.0,ctrl_1;ctrl_2,161.0;75.0;21.0,rep_1;rep_2;rep_3,Dual,33956155,...,2.470796,0.0,-1.00,0.0,AKT1-KO-1-R,ACGTGAGGCTCCCCTCAACA,AKT1,ATG10-KO-1-R,ACGTTATTGTGCAGAATTCA,ATG10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2553231,12147,12048,303393,B_A,431.0,pDNA_Reads,140.0;93.0;291.0,Rep_A_Reads;Rep_B_Reads;Rep_C_Reads,Dual,29251726,...,0.404221,0.0,0.01,0.0,UBB_6,TTCTGACTACAACATCCAGA,UBB,PARP2_3,TTGCTTTCATTTAATGCTACA,PARP2
2553232,12147,12049,303033,B_A,399.0,pDNA_Reads,185.0;270.0;418.0,Rep_A_Reads;Rep_B_Reads;Rep_C_Reads,Dual,29251726,...,0.566849,0.0,0.01,0.0,UBB_6,TTCTGACTACAACATCCAGA,UBB,IMPDH1_3,TTGGGACGAGTCCTGTGAGAA,IMPDH1
2553233,12147,12050,302883,B_A,496.0,pDNA_Reads,215.0;208.0;337.0,Rep_A_Reads;Rep_B_Reads;Rep_C_Reads,Single,29251726,...,,,,,UBB_6,TTCTGACTACAACATCCAGA,UBB,HPRT INTRON_6,TTTAGGAATTGCTGTTGGGAC,CONTROL
2553234,12147,12051,301623,B_A,548.0,pDNA_Reads,178.0;186.0;486.0,Rep_A_Reads;Rep_B_Reads;Rep_C_Reads,Dual,29251726,...,0.192682,0.0,0.01,0.0,UBB_6,TTCTGACTACAACATCCAGA,UBB,BCL2A1_3,TTTCCATCACTTGGTTGAATA,BCL2A1


In [None]:
print('Accessing stats for ' + curr_study)


In [18]:
SLKB_raw_score = pd.read_csv('SLKB_original_raw_reported_SL.csv')
SLKB_raw_score = SLKB_raw_score.loc[SLKB_raw_score['SL_or_not'] == 'SL',:]

In [19]:
def get_n_hop_interactions(curr_scores, n = 2):
    # first, create gene target dict
    first_target_list = {}

    for gene in set(curr_scores['gene_1'].tolist() + curr_scores['gene_2'].tolist()):
        targets = curr_scores.loc[curr_scores['gene_1'] == gene, 'gene_2'].tolist() + curr_scores.loc[curr_scores['gene_2'] == gene, 'gene_1'].tolist()
        first_target_list[gene] = list(set(targets))

    n_hop_targets = copy.deepcopy(first_target_list)
    # create n hop
    if n > 1:
        for i in range(n-1):
            print('Hop: ' + str(i+1))
            new_target_list = copy.deepcopy(n_hop_targets)
            # start node
            for gene in first_target_list:
                # next hop interactions
                new_targets = new_target_list[gene]
                for hops in new_targets.copy():
                    new_targets += first_target_list[hops]

                # make it a set
                new_targets = set(new_targets)
                if gene in new_targets:
                    new_targets.remove(gene)
                new_target_list[gene] = list(new_targets)

            n_hop_targets = copy.deepcopy(new_target_list)
        for gene in first_target_list:
            first_target_list[gene] = set(first_target_list[gene])
        for gene in n_hop_targets:
            n_hop_targets[gene] = set(n_hop_targets[gene])
            
        return((first_target_list, n_hop_targets))
    else:
        print('No hops!')
        for gene in first_target_list:
            first_target_list[gene] = set(first_target_list[gene])
        return((first_target_list, _))

In [24]:
def average_interactions(target_dict):
    total_interactions = 0
    for gene in target_dict:
        total_interactions += len(target_dict[gene])
    total_interactions = total_interactions/len(target_dict.keys())
    print('Average interactions per gene: ' + str(total_interactions))
    return(total_interactions)
    
def intersect_two_targets(target_dict):
    interactions = pd.DataFrame(0, index = sorted(target_dict.keys()), columns = sorted(target_dict.keys()))

    for row in range(len(interactions.index)):
        for col in range(len(interactions.columns)):
            if row <= col:
                continue
            row_gene = interactions.index[row]
            col_gene = interactions.columns[col]
            interactions.loc[row_gene, col_gene] = len(target_dict[row_gene].intersection(target_dict[col_gene]))

    total_sum = interactions.sum(axis = 0).sum()
    average_intersecting_interactions = total_sum/(((interactions.shape[0] ** 2) - interactions.shape[0])/2)
    print('Average intersections between two genes: ' + str(average_intersecting_interactions))
    return(average_intersecting_interactions)
    
def analyze_n_hop(initial_pairs, n_hops):
    
    results = {}
    
    # print the average number of interactions (from one gene)
    print('Initial interactions:')
    res1 = average_interactions(initial_pairs)
    print('N hop interactions interactions:')
    res2 = average_interactions(n_hops)
    
    # print the average number of intersecting interactions (between two genes)
    print('Initial interactions:')
    intersect_two_targets(initial_pairs)
    print('N hop interactions interactions:')
    intersect_two_targets(n_hops)
    
    return([res1, res2])
    
    
    
    

In [23]:
res1

NameError: name 'res1' is not defined

In [25]:
average_i = 0
average_hop = 0
count_i = 0
for study in set(SLKB_raw_score['study_origin']):
    print(study)
    curr_subset = SLKB_raw_score.loc[SLKB_raw_score['study_origin'] == study,:]
    for cell_line in set(curr_subset['cell_line_origin']):
        print(cell_line)
        curr_scores = curr_subset.loc[curr_subset['cell_line_origin'] == cell_line,:]
        initial_pairs, n_hops = get_n_hop_interactions(curr_scores, n = 2)
        res1, res2 = analyze_n_hop(initial_pairs, n_hops)
        average_i += res1
        average_hop += res2
        count_i += 1
        print('-----')
    print('+++++')

34857952
MELJUSO
Hop: 1
Initial interactions:
Average interactions per gene: 1.9919517102615694
N hop interactions interactions:
Average interactions per gene: 4.140845070422535
Initial interactions:
Average intersections between two genes: 0.013922243136236776
N hop interactions interactions:
Average intersections between two genes: 0.0598023625624716
-----
HS944T
Hop: 1
Initial interactions:
Average interactions per gene: 1.9951338199513382
N hop interactions interactions:
Average interactions per gene: 3.9854014598540144
Initial interactions:
Average intersections between two genes: 0.016153344015191976
N hop interactions interactions:
Average intersections between two genes: 0.07207880837932466
-----
HS936T
Hop: 1
Initial interactions:
Average interactions per gene: 1.6563706563706564
N hop interactions interactions:
Average interactions per gene: 3.0617760617760617
Initial interactions:
Average intersections between two genes: 0.006698879039304571
N hop interactions interactions:


In [28]:
average_i/count_i

4.539659705589646

In [29]:
average_hop/count_i

14.814239156356528

In [30]:
average_i = 0
average_hop = 0
count_i = 0
for study in set(SLKB_raw_score['study_origin']):
    print(study)
    curr_subset = SLKB_raw_score.loc[SLKB_raw_score['study_origin'] == study,:]
    for cell_line in set(curr_subset['cell_line_origin']):
        print(cell_line)
        curr_scores = curr_subset.loc[curr_subset['cell_line_origin'] == cell_line,:]
        initial_pairs, n_hops = get_n_hop_interactions(curr_scores, n = 3)
        res1, res2 = analyze_n_hop(initial_pairs, n_hops)
        average_i += res1
        average_hop += res2
        count_i += 1
        print('-----')
    print('+++++')

34857952
MELJUSO
Hop: 1
Hop: 2
Initial interactions:
Average interactions per gene: 1.9919517102615694
N hop interactions interactions:
Average interactions per gene: 4.277665995975855
Initial interactions:
Average intersections between two genes: 0.013922243136236776
N hop interactions interactions:
Average intersections between two genes: 0.06441877068864801
-----
HS944T
Hop: 1
Hop: 2
Initial interactions:
Average interactions per gene: 1.9951338199513382
N hop interactions interactions:
Average interactions per gene: 4.1654501216545015
Initial interactions:
Average intersections between two genes: 0.016153344015191976
N hop interactions interactions:
Average intersections between two genes: 0.07725357545546259
-----
HS936T
Hop: 1
Hop: 2
Initial interactions:
Average interactions per gene: 1.6563706563706564
N hop interactions interactions:
Average interactions per gene: 3.413127413127413
Initial interactions:
Average intersections between two genes: 0.006698879039304571
N hop intera

In [31]:
average_i/count_i

4.539659705589646

In [32]:
average_hop/count_i

23.967609649943597

In [35]:
initial_pairs, n_hops = get_n_hop_interactions(curr_scores, n = 2)

Hop: 1


In [36]:
analyze_n_hop(initial_pairs, n_hops)

Initial interactions:
Average interactions per gene: 3.312
N hop interactions interactions:
Average interactions per gene: 22.144
Initial interactions:
Average intersections between two genes: 0.10184738955823293
N hop interactions interactions:
Average intersections between two genes: 4.051951807228916


In [37]:
initial_pairs, n_hops = get_n_hop_interactions(curr_scores, n = 3)

Hop: 1
Hop: 2


In [38]:
analyze_n_hop(initial_pairs, n_hops)

Initial interactions:
Average interactions per gene: 3.312
N hop interactions interactions:
Average interactions per gene: 66.872
Initial interactions:
Average intersections between two genes: 0.10184738955823293
N hop interactions interactions:
Average intersections between two genes: 28.532144578313254
