# Basic network analysis of CCLE and GDSC joint drug response matrix

In [1]:
import os

path = os.getcwd()
# find the string 'project' in the path, return index
index_project = path.find('project')
# slice the path from the index of 'project' to the end
project_path = path[:index_project+7]
# set the working directory
os.chdir(project_path)
print(f'Project path set to: {os.getcwd()}')

from PathLoader import PathLoader
path_loader = PathLoader('data_config.env', 'current_user.env')

Project path set to: c:\Github\ode-biomarker-project


In [2]:
from DataLink import DataLink
data_link = DataLink(path_loader, 'data_codes.csv')

In [3]:
# load in original ccle data
loading_code = 'ccle-gdsc-2-Palbociclib-LN_IC50-sin'
feature_data, label_data = data_link.get_data_using_code(loading_code)
print(f'Data loaded for code {loading_code}')
print('feature data shape:', feature_data.shape, 'label data shape:', label_data.shape)

Data loaded for code ccle-gdsc-2-Palbociclib-LN_IC50-sin
feature data shape: (584, 19221) label data shape: (584,)


## Correlation Matrix using pandas.corr

In [4]:
corr_matrix = feature_data.corr()

In [5]:
corr_matrix.head()

Unnamed: 0,TSPAN6,TNMD,DPM1,SCYL3,C1orf112,FGR,CFH,FUCA2,GCLC,NFYA,...,H3C2,H3C3,AC098582.1,DUS4L-BCAP29,C8orf44-SGK3,ELOA3B,NPBWR1,ELOA3D,ELOA3,CDR1
TSPAN6,1.0,0.163946,0.252582,-0.19466,-0.05533,-0.487432,0.130018,0.41461,0.213403,-0.141286,...,-0.100573,-0.111479,-0.177692,-0.075463,-0.176298,-0.055795,0.199724,-0.066443,-0.074276,0.018926
TNMD,0.163946,1.0,0.044927,0.034197,0.055193,-0.045083,-0.022945,-0.061861,-0.042361,-0.046662,...,-0.020918,-0.078262,-0.08531,-0.065824,0.027621,-0.008905,-0.058801,-0.015246,-0.042875,0.032538
DPM1,0.252582,0.044927,1.0,-0.050684,0.139284,-0.194997,-0.039977,0.272837,-0.037828,-0.090364,...,0.066507,0.022239,-0.069403,0.147007,-0.052367,-0.092502,0.01979,-0.140743,-0.08744,-0.03178
SCYL3,-0.19466,0.034197,-0.050684,1.0,0.444081,0.203733,-0.056618,-0.202712,0.221475,0.377699,...,0.048807,0.008578,0.134482,-0.001459,0.111781,0.114585,-0.077774,0.100753,0.105715,-0.125245
C1orf112,-0.05533,0.055193,0.139284,0.444081,1.0,0.053456,-0.094124,-0.187769,0.051586,0.392347,...,0.172606,0.120762,0.115496,0.225627,0.023268,0.168303,0.004911,0.136021,0.183258,-0.037329


In [6]:
folder_name = 'project-drugnet' # always take the file name of the script after '_'

if not os.path.exists(f'{path_loader.get_data_path()}data/results/{folder_name}'):
    os.makedirs(f'{path_loader.get_data_path()}data/results/{folder_name}')

file_save_path = f'{path_loader.get_data_path()}data/results/{folder_name}/'



In [8]:
corr_matrix.to_pickle(f'{file_save_path}corr_matrix_ccle_palbociclib.pkl')

In [3]:
loading_code = 'corr_matrix_ccle_palbociclib'

corr_matrix = data_link.get_data_from_code(loading_code)

corr_matrix.head()

Unnamed: 0,TSPAN6,TNMD,DPM1,SCYL3,C1orf112,FGR,CFH,FUCA2,GCLC,NFYA,...,H3C2,H3C3,AC098582.1,DUS4L-BCAP29,C8orf44-SGK3,ELOA3B,NPBWR1,ELOA3D,ELOA3,CDR1
TSPAN6,1.0,0.163946,0.252582,-0.19466,-0.05533,-0.487432,0.130018,0.41461,0.213403,-0.141286,...,-0.100573,-0.111479,-0.177692,-0.075463,-0.176298,-0.055795,0.199724,-0.066443,-0.074276,0.018926
TNMD,0.163946,1.0,0.044927,0.034197,0.055193,-0.045083,-0.022945,-0.061861,-0.042361,-0.046662,...,-0.020918,-0.078262,-0.08531,-0.065824,0.027621,-0.008905,-0.058801,-0.015246,-0.042875,0.032538
DPM1,0.252582,0.044927,1.0,-0.050684,0.139284,-0.194997,-0.039977,0.272837,-0.037828,-0.090364,...,0.066507,0.022239,-0.069403,0.147007,-0.052367,-0.092502,0.01979,-0.140743,-0.08744,-0.03178
SCYL3,-0.19466,0.034197,-0.050684,1.0,0.444081,0.203733,-0.056618,-0.202712,0.221475,0.377699,...,0.048807,0.008578,0.134482,-0.001459,0.111781,0.114585,-0.077774,0.100753,0.105715,-0.125245
C1orf112,-0.05533,0.055193,0.139284,0.444081,1.0,0.053456,-0.094124,-0.187769,0.051586,0.392347,...,0.172606,0.120762,0.115496,0.225627,0.023268,0.168303,0.004911,0.136021,0.183258,-0.037329


## Building an edge list instead 

In [4]:
feature_data.head()

Unnamed: 0_level_0,TSPAN6,TNMD,DPM1,SCYL3,C1orf112,FGR,CFH,FUCA2,GCLC,NFYA,...,H3C2,H3C3,AC098582.1,DUS4L-BCAP29,C8orf44-SGK3,ELOA3B,NPBWR1,ELOA3D,ELOA3,CDR1
SANGER_MODEL_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SIDM00872,5.29609,0.0,6.794416,3.452859,5.260778,0.042644,4.339137,5.833143,7.313246,4.764474,...,1.827819,0.0,0.847997,1.704872,0.411426,0.0,1.063503,0.0,0.0,0.0
SIDM00866,5.214125,0.0,6.328047,3.168321,4.654206,0.042644,0.432959,6.574102,5.140779,5.114783,...,0.0,0.0,0.298658,1.584963,0.695994,0.0,2.693766,0.0,0.0,0.0
SIDM00885,5.24184,0.201634,5.615299,3.090853,3.732269,0.084064,0.111031,4.0268,5.471513,6.01948,...,0.632268,0.0,0.422233,1.250962,0.505891,0.111031,0.070389,0.0,0.111031,0.0
SIDM00884,3.481557,0.0,7.070389,2.341986,3.679199,0.176323,3.420887,5.854494,5.976134,5.112283,...,2.327687,0.226509,0.650765,2.477677,0.0,0.0,0.042644,0.0,0.084064,0.0
SIDM00874,4.903038,0.0,7.177719,2.744161,4.648465,0.15056,0.070389,5.819924,4.374344,3.81455,...,2.440952,1.695994,0.367371,1.505891,0.214125,0.137504,0.704872,0.0,0.0,0.0


In [10]:
from tqdm import tqdm

edge_list = []

# for each col in feature_data
for col in tqdm(feature_data.columns):
    for col2 in feature_data.columns:
        if col != col2:
            edge_list.append((col, col2, feature_data[col].corr(feature_data[col2])))
            print(col, col2, feature_data[col].corr(feature_data[col2]))
            break
    break
    
### estimated to take 18 hours, would not suggest running this

  0%|          | 0/19221 [00:00<?, ?it/s]

TSPAN6 TNMD 0.16394561071127337



