In [82]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [83]:
import os
os.chdir('/Users/zhenyamordan/PyCharmProjects/Kinome-Regularization/')

In [84]:
import pandas as pd
import numpy as np
import plotly.express as px
from scipy.optimize import curve_fit
import plotly.graph_objects as go
import subprocess

## Load data

In [85]:
# data_folder = './fof/data/data_03_12_23'
data_folder = './fof/data/data_06_10_23'

In [86]:
!ls {data_folder}

KIR3KinaseCategories.xlsx         good_doses_mask.csv
drug_phenotype_data.csv           kinase_data.csv
final_mask_1_point_on_plateau.csv y_plateau_preprocessed.csv


In [87]:
!ls ./fof/data/data_03_12_23

KIR3KinaseCategories.xlsx         good_doses_mask.csv
drug_phenotype_data.csv           kinase_data.csv
final_mask_1_point_on_plateau.csv y_plateau_preprocessed.csv


In [88]:
y = pd.read_csv(os.path.join(data_folder, 'drug_phenotype_data.csv'))

In [89]:
X = pd.read_csv(os.path.join(data_folder, 'kinase_data.csv'))

In [90]:
y_plateau = pd.read_csv(os.path.join(data_folder, 'y_plateau_preprocessed.csv'))

## Process

### Merge

In [91]:
def merge_data(X, y, y_plateau):
    data = pd.merge(y, X, left_index=True, right_index=True)
    data = pd.merge(data, y_plateau, left_index=True, right_index=True)
    return data

In [92]:
data = merge_data(X, y, y_plateau)

In [93]:
mask = pd.read_csv('./fof/data/good_doses_mask.csv')

In [94]:
data = data[mask['if_used_in_final']].reset_index(drop=True)

In [95]:
data.shape

(347, 377)

### Drop control

In [99]:
data = data[data['dose (uM)'] != 0.]

### Prepare for R

In [104]:
kinases = data.columns[7:].to_numpy()
print(kinases[:5])
drugs = data['drugNames'].unique()
print(drugs[:5])

['AKT1' 'AKT2' 'AKT3' 'ALK1_ACVRL1' 'ALK2_ACVR1']
['NCGC00344999_A1' 'NCGC00241102_E1' 'NCGC00249389_H1' 'NCGC00345784_C2'
 'NCGC00346673_G2']


In [105]:
number_drug_dict = dict(zip(range(len(data)), data['drugNames'].unique()))
drug_number_dict = {v: k for k, v in number_drug_dict.items()}

In [106]:
number_kinase_dict = dict(zip(range(1, len(kinases)+1), kinases))
kinase_number_dict = {v: k for k, v in number_kinase_dict.items()}

In [111]:
dose_column = 'dose (uM)'
response_column = 'Y'

In [112]:
y = data[response_column].to_numpy().reshape(-1,1)
X = data[kinases].values
t = data[dose_column].to_numpy().reshape(-1,1)
drug = data['drugNames'].apply(lambda x: drug_number_dict[x]).to_numpy().reshape(-1,1)

### Save

In [113]:
r_data = np.concatenate([drug, y, t, X], axis=1)
r_columns = ['subj', 'Y', 'time'] + [f'Cov_{i+1}' for i in range(X.shape[1])]

In [114]:
r_df = pd.DataFrame(data=r_data, columns=r_columns)
r_df

Unnamed: 0,subj,Y,time,Cov_1,Cov_2,Cov_3,Cov_4,Cov_5,Cov_6,Cov_7,...,Cov_361,Cov_362,Cov_363,Cov_364,Cov_365,Cov_366,Cov_367,Cov_368,Cov_369,Cov_370
0,0.0,0.276545,1.11,0.825562,0.958012,0.929195,0.873394,0.999992,0.978198,1.000000,...,0.027721,0.302221,0.000411,0.000200,0.025107,0.343913,0.228633,0.273383,0.001528,0.276545
1,0.0,0.526242,0.37,0.877345,0.984761,0.979264,0.959281,1.000000,0.991725,1.000000,...,0.068760,0.577477,0.000973,0.000201,0.064854,0.566269,0.428022,0.534281,0.004563,0.526242
2,0.0,0.700000,0.12,0.915214,0.994564,0.994147,0.987177,1.000000,0.996885,1.000000,...,0.160831,0.811382,0.003018,0.000210,0.157508,0.764517,0.653849,0.777644,0.014427,0.700000
3,0.0,0.700000,0.04,0.942042,0.998072,0.998365,0.995500,1.000000,0.998830,1.000000,...,0.332354,0.930753,0.010412,0.000296,0.335096,0.889445,0.826574,0.914221,0.045557,0.700000
4,0.0,0.700000,0.01,0.960575,0.999315,0.999543,0.997916,1.000000,0.999560,1.000000,...,0.563155,0.976166,0.036337,0.001109,0.575069,0.951738,0.922977,0.970018,0.134958,0.700000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,46.0,0.074273,1.11,0.960433,0.996855,0.928235,0.915098,1.000000,0.998901,0.998901,...,0.002261,0.020236,0.348805,0.300735,0.001050,0.165440,0.479133,1.000000,0.001413,0.074273
296,46.0,0.251452,0.37,0.975974,0.999325,0.951123,0.958103,1.000000,0.998901,0.998901,...,0.007330,0.066630,0.618058,0.529689,0.006249,0.368352,0.722819,1.000000,0.005303,0.251452
297,46.0,0.700000,0.12,0.985499,0.999855,0.966849,0.979502,1.000000,0.998901,0.998901,...,0.024551,0.198193,0.830142,0.746503,0.038751,0.631744,0.880451,1.000000,0.020475,0.700000
298,46.0,0.700000,0.04,0.991281,0.999969,0.977512,0.989781,1.000000,0.998901,0.998901,...,0.079986,0.461122,0.936530,0.884694,0.207089,0.834582,0.953693,1.000000,0.076450,0.700000


In [115]:
r_df.to_csv('./fof/data/r/prepared_data.csv', index=False)

## Best kinase

In [118]:
kinase_number_dict['EPHB2']

252

In [122]:
kinase_number_dict['FYN']

264

In [123]:
kinase_number_dict['LATS2']

281

In [124]:
kinase_number_dict['LYN']

285

In [128]:
kinase_number_dict['MINK_MINK1']

294

In [129]:
kinase_number_dict['YES_YES1']

369