In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import matplotlib
from sklearn.feature_selection import SelectKBest, SelectPercentile, f_regression, mutual_info_regression, SelectFromModel, VarianceThreshold
from methods import pre


## Implementing New DA's and Standardization/Normalization

In [4]:
gdsc_ge = pd.read_csv('data/Processed/gdsc_cell_ge.csv').fillna(0).set_index('CCL')

In [9]:
import OnPLS
import numpy as np

In [11]:
np.random.seed(42)

n, p_1, p_2, p_3 = 4, 3, 4, 5
t = np.sort(np.random.randn(n, 1), axis=0)
p1 = np.sort(np.random.randn(p_1, 1), axis=0)
p2 = np.sort(np.random.randn(p_2, 1), axis=0)
p3 = np.sort(np.random.randn(p_3, 1), axis=0)
X1 = np.dot(t, p1.T) + 0.1 * np.random.randn(n, p_1)
X2 = np.dot(t, p2.T) + 0.1 * np.random.randn(n, p_2)
X3 = np.dot(t, p3.T) + 0.1 * np.random.randn(n, p_3)

In [12]:
# Define the connections between blocks
predComp = [[0, 1, 1], [1, 0, 1], [1, 1, 0]]
# Define the numbers of non-global components
orthComp = [1, 1, 1]

# Create the estimator
onpls = OnPLS.estimators.OnPLS(predComp, orthComp)

In [13]:
# Fit a model
onpls.fit([X1, X2, X3])

# Perform prediction of all matrices from all connected matrices
Xhat = onpls.predict([X1, X2, X3])

# Compute prediction score
score = onpls.score([X1, X2, X3])

cv_scores = OnPLS.resampling.cross_validation(onpls, [X1, X2, X3], cv_rounds=4)



In [17]:
onpls.precomputedW

[array([[-0.15363395],
        [-0.18393189],
        [ 0.97085821]]),
 array([[-0.38025657],
        [-0.45758902],
        [ 0.47166374],
        [ 0.65080762]]),
 array([[ 0.69783882],
        [ 0.64528545],
        [ 0.22216816],
        [ 0.20472423],
        [-0.07319133]])]

## Checking ElasticNet RandomizedSearch

In [9]:
gdsc_ge = pd.read_csv('data/Processed/gdsc_cell_ge.csv').fillna(0).set_index('CCL')
gdsc_dr = pd.read_csv('data/Processed/gdsc_poz_dr.csv').fillna(0)

In [None]:
gdsc

In [11]:
from sklearn import linear_model
from sklearn.datasets import load_iris
from sklearn.model_selection import RandomizedSearchCV
iris = load_iris()
t3 = {
    'alpha': [0.5, 1, 1.5, 2, 5],
    'l1_ratio': [0, 0.25, 0.5, 0.75, 1],
    'max_iter': [5000]
}

lo = RandomizedSearchCV(linear_model.ElasticNet(), t3, n_iter=50, scoring='r2')
lo.fit(iris.data, iris.target)
pe = pd.DataFrame(lo.cv_results_)



## Reducing memory usage by using indices

In [1]:
from classes import drug
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.feature_selection import SelectKBest, SelectPercentile, f_regression, mutual_info_regression, SelectFromModel, VarianceThreshold

In [8]:
%%time
gdsc_ge = pd.read_csv('data/Processed/gdsc_cell_ge.csv').fillna(0).set_index('CCL')
ctrp_ge = pd.read_csv('data/Processed/ctrp_cell_ge.csv').fillna(0).set_index('CCL')
gdsc_dr = pd.read_csv('data/Processed/gdsc_poz_dr.csv').fillna(0)
ctrp_dr = pd.read_csv('data/Processed/ctrp_poz_dr.csv').fillna(0)

CPU times: user 35.9 s, sys: 579 ms, total: 36.5 s
Wall time: 36.4 s


In [9]:
%%time
aag = drug('17-AAG', {'ctrp': ctrp_ge, 'gdsc': gdsc_ge}, {'ctrp': ctrp_dr, 'gdsc': gdsc_dr})

CPU times: user 1.44 s, sys: 4.97 ms, total: 1.45 s
Wall time: 1.45 s


In [10]:
aag.pre()

In [11]:
'ge' in aag.__dict__

False

In [12]:
aag.combine()

In [13]:
aag.split()

In [14]:
aag.get('X', 'train').shape

(827, 20600)

'mean is 1, std is 2'

In [15]:

aag.fs(mutual_info_regression, n=0.01)

206 features
[[ 8.22625117  7.9240769  11.25370085 ...  0.          0.
   0.        ]
 [ 7.38502456  9.18294846 11.28107417 ...  0.          0.
   0.        ]
 [ 6.48340108 10.20096264  9.33630783 ...  6.24645304  3.41559208
   3.64098972]
 ...
 [ 6.48211485  9.75738726 10.56100587 ...  8.97084604  3.28393378
   3.57814007]
 [ 5.82768649  9.15923494 10.25319984 ...  3.9957859   3.14716415
   3.25962857]
 [ 5.93190027 10.3220042   9.9263591  ...  3.53349924  3.26787564
   3.32453528]]
After fs (276, 206) (827, 206)


In [16]:
aag.get('X', 'train')

Unnamed: 0_level_0,Unnamed: 1_level_0,AGGF1,ANAPC5,AP3S1,ARHGAP27,ASPH,ATP2B2,ATP2B3,BAG3,BCAP29,BMP1,...,SLC35G5,SNHG7,SNORA26,SNORA74B,TCEAL9,TEPSIN,TMEM189,TMSB15B,ZNF695,ZNF767P
Unnamed: 0_level_1,CCL,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
ctrp,LN229,8.226251,7.924077,11.253701,4.407883,7.413361,4.001565,3.813327,9.755886,6.798963,5.722995,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
ctrp,KCL22,7.385025,9.182948,11.281074,4.634485,7.294799,4.041923,3.748326,8.800843,7.170124,5.992913,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
gdsc,HUPT4,6.483401,10.200963,9.336308,4.412368,5.471657,2.943845,2.905130,6.888100,5.273164,3.394748,...,3.899702,5.399728,3.295038,3.570888,8.303709,5.853637,5.446821,6.246453,3.415592,3.640990
gdsc,SIMA,5.733640,9.590814,9.940340,3.210771,3.790866,3.668956,4.087474,4.446468,4.865038,3.497878,...,3.247251,6.442336,3.298268,3.345386,7.676533,5.152282,6.072026,7.388342,3.343255,3.669435
gdsc,RPMI6666,5.988554,9.614634,10.509346,3.716671,3.724674,3.006337,2.978083,4.613674,5.822691,3.192930,...,3.819218,5.382215,3.178846,3.586889,4.388571,5.577618,5.299369,6.523783,3.119636,4.471245
ctrp,NCIH520,7.554070,8.654919,11.247098,4.837946,6.880808,4.022821,3.944314,9.031478,5.932876,4.895386,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
ctrp,SIGM5,7.505783,9.508165,11.261540,5.623851,5.387787,4.059507,3.744212,7.919789,6.870298,4.657048,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
gdsc,HUPT3,5.927602,9.884768,10.187934,4.142101,6.135774,3.155070,2.718358,7.357486,5.364319,3.452619,...,3.556778,6.348721,2.876397,3.524512,7.612203,5.014556,6.085385,5.008135,3.100723,3.617822
gdsc,CAL27,6.039181,9.712396,10.369503,3.897784,5.095961,3.179379,3.018270,9.268381,6.008715,4.245253,...,3.733616,5.942575,2.561820,3.478759,8.144564,5.933112,8.073544,3.777850,3.197866,3.642748
gdsc,CCFSTTG1,6.448939,9.764155,10.411066,3.218237,6.006615,2.917954,2.832533,8.124702,6.642247,4.954824,...,3.473340,4.585441,2.750643,3.511647,8.447538,5.100174,5.820138,6.858185,3.225966,3.913996


In [10]:
aag.get('X', 'train').shape

(827, 2060)

In [2]:
from methods import pre
lo = 34


AttributeError: 'int' object has no attribute '__name__'

In [None]:
aag.X['fs_test']

## Speeding up combine

In [12]:
gdsc_dr = pd.read_csv('data/Processed/gdsc_poz_dr.csv').fillna(0)
gdsc_ge = pd.read_csv('data/Processed/gdsc_cell_ge.csv').fillna(0).set_index('CCL')

In [13]:
gdsc_ge.shape[1]

19562

In [15]:
dr = gdsc_dr
ge = gdsc_ge

In [35]:
ge.join(dr[dr['Drug_name'] == '17-AAG'][['CCL', 'AUC_IC50']].set_index('CCL'), how='right')

Unnamed: 0_level_0,A1BG,A1CF,A2M,A2ML1,A3GALT2,A3GALT2P,A4GALT,A4GNT,AAAS,AACS,...,ZWILCH,ZWINT,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3,AUC_IC50
CCL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
22RV1,3.537942,6.364651,5.332441,2.923030,2.815383,2.815383,3.241125,3.262633,4.722157,4.942126,...,7.526319,9.274166,3.785757,5.201929,2.742575,4.635015,3.940190,4.658663,8.088672,0.169156
2313287,3.370950,6.284884,3.485675,2.831562,2.913369,2.913369,3.313028,3.096527,4.873621,4.213177,...,6.526713,8.021952,3.520584,4.957371,2.636933,4.350120,4.395806,4.598583,7.951146,0.577314
5637,2.927335,2.892365,3.181651,2.926549,2.677943,2.677943,4.295357,3.205598,5.249042,4.495021,...,7.057572,9.261486,3.492602,4.813163,2.558705,4.580977,4.810975,4.371501,7.974367,0.828724
639V,3.953010,2.858072,2.892599,2.828334,2.729762,2.729762,2.960059,3.121154,4.445200,4.017422,...,6.310088,8.021780,3.919468,5.923131,2.782005,4.969272,5.481422,5.205446,8.118109,0.726141
647V,2.804009,2.944390,3.178071,2.675572,2.949402,2.949402,3.029297,3.024326,4.636786,4.724453,...,8.657911,9.426794,3.657576,6.082990,3.148358,5.182059,5.203430,4.806941,8.530606,0.008665
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
YAPC,3.043994,3.055992,3.532297,2.946112,2.835091,2.835091,3.273226,3.182387,4.474313,4.238488,...,8.619045,10.392332,3.704526,5.128979,3.082707,4.061633,3.664478,4.276836,7.238275,0.502122
YH13,5.167861,3.072842,3.049070,3.098071,3.029716,3.029716,2.990092,3.318574,4.561866,3.581945,...,8.048126,8.235304,3.305898,4.745612,2.577626,5.215836,7.003537,4.203823,7.919283,0.881166
YKG1,4.711624,3.067945,3.306930,3.037960,2.713442,2.713442,3.154398,2.988924,4.586814,3.781220,...,7.978261,9.150385,3.972405,5.166082,2.813449,5.117194,5.085635,4.918170,8.208343,0.803193
YT,3.143593,3.779792,5.095163,2.879333,2.749091,2.749091,3.186225,3.359589,4.511721,4.241912,...,8.196766,9.205680,3.515323,5.152341,2.472749,5.046783,6.016707,5.168183,7.063949,0.513477


In [28]:
dr[dr['Drug_name'] == '17-AAG'][['CCL', 'AUC_IC50']].set_index('CCL')

Unnamed: 0_level_0,AUC_IC50
CCL,Unnamed: 1_level_1
22RV1,0.169156
2313287,0.577314
5637,0.828724
639V,0.726141
647V,0.008665
...,...
YAPC,0.502122
YH13,0.881166
YKG1,0.803193
YT,0.513477


In [16]:
%%time
drug_dr = dr[dr['Drug_name'] == '17-AAG'][['CCL', 'AUC_IC50']]

CPU times: user 7.94 ms, sys: 1.2 ms, total: 9.14 ms
Wall time: 15.4 ms


In [17]:
%%time
X = np.array([list(ge.loc[i].values) for i in drug_dr[drug_dr['CCL'].isin(ge.index)]['CCL']])

CPU times: user 2.9 s, sys: 258 ms, total: 3.16 s
Wall time: 3.33 s


In [18]:
%%time
X = X.reshape(drug_dr.shape[0], ge.shape[1])

CPU times: user 31 µs, sys: 1 µs, total: 32 µs
Wall time: 35 µs


In [19]:
%%time
data = pd.DataFrame(X).assign(DR = drug_dr['AUC_IC50'])

CPU times: user 48.3 ms, sys: 9.44 ms, total: 57.7 ms
Wall time: 66.9 ms


In [66]:
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19553,19554,19555,19556,19557,19558,19559,19560,19561,DR
0,3.537942,6.364651,5.332441,2.923030,2.815383,2.815383,3.241125,3.262633,4.722157,4.942126,...,7.526319,9.274166,3.785757,5.201929,2.742575,4.635015,3.940190,4.658663,8.088672,0.169156
1,3.370950,6.284884,3.485675,2.831562,2.913369,2.913369,3.313028,3.096527,4.873621,4.213177,...,6.526713,8.021952,3.520584,4.957371,2.636933,4.350120,4.395806,4.598583,7.951146,0.577314
2,2.927335,2.892365,3.181651,2.926549,2.677943,2.677943,4.295357,3.205598,5.249042,4.495021,...,7.057572,9.261486,3.492602,4.813163,2.558705,4.580977,4.810975,4.371501,7.974367,0.828724
3,3.953010,2.858072,2.892599,2.828334,2.729762,2.729762,2.960059,3.121154,4.445200,4.017422,...,6.310088,8.021780,3.919468,5.923131,2.782005,4.969272,5.481422,5.205446,8.118109,0.726141
4,2.804009,2.944390,3.178071,2.675572,2.949402,2.949402,3.029297,3.024326,4.636786,4.724453,...,8.657911,9.426794,3.657576,6.082990,3.148358,5.182059,5.203430,4.806941,8.530606,0.008665
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
655,3.043994,3.055992,3.532297,2.946112,2.835091,2.835091,3.273226,3.182387,4.474313,4.238488,...,8.619045,10.392332,3.704526,5.128979,3.082707,4.061633,3.664478,4.276836,7.238275,0.502122
656,5.167861,3.072842,3.049070,3.098071,3.029716,3.029716,2.990092,3.318574,4.561866,3.581945,...,8.048126,8.235304,3.305898,4.745612,2.577626,5.215836,7.003537,4.203823,7.919283,0.881166
657,4.711624,3.067945,3.306930,3.037960,2.713442,2.713442,3.154398,2.988924,4.586814,3.781220,...,7.978261,9.150385,3.972405,5.166082,2.813449,5.117194,5.085635,4.918170,8.208343,0.803193
658,3.143593,3.779792,5.095163,2.879333,2.749091,2.749091,3.186225,3.359589,4.511721,4.241912,...,8.196766,9.205680,3.515323,5.152341,2.472749,5.046783,6.016707,5.168183,7.063949,0.513477


## Speeding up pre-processing

In [2]:
gdsc_ge = pd.read_csv('data/Processed/gdsc_cell_ge.csv').fillna(0).set_index('CCL')

In [3]:
print(type(gdsc_ge.keys()))

<class 'pandas.core.indexes.base.Index'>


In [5]:
gdsc_ge.shape

(706, 19562)

In [6]:
gdsc_ge.shape[0]

706

In [8]:
%%time
under = (gdsc_ge.to_numpy()>5).T.astype(np.int8)

n = [np.count_nonzero(i) > 0.1*gdsc_ge.shape[0] for i in under]
names = {gdsc_ge.keys()[k]:v for k, v in enumerate(n)}
indices = [k for k,v in names.items() if v]
index = gdsc_ge[indices]

CPU times: user 117 ms, sys: 6.79 ms, total: 124 ms
Wall time: 125 ms


In [9]:
%%time
index2 = pre(gdsc_ge,t=5, p=0.1)

CPU times: user 6.17 s, sys: 127 ms, total: 6.3 s
Wall time: 6.36 s


In [10]:
index.shape

(706, 9919)

In [11]:
index2.shape

(706, 9919)

## Domain Adaptation experiments

In [113]:
X_a = np.random.rand(3, 2) * 3

In [114]:
X_b = np.random.rand(3, 2) * 4

In [115]:
X_c = np.random.rand(3, 2) * 5

In [116]:
def jump(domain, n, data):
    result = []
    for i, ele in enumerate(data):
        result.append(ele)
        for j in range(0, domain):
            result.append(0)
        result.append(ele)
        for j in range(0, n-domain-1):
            result.append(0)
    return result
    

In [117]:
def feda(domains):
    n = len(domains)
    
    samples = 0
    for i in domains:
        samples += i.shape[0]
    
    features = domains[0].shape[1]*(n+1)
    
    new = np.zeros(features)
    for i, data in enumerate(domains):
        for j in data:
            new = np.vstack([new, jump(i, n, j)])
            
    return new[1:]

In [118]:
feda([X_a, X_b, X_c])

array([[2.39552445, 2.39552445, 0.        , 0.        , 0.8923882 ,
        0.8923882 , 0.        , 0.        ],
       [2.09713092, 2.09713092, 0.        , 0.        , 1.99024727,
        1.99024727, 0.        , 0.        ],
       [2.17421497, 2.17421497, 0.        , 0.        , 2.66618415,
        2.66618415, 0.        , 0.        ],
       [3.77029806, 0.        , 3.77029806, 0.        , 0.52768015,
        0.        , 0.52768015, 0.        ],
       [3.03560501, 0.        , 3.03560501, 0.        , 1.3673342 ,
        0.        , 1.3673342 , 0.        ],
       [2.41889914, 0.        , 2.41889914, 0.        , 3.77142649,
        0.        , 3.77142649, 0.        ],
       [0.70066211, 0.        , 0.        , 0.70066211, 3.89969596,
        0.        , 0.        , 3.89969596],
       [1.67451535, 0.        , 0.        , 1.67451535, 0.97901028,
        0.        , 0.        , 0.97901028],
       [0.52645206, 0.        , 0.        , 0.52645206, 1.13973433,
        0.        , 0.      

## Mess

In [18]:
from methods import fs

In [19]:
fs.__doc__

' Returns a subset of {X_train} and {X_test} with features being selected by the method {model}\n    :param int n: it can be the variance thereshold or the number of chosen features \n    :\n    '

In [45]:
from classes import drug
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
import config as c
from sklearn.feature_selection import SelectKBest, SelectPercentile, f_regression, mutual_info_regression, SelectFromModel, VarianceThreshold

In [46]:
gdsc_ge = pd.read_csv(c.dir + 'gdsc_cell_ge.csv').fillna(0).set_index('CCL')
ctrp_ge = pd.read_csv(c.dir + 'ctrp_cell_ge.csv').fillna(0).set_index('CCL')
gdsc_dr = pd.read_csv(c.dir + 'gdsc_poz_dr.csv').fillna(0)
ctrp_dr = pd.read_csv(c.dir + 'ctrp_poz_dr.csv').fillna(0)

In [47]:
%%time
aag = drug('17-AAG', {'ctrp': ctrp_ge, 'gdsc': gdsc_ge}, {'ctrp': ctrp_dr, 'gdsc': gdsc_dr})

CPU times: user 973 ms, sys: 119 ms, total: 1.09 s
Wall time: 1.1 s


In [48]:
aag.to_json()

col
<class 'list'>
da
<class 'dict'>
data
<class 'pandas.core.frame.DataFrame'>
dr
<class 'pandas.core.frame.DataFrame'>
ge
<class 'pandas.core.frame.DataFrame'>
name
<class 'str'>
predicted
<class 'list'>


In [49]:
%%time
aag.pre()

CPU times: user 496 ms, sys: 297 ms, total: 794 ms
Wall time: 888 ms


In [50]:
%%time
aag.combine()

CPU times: user 478 ms, sys: 398 ms, total: 876 ms
Wall time: 898 ms


In [51]:
%%time
aag.split()

CPU times: user 141 ms, sys: 109 ms, total: 250 ms
Wall time: 254 ms


In [52]:
%%time
aag.fs(f_regression, n=0.01)

After fs (276, 206) (827, 206)
CPU times: user 364 ms, sys: 185 ms, total: 550 ms
Wall time: 466 ms


In [9]:
%%time
aag.feda()

CPU times: user 938 ms, sys: 702 ms, total: 1.64 s
Wall time: 1.69 s


In [10]:
aag.get('X', 'train')

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,608,609,610,611,612,613,614,615,616,617
0,4.031054,4.031054,0.0,10.708475,10.708475,0.000000,3.620436,3.620436,0.0,5.459297,...,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000
1,4.070159,4.070159,0.0,10.684955,10.684955,0.000000,3.457989,3.457989,0.0,6.579006,...,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000
2,4.254251,4.254251,0.0,10.563919,10.563919,0.000000,3.448932,3.448932,0.0,4.778966,...,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000
3,4.292504,4.292504,0.0,6.804169,6.804169,0.000000,3.800755,3.800755,0.0,4.509346,...,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000
4,4.086229,4.086229,0.0,12.579925,12.579925,0.000000,3.383222,3.383222,0.0,5.262325,...,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
822,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000
823,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000
824,0.000000,0.000000,0.0,9.771689,0.000000,9.771689,0.000000,0.000000,0.0,0.000000,...,5.976329,3.557185,0.0,3.557185,3.452922,0.0,3.452922,4.479922,0.0,4.479922
825,0.000000,0.000000,0.0,12.743566,0.000000,12.743566,0.000000,0.000000,0.0,0.000000,...,5.782401,3.402952,0.0,3.402952,3.210286,0.0,3.210286,4.563701,0.0,4.563701


In [10]:
aag.data.drop(aag.metric, axis=1)

Unnamed: 0_level_0,Unnamed: 1_level_0,A1BG,A1BG-AS1,A1CF,A2M,A2M-AS1,A2ML1,A2MP1,A4GALT,A4GNT,AA06,...,ZNF773,ZNF812,ZNF816,ZNF833P,ZNF841,ZNF845,ZNF860,ZNRD1ASP,ZP3,ZPR1
Unnamed: 0_level_1,CCL,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
ctrp,22RV1,4.753564,4.766881,7.445061,4.824879,5.714456,3.696364,4.500844,4.386674,3.917018,4.372562,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
ctrp,42MGBA,7.290132,5.591604,4.030574,4.113296,3.277104,3.985902,4.839848,4.578237,3.898615,4.481621,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
ctrp,5637,4.293473,4.510732,4.157908,4.110104,4.021707,4.169669,5.138686,6.044395,4.319823,4.457162,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
ctrp,639V,6.627165,4.951915,4.094668,3.980078,4.311624,4.105991,4.640191,4.547413,3.960633,4.682899,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
ctrp,697,6.991375,4.836096,4.027624,4.069556,4.350773,4.054177,4.691261,4.582222,3.989948,4.433872,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
gdsc,YAPC,3.043994,0.000000,3.055992,3.532297,0.000000,2.946112,0.000000,3.273226,3.182387,0.000000,...,5.275694,2.827617,3.113137,3.413018,3.004792,2.991953,4.212500,4.518800,5.097525,7.202968
gdsc,YH13,5.167861,0.000000,3.072842,3.049070,0.000000,3.098071,0.000000,2.990092,3.318574,0.000000,...,5.819260,3.036440,2.874351,3.359950,3.557884,3.183011,2.841925,4.917730,4.791946,6.943269
gdsc,YKG1,4.711624,0.000000,3.067945,3.306930,0.000000,3.037960,0.000000,3.154398,2.988924,0.000000,...,6.671234,2.813230,3.140576,3.274792,3.544649,3.422049,2.635027,4.711590,4.786302,6.520695
gdsc,YT,3.143593,0.000000,3.779792,5.095163,0.000000,2.879333,0.000000,3.186225,3.359589,0.000000,...,6.043061,2.985051,2.627298,3.170128,3.165902,3.156890,2.589527,3.915674,4.119448,6.867354


In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(aag.data.drop(aag.metric, axis=1), aag.data[aag.metric])

In [12]:
from methods import fs
X_train, X_test, co = fs(f_regression, X_train, X_test, y_train, n=0.01)

In [13]:
from methods import drp
from sklearn.linear_model import ElasticNet
from sklearn.metrics import r2_score

mod = drp(DecisionTreeRegressor(), X_train, y_train)
ypred= mod.predict(X_test)
r2_score(y_test, ypred)

-0.3029939981101124

In [53]:
%%time
aag.train(DecisionTreeRegressor())

CPU times: user 203 ms, sys: 101 ms, total: 303 ms
Wall time: 314 ms


In [40]:
importances = {0:[],1:[],2:[]}
e=0
for j in aag.model.feature_importances_:
    importances[e].append(j*100)
    if e == 2:
        e = 0
    else:
        e += 1

In [41]:
pd.DataFrame.from_dict(importances).rename(columns={0:'Global', 1:'CTRP', 2:'GDSC'}).describe()

Unnamed: 0,Global,CTRP,GDSC
count,206.0,206.0,206.0
mean,0.23466,0.107341,0.143436
std,0.643431,0.70276,0.357749
min,0.0,0.0,0.0
25%,0.0,0.0,0.0
50%,0.001635,0.0,0.001708
75%,0.081482,0.0,0.057856
max,4.915703,6.327331,2.8653


In [54]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import median_absolute_error

aag.metrics([r2_score, mean_absolute_error, mean_squared_error, median_absolute_error])

{'r2_score': -0.589656604796021,
 'mean_absolute_error': 0.29517072848360304,
 'mean_squared_error': 0.1373123651256232,
 'median_absolute_error': 0.2491798193489835}

In [16]:
aag.scores

AttributeError: 'drug' object has no attribute 'scores'

In [None]:
aag.to_json()

## Runs

In [17]:
from runs import run
from sklearn.svm import SVR
from classes import tuning

In [18]:
from classes import drug
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.feature_selection import SelectKBest, SelectPercentile, f_regression, mutual_info_regression, SelectFromModel, VarianceThreshold

In [19]:

#ge = {'ctrp': ctrp_ge, 'gdsc': gdsc_ge}
#dr = {'ctrp': ctrp_dr, 'gdsc': gdsc_dr}

t2 = {
    'degree': [2, 3, 4, 5],
    'epsilon' : [0.1, 0.2, 0.3, 0.9],
    'C':[0.01, 0.1, 1, 10, 100],
    'gamma':['scale']
}
tune = tuning(t2, iterations=50, cv=3, scoring='r2')

In [20]:
ge={'gdsc':True, 'ctrp':False, 'ccle':False}


In [21]:
%%time
r1, scores = run(ge, 'ElasticNet', False, 'DecisionTreeRegressor',drugs=3, n = 0.1, tuning = None)

CPU times: user 27.4 s, sys: 1.93 s, total: 29.4 s
Wall time: 54.9 s


In [34]:
for i in r1.values():
    print(len(i.col))


1607
1607
1607


In [26]:
model = {k:v.model.get_params() for k,v in r1.items()}

In [27]:
for i in r1.values():
    print(i.metrics([r2_score]))

{'r2_score': -0.4402398209397902}
{'r2_score': -1.1516677873876828}
{'r2_score': -0.9183615617612619}


In [34]:
import matplotlib.pyplot as plt
import seaborn as sns
scores = pd.DataFrame.from_dict(scores, orient='index')

In [35]:
scores = scores.join(pd.DataFrame.from_dict(model, orient='index'))

In [36]:
from sklearn.feature_selection import SelectKBest, SelectPercentile, f_regression, mutual_info_regression, SelectFromModel, VarianceThreshold

In [37]:
VarianceThreshold.__name__

'VarianceThreshold'

In [38]:
scores

Unnamed: 0,r2_score,mean_absolute_error,mean_squared_error,median_absolute_error,criterion,max_depth,max_features,max_leaf_nodes,min_impurity_decrease,min_impurity_split,min_samples_leaf,min_samples_split,min_weight_fraction_leaf,presort,random_state,splitter
PD-173074,0.998486,0.001222,3e-05,0.000151,mse,,,,0.0,,1,2,0.0,False,,best
S-Trityl-L-cysteine,0.999911,0.001753,6e-06,0.001241,mse,,,,0.0,,1,2,0.0,False,,best
Vinorelbine,0.999976,0.000921,2e-06,0.000581,mse,,,,0.0,,1,2,0.0,False,,best


In [7]:
a = [1,2,3]
b = [2,3,4]
x = [2,7,9]

In [8]:
c = set(a) & set(b)

In [9]:
list(c & set(x))

[2]

In [23]:
import pandas as pd
import config as c

In [24]:
ccle_ge = pd.read_csv(c.dir + 'ccle_cell_ge.csv').fillna(0).set_index('CCL')
ctrp_ge = pd.read_csv(c.dir + 'ctrp_cell_ge.csv').fillna(0).set_index('CCL')
ccle_dr = pd.read_csv(c.dir + 'ccle_poz_dr.csv').fillna(0)
ctrp_dr = pd.read_csv(c.dir + 'ctrp_poz_dr.csv').fillna(0)

In [26]:
drug_data = {'ccle':ccle_dr, 'ctrp':ctrp_dr}
[item for sublist in [list(j['Drug_name'].unique()) for j in drug_data.values()] for item in sublist]

['17-AAG',
 'AEW541',
 'Saracatinib',
 'Selumetinib',
 'Erlotinib',
 'Irinotecan',
 'L-685458',
 'Lapatinib',
 'LBW242',
 'Nilotinib',
 'Nutlin-3',
 'Paclitaxel',
 'Panobinostat',
 'PD-0325901',
 'PD-0332991',
 'Crizotinib',
 'PHA-665752',
 'PLX-4720',
 'RAF265',
 'Sorafenib',
 'TAE684',
 'Dovitinib',
 'Topotecan',
 'Vandetanib',
 '16-beta-bromoandrosterone',
 '1S,3R-RSL-3',
 '3-Cl-AHPC',
 '968',
 'A-804598',
 'AA-COCF3',
 'Abiraterone',
 'ABT-199',
 'ABT-737',
 'AC55649',
 'Afatinib',
 'AGK-2',
 'Alisertib',
 'Alisertib_navitoclax 2 to 1 ratio mol by mol',
 'Flavopiridol',
 'AM-580',
 'Apicidin',
 'AT-406',
 'AT13387',
 'AT7867',
 'Austocystin D',
 'Avicin D',
 'Avrainvillamide',
 'Axitinib',
 'AZ-3146',
 'Azacitidine',
 'AZD1480',
 'AZD4547',
 'AZD6482',
 'AZD7545',
 'AZD7762',
 'AZD8055',
 'B02',
 'Bafilomycin A1',
 'Barasertib',
 'Bardoxolone methyl',
 'Bax channel blocker',
 'BCL-LZH-4',
 'BEC',
 'Belinostat',
 'Bendamustine',
 'Betulinic acid',
 'Bexarotene',
 'BI-2536',
 'BIBR-1

## Fixing feda

In [35]:
from methods import feda
from classes import drug
import pandas as pd

In [34]:
ccle_ge = pd.read_csv(c.dir + 'ccle_cell_ge.csv').fillna(0).set_index('CCL')
ctrp_ge = pd.read_csv(c.dir + 'ctrp_cell_ge.csv').fillna(0).set_index('CCL')
ccle_dr = pd.read_csv(c.dir + 'ccle_poz_dr.csv').fillna(0)
ctrp_dr = pd.read_csv(c.dir + 'ctrp_poz_dr.csv').fillna(0)

In [48]:
a = ccle_ge[:3][['A1BG', 'A1CF']]
b = ctrp_ge[:3][['A1BG', 'A1CF']]
c = ctrp_ge[3:6][['A1BG', 'A1CF']]

In [49]:
a

Unnamed: 0_level_0,A1BG,A1CF
CCL,Unnamed: 1_level_1,Unnamed: 2_level_1
1321N1,5.54272,3.921564
22RV1,4.753564,7.445061
42MGBA,7.290132,4.030574


In [50]:
b

Unnamed: 0_level_0,A1BG,A1CF
CCL,Unnamed: 1_level_1,Unnamed: 2_level_1
22RV1,4.753564,7.445061
42MGBA,7.290132,4.030574
5637,4.293473,4.157908


In [51]:
c

Unnamed: 0_level_0,A1BG,A1CF
CCL,Unnamed: 1_level_1,Unnamed: 2_level_1
639V,6.627165,4.094668
697,6.991375,4.027624
769P,4.477092,5.060515


In [52]:
pd.DataFrame(feda([a.to_numpy(), b.to_numpy(), c.to_numpy()]))

Unnamed: 0,0,1,2,3,4,5,6,7
0,5.54272,5.54272,0.0,0.0,3.921564,3.921564,0.0,0.0
1,4.753564,4.753564,0.0,0.0,7.445061,7.445061,0.0,0.0
2,7.290132,7.290132,0.0,0.0,4.030574,4.030574,0.0,0.0
3,4.753564,0.0,4.753564,0.0,7.445061,0.0,7.445061,0.0
4,7.290132,0.0,7.290132,0.0,4.030574,0.0,4.030574,0.0
5,4.293473,0.0,4.293473,0.0,4.157908,0.0,4.157908,0.0
6,6.627165,0.0,0.0,6.627165,4.094668,0.0,0.0,4.094668
7,6.991375,0.0,0.0,6.991375,4.027624,0.0,0.0,4.027624
8,4.477092,0.0,0.0,4.477092,5.060515,0.0,0.0,5.060515
