In [1]:
import pandas as pd
import glob
import numpy as np
from scipy import stats
from sklearn import cluster
import networkx as nx
from collections import defaultdict
import matplotlib.pyplot as plt
from matplotlib import cm
import seaborn as sns
from sklearn.metrics.cluster import normalized_mutual_info_score
from sklearn.metrics.cluster import adjusted_rand_score



In [2]:
from numpy import concatenate
from matplotlib import pyplot
from pandas import read_csv
from pandas import DataFrame
from pandas import concat
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM

Using TensorFlow backend.


In [896]:
#path to celllines folder where I have 44 celllines
path = './celllines' # use your path
all_files = glob.glob(path + "/*.csv")


In [943]:
emb = pd.read_csv('./network_embedding.csv', index_col=None, header=0)

In [945]:
emb.set_index('53',inplace=True)

In [989]:
EGF=emb.loc['EGF'].values
EGFR=emb.loc['EGFR'].values
MEK=emb.loc['MEK12'].values
PI3K=emb.loc['PI3K'].values
PKC=emb.loc['PKC'].values
MTOR=emb.loc['mTOR'].values



In [1009]:
def prepare_data(all_files,EGF,EGFR,MEK,PI3K,PKC,MTOR):
    df_values=list()
    for i in range (len(all_files)):    #len(all_files)
        filename = all_files[i]

        print(filename)
        df = pd.read_csv(filename, index_col=None, header=0)
        # drop markers not found in test data
        df.drop(['p.HER2','p.PLCg2'], axis=1,inplace=True)
        # fill na
        df.fillna(method='ffill', inplace=True)
        # remove dublicate
        df["id"] = df["treatment"] + df["cell_line"] + df["time"].map(str) + df["cellID"].map(str)
        df = df.drop_duplicates(subset='id', keep="first")
        df.drop(['id'], axis=1,inplace=True)
        # set full time to -1
        df.at[df['treatment']=='full', 'time'] = -1
        # drop some times 
        indexNames =df[ df['time'] == 5.5 ].index
        indexNames2=df[ df['time'] == 23 ].index
        indexNames3=df[ df['time'] == 30 ].index

        # Delete these row indexes from dataFrame
        df.drop(indexNames , inplace=True)
        df.drop(indexNames2 , inplace=True)
        df.drop(indexNames3 , inplace=True)

        data=df.copy() #EGF,iEGFR,iMEK,iPI3K,iPKC
        data_EGF=data[data['treatment']=='EGF']
        data_EGF=data_EGF.append(data[data['treatment']=='full'])
        #keep only cells with all 8 time steps
        data_EGF=data_EGF.groupby('cellID').filter(lambda x : len(x)==8)
        if not data_EGF.empty:
            data_EGF.drop(['treatment','cell_line','fileID'], axis=1,inplace=True)
            data_EGF.set_index('cellID',inplace=True)
            data_EGF_sort= data_EGF.sort_values(['cellID', 'time'], ascending=[True, True])
            data_EGF_sort.drop(['time'], axis=1,inplace=True)
            #to put all time steps in one raw 
            df_EGF = data_EGF_sort.groupby(['cellID'])['b.CATENIN', 'cleavedCas', 'CyclinB', 'GAPDH', 'IdU', 'Ki.67',
               'p.4EBP1', 'p.Akt.Ser473.', 'p.AKT.Thr308.', 'p.AMPK', 'p.BTK',
               'p.CREB', 'p.ERK', 'p.FAK', 'p.GSK3b', 'p.H3', 'p.JNK',
               'p.MAP2K3', 'p.MAPKAPK2', 'p.MEK', 'p.MKK3.MKK6', 'p.MKK4', 'p.NFkB',
               'p.p38', 'p.p53', 'p.p90RSK', 'p.PDPK1', 'p.RB', 'p.S6',
               'p.S6K', 'p.SMAD23', 'p.SRC', 'p.STAT1', 'p.STAT3', 'p.STAT5'].apply(lambda x: pd.DataFrame(x.values, columns=x.columns))
            df_EGF=df_EGF.unstack()#.sort_index(axis=1, level=1)
            m=np.repeat([EGF], df_EGF.shape[0], axis=0)
            concat=np.concatenate((m, df_EGF.values), axis=1)
            #print(concat)
            print(concat.shape)
            df_values.extend(concat)
       

        data_iEGFR=data[data['treatment']=='iEGFR']
        data_iEGFR=data_iEGFR.append(data[data['treatment']=='full'])
        #keep only cells with all 8 time steps
        data_iEGFR=data_iEGFR.groupby('cellID').filter(lambda x : len(x)==8)
        if not data_iEGFR.empty:
            data_iEGFR.drop(['treatment','cell_line','fileID'], axis=1,inplace=True)
            data_iEGFR.set_index('cellID',inplace=True)
            data_iEGFR_sort= data_iEGFR.sort_values(['cellID', 'time'], ascending=[True, True])
            data_iEGFR_sort.drop(['time'], axis=1,inplace=True)
            df_iEGFR = data_iEGFR_sort.groupby(['cellID'])['b.CATENIN', 'cleavedCas', 'CyclinB', 'GAPDH', 'IdU', 'Ki.67',
               'p.4EBP1', 'p.Akt.Ser473.', 'p.AKT.Thr308.', 'p.AMPK', 'p.BTK',
               'p.CREB', 'p.ERK', 'p.FAK', 'p.GSK3b', 'p.H3', 'p.JNK',
               'p.MAP2K3', 'p.MAPKAPK2', 'p.MEK', 'p.MKK3.MKK6', 'p.MKK4', 'p.NFkB',
               'p.p38', 'p.p53', 'p.p90RSK', 'p.PDPK1', 'p.RB', 'p.S6',
               'p.S6K', 'p.SMAD23', 'p.SRC', 'p.STAT1', 'p.STAT3', 'p.STAT5'].apply(lambda x: pd.DataFrame(x.values, columns=x.columns))
            df_iEGFR=df_iEGFR.unstack()#.sort_index(axis=1, level=1)
            m=np.repeat([EGFR], df_iEGFR.shape[0], axis=0)
            concat=np.concatenate((m, df_iEGFR.values), axis=1)
            #print(concat)
            print(concat.shape)
            df_values.extend(concat)
        

        data_iMEK=data[data['treatment']=='iMEK']
        data_iMEK=data_iMEK.append(data[data['treatment']=='full'])
        #keep only cells with all 8 time steps
        data_iMEK=data_iMEK.groupby('cellID').filter(lambda x : len(x)==8)
        if not data_iMEK.empty:
            data_iMEK.drop(['treatment','cell_line','fileID'], axis=1,inplace=True)
            data_iMEK.set_index('cellID',inplace=True)
            data_iMEK_sort= data_iMEK.sort_values(['cellID', 'time'], ascending=[True, True])
            data_iMEK_sort.drop(['time'], axis=1,inplace=True)
            df_iMEK= data_iMEK_sort.groupby(['cellID'])['b.CATENIN', 'cleavedCas', 'CyclinB', 'GAPDH', 'IdU', 'Ki.67',
               'p.4EBP1', 'p.Akt.Ser473.', 'p.AKT.Thr308.', 'p.AMPK', 'p.BTK',
               'p.CREB', 'p.ERK', 'p.FAK', 'p.GSK3b', 'p.H3', 'p.JNK',
               'p.MAP2K3', 'p.MAPKAPK2', 'p.MEK', 'p.MKK3.MKK6', 'p.MKK4', 'p.NFkB',
               'p.p38', 'p.p53', 'p.p90RSK', 'p.PDPK1', 'p.RB', 'p.S6',
               'p.S6K', 'p.SMAD23', 'p.SRC', 'p.STAT1', 'p.STAT3', 'p.STAT5'].apply(lambda x: pd.DataFrame(x.values, columns=x.columns))
            df_iMEK=df_iMEK.unstack()#.sort_index(axis=1, level=1)
            m=np.repeat([MEK], df_iMEK.shape[0], axis=0)
            concat=np.concatenate((m, df_iMEK.values), axis=1)
            #print(concat)
            print(concat.shape)
            df_values.extend(concat)
        


        data_iPI3K=data[data['treatment']=='iPI3K']
        data_iPI3K=data_iPI3K.append(data[data['treatment']=='full'])
        #keep only cells with all 8 time steps
        data_iPI3K=data_iPI3K.groupby('cellID').filter(lambda x : len(x)==8)
        if not data_iPI3K.empty:
            data_iPI3K.drop(['treatment','cell_line','fileID'], axis=1,inplace=True)
            data_iPI3K.set_index('cellID',inplace=True)
            data_iPI3K_sort= data_iPI3K.sort_values(['cellID', 'time'], ascending=[True, True])
            data_iPI3K_sort.drop(['time'], axis=1,inplace=True)
            df_iPI3K= data_iPI3K_sort.groupby(['cellID'])['b.CATENIN', 'cleavedCas', 'CyclinB', 'GAPDH', 'IdU', 'Ki.67',
               'p.4EBP1', 'p.Akt.Ser473.', 'p.AKT.Thr308.', 'p.AMPK', 'p.BTK',
               'p.CREB', 'p.ERK', 'p.FAK', 'p.GSK3b', 'p.H3', 'p.JNK',
               'p.MAP2K3', 'p.MAPKAPK2', 'p.MEK', 'p.MKK3.MKK6', 'p.MKK4', 'p.NFkB',
               'p.p38', 'p.p53', 'p.p90RSK', 'p.PDPK1', 'p.RB', 'p.S6',
               'p.S6K', 'p.SMAD23', 'p.SRC', 'p.STAT1', 'p.STAT3', 'p.STAT5'].apply(lambda x: pd.DataFrame(x.values, columns=x.columns))
            df_iPI3K=df_iPI3K.unstack()#.sort_index(axis=1, level=1)
            m=np.repeat([PI3K], df_iPI3K.shape[0], axis=0)
            concat=np.concatenate((m, df_iPI3K.values), axis=1)
            #print(concat)
            print(concat.shape)
            df_values.extend(concat)
       

        data_iPKC=data[data['treatment']=='iPKC']
        data_iPKC=data_iPKC.append(data[data['treatment']=='full'])
        #keep only cells with all 8 time steps
        data_iPKC=data_iPKC.groupby('cellID').filter(lambda x : len(x)==8)
        if not data_iPKC.empty:
            data_iPKC.drop(['treatment','cell_line','fileID'], axis=1,inplace=True)
            data_iPKC.set_index('cellID',inplace=True)
            data_iPKC_sort= data_iPKC.sort_values(['cellID', 'time'], ascending=[True, True])
            data_iPKC_sort.drop(['time'], axis=1,inplace=True)
            df_iPKC= data_iPKC_sort.groupby(['cellID'])['b.CATENIN', 'cleavedCas', 'CyclinB', 'GAPDH', 'IdU', 'Ki.67',
               'p.4EBP1', 'p.Akt.Ser473.', 'p.AKT.Thr308.', 'p.AMPK', 'p.BTK',
               'p.CREB', 'p.ERK', 'p.FAK', 'p.GSK3b', 'p.H3', 'p.JNK',
               'p.MAP2K3', 'p.MAPKAPK2', 'p.MEK', 'p.MKK3.MKK6', 'p.MKK4', 'p.NFkB',
               'p.p38', 'p.p53', 'p.p90RSK', 'p.PDPK1', 'p.RB', 'p.S6',
               'p.S6K', 'p.SMAD23', 'p.SRC', 'p.STAT1', 'p.STAT3', 'p.STAT5'].apply(lambda x: pd.DataFrame(x.values, columns=x.columns))
            df_iPKC=df_iPKC.unstack()#.sort_index(axis=1, level=1)

            m=np.repeat([PKC], df_iPKC.shape[0], axis=0)
            concat=np.concatenate((m, df_iPKC.values), axis=1)
            #print(concat)
            print(concat.shape)
            df_values.extend(concat)
        
        
    return(df_values)

    

In [1010]:
data=prepare_data(all_files,EGF,EGFR,MEK,PI3K,PKC,MTOR)

./celllines/MFM223.csv
(6969, 296)
(4187, 296)
(4659, 296)
(6962, 296)
(4202, 296)
./celllines/MCF7.csv
(7155, 296)
(5195, 296)
(1000, 296)
(6535, 296)
(4793, 296)
./celllines/MDAMB175VII.csv
(2883, 296)
(2222, 296)
(1113, 296)
(2831, 296)
(170, 296)
./celllines/HCC3153.csv
(5722, 296)
(6033, 296)
(4106, 296)
(4472, 296)
(2977, 296)
./celllines/HCC1395.csv
(6082, 296)
(5268, 296)
(4471, 296)
(3250, 296)
./celllines/HBL100.csv
(3821, 296)
(3627, 296)
(3438, 296)
(4287, 296)
(4287, 296)
./celllines/DU4475.csv
(803, 296)
(1946, 296)
(2155, 296)
(1256, 296)
(1197, 296)
./celllines/MDAkb2.csv
(5315, 296)
(3823, 296)
(3775, 296)
(2901, 296)
./celllines/CAL148.csv
(3649, 296)
(3760, 296)
(3550, 296)
./celllines/T47D.csv
(6215, 296)
(3562, 296)
(3850, 296)
(4294, 296)
(3889, 296)
./celllines/HDQP1.csv
(6942, 296)
(6476, 296)
(5807, 296)
(6942, 296)
(3737, 296)
./celllines/HCC2157.csv
(6307, 296)
(5123, 296)
(4111, 296)
(5056, 296)
./celllines/HCC1500.csv
(4692, 296)
(2501, 296)
(3717, 296)
(18

In [1015]:
train=np.asarray(data)

In [1016]:
train.shape

(988929, 296)

In [1017]:
np.save('train_emb.npy', train)    # .npy extension is added if not given
#train= np.load('train.npy')

In [1018]:
train.shape

(988929, 296)

In [1081]:

 
# inverse scaling for a forecasted value
def invert_scale(scaler, X, yhat):
	new_row = [x for x in X] + [yhat]
	array = np.array(new_row)
	array = array.reshape(1, len(array))
	inverted = scaler.inverse_transform(array)
	return inverted[0, -1]
 

In [1042]:
# fit an LSTM network to training data
def fit_lstm(train, batch_size, nb_epoch, neurons, timesteps):
    n_lag=35+16
    X, y = train[:, 0:n_lag], train[:, n_lag:]
    X = X.reshape(X.shape[0], 1, X.shape[1])
    model = Sequential()
    model.add(LSTM(neurons, input_shape=(X.shape[1], X.shape[2])))
    model.add(Dense(y.shape[1]))
    model.compile(loss='mean_squared_error', optimizer='adam')
    for i in range(nb_epoch):
        model.fit(X, y, epochs=1, batch_size=batch_size, verbose=1,validation_split=0.2, shuffle=False)
        model.reset_states()
    return model

In [1040]:
# scale train and test data to [-1, 1]
# fit scaler
scaler = MinMaxScaler(feature_range=(-1, 1))
scaler = scaler.fit(train)
# transform train
#train = train.reshape(train.shape[0], train.shape[1])
train_scaled = scaler.transform(train)


In [1174]:
# fit model

model = fit_lstm(train_scaled, 164, 10, 50, 1)

    

Train on 791143 samples, validate on 197786 samples
Epoch 1/1
Train on 791143 samples, validate on 197786 samples
Epoch 1/1
Train on 791143 samples, validate on 197786 samples
Epoch 1/1
Train on 791143 samples, validate on 197786 samples
Epoch 1/1
Train on 791143 samples, validate on 197786 samples
Epoch 1/1
Train on 791143 samples, validate on 197786 samples
Epoch 1/1
Train on 791143 samples, validate on 197786 samples
Epoch 1/1
Train on 791143 samples, validate on 197786 samples
Epoch 1/1
Train on 791143 samples, validate on 197786 samples
Epoch 1/1
Train on 791143 samples, validate on 197786 samples
Epoch 1/1


In [1181]:
################TESt
#path to subchallenge folder where I have 12 celllines 
#'184B5' 'BT483' 'HCC1428' 
#'HCC1806' 'HCC202' 'Hs578T' 
#'MCF12A' 'MDAMB231' 'MDAMB468'
#'SKBR3' 'UACC3199' 'ZR751'
path = './sub2' # use your path
test_files = [path + "/184B5.csv",path + "/BT483.csv", path + "/HCC1428.csv",
             path + "/HCC1806.csv",path + "/HCC202.csv",  path + "/Hs578T.csv",
             path + "/MCF12A.csv",path + "/MDAMB231.csv", path + "/MDAMB468.csv",
             path + "/SKBR3.csv",path + "/UACC3199.csv",  path + "/ZR751.csv"]


In [1182]:
def prepare_data(all_files):
    df_values=list()
    for i in range (len(all_files)):    #len(all_files)
        filename = all_files[i]

        print(filename)
        df = pd.read_csv(filename, index_col=None, header=0)
        # drop markers not found in test data
        df.drop(['p.HER2','p.PLCg2'], axis=1,inplace=True)
        # fill na
        df.fillna(method='ffill', inplace=True)
        # set full time to -1
        df.at[df['treatment']=='full', 'time'] = -1
        inputdata=df[df['treatment']=='full']
        inputdata.drop(['treatment','cell_line','fileID','time'], axis=1,inplace=True)
        inputdata.set_index('cellID',inplace=True)
        inputdata=inputdata[:10000]
        if inputdata.shape[0]<10000:
            inputdata= inputdata.append(inputdata)
            inputdata= inputdata.append(inputdata)
        inputdata=inputdata[:10000]
        #print(inputdata.shape)
        name=filename.split('/')[2].split('.')[0]
        if (name =='BT483' or name == 'MCF12A' or name == 'MDAMB468' ):
            m=np.repeat([EGFR], 10000, axis=0)
            concat=np.concatenate((m, inputdata.values), axis=1)
            #print(concat)
            print(concat.shape)
            df_values.extend(concat)
            
        if (name =='184B5' or name == 'ZR751' or name == 'HCC202' ):
            m=np.repeat([MEK], 10000, axis=0)
            concat=np.concatenate((m, inputdata.values), axis=1)
            #print(concat)
            print(concat.shape)
            df_values.extend(concat)
            
        if (name =='UACC3199' or name == 'SKBR3' or name == 'MDAMB231' ):
            m=np.repeat([PI3K], 10000, axis=0)
            concat=np.concatenate((m, inputdata.values), axis=1)
            #print(concat)
            print(concat.shape)
            df_values.extend(concat)
            
        if (name =='HCC1806' or name == 'Hs578T' or name == 'HCC1428' ):
            m=np.repeat([PI3K], 10000, axis=0)
            concat=np.concatenate((m, inputdata.values), axis=1)
            #print(concat)
            print(concat.shape)
            df_values.extend(concat)

    return df_values


In [1183]:
data=prepare_data(test_files)

./sub2/184B5.csv


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


(10000, 51)
./sub2/BT483.csv
(10000, 51)
./sub2/HCC1428.csv
(10000, 51)
./sub2/HCC1806.csv
(10000, 51)
./sub2/HCC202.csv
(10000, 51)
./sub2/Hs578T.csv
(10000, 51)
./sub2/MCF12A.csv
(10000, 51)
./sub2/MDAMB231.csv
(10000, 51)
./sub2/MDAMB468.csv
(10000, 51)
./sub2/SKBR3.csv
(10000, 51)
./sub2/UACC3199.csv
(10000, 51)
./sub2/ZR751.csv
(10000, 51)


In [1184]:
test=np.asarray(data)

In [1185]:
test.shape

(120000, 51)

In [1186]:
# transform test
scaler = MinMaxScaler(feature_range=(-1, 1))
scaler = scaler.fit(test)
# transform train
#test = test.reshape(test.shape[0], test.shape[1])
test_scaled = scaler.transform(test)

In [1187]:
test_scaled.shape

(120000, 51)

In [1188]:
# make a prediction
n_lag=35+16
X, y = test_scaled[:, 0:n_lag], test_scaled[:, n_lag:]
print(X.shape)
X = X.reshape(X.shape[0], 1, X.shape[1])
yhat = model.predict(X)




(120000, 51)


In [1189]:
# invert scaling
#yhat = invert_scale(scaler, X, yhat)

In [1190]:
tempdata=yhat.reshape(yhat.shape[0]*7,35)

In [1191]:
###### TEMP

#path to cellline 
#MCF12A miss time 13
filename = './subchallenge_2_template_data.csv' # use your path

print(filename)
df_temp = pd.read_csv(filename, index_col=None, header=0)

./subchallenge_2_template_data.csv


In [1192]:
dfpred=pd.DataFrame(tempdata)

In [1193]:
# drop time 13 in cell MC...
indx= np.arange(450000,450000+10000)
# Delete these row indexes from dataFrame
dfpred.drop(indx , inplace=True)

In [1194]:
dfpred.shape

(830000, 35)

In [1195]:
dfpred.insert (0, "cell_line", df_temp.cell_line.values)

In [1196]:
dfpred.insert (1, "treatment", df_temp.treatment.values)

In [1197]:
dfpred.insert (2, "time", df_temp.time.values)

In [1198]:
dfpred.insert (3, "cellID", df_temp.cellID.values)

In [1199]:
dfpred.columns = df_temp.columns

In [1200]:
dfpred

Unnamed: 0,cell_line,treatment,time,cellID,b.CATENIN,cleavedCas,CyclinB,GAPDH,IdU,Ki.67,...,p.p90RSK,p.PDPK1,p.RB,p.S6,p.S6K,p.SMAD23,p.SRC,p.STAT1,p.STAT3,p.STAT5
0,184B5,iMEK,0,1,0.387833,-0.193119,0.486817,0.230707,-0.542230,0.319631,...,0.170828,0.464226,0.322583,0.225618,0.057457,0.076021,-0.283974,-0.085928,0.266834,0.347443
1,184B5,iMEK,0,2,0.241438,0.342096,0.438373,0.111240,-0.058427,0.293601,...,-0.713351,-0.390907,-1.035321,-1.609302,-1.054076,-0.892049,-0.655364,-0.860273,-0.803724,0.095706
2,184B5,iMEK,0,3,-0.225576,-0.658965,-0.367058,-0.077770,-0.003269,0.143232,...,-0.322883,0.371133,0.477205,0.674337,0.365431,0.159284,-0.017106,-0.403989,-0.706290,-0.340120
3,184B5,iMEK,0,4,-0.323865,0.230783,-0.238179,0.037095,0.181815,-0.342467,...,0.824958,0.362221,0.116568,0.337017,0.110883,-0.621409,0.288924,0.534538,1.021670,0.532385
4,184B5,iMEK,0,5,0.534068,0.292007,0.025827,-0.126765,0.213322,0.246093,...,-0.102828,-0.400977,-1.213874,-0.399810,-0.325879,-0.027919,-0.202220,-0.248646,-0.154157,-0.021578
5,184B5,iMEK,0,6,-0.670698,0.102846,0.237835,0.299122,-0.050684,0.085723,...,-0.023056,0.099238,0.479202,0.278492,0.078821,-0.078265,-0.074810,-0.648662,0.148038,-0.181062
6,184B5,iMEK,0,7,-0.081100,-0.026857,-0.237648,-0.011317,-0.339229,-0.659566,...,-0.026043,-0.132270,0.275472,-0.292780,-0.252053,-0.144752,-0.086509,0.284686,-0.015108,-0.039691
7,184B5,iMEK,0,8,0.195643,-0.322151,-0.394128,-0.114179,-0.450463,-0.362535,...,-0.459917,-0.287636,-0.402393,-0.376254,-0.208458,-0.294319,-0.333013,-0.388548,-0.447788,-0.276408
8,184B5,iMEK,0,9,-0.317866,-0.201621,-0.149658,-0.373430,-0.369810,-0.293051,...,-0.895935,-0.525373,-0.895801,-0.898010,-0.784670,-0.952773,-0.736004,-0.827964,-0.686250,-0.555413
9,184B5,iMEK,0,10,-0.616021,-0.703095,-0.657960,-0.696258,-0.578295,-0.559404,...,-0.463488,-0.275962,-0.524637,-0.257246,-0.381654,-0.340867,-0.568214,-0.729151,-0.752525,-0.691021


In [1201]:
dfpred.to_csv(r'filled_subchallenge_2_template_data.csv')

In [1202]:
#        'BT20', 'BT474', 'BT549', 'CAL148', 'CAL51', 'CAL851', 'DU4475',
#       'EFM192A', 'EVSAT', 'HBL100', 'HCC1187', 'HCC1395', 'HCC1419',
#       'HCC1500', 'HCC1569', 'HCC1599', 'HCC1937', 'HCC1954', 'HCC2185',
#       'HCC3153', 'HCC38', 'HCC70', 'HDQP1', 'JIMT1', 'MCF7',
#       'MDAMB134VI', 'MDAMB157', 'MDAMB175VII', 'MDAMB361', 'MDAMB415',
#       'MDAMB453', 'MFM223', 'MPE600', 'MX1', 'OCUBM', 'T47D', 'UACC812',
#       'UACC893', 'ZR7530'

path = './celllines' # use your path
test_files = [path + "/BT20.csv",path + "/BT474.csv", path + "/BT549.csv",path + "/CAL148.csv",
              path + "/CAL51.csv", path + "/CAL851.csv",
              path + "/DU4475.csv",path + "/EFM192A.csv",path + "/EVSAT.csv",path + "/HBL100.csv",path + "/HCC1187.csv",
             path + "/HCC1395.csv",path + "/HCC1419.csv",  path + "/HCC1500.csv",path + "/HCC1569.csv",path + "/HCC1599.csv",
             path + "/HCC1937.csv",path + "/HCC1954.csv", path + "/HCC2185.csv",path + "/HCC3153.csv",path + "/HCC38.csv",
             path + "/HCC70.csv",path + "/HDQP1.csv",  path + "/JIMT1.csv",path + "/MCF7.csv",path + "/MDAMB134VI.csv",
              path +'/MDAMB157.csv', path +'/MDAMB175VII.csv',path + '/MDAMB361.csv',path + '/MDAMB415.csv', path + '/MDAMB453.csv',
             path + "/MFM223.csv",path + "/MPE600.csv",path + "/MX1.csv",path + "/OCUBM.csv",path + "/T47D.csv",
             path + "/UACC812.csv",path + "/UACC893.csv",path + "/ZR7530.csv"]

In [1203]:
def prepare_data(all_files):
    df_values=list()
    for i in range (len(all_files)):    #len(all_files)
        filename = all_files[i]

        print(filename)
        df = pd.read_csv(filename, index_col=None, header=0)
        # drop markers not found in test data
        df.drop(['p.HER2','p.PLCg2'], axis=1,inplace=True)
        # fill na
        df.fillna(method='ffill', inplace=True)
        # set full time to -1
        df.at[df['treatment']=='full', 'time'] = -1
        inputdata=df[df['treatment']=='full']
        inputdata.drop(['treatment','cell_line','fileID','time'], axis=1,inplace=True)
        inputdata.set_index('cellID',inplace=True)
        inputdata=inputdata[:10000]
        if inputdata.shape[0]<10000:
            inputdata= inputdata.append(inputdata)
            inputdata= inputdata.append(inputdata)
            inputdata= inputdata.append(inputdata)
        inputdata=inputdata[:10000]
        print(inputdata.shape)
        
        
        m=np.repeat([MTOR], 10000, axis=0)
        concat=np.concatenate((m, inputdata.values), axis=1)
        #print(concat)
        print(concat.shape)
        df_values.extend(concat)
            

    return df_values


In [1204]:
data=prepare_data(test_files)

./celllines/BT20.csv


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


(10000, 35)
(10000, 51)
./celllines/BT474.csv
(10000, 35)
(10000, 51)
./celllines/BT549.csv
(10000, 35)
(10000, 51)
./celllines/CAL148.csv
(10000, 35)
(10000, 51)
./celllines/CAL51.csv
(10000, 35)
(10000, 51)
./celllines/CAL851.csv
(10000, 35)
(10000, 51)
./celllines/DU4475.csv
(10000, 35)
(10000, 51)
./celllines/EFM192A.csv
(10000, 35)
(10000, 51)
./celllines/EVSAT.csv
(10000, 35)
(10000, 51)
./celllines/HBL100.csv
(10000, 35)
(10000, 51)
./celllines/HCC1187.csv
(10000, 35)
(10000, 51)
./celllines/HCC1395.csv
(10000, 35)
(10000, 51)
./celllines/HCC1419.csv
(10000, 35)
(10000, 51)
./celllines/HCC1500.csv
(10000, 35)
(10000, 51)
./celllines/HCC1569.csv
(10000, 35)
(10000, 51)
./celllines/HCC1599.csv
(10000, 35)
(10000, 51)
./celllines/HCC1937.csv
(10000, 35)
(10000, 51)
./celllines/HCC1954.csv
(10000, 35)
(10000, 51)
./celllines/HCC2185.csv
(10000, 35)
(10000, 51)
./celllines/HCC3153.csv
(10000, 35)
(10000, 51)
./celllines/HCC38.csv
(10000, 35)
(10000, 51)
./celllines/HCC70.csv
(10000, 

In [1205]:
test=np.asarray(data)

In [1206]:
# transform test
scaler = MinMaxScaler(feature_range=(-1, 1))
scaler = scaler.fit(test)
# transform train
#test = test.reshape(test.shape[0], test.shape[1])
test_scaled = scaler.transform(test)

In [1207]:
# make a prediction
n_lag=35+16
X, y = test_scaled[:, 0:n_lag], test_scaled[:, n_lag:]
print(X.shape)
X = X.reshape(X.shape[0], 1, X.shape[1])
yhat = model.predict(X)

(390000, 51)


In [1208]:
tempdata=yhat.reshape(yhat.shape[0]*7,35)

In [1209]:
dfpred3=pd.DataFrame(tempdata)

In [1210]:
###### TEMP

#path to cellline 
#MDAMB175VII miss time 60
filename = './subchallenge_3_template_data.csv' # use your path

print(filename)
df_temp3 = pd.read_csv(filename, index_col=None, header=0)

./subchallenge_3_template_data.csv


In [1211]:
df_temp3

Unnamed: 0,cell_line,treatment,time,cellID,b.CATENIN,cleavedCas,CyclinB,GAPDH,IdU,Ki.67,...,p.p90RSK,p.PDPK1,p.RB,p.S6,p.S6K,p.SMAD23,p.SRC,p.STAT1,p.STAT3,p.STAT5
0,BT20,imTOR,0,1,,,,,,,...,,,,,,,,,,
1,BT20,imTOR,0,2,,,,,,,...,,,,,,,,,,
2,BT20,imTOR,0,3,,,,,,,...,,,,,,,,,,
3,BT20,imTOR,0,4,,,,,,,...,,,,,,,,,,
4,BT20,imTOR,0,5,,,,,,,...,,,,,,,,,,
5,BT20,imTOR,0,6,,,,,,,...,,,,,,,,,,
6,BT20,imTOR,0,7,,,,,,,...,,,,,,,,,,
7,BT20,imTOR,0,8,,,,,,,...,,,,,,,,,,
8,BT20,imTOR,0,9,,,,,,,...,,,,,,,,,,
9,BT20,imTOR,0,10,,,,,,,...,,,,,,,,,,


In [1212]:

# drop time 60 in cell MDAMB175VII...
indx= np.arange(1950000,1950000+10000)
# Delete these row indexes from dataFrame
dfpred3.drop(indx , inplace=True)


In [1213]:
dfpred3.insert (0, "cell_line", df_temp3.cell_line.values)
dfpred3.insert (1, "treatment", df_temp3.treatment.values)
dfpred3.insert (2, "time", df_temp3.time.values)
dfpred3.insert (3, "cellID", df_temp3.cellID.values)

In [1214]:
dfpred3.columns = df_temp3.columns

In [1215]:
dfpred3.to_csv(r'filled_subchallenge_3_template_data.csv')

In [1216]:
dfpred3

Unnamed: 0,cell_line,treatment,time,cellID,b.CATENIN,cleavedCas,CyclinB,GAPDH,IdU,Ki.67,...,p.p90RSK,p.PDPK1,p.RB,p.S6,p.S6K,p.SMAD23,p.SRC,p.STAT1,p.STAT3,p.STAT5
0,BT20,imTOR,0,1,0.332962,-0.436013,1.957267,-0.607413,-0.765878,0.076522,...,-0.012252,0.057731,0.288160,-0.046659,0.004536,-0.069728,-0.380509,-0.275683,0.269616,-0.059849
1,BT20,imTOR,0,2,0.337543,-0.105127,-0.006116,-0.033746,-0.316392,-0.132823,...,-0.710612,-0.329315,-0.448906,-0.378404,-0.186311,0.256996,0.681693,0.530837,0.206297,-0.359870
2,BT20,imTOR,0,3,-0.164994,-0.700933,-0.514319,0.038464,-0.231255,0.223699,...,-0.556243,0.003984,0.329007,0.119779,0.624066,-0.349594,-0.600084,-0.522734,-0.850078,-0.585768
3,BT20,imTOR,0,4,-0.380286,-0.295291,-0.299862,-0.570987,-0.594385,0.014517,...,-0.166445,0.244153,-0.528571,-0.018584,0.238661,-0.199611,0.249790,0.832501,1.148405,0.943492
4,BT20,imTOR,0,5,0.155575,0.173917,-0.105037,-0.363726,-0.169160,0.023698,...,-0.497414,-0.217108,-0.451798,0.131708,0.440934,0.411095,0.820111,0.034669,-0.636535,-0.256274
5,BT20,imTOR,0,6,-0.690955,-0.363808,0.057355,-0.401748,-0.075690,-0.685491,...,-0.121740,0.255343,0.206905,0.631094,-0.119909,-0.312474,-0.264533,-0.819194,-0.293778,-0.219319
6,BT20,imTOR,0,7,-0.801288,-0.096823,-0.717812,-0.553742,-0.269176,-0.655497,...,-0.371545,-0.746244,-0.058045,-0.280350,-0.219694,-0.118403,0.028415,0.398591,-0.017006,0.052456
7,BT20,imTOR,0,8,0.719292,-0.760361,-0.735199,-0.733802,0.151388,-0.153555,...,-0.016703,-0.092870,0.104667,0.026162,0.191871,-0.126644,-0.242218,-0.589067,-0.244971,-0.290172
8,BT20,imTOR,0,9,-0.187888,-0.026459,0.057536,-0.211490,-0.478586,-0.548718,...,-0.724924,-0.237334,-0.668149,-0.273685,-0.506554,-0.470501,-0.426303,-0.482794,-0.448370,-0.169862
9,BT20,imTOR,0,10,-0.197765,-0.593284,-0.708365,-0.459342,-0.624239,-0.322962,...,-0.506071,-0.740281,-0.699031,-0.675699,-0.342040,-0.184926,-0.359360,-0.628258,-0.768098,-0.982188
