# Train a baseline model for Subchallenge 1


### First, download training data.

This only needs to be run once, to populate the `training/` directory.

In [None]:
#%pip install synapseclient

#import getpass
#import pandas
#import synapseclient
#import synapseutils

#syn = synapseclient.Synapse()
#syn.login(input(prompt="Enter Synapse Username"), getpass.getpass("Enter Synapse Password"))
#downloaded_files = synapseutils.syncFromSynapse(syn, 'syn21212904', path='training') 

### Now, load the data, and train a model!

In [1]:
import pandas
rnaseq = pandas.read_csv('../training/rnaseq.csv')
aucs = pandas.read_csv('../training/aucs.csv')

rnaseq_lb = pandas.read_csv('../leaderboard/rnaseq.csv')
aucs_lb = pandas.read_csv('../leaderboard/aucs.csv')

In [2]:
print(rnaseq.shape, rnaseq_lb.shape)
print(aucs.shape, aucs_lb.shape)

(63677, 215) (63677, 82)
(20242, 3) (7850, 3)


In [11]:
aucs = pandas.concat((aucs, aucs_lb), axis = 0)
print(aucs.shape)

(28092, 3)

In [8]:
import pickle

In [12]:
from util import TransposeRnaSeqTable

specimens = TransposeRnaSeqTable(rnaseq)
specimens_lb = TransposeRnaSeqTable(rnaseq_lb)
specimens = pandas.concat((specimens, specimens_lb), axis = 0)
print(specimens.shape)

selected_genes = specimens.var().nlargest(20000).index.tolist()

with open('model/selected_genes.pkl', 'wb') as f:
        pickle.dump(selected_genes, f)

(293, 63677)


In [13]:
import numpy
from sklearn.linear_model import RidgeCV
from sklearn.decomposition import KernelPCA
from sklearn.preprocessing import StandardScaler

# Normalize each specimen.
X = specimens
X = X.div(numpy.linalg.norm(X, axis=1), axis=0)
X = X[selected_genes]

# Compute z-score.
#gene_mean = X.mean(axis=0)
#gene_std = X.std(axis=0)
#X = (X - gene_mean) / gene_std
scaler = StandardScaler()
X = pandas.DataFrame(scaler.fit_transform(X), index = X.index)

with open('model/scaler.pkl', 'wb') as f:
        pickle.dump(scaler, f)

# apply kernel PCA
pca = KernelPCA(kernel='rbf')
X_pca = pandas.DataFrame(pca.fit_transform(X), index = X.index)
X = X_pca

with open('model/pca.pkl', 'wb') as f:
        pickle.dump(pca, f)

# For each inhibitor, train a regressor.
alphas = numpy.logspace(-1, 5, num=40)
regressors = {}
for inhibitor in aucs.inhibitor.unique():
    auc_subset = aucs[aucs.inhibitor == inhibitor]
    regr = RidgeCV(alphas=alphas, store_cv_values=True)
    regr = regr.fit(X.loc[auc_subset.lab_id], auc_subset.auc)
    regressors[inhibitor] = regr

# store the model information in model/    
with open('model/regressors.pkl', 'wb') as f:
        pickle.dump(regressors, f)


### Submission to Synapse

```bash
cd Documents/Synapse/beataml_example1-master
SYNAPSE_PROJECT_ID=syn21789396
docker build -t docker.synapse.org/$SYNAPSE_PROJECT_ID/sc1_model:new_tag .
docker images
docker login docker.synapse.org -u Clyde_Dixon
docker push docker.synapse.org/$SYNAPSE_PROJECT_ID/sc1_model:new_tag
```

_Verify the Docker image was successfully pushed:_ https://www.synapse.org/#!Synapse:<Your project ID>/docker
    
_Submit your Docker image:_ To submit your Docker image, navigate to the image uploaded on Synapse and click on the __Docker Repository Tools__ button in the upper-right corner. Select __Submit Docker Repository to Challenge__ from the options.