# Neural Networks to predict protein activation

### Improvement ideas
* Look at one hot key encoding, are we dropping one of the 21 features? do we need to do so?
* One vector with four 1s, we might be loosing information
* Class weights
* Number of epochs
* batch size
* Neural Network (number of layers, where to put drop out layer, activations)
* Optimizer, loss function for f1

#### Set up the directories, load libraries

In [0]:
import pandas as pd
import numpy as np

import pickle
%tensorflow_version 1.x
import tensorflow as tf
from tensorflow.python.util import deprecation
deprecation._PRINT_DEPRECATION_WARNINGS = False

import keras
import keras.backend as K
from keras import Sequential
from keras.layers import Dense, Dropout
from keras.layers import BatchNormalization
from keras import regularizers
from keras.optimizers import SGD
from keras.callbacks import Callback,ModelCheckpoint

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.utils import class_weight
from sklearn.preprocessing import LabelEncoder

In [0]:
#!if [ ! -f Archive.zip ]; then wget -nv https://drive.google.com/open?id=1g7aT8cMkFAFlk6wxkiEH3mgFVp2Xa1l9 -O Archive.zip; fi

In [3]:
from google.colab import drive
drive.mount('/content/drive', force_remount= True)
import os
os.chdir("/content/drive/My Drive/IML/IML_Projects/task_3")
os.getcwd()

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


'/content/drive/My Drive/IML/IML_Projects/task_3'

In [4]:
import os
os.getcwd()

'/content/drive/My Drive/IML/IML_Projects/task_3'

In [5]:
! ls

 Data		 models		      'on the server'
 iml_task_3.py	 ns_IML_Task_3.ipynb   prediction.csv


#### Load data & data inspection

In [0]:
dat_train = pd.read_csv("./Data/train.csv")
dat_test = pd.read_csv("./Data/test.csv")

In [7]:
# check class balance on activation
dat_train['Active'].value_counts()

0    107787
1      4213
Name: Active, dtype: int64

#### Pre-process data

In [0]:
import re

def split_convert(word_inp): 
    return [ord(i) for i in word_inp] 

In [0]:
train_seqs = [split_convert(i) for i in dat_train.iloc[:,0]]
train_labels = [i for i in dat_train.iloc[:,1]]
test_seqs = [split_convert(i) for i in dat_test.iloc[:,0]]

In [0]:
# binary encode
onehot_encoder = OneHotEncoder(sparse=False)
train_seqs_onehot = onehot_encoder.fit_transform(train_seqs)
test_seqs_onehot = onehot_encoder.transform(test_seqs)

#### Define Neural Network Architecture and Model

In [0]:
# functions to determine metrics f1, precision and recall
# taken from: https://datascience.stackexchange.com/questions/45165/how-to-get-accuracy-f1-precision-and-recall-for-a-keras-model

def get_recall(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def get_precision(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def get_f1(y_true, y_pred):
    precision = get_precision(y_true, y_pred)
    recall = get_recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [0]:
# determine class imbalance
class_weights = class_weight.compute_class_weight('balanced', np.unique(train_labels), train_labels)
class_weight_dict = dict(enumerate(class_weights))

In [0]:
# class_weight = {0:1, 1:12}
NEPOCHS = 60  
BATCHSIZE = 64
VALIDATIONSPLIT = 0.2
HIDDENSIZE = 80

In [0]:
# opt = SGD(lr=0.01, momentum=0.9)
opt = 'adam'

def create_model():
  model = Sequential()
  model.add(Dense(60, input_dim = 80, activation='relu', kernel_initializer= 'he_normal'))
  model.add(Dense(1, input_dim = 60, activation='sigmoid'))

  model.compile(optimizer=opt,
                loss='binary_crossentropy',
 #               loss='mean_squared_error',
                metrics=['accuracy'])

  return model


#### Model Selection / training

In [0]:
kfold_splits = 5
folds = list(StratifiedKFold(n_splits=kfold_splits, shuffle=True, random_state=1).split(train_seqs, train_labels))

In [107]:
# Convert labels to categorical one-hot encoding
#train_labels_onehot = keras.utils.to_categorical(train_labels, num_classes=2)

mode_path = './models/mlp_v2.h5'

model = None
model = create_model()
model.summary()

best_fold = -1
best_score = 0
best_model = None

for index, (train_indices, val_indices) in enumerate(folds):
  print("Training on fold " + str(index+1) + "/5...")
  # Generate batches from indices
  xtrain, xval = train_seqs_onehot[train_indices], train_seqs_onehot[val_indices]
  #ytrain, yval = train_labels_onehot[train_indices], train_labels_onehot[val_indices]
  ytrain = np.array(train_labels)[train_indices.astype(int)]
  yval = np.array(train_labels)[val_indices.astype(int)]

  # xtrain_onehot = onehot_encoder.transform(xtrain)
  # xval_onehot = onehot_encoder.transform(xval)
  # ytrain_onehot = keras.utils.to_categorical(y_train, num_classes=2)
  # yval_onehot = keras.utils.to_categorical(y_val, num_classes=2)

  model = None
  model = create_model()

  # class wight for the train set
  class_weights = class_weight.compute_class_weight('balanced', np.unique(ytrain), ytrain)
  class_weight_dict = dict(enumerate(class_weights))

  # model.summary()
  callbacks = [ModelCheckpoint(filepath=mode_path, save_best_only=True)]
  model.fit(xtrain, ytrain, validation_data = (xval, yval), epochs = NEPOCHS, batch_size=BATCHSIZE, verbose = 0 ,
            callbacks=callbacks, class_weight = class_weight_dict)  # starts training

  # get the best fold based on the best f1 score
  y_pred = model.predict_classes(xval, batch_size=BATCHSIZE, verbose=1)
  y_train = model.predict_classes(xtrain, batch_size=BATCHSIZE, verbose=1)
  # y_pred_bool = np.argmax(y_pred, axis=1)
  y_pred_bool = y_pred.astype(int)
  #tmp_score = metrics.f1_score(np.argmax(yval, axis=1),y_pred)
  tmp_score = metrics.f1_score(yval,y_pred)
  score_train = metrics.f1_score(ytrain,y_train)
  print("F1 score for this fold is : ", tmp_score, score_train)
  if(tmp_score > best_score):
    best_fold = index
    best_model = model


Model: "sequential_121"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_285 (Dense)            (None, 60)                4860      
_________________________________________________________________
dense_286 (Dense)            (None, 1)                 61        
Total params: 4,921
Trainable params: 4,921
Non-trainable params: 0
_________________________________________________________________
Training on fold 1/5...
F1 score for this fold is :  0.8539915966386554 0.8850091887634549
Training on fold 2/5...
F1 score for this fold is :  0.817258883248731 0.8778645833333333
Training on fold 3/5...
F1 score for this fold is :  0.8545551411827383 0.8987983978638184
Training on fold 4/5...
F1 score for this fold is :  0.8443519000520562 0.8934252386002121
Training on fold 5/5...
F1 score for this fold is :  0.8568387440127727 0.9004676018704075


In [108]:
# train model on entire data set
# class wight for the train set
class_weights = class_weight.compute_class_weight('balanced', np.unique(train_labels), train_labels)
class_weight_dict = dict(enumerate(class_weights))

# model.summary()
callbacks = [ModelCheckpoint(filepath=mode_path, save_best_only=True)]
model.fit(train_seqs_onehot,train_labels, validation_split=0.2, epochs = NEPOCHS, batch_size=BATCHSIZE, verbose = 0 ,
            callbacks=callbacks, class_weight = class_weight_dict)  # starts training

<keras.callbacks.callbacks.History at 0x7fdcf5938470>

In [109]:
# Training Error
y_pred = model.predict_classes(train_seqs_onehot, batch_size=BATCHSIZE, verbose=1)
y_pred_bool = np.argmax(y_pred, axis=1)

print(classification_report(train_labels, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00    107787
           1       0.88      0.99      0.93      4213

    accuracy                           0.99    112000
   macro avg       0.94      0.99      0.96    112000
weighted avg       1.00      0.99      0.99    112000



#### Prediction on test data

In [110]:
y_pred = model.predict_classes(test_seqs_onehot, batch_size=BATCHSIZE,verbose = 1)
# res = np.argmax(y_pred, axis=1)
print(np.sum(y_pred))

# res = pd.DataFrame(res)
res = pd.DataFrame(y_pred)

2080


#### Save results

In [0]:
res.to_csv("./prediction.csv", index=False, header=False)