<a href="https://colab.research.google.com/github/Ahmadrezauf/IML_Projects/blob/master/task_3/ns_IML_Task_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Neural Networks to predict protein activation

#### Set up the directories, load libraries

In [0]:
import pandas as pd
import numpy as np

import pickle
%tensorflow_version 1.x
import tensorflow as tf
from tensorflow.python.util import deprecation
deprecation._PRINT_DEPRECATION_WARNINGS = False

import keras
import keras.backend as K
from keras import Sequential
from keras.layers import Dense, Dropout
from keras.layers import BatchNormalization
from keras.callbacks import Callback,ModelCheckpoint

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.utils import class_weight

In [0]:
#!if [ ! -f Archive.zip ]; then wget -nv https://drive.google.com/open?id=1g7aT8cMkFAFlk6wxkiEH3mgFVp2Xa1l9 -O Archive.zip; fi

In [75]:
from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir("/content/drive/My Drive/IML/IML_Projects/task_3")
os.getcwd()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


'/content/drive/My Drive/IML/IML_Projects/task_3'

In [76]:
import os
os.getcwd()

'/content/drive/My Drive/IML/IML_Projects/task_3'

In [77]:
! ls

 Data		 models		      'on the server'	     prediction.csv
 iml_task_3.py	 ns_IML_Task_3.ipynb  'prediction (1).csv'


#### Load data & data inspection

In [0]:
dat_train = pd.read_csv("./Data/train.csv")
dat_test = pd.read_csv("./Data/test.csv")

In [79]:
# check class balance on activation
dat_train['Active'].value_counts()

0    107787
1      4213
Name: Active, dtype: int64

#### Pre-process data

In [0]:
import re

def split_convert(word_inp): 
    return [ord(i) for i in word_inp] 

In [0]:
train_seqs = [split_convert(i) for i in dat_train.iloc[:,0]]
train_labels = [i for i in dat_train.iloc[:,1]]
test_seqs = [split_convert(i) for i in dat_test.iloc[:,0]]


In [0]:
# binary encode
onehot_encoder = OneHotEncoder(sparse=False)
train_seqs_onehot = onehot_encoder.fit_transform(train_seqs)
test_seqs_onehot = onehot_encoder.transform(test_seqs)

#### Define Neural Network Architecture and Model

In [0]:
# functions to determine metrics f1, precision and recall
# taken from: https://datascience.stackexchange.com/questions/45165/how-to-get-accuracy-f1-precision-and-recall-for-a-keras-model

def get_recall(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def get_precision(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def get_f1(y_true, y_pred):
    precision = get_precision(y_true, y_pred)
    recall = get_recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [0]:
# determine class imbalance
class_weights = class_weight.compute_class_weight('balanced', np.unique(train_labels), train_labels)
class_weight_dict = dict(enumerate(class_weights))

In [0]:
# class_weight = {0:1, 1:12}
NEPOCHS = 60    
BATCHSIZE = 64
VALIDATIONSPLIT = 0.2
HIDDENSIZE = 80

In [0]:
def create_model():
  model = Sequential()
  model.add(Dense(HIDDENSIZE, input_dim = 80, activation='relu'))
  model.add(Dense(HIDDENSIZE, input_dim = HIDDENSIZE, activation='relu'))
  model.add(Dense(HIDDENSIZE, input_dim = HIDDENSIZE, activation='relu'))
  model.add(Dense(HIDDENSIZE, input_dim = HIDDENSIZE, activation='relu'))
  model.add(BatchNormalization())
  model.add(Dropout(0.5))

  model.add(Dense(HIDDENSIZE, input_dim = HIDDENSIZE, activation='relu'))
  model.add(Dense(HIDDENSIZE, input_dim = HIDDENSIZE, activation='relu'))
  model.add(Dense(HIDDENSIZE, input_dim = HIDDENSIZE, activation='relu'))
  model.add(Dense(HIDDENSIZE, input_dim = HIDDENSIZE, activation='relu'))
  model.add(BatchNormalization())
  model.add(Dropout(0.5))

  model.add(Dense(HIDDENSIZE, input_dim = HIDDENSIZE, activation='relu'))
  model.add(Dense(HIDDENSIZE, input_dim = HIDDENSIZE, activation='relu'))
  model.add(Dense(HIDDENSIZE, input_dim = HIDDENSIZE, activation='relu'))
  model.add(Dense(HIDDENSIZE, input_dim = HIDDENSIZE, activation='relu'))
  model.add(Dropout(0.5))
  model.add(BatchNormalization())

  model.add(Dense(2, input_dim = HIDDENSIZE, activation='softmax'))



  model.compile(optimizer='rmsprop',
                loss='binary_crossentropy',
                metrics=[get_f1, get_recall])
  return model


#### Model Selection / training

In [0]:
kfold_splits = 5
folds = list(StratifiedKFold(n_splits=kfold_splits, shuffle=True, random_state=1).split(train_seqs, train_labels))

In [88]:
# Convert labels to categorical one-hot encoding
train_labels_onehot = keras.utils.to_categorical(train_labels, num_classes=2)

mode_path = './models/mlp_v2.h5'

model = None
model = create_model()
model.summary()

best_fold = -1
best_score = 0
best_model = None

for index, (train_indices, val_indices) in enumerate(folds):
  print("Training on fold " + str(index+1) + "/5...")
  # Generate batches from indices
  xtrain, xval = train_seqs_onehot[train_indices], train_seqs_onehot[val_indices]
  ytrain, yval = train_labels_onehot[train_indices], train_labels_onehot[val_indices]

  # xtrain_onehot = onehot_encoder.transform(xtrain)
  # xval_onehot = onehot_encoder.transform(xval)
  # ytrain_onehot = keras.utils.to_categorical(y_train, num_classes=2)
  # yval_onehot = keras.utils.to_categorical(y_val, num_classes=2)

  model = None
  model = create_model()

  # model.summary()
  callbacks = [ModelCheckpoint(filepath=mode_path, save_best_only=True)]
  model.fit(xtrain, ytrain, validation_data = (xval, yval), epochs = NEPOCHS, batch_size=BATCHSIZE, verbose = 1 ,
            callbacks=callbacks, class_weight = class_weight_dict)  # starts training

  # get the best fold based on the best f1 score
  y_pred = model.predict(xval, batch_size=BATCHSIZE, verbose=1)
  y_pred_bool = np.argmax(y_pred, axis=1)
  tmp_score = metrics.f1_score(np.argmax(yval, axis=1),y_pred_bool)
  print("F1 score for this fold is : ", tmp_score)
  if(tmp_score > best_score):
    best_fold = index
    best_model = model


Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_105 (Dense)            (None, 80)                6480      
_________________________________________________________________
dense_106 (Dense)            (None, 80)                6480      
_________________________________________________________________
dense_107 (Dense)            (None, 80)                6480      
_________________________________________________________________
dense_108 (Dense)            (None, 80)                6480      
_________________________________________________________________
batch_normalization_25 (Batc (None, 80)                320       
_________________________________________________________________
dropout_25 (Dropout)         (None, 80)                0         
_________________________________________________________________
dense_109 (Dense)            (None, 80)               

In [89]:
# Training Error
y_pred = best_model.predict(train_seqs_onehot, batch_size=BATCHSIZE, verbose=1)
y_pred_bool = np.argmax(y_pred, axis=1)

print(classification_report(train_labels, y_pred_bool))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99    107787
           1       0.69      0.80      0.74      4213

    accuracy                           0.98    112000
   macro avg       0.84      0.89      0.86    112000
weighted avg       0.98      0.98      0.98    112000



#### Prediction on test data

In [90]:
y_pred = best_model.predict(test_seqs_onehot, batch_size=BATCHSIZE,verbose = 1)
res = np.argmax(y_pred, axis=1)
print(np.sum(res))

res = pd.DataFrame(res)

2111


#### Save results

In [0]:
res.to_csv("./prediction.csv", index=False, header=False)