In [4]:
import argparse
import sys
import os
import random
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import KFold,StratifiedKFold
import cv2
import gc
import math
import matplotlib.pyplot as plt
from scipy.stats.mstats import gmean

from tensorflow.keras.layers import Input,Dense,Dropout,Embedding,Concatenate,Flatten,LSTM ,Bidirectional,GRU
from tensorflow.keras.activations import relu ,sigmoid,softmax
from tensorflow.keras.losses import CategoricalCrossentropy
import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow_addons.optimizers import AdamW

ModuleNotFoundError: No module named 'tensorflow'

In [8]:
def seed_all(seed_value):
    random.seed(seed_value) # Python
    np.random.seed(seed_value) # cpu vars
    torch.manual_seed(seed_value) # cpu  vars
    tf.random.set_seed(seed_value+1000)
    #os.environ['PYTHONHASHSEED'] = str(seed_value)
    #os.environ['TF_DETERMINISTIC_OPS'] = '1'
    #os.environ['TF_KERAS'] = '1'
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value) # gpu vars
        torch.backends.cudnn.deterministic = True  #needed
        torch.backends.cudnn.benchmark = False

seed_all(42)

# Config

In [9]:
class Config:
  n_folds=10
  random_state=42
  tbs = 1024
  vbs = 512
  data_path="data"
  result_path="results"
  models_path="models"

# plot and util

In [10]:
def write_to_txt(file_name,column):
    with open(file_name, 'w') as f:
        for item in column:
            f.write("%s\n" % item)

# Load data

In [11]:
train=pd.read_csv(os.path.join(Config.data_path,"train.csv"))
test=pd.read_csv(os.path.join(Config.data_path,"test.csv"))
aae=pd.read_csv(os.path.join(Config.data_path,"amino_acid_embeddings.csv"))
submission=pd.read_csv(os.path.join(Config.data_path,"SampleSubmission.csv"))

# Prepare and split data

In [12]:
train["Sequence_len"]=train["Sequence"].apply(lambda x : len(x))
test["Sequence_len"]=test["Sequence"].apply(lambda x : len(x))

In [13]:
max_seq_length = 550 # max seq length in this data set is 550 

In [14]:
#stratified k fold
train["folds"]=-1
kf = StratifiedKFold(n_splits=Config.n_folds, random_state=Config.random_state, shuffle=True)
for fold, (_, val_index) in enumerate(kf.split(train,train["target"])):
        train.loc[val_index, "folds"] = fold

In [15]:
train.head()

Unnamed: 0,ID,Sequence,target,Sequence_len,folds
0,ID_train_0,MVDGVMILPVLVMIAFPFPSMEDEKPKVNPKLYMCVCEGLSCGDEA...,0,509,9
1,ID_train_1,MAQKENAYPWPYGSKTSQSGLNTLSQRVLRKEPATTSALALVNRFN...,1,345,1
2,ID_train_2,MRLWPRSLFGRLVLILVSGMLAAQILTSSIWYDVRHSQVLEIPTRL...,2,462,9
3,ID_train_3,MNSIVKIMKMKQITYKLFMTTSLILLSFAVLIYLTLYFFLPTFYEQ...,2,490,6
4,ID_train_4,MKLIYQNVLSFLLIIVTTISIIGYSEIGYARNQAYTQNYQRMESYA...,2,484,2


In [16]:
# reduce seq length
if max_seq_length>550 : 
    train["Sequence"] = train["Sequence"].apply(lambda x: "".join(list(x)[0:max_seq_length]))
    test["Sequence"] = test["Sequence"].apply(lambda x: "".join(list(x)[0:max_seq_length]))

In [17]:
voc_set = set(['P', 'V', 'I', 'K', 'N', 'B', 'F', 'Y', 'E', 'W', 'R', 'D', 'X', 'S', 'C', 'U', 'Q', 'A', 'M', 'H', 'L', 'G', 'T'])
voc_set_map = { k:v for k , v in zip(voc_set,range(1,len(voc_set)+1))}
number_of_class = train["target"].nunique()

In [18]:
def encode(text_tensor, label):
    encoded_text = [ voc_set_map[e] for e in list(text_tensor.numpy().decode())]
    return encoded_text, label
def encode_map_fn(text, label):
    # py_func doesn't set the shape of the returned tensors.
    encoded_text, label = tf.py_function(encode, 
                                       inp=[text, label], 
                                       Tout=(tf.int64, tf.int64))
    encoded_text.set_shape([None])
    label=tf.one_hot(label,number_of_class)
    label.set_shape([number_of_class])
    
    return encoded_text, label
def get_data_loader(file,batch_size,labels):
    
    label_data=tf.data.Dataset.from_tensor_slices(labels)
    data_set=tf.data.TextLineDataset(file)
    data_set=tf.data.Dataset.zip((data_set,label_data))

    data_set=data_set.repeat()
    data_set = data_set.shuffle(len(labels))
    data_set=data_set.map(encode_map_fn,tf.data.experimental.AUTOTUNE)
    data_set=data_set.padded_batch(batch_size)
    data_set = data_set.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
    return data_set


def get_data_loader_test(file,batch_size,labels):
    
    label_data=tf.data.Dataset.from_tensor_slices(labels.target)
    data_set=tf.data.TextLineDataset(file)
    data_set=tf.data.Dataset.zip((data_set,label_data))
    data_set=data_set.map(encode_map_fn,tf.data.experimental.AUTOTUNE)
    data_set=data_set.padded_batch(batch_size)
    data_set = data_set.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
    return data_set

# Model

In [19]:
def model():
    name = "seq"
    dropout_rate = 0.1
    learning_rate = 0.001
    sequnce = Input([None],name="sequnce")
    
    EMB_layer = Embedding(input_dim = len(voc_set)+1, output_dim = 64, name = "emb_layer")
    
    GRU_layer_2 = GRU(units=256, name = "gru_2", return_sequences = False)
    BIDIR_layer_2 = Bidirectional(GRU_layer_2, name="bidirectional_2")
    
    Dens_layer_1 = Dense(units=512, activation=relu, kernel_regularizer=None, bias_regularizer=None, name=name+"_dense_layer_1")
    Dens_layer_2 = Dense(units=256, activation=relu, kernel_regularizer=None, bias_regularizer=None, name=name+"_dense_layer_2")
    
    output = Dense(units=number_of_class, activation=softmax, kernel_regularizer=None, bias_regularizer=None, name=name+"_dense_layer_output")
    
    dropout_1 = Dropout(dropout_rate)
    
    
    emb_layer = EMB_layer(sequnce)
    logits = output(Dens_layer_2(dropout_1(Dens_layer_1(BIDIR_layer_2(emb_layer)))))

    
    model = tf.keras.Model(inputs={"sequnce":sequnce, },outputs=logits) 
    
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    #loss= tfa.losses.SigmoidFocalCrossEntropy(reduction=tf.keras.losses.Reduction.AUTO)
    loss=CategoricalCrossentropy()
    model.compile(optimizer=optimizer, loss=loss, metrics=[tf.keras.metrics.CategoricalAccuracy(name="Acc")]) 
    model.summary()
    return model 


# training 

In [None]:
def trainn(fold):
    model_path=f"model_{fold}.h5"
    df_train = train[train["folds"] != fold].reset_index(drop=True)
    df_valid = train[train["folds"] == fold].reset_index(drop=True)
    write_to_txt(f"data/train_{fold}.txt",df_train.Sequence)
    write_to_txt(f"data/valid_{fold}.txt",df_valid.Sequence)
    train_label=df_train["target"]
    valid_label=df_valid["target"]
    train_dl = get_data_loader(f"data/train_{fold}.txt",Config.tbs,train_label)
    valid_dl = get_data_loader(f"data/valid_{fold}.txt",Config.vbs,valid_label)
    checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath=os.path.join(Config.models_path,model_path),
                                                 save_weights_only=True,monitor = 'val_loss',
                                                 save_best_only=True,mode="min", verbose=1)
    callbacks=[checkpoint]
    my_model = model()
    
    history = my_model.fit(train_dl,
                    validation_data=valid_dl,
                    epochs=15,
                    verbose=1,
                    batch_size=Config.tbs,
                    validation_batch_size=Config.vbs,
                    validation_steps=len(df_valid)//Config.vbs,
                    steps_per_epoch=len(df_train)/Config.tbs,
                    callbacks=callbacks
                   )
    

In [None]:
def predict(fold):
  model_path=f"model_{fold}.h5"
  write_to_txt(f"data/test_{fold}.txt",test.Sequence)
  test["target"]=0
  test_label=test["target"]
  test_dl = get_data_loader_test(f"data/test_{fold}.txt",Config.vbs,test)
  my_model = model()
  my_model.load_weights(os.path.join(Config.models_path,model_path))
  prediction=my_model.predict(test_dl)
  return prediction

In [None]:
trainn(1)

Model: "functional_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
sequnce (InputLayer)         [(None, None)]            0         
_________________________________________________________________
emb_layer (Embedding)        (None, None, 128)         3072      
_________________________________________________________________
bidirectional_2 (Bidirection (None, 512)               592896    
_________________________________________________________________
seq_dense_layer_1 (Dense)    (None, 512)               262656    
_________________________________________________________________
dropout_2 (Dropout)          (None, 512)               0         
_________________________________________________________________
seq_dense_layer_2 (Dense)    (None, 256)               131328    
_________________________________________________________________
seq_dense_layer_output (Dens (None, 8)                

In [None]:
p=predict(1)

Model: "functional_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
sequnce (InputLayer)         [(None, None)]            0         
_________________________________________________________________
emb_layer (Embedding)        (None, None, 128)         3072      
_________________________________________________________________
bidirectional_2 (Bidirection (None, 512)               592896    
_________________________________________________________________
seq_dense_layer_1 (Dense)    (None, 512)               262656    
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
seq_dense_layer_2 (Dense)    (None, 256)               131328    
_________________________________________________________________
seq_dense_layer_output (Dens (None, 8)                

In [20]:
sub=test[["ID"]].copy()
for i in range(number_of_class):
    sub["target_{}".format(i)]=p[:,i]

In [21]:
sub.head()

Unnamed: 0,ID
0,ID_test_0
1,ID_test_1
2,ID_test_2
3,ID_test_3
4,ID_test_4


In [22]:
sub.to_csv(os.path.join(Config.result_path,"sub_p1_epoch15.csv"),index=False)