# Fine-tune the model for the signal peptide benchmark

In [None]:
import os
import random
import numpy as np
import pandas as pd
from IPython.display import display
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split

from proteinbert import OutputType, OutputSpec, FinetuningModelGenerator, load_pretrained_model, finetune, evaluate_by_len
from proteinbert.conv_and_global_attention_model import get_model_with_hidden_layers_as_outputs

# Set random seeds for reproducibility
seed = 100
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)

# Benchmark name and output specification
BENCHMARK_NAME = 'ProFET_NP_SP_Cleaved'
BENCHMARKS_DIR = '/Users/yeyatiprasher/Coding/Internship/protein_bert/protein_benchmarks'
OUTPUT_TYPE = OutputType(False, 'binary')
UNIQUE_LABELS = [0, 1]
OUTPUT_SPEC = OutputSpec(OUTPUT_TYPE, UNIQUE_LABELS)

# Directory containing benchmarks

# Loading the dataset
train_set_file_path = os.path.join(BENCHMARKS_DIR, '%s.train.csv' % BENCHMARK_NAME)
train_set = pd.read_csv(train_set_file_path).dropna().drop_duplicates()
train_set, valid_set = train_test_split(train_set, stratify=train_set['label'], test_size=0.1, random_state=100)

test_set_file_path = os.path.join(BENCHMARKS_DIR, '%s.test.csv' % BENCHMARK_NAME)
test_set = pd.read_csv(test_set_file_path).dropna().drop_duplicates()

print(f'{len(train_set)} training set records, {len(valid_set)} validation set records, {len(test_set)} test set records.')
print(valid_set.head(10))

# Loading the pre-trained model and fine-tuning it on the loaded dataset
batch_size = 26
seq_len = 512
final = seq_len * 2
pretrained_model_generator, input_encoder = load_pretrained_model()

# get_model_with_hidden_layers_as_outputs gives the model output access to the hidden layers (on top of the output)
model_generator = FinetuningModelGenerator(pretrained_model_generator, OUTPUT_SPEC, pretraining_model_manipulation_function=get_model_with_hidden_layers_as_outputs, dropout_rate=0.17681099042260753)

training_callbacks = [
    keras.callbacks.ReduceLROnPlateau(patience=1, factor=0.25, min_lr=1e-05, verbose=1),
    keras.callbacks.EarlyStopping(patience=2, restore_best_weights=True),
]

finetune(model_generator, input_encoder, OUTPUT_SPEC, train_set['seq'], train_set['label'], valid_set['seq'], valid_set['label'],
         seq_len=seq_len, batch_size=batch_size, max_epochs_per_stage=1, lr=8.741510119145999e-05, begin_with_frozen_pretrained_layers=True,
         lr_with_frozen_pretrained_layers=1e-02, n_final_epochs=1, final_seq_len=final, final_lr=1e-05, callbacks=training_callbacks)

    # Evaluating the performance on the test set
results, confusion_matrix = evaluate_by_len(model_generator, input_encoder, OUTPUT_SPEC, test_set['seq'], test_set['label'],
                                            start_seq_len=seq_len, start_batch_size=batch_size)
print("seq_len = %d" % seq_len)
print('Test-set performance:')
display(results)

print('Confusion matrix:')
display(confusion_matrix)

# Finding number of uniquely labeled sequences

In [None]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

from proteinbert import OutputType, OutputSpec, FinetuningModelGenerator, load_pretrained_model, finetune, evaluate_by_len
from proteinbert.conv_and_global_attention_model import get_model_with_hidden_layers_as_outputs

BENCHMARKS_DIR = '/Users/yeyatiprasher/Coding/Internship/protein_bert/protein_benchmarks'
BENCHMARK_NAME = 'ProFET_NP_SP_Cleaved'
OUTPUT_TYPE = OutputType(False, 'binary')
UNIQUE_LABELS = [0,1]
OUTPUT_SPEC = OutputSpec(OUTPUT_TYPE, UNIQUE_LABELS)

train_set_file_path = os.path.join(BENCHMARKS_DIR, '%s.train.csv' % BENCHMARK_NAME)
train_set = pd.read_csv(train_set_file_path).dropna().drop_duplicates()
#train_set, valid_set = train_test_split(train_set, stratify=train_set['label'], test_size=0.1, random_state=100)

test_set_file_path = os.path.join(BENCHMARKS_DIR, '%s.test.csv' % BENCHMARK_NAME)
test_set = pd.read_csv(test_set_file_path).dropna().drop_duplicates()


print("Training Set pos : ", len(train_set[train_set["label"] == 1]))
print("Training Set neg : ", len(train_set[train_set["label"] == 0]))
print(len(train_set))
print("Testing Set pos : ", len(test_set[test_set["label"] == 1]))
print("Testing Set neg : ", len(test_set[test_set["label"] == 0]))
print(len(test_set))

In [None]:
import os
import random
import numpy as np
import pandas as pd
from IPython.display import display
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split

from proteinbert import OutputType, OutputSpec, FinetuningModelGenerator, load_pretrained_model, finetune, evaluate_by_len
from proteinbert.conv_and_global_attention_model import get_model_with_hidden_layers_as_outputs

BENCHMARKS_DIR = '/Users/yeyatiprasher/Coding/Internship/protein_bert/protein_benchmarks'
BENCHMARK_NAME = 'scop'
OUTPUT_TYPE = OutputType(False, 'categorical')
UNIQUE_LABELS = ['a','b','c','d','e','f','g']
OUTPUT_SPEC = OutputSpec(OUTPUT_TYPE, UNIQUE_LABELS)

train_set_file_path = os.path.join(BENCHMARKS_DIR, '%s.train.csv' % BENCHMARK_NAME)
train_set = pd.read_csv(train_set_file_path).dropna().drop_duplicates()
train_set, valid_set = train_test_split(train_set, stratify=train_set['label'], test_size=0.1, random_state=100)

test_set_file_path = os.path.join(BENCHMARKS_DIR, '%s.test.csv' % BENCHMARK_NAME)
test_set = pd.read_csv(test_set_file_path).dropna().drop_duplicates()


print("Training Set a : ", len(train_set[train_set["label"] == 'a']))
print("Training Set b : ", len(train_set[train_set["label"] == 'b']))
print("Training Set c : ", len(train_set[train_set["label"] == 'c']))
print("Training Set d : ", len(train_set[train_set["label"] == 'd']))
print("Training Set e : ", len(train_set[train_set["label"] == 'e']))
print("Training Set f : ", len(train_set[train_set["label"] == 'f']))
print("Training Set g : ", len(train_set[train_set["label"] == 'g']))
print(len(train_set))
print("Valid Set a : ", len(valid_set[valid_set["label"] == 'a']))
print("Valid Set b  : ", len(valid_set[valid_set["label"] == 'b']))
print("Valid Set c : ", len(valid_set[valid_set["label"] == 'c']))
print("Valid Set d : ", len(valid_set[valid_set["label"] == 'd']))
print("Valid Set e : ", len(valid_set[valid_set["label"] == 'e']))
print("Valid Set f : ", len(valid_set[valid_set["label"] == 'f']))
print("Valid Set g : ", len(valid_set[valid_set["label"] == 'g']))
print(len(valid_set))
print("Testing Set a : ", len(test_set[test_set["label"] == 'a']))
print("Testing Set b : ", len(test_set[test_set["label"] == 'b']))
print("Testing Set c : ", len(test_set[test_set["label"] == 'c']))
print("Testing Set d : ", len(test_set[test_set["label"] == 'd']))
print("Testing Set e : ", len(test_set[test_set["label"] == 'e']))
print("Testing Set f : ", len(test_set[test_set["label"] == 'f']))
print("Testing Set g : ", len(test_set[test_set["label"] == 'g']))
print(len(test_set))

# Finding Embeddings

In [None]:
from proteinbert import tokenization as tk
import pandas as pd
import csv
train_set_file_path = "/Users/yeyatiprasher/Coding/Internship/protein_bert/protein_benchmarks/ProFET_NP_SP_Cleaved.train.csv"
train_set = pd.read_csv(train_set_file_path).dropna().drop_duplicates()
f = open("/Users/yeyatiprasher/Coding/Internship/protein_bert/NPCleaved_embed.csv","w")
writer = csv.writer(f)
for i in train_set['seq']:
    embed = tk.tokenize_seq(i)
    writer.writerow(embed)

In [None]:
from proteinbert import tokenization as tk
import pandas as pd
import csv
train_set_file_path = "/Users/yeyatiprasher/Coding/Internship/protein_bert/protein_benchmarks/scop.train.csv"
train_set = pd.read_csv(train_set_file_path).dropna().drop_duplicates()
f = open("/Users/yeyatiprasher/Coding/Internship/protein_bert/scop_embed.csv","w")
writer = csv.writer(f)
for i in train_set['seq']:
    embed = tk.tokenize_seq(i)
    writer.writerow(embed)

# Optimizing the model

In [None]:
import os
import pandas as pd
import random
from IPython.display import display
import numpy as np
import tensorflow as tf

import optuna
from tensorflow import keras
from sklearn.model_selection import train_test_split

from proteinbert import OutputType, OutputSpec, FinetuningModelGenerator, load_pretrained_model, finetune, evaluate_by_len
from proteinbert.conv_and_global_attention_model import get_model_with_hidden_layers_as_outputs

BENCHMARK_NAME = 'ProFET_NP_SP_Cleaved'
BENCHMARKS_DIR = '/Users/yeyatiprasher/Coding/Internship/protein_bert/protein_benchmarks'

# A local (non-global) binary output
OUTPUT_TYPE = OutputType(False, 'binary')
UNIQUE_LABELS = [0, 1]
OUTPUT_SPEC = OutputSpec(OUTPUT_TYPE, UNIQUE_LABELS)




# Define the Optuna objective function
def objective(trial):
    # Suggest hyperparameters
    #batch_size = trial.suggest_int('batch_size', 20, 64)
    seed = trial.suggest_int('seed',80,110)
    #lr = trial.suggest_loguniform('lr', 1e-5, 1e-3)
    #dropout_rate = trial.suggest_uniform('dropout_rate', 0.1, 0.5)
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    # Loading the dataset
    train_set_file_path = os.path.join(BENCHMARKS_DIR, '%s.train.csv' % BENCHMARK_NAME)
    train_set = pd.read_csv(train_set_file_path).dropna().drop_duplicates()
    train_set, valid_set = train_test_split(train_set, stratify=train_set['label'], test_size=0.1, random_state=seed)

    test_set_file_path = os.path.join(BENCHMARKS_DIR, '%s.test.csv' % BENCHMARK_NAME)
    test_set = pd.read_csv(test_set_file_path).dropna().drop_duplicates()
    print('%d training set records, %d validation set records, %d test set records.' % (len(train_set), len(valid_set), len(test_set)))

    pretrained_model_generator, input_encoder = load_pretrained_model()
    model_generator = FinetuningModelGenerator(
        pretrained_model_generator, OUTPUT_SPEC, 
        pretraining_model_manipulation_function=get_model_with_hidden_layers_as_outputs,
        dropout_rate=0.17681099042260753
    )

    training_callbacks = [
        keras.callbacks.ReduceLROnPlateau(patience=1, factor=0.25, min_lr=1e-05, verbose=1),
        keras.callbacks.EarlyStopping(patience=2, restore_best_weights=True),
    ]

    finetune(model_generator, input_encoder, OUTPUT_SPEC, train_set['seq'], train_set['label'], valid_set['seq'], valid_set['label'], 
             seq_len=512, batch_size=32, max_epochs_per_stage=1, lr=8.741510119145999e-05, 
             begin_with_frozen_pretrained_layers=True, lr_with_frozen_pretrained_layers=1e-02, 
             n_final_epochs=1, final_seq_len=1024, final_lr=1e-05, callbacks=training_callbacks)

    results, confusion_matrix = evaluate_by_len(model_generator, input_encoder, OUTPUT_SPEC, test_set['seq'], test_set['label'], 
                                                start_seq_len=512, start_batch_size=32)
    
    overall_accuracy = results.loc['All', 'AUC']
    print(overall_accuracy)
    return overall_accuracy

# Create an Optuna study and optimize
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=5)

# Print the best hyperparameters
print('Best hyperparameters: ', study.best_params)
print('Best accuracy: ', study.best_value)