### <center> Prepare Environment </center>

In [None]:
# Check requirements

import pkg_resources
import sys

required = {
    "h5py": "3.11.0",
    "tqdm": "4.66.4",
    "numpy": "1.26.4",
    "scikit-learn": "1.4.2",
    "tensorflow": "2.10.1",
    "transformers": "4.40.1",
    "tape-proteins": "0.5",
    "torch": "2.3.0",
    "fair-esm": "2.0.0"
}
print('python envirement version:',sys.version)
for pkg, expected_version in required.items():
    try:
        installed_version = pkg_resources.get_distribution(pkg).version
        match = installed_version == expected_version
        print(f"{pkg:<15} Installed: {installed_version:<10} Expected: {expected_version:<10} {'✅' if match else '❌'}")
    except pkg_resources.DistributionNotFound:
        print(f"{pkg:<15} Not installed ❌")


python envirement version: 3.10.18 | packaged by Anaconda, Inc. | (main, Jun  5 2025, 13:08:55) [MSC v.1929 64 bit (AMD64)]
h5py            Installed: 3.11.0     Expected: 3.11.0     ✅
tqdm            Installed: 4.66.4     Expected: 4.66.4     ✅
numpy           Installed: 1.26.4     Expected: 1.26.4     ✅
scikit-learn    Installed: 1.4.2      Expected: 1.4.2      ✅
tensorflow      Installed: 2.10.1     Expected: 2.10.1     ✅
transformers    Installed: 4.40.1     Expected: 4.40.1     ✅
tape-proteins   Installed: 0.5        Expected: 0.5        ✅
torch           Installed: 2.3.0      Expected: 2.3.0      ✅
fair-esm        Installed: 2.0.0      Expected: 2.0.0      ✅


In [None]:
# Install requirements

# import subprocess
# import sys
# import pkg_resources

# required = {
#     "h5py": "3.11.0",
#     "tqdm": "4.66.4",
#     "numpy": "1.26.4",
#     "scikit-learn": "1.4.2",
#     "tensorflow": "2.10.1",
#     "transformers": "4.40.1",
#     "tape-proteins": "0.5",
#     "torch": "2.3.0",
#     "fair-esm": "2.0.0"
# }

# def install_package(pkg, version):
#     subprocess.check_call([sys.executable, "-m", "pip", "install", f"{pkg}=={version}"])

# for pkg, expected_version in required.items():
#     try:
#         installed_version = pkg_resources.get_distribution(pkg).version
#         if installed_version != expected_version:
#             print(f"Updating {pkg} from {installed_version} to {expected_version}...")
#             install_package(pkg, expected_version)
#         else:
#             print(f"{pkg} ✅ {installed_version}")
#     except pkg_resources.DistributionNotFound:
#         print(f"{pkg} not found. Installing {expected_version}...")
#         install_package(pkg, expected_version)

h5py ✅ 3.11.0
tqdm ✅ 4.66.4
numpy ✅ 1.26.4
scikit-learn ✅ 1.4.2
tensorflow ✅ 2.10.1
transformers ✅ 4.40.1
tape-proteins ✅ 0.5
torch not found. Installing 2.3.0...
fair-esm not found. Installing 2.0.0...


In [12]:
# Imports
# !pip install huggingface_hub[hf_xet]
# !pip install sentencepiece

# import pandas as pd
import numpy as np
# import tensorflow as tf
from transformers import T5EncoderModel, T5Tokenizer
import torch
# import h5py
import time
import os

# Set device to GPU if available
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print('device: ',device)

device:  cpu


In [74]:
# costume
class_p_n='neg'       # neg/pos
class_tr_te='test'     # train/test

## <center> Preparing Embeddings with pre-Trained Models </center>

In [75]:
path_input=fr'fasta\{class_tr_te}\{class_p_n}'

path_output=fr'{class_tr_te}\{class_p_n}'

### Model: ProtTrans

In [76]:
# Read a FASTA file and return sequence as a list of tuples
def read_fasta(fasta_path):
    seq = ''
    with open(fasta_path, 'r') as fasta_f:
        for line in fasta_f:
            if not line.startswith('>'):
                seq += line.strip()

    seq_id = os.path.splitext(os.path.basename(fasta_path))[0]  # Extract filename without str
    seqs = [(seq_id, seq)]
    return seqs

In [77]:
# Load ProtTrans T5 model and tokenizer
def get_T5_model():
    model = T5EncoderModel.from_pretrained("Rostlab/prot_t5_xl_half_uniref50-enc")
    model = model.to(device).eval()   # move model to GPU # set model to evaluation model
    tokenizer = T5Tokenizer.from_pretrained('Rostlab/prot_t5_xl_half_uniref50-enc', do_lower_case=False)
    return model, tokenizer

In [78]:
# Generate embeddings for protein sequences
def get_embeddings(model, tokenizer, seqs, max_residues=4000, max_seq_len=1000, max_batch=100):
    results = {"residue_embs": dict()}
    # sort sequences according to length (reduces unnecessary padding --> speeds up embedding)
    seq_dict = sorted(seqs, key=lambda x: len(x[1]), reverse=True)  # Sort by length for efficient batching
    start = time.time()
    batch = []

    for seq_idx, (pdb_id, seq) in enumerate(seq_dict, 1):
        seq_len = len(seq)
        seq = ' '.join(list(seq))  # Add spaces between amino acids
        batch.append((pdb_id, seq, seq_len))

        # Check if batch size or residue count exceeds limits
        # count residues in current batch and add the last sequence length to
        # avoid that batches with (n_res_batch > max_residues) get processed
        n_res_batch = sum([s_len for _, _, s_len in batch]) + seq_len
        if len(batch) >= max_batch or n_res_batch >= max_residues or seq_idx == len(seq_dict) or seq_len > max_seq_len:
            pdb_ids, seqs, seq_lens = zip(*batch)
            batch = []

            # Tokenize sequences
            # add_special_tokens adds extra token at the end of each sequence
            token_encoding = tokenizer.batch_encode_plus(seqs, add_special_tokens=True, padding="longest")
            input_ids = torch.tensor(token_encoding['input_ids']).to(device)
            attention_mask = torch.tensor(token_encoding['attention_mask']).to(device)

            try:
                with torch.no_grad():
                    # returns: ( batch-size x max_seq_len_in_minibatch x embedding_dim )
                    embedding_repr = model(input_ids, attention_mask=attention_mask)
            except RuntimeError:
                print(f"RuntimeError during embedding for {pdb_id} (L={seq_len})")
                continue

            # Extract embeddings for each sequence
            for batch_idx, identifier in enumerate(pdb_ids): # for each protein in the current mini-batch
                s_len = seq_lens[batch_idx]
                # slice off padding --> batch-size x seq_len x embedding_dim
                emb = embedding_repr.last_hidden_state[batch_idx, :s_len]
                # if "residue_embs" in results:
                results["residue_embs"][identifier] = emb.detach().cpu().numpy().squeeze()

    passed_time = time.time() - start
    avg_time = passed_time / len(results["residue_embs"])
    return results

In [79]:
model, tokenizer = get_T5_model()

In [80]:
# Save embeddings to a text file
def save_port_map(port_data, output_file):
    np.savetxt(output_file, port_data)

In [81]:
# Main function to process a single FASTA file
def main(fasta_file, output_file):
    filename = os.path.splitext(os.path.basename(fasta_file))[0]
    seqs = read_fasta(fasta_file)
    results = get_embeddings(model, tokenizer, seqs)
    embeddings = results['residue_embs'][filename]
    save_port_map(embeddings, output_file)

In [None]:
# Entry point: process all .fasta files in the input directory
start = time.time()
input_files = os.listdir(path_input)
str = ".fasta"
for i in input_files:
    if i.endswith(str):
        file_name = "/" + i.split(".")[0]
        # print( "/" + i)
        main(path_input + "/" + i, path_output + file_name + ".prottrans")
end=time.time()
print("Elapsed time:",(end-start)/60, "minutes")

In [None]:
# Explore a .prottrans file
with open(r"Train\pos\neo_sequence24.prottrans", "rb") as f:
     for line in f.readlines():
        print(line.decode('utf-8').strip())

## <center> Get Dataset </center>
*After Embeddings

In [84]:
# Path to input directory containing feature files
path_input= path_output     # path_output=fr'{class_tr_te}\{class_p_n}' From the previous section!
# Path to save the processed dataset
path_output=fr'dataset\{class_tr_te}\{class_p_n}'
# File extension/type of input features (e.g., .npy, .esm)
data_type='.prottrans'
# Maximum number of residues to keep per sequence
max_sequence=35

In [85]:
# Load plain text feature data (e.g., .prottrans or .esm)
def loadData(path):
    Data = np.loadtxt(path)
    return Data

In [86]:
# Load binary NumPy feature data (e.g., .npy from TAPE)
def loadData_tape(path):
    Data = np.load(path)
    return Data

In [87]:
# Save the final concatenated dataset as a .npy file
def saveData(path, data):
    print(data.shape)  # Print shape for verification
    np.save(path, data)

In [88]:
# Reshape features from ProtTrans or ESM format
def get_series_feature(org_data, maxseq, length):
    data = np.zeros((maxseq, length), dtype=np.float16)  # Initialize zero-padded array
    data_len = len(org_data)
    if data_len < maxseq:
        data[:data_len, :] = org_data
    else:
        data[:, :] = org_data[:maxseq, :]
    data = data.reshape((1, 1, maxseq, length))  # Reshape for model input
    return data

In [89]:
# Reshape features from TAPE format (stored as [1, seq_len, dim])
def get_series_feature_tape(org_data, maxseq, length):
    data = np.zeros((maxseq, length), dtype=np.float16)
    data_len = len(org_data[0])
    if data_len < maxseq:
        data[:data_len, :] = org_data[0]
    else:
        data[:, :] = org_data[0][:maxseq, :]
    data = data.reshape((1, 1, maxseq, length))
    return data

In [90]:
# Main processing function
def main(path_input, path_output, data_type, maxseq, length):
    result = []  # List to collect reshaped feature arrays
    input_files = os.listdir(path_input)  # List all files in input directory
    for i in input_files:
        if i.endswith(data_type):  # Filter files by extension
            file_name = i.split(".")[0]  # Remove extension
            if data_type == ".npy":
                data = loadData_tape(os.path.join(path_input, file_name + data_type))
                result.append(get_series_feature_tape(data, maxseq, length))
            else:
                data = loadData(os.path.join(path_input, file_name + data_type))
                result.append(get_series_feature(data, maxseq, length))
    data = np.concatenate(result, axis=0)  # Combine all samples
    saveData(path_output, data)  # Save final dataset


In [91]:
# Entry point
path_input = path_input
path_output = path_output
data_type = data_type
maxseq = max_sequence

# Set feature vector length based on model type
if data_type == ".prottrans":
    length = 1024
elif data_type == ".esm":
    length = 1280
elif data_type == ".npy":
    length = 768
else:
    length = 20  # Default fallback

main(path_input, path_output, data_type, maxseq, length)


(1343, 1, 35, 1024)


## <center> Loading Data </center>

In [92]:
from sklearn.utils import shuffle
from tqdm import tqdm
import tensorflow as tf
print('(tf.__version__):',tf.__version__)
import gc

(tf.__version__): 2.10.1


In [93]:
def MCNN_data_load():
    """
    Load training and testing data for the MCNN model.
    
    Returns:
        tuple: Tuple containing training data (x_train, y_train) and testing data (x_test, y_test).
    """
    # Define the paths to the training and testing data files
    # path_train_pos = "../dataset/Train/pos.npy"
    # path_train_neg = "../dataset/Train/neg.npy"
    # path_test_pos = "../dataset/Test/pos.npy"
    # path_test_neg = "../dataset/Test/neg.npy"
    path_train_pos = "dataset/Train/pos.npy"
    path_train_neg = "dataset/Train/neg.npy"
    path_test_pos = "dataset/Test/pos.npy"
    path_test_neg = "dataset/Test/neg.npy"

    # Load the training and testing data using the data_load function
    x_train,y_train=data_load(path_train_pos,path_train_neg)
    x_test,y_test=data_load(path_test_pos,path_test_neg)
    
    return(x_train,y_train,x_test,y_test)

def data_load(folder1,folder2):
    """
    Load data from two folders and create labels.
    
    Args:
        folder1 (str): Path to the first data folder.
        folder2 (str): Path to the second data folder.
    
    Returns:
        tuple: Tuple containing concatenated data (x) and one-hot encoded labels (y).
    """
    # Load data from the specified folders
    f1=np.load(folder1)
    f2=np.load(folder2)
    
    # Create labels for the data
    label1 = np.ones(f1.shape[0])
    label2 = np.zeros(f2.shape[0])
    
     # Concatenate the data from both folders
    x=np.concatenate([f1,f2], axis=0)
    
    # Concatenate the labels
    y=np.concatenate([label1,label2], axis=0)
    
    # Convert the labels to one-hot encoding
    y= tf.keras.utils.to_categorical(y,2)
    
    # Collect garbage to free up memory
    gc.collect()
    
    return x ,y

## <center> Main </center>

In [94]:
# !pip install protobuf==3.20.*

import h5py
import pickle
from tqdm import tqdm
from time import gmtime, strftime
import math
from sklearn.decomposition import IncrementalPCA
from sklearn.decomposition import PCA
from sklearn import metrics
from sklearn.metrics import roc_curve
from tensorflow.keras import layers,Model
from sklearn.model_selection import KFold

# import sys
# sys.path.append("code/")
# import loading_data as load_data

### setting of parameter

In [95]:
MAXSEQ = 35         #The setting of sequence length.

DATA_TYPE = "prottrans"         #The type of data. Options are "ProtTrans", "tape", "esm2", "esm1b".

NUM_FEATURE = 1024          #"The number of data feature dimensions. 1024 for ProtTrans, 768 for tape, 1280 for esm2 and esm1b."

NUM_FILTER = 64         #The number of filters in the convolutional layer.

NUM_HIDDEN = 256            #The number of hidden units in the dense layer.

BATCH_SIZE  = 256       #The batch size for training the model.

WINDOW_SIZES = [4,8,16]     #The window sizes for convolutional filters.

NUM_CLASSES = 2
CLASS_NAMES = ['Negative','Positive']       #The label of dataset.

EPOCHS      = 50        #The number of epochs for training the model.

K_Fold = 5      #The number of n-fold cross validation.

VALIDATION_MODE="cross"     #The validation mode. Options are "cross", "independent".

print("\nMCNN_MC\n")
print("The setting of sequence length: ",MAXSEQ)
print("The number of filters in the convolutional layer: ",NUM_FILTER)
print("The number of hidden units in the dense layer: ",NUM_HIDDEN)
print("The batch size for training the model: ",BATCH_SIZE)
print("The window sizes for convolutional filters: ",WINDOW_SIZES)
print("The validation mode: ",VALIDATION_MODE)
print("The type of data: ",DATA_TYPE)
print("The number of data feature dimensions: ",NUM_FEATURE)


MCNN_MC

The setting of sequence length:  35
The number of filters in the convolutional layer:  64
The number of hidden units in the dense layer:  256
The batch size for training the model:  256
The window sizes for convolutional filters:  [4, 8, 16]
The validation mode:  cross
The type of data:  prottrans
The number of data feature dimensions:  1024


### Data Generator

In [96]:
# model fit batch funtion
class DataGenerator(tf.keras.utils.Sequence):
    def __init__(self, data, labels, batch_size):
        self.data = data
        self.labels = labels
        self.batch_size = batch_size
        self.indexes = np.arange(len(self.data))

    def __len__(self):
        return int(np.ceil(len(self.data) / self.batch_size))

    def __getitem__(self, index):
        batch_indexes = self.indexes[index * self.batch_size:(index + 1) * self.batch_size]
        batch_data = [self.data[i] for i in batch_indexes]
        batch_labels = [self.labels[i] for i in batch_indexes]
        return np.array(batch_data), np.array(batch_labels)

### MCNN model

In [97]:
class DeepScan(Model):
    def __init__(self,
                 input_shape=(1, MAXSEQ, NUM_FEATURE),
                 window_sizes=[32],
                 num_filters=256,
                 num_hidden=1000):
        # Initialize the parent class
        super(DeepScan, self).__init__()
        # Initialize the input layer
        self.input_layer = tf.keras.Input(input_shape)
        # Initialize convolution window sizes
        self.window_sizes = window_sizes
        # Initialize lists to store convolution, pooling, and flatten layers
        self.conv2d = []
        self.maxpool = []
        self.flatten = []
        # Create corresponding convolution, pooling, and flatten layers for each window size
        for window_size in self.window_sizes:
            self.conv2d.append(
                layers.Conv2D(filters=num_filters,
                              kernel_size=(1, window_size),
                              activation=tf.nn.relu,
                              padding='valid',
                              bias_initializer=tf.constant_initializer(0.1),
                              kernel_initializer=tf.keras.initializers.GlorotUniform())
            )
            self.maxpool.append(
                layers.MaxPooling2D(pool_size=(1, MAXSEQ - window_size + 1),
                                    strides=(1, MAXSEQ),
                                    padding='valid')
            )
            self.flatten.append(
                layers.Flatten()
            )
        # Initialize Dropout layer to prevent overfitting
        self.dropout = layers.Dropout(rate=0.7)
        # Initialize the first fully connected layer
        self.fc1 = layers.Dense(num_hidden,
                                activation=tf.nn.relu,
                                bias_initializer=tf.constant_initializer(0.1),
                                kernel_initializer=tf.keras.initializers.GlorotUniform()
        )
        # Initialize the output layer with softmax activation
        self.fc2 = layers.Dense(NUM_CLASSES,
                                activation='softmax',
                                kernel_regularizer=tf.keras.regularizers.l2(1e-3)
        )
        # Get the output layer by calling the call method
        self.out = self.call(self.input_layer)

    def call(self, x, training=False):
        # List to store outputs of convolution, pooling, and flatten layers
        _x = []
        # Perform convolution, pooling, and flatten operations for each window size
        for i in range(len(self.window_sizes)):
            x_conv = self.conv2d[i](x)
            x_maxp = self.maxpool[i](x_conv)
            x_flat = self.flatten[i](x_maxp)
            _x.append(x_flat)
        # Concatenate the outputs of all flatten layers
        x = tf.concat(_x, 1)
        # Apply Dropout layer
        x = self.dropout(x, training=training)
        # Apply the first fully connected layer
        x = self.fc1(x)
        # Apply the output layer
        x = self.fc2(x)
        return x


### Training

In [100]:
x_train,y_train,x_test,y_test= MCNN_data_load()

print("The shape of training dataset :",x_train.shape)
print("The data type of training dataset :",x_train.dtype)
print("The shape of training label :",y_train.shape)
print("The shape of validation dataset :",x_test.shape)
print("The data type of validation dataset :",x_test.dtype)
print("The shape of validation label :",y_test.shape)
print("\n")

The shape of training dataset : (5608, 1, 35, 1024)
The data type of training dataset : float16
The shape of training label : (5608, 2)
The shape of validation dataset : (1404, 1, 35, 1024)
The data type of validation dataset : float16
The shape of validation label : (1404, 2)




In [101]:
def model_test(model, x_test, y_test):
    
    # Generate predictions for the test data
    pred_test = model.predict(x_test)
    
    # Calculate the false positive rate, true positive rate, and thresholds
    fpr, tpr, thresholds = roc_curve(y_test[:, 1], pred_test[:, 1])
    # Calculate the Area Under the Curve (AUC) for the ROC curve
    AUC = metrics.auc(fpr, tpr)
    # Display the ROC curve
    if (VALIDATION_MODE!="cross"):
        display = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=AUC, estimator_name='mCNN')
        display.plot()
    
    # Calculate the geometric mean for each threshold
    gmeans = np.sqrt(tpr * (1 - fpr))
    # Locate the index of the largest geometric mean
    ix = np.argmax(gmeans)
    print(f'\nBest Threshold={thresholds[ix]}, G-Mean={gmeans[ix]}')
    # Set the threshold to the one with the highest geometric mean
    threshold = thresholds[ix]
    # Generate binary predictions based on the threshold
    y_pred = (pred_test[:, 1] >= threshold).astype(int)
    
    # Calculate confusion matrix values: TN, FP, FN, TP
    TN, FP, FN, TP = metrics.confusion_matrix(y_test[:, 1], y_pred).ravel()
    # Calculate Sensitivity (Recall)
    Sens = TP / (TP + FN) if TP + FN > 0 else 0.0
    # Calculate Specificity
    Spec = TN / (FP + TN) if FP + TN > 0 else 0.0
    # Calculate Accuracy
    Acc = (TP + TN) / (TP + FP + TN + FN)
    # Calculate Matthews Correlation Coefficient (MCC)
    MCC = (TP * TN - FP * FN) / math.sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) if TP + FP > 0 and FP + TN > 0 and TP + FN and TN + FN else 0.0
    # Calculate F1 Score
    F1 = 2 * TP / (2 * TP + FP + FN)
    # Calculate Precision
    Prec = TP / (TP + FP)
    # Calculate Recall
    Recall = TP / (TP + FN)
    
    # Print the performance metrics
    print(f'TP={TP}, FP={FP}, TN={TN}, FN={FN}, Sens={Sens:.4f}, Spec={Spec:.4f}, Acc={Acc:.4f}, MCC={MCC:.4f}, AUC={AUC:.4f}, F1={F1:.4f}, Prec={Prec:.4f}, Recall={Recall:.4f}\n')
    
    # Return the performance metrics
    return TP, FP, TN, FN, Sens, Spec, Acc, MCC, AUC, F1, Prec, Recall


In [102]:
if(VALIDATION_MODE == "cross"):
    # Initialize K-Fold cross-validation
    kfold = KFold(n_splits=K_Fold, shuffle=True, random_state=2)
    
    results = []  # List to store results of each fold
    i = 1  # Counter for fold number
    
    # Iterate over each split of the dataset
    for train_index, test_index in kfold.split(x_train):
        print(f"{i} / {K_Fold}\n")
        
        # Split the data into training and testing sets for the current fold
        X_train, X_test = x_train[train_index], x_train[test_index]
        Y_train, Y_test = y_train[train_index], y_train[test_index]
        
        # Print the shapes of the training and testing datasets
        print("The shape of training dataset of cross validation:", X_train.shape)
        print("The shape of training label of cross validation:", Y_train.shape)
        print("The shape of validation dataset of cross validation:", X_test.shape)
        print("The shape of validation label of cross validation:", Y_test.shape)
        print("\n")
        
        # Create a data generator for the training data
        generator = DataGenerator(X_train, Y_train, batch_size=BATCH_SIZE)
        
        # Initialize the DeepScan model
        model = DeepScan(
            num_filters=NUM_FILTER,
            num_hidden=NUM_HIDDEN,
            window_sizes=WINDOW_SIZES
        )
        
        # Compile the model with Adam optimizer and binary cross-entropy loss
        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        
        # Build the model with the input shape of the training data
        model.build(input_shape=X_train.shape)
        
        # Print the model summary
        model.summary()
        
        # Train the model
        history = model.fit(
            generator,
            epochs=EPOCHS,
            callbacks=[tf.keras.callbacks.EarlyStopping(monitor='loss', patience=10)],
            verbose=1,
            shuffle=True
        )
        
        # Test the model on the validation set and get performance metrics
        TP, FP, TN, FN, Sens, Spec, Acc, MCC, AUC, F1, Prec, Recall = model_test(model, X_test, Y_test)
        
        # Append the results to the list
        results.append([TP, FP, TN, FN, Sens, Spec, Acc, MCC, AUC, F1, Prec, Recall])
        
        # Increment the fold counter
        i += 1
        
        # Clear the training and testing data from memory
        del X_train
        del X_test
        del Y_train
        del Y_test
        gc.collect()
    
    # Calculate the mean results across all folds
    mean_results = np.mean(results, axis=0)
    
    # Print the mean results of the cross-validation
    print(f"The mean of {K_Fold}-Fold cross-validation results:")
    print(f'TP={mean_results[0]:.4}, FP={mean_results[1]:.4}, TN={mean_results[2]:.4}, FN={mean_results[3]:.4}, '
          f'Sens={mean_results[4]:.4}, Spec={mean_results[5]:.4}, Acc={mean_results[6]:.4}, MCC={mean_results[7]:.4}, AUC={mean_results[8]:.4}, F1={mean_results[9]:.4}, Prec={mean_results[10]:.4}, Recall={mean_results[10]:.4}\n')

1 / 5

The shape of training dataset of cross validation: (4486, 1, 35, 1024)
The shape of training label of cross validation: (4486, 2)
The shape of validation dataset of cross validation: (1122, 1, 35, 1024)
The shape of validation label of cross validation: (1122, 2)


Model: "deep_scan"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 1, 32, 64)         262208    
                                                                 
 conv2d_1 (Conv2D)           (None, 1, 28, 64)         524352    
                                                                 
 conv2d_2 (Conv2D)           (None, 1, 20, 64)         1048640   
                                                                 
 max_pooling2d (MaxPooling2D  (None, 1, 1, 64)         0         
 )                                                               
                                                

In [103]:
if(VALIDATION_MODE == "independent"):
    # Create a data generator for the training data
    generator = DataGenerator(x_train, y_train, batch_size=BATCH_SIZE)
    
    # Initialize the DeepScan model
    model = DeepScan(
        num_filters=NUM_FILTER,
        num_hidden=NUM_HIDDEN,
        window_sizes=WINDOW_SIZES
    )
    
    # Compile the model with Adam optimizer and binary cross-entropy loss
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    # Build the model with the input shape of the training data
    model.build(input_shape=x_train.shape)
    
    # Print the model summary
    model.summary()
    
    # Train the model
    model.fit(
        generator,
        epochs=EPOCHS,
        shuffle=True,
    )
    
    # Test the model on the independent test set and get performance metrics
    TP, FP, TN, FN, Sens, Spec, Acc, MCC, AUC, F1, Prec, Recall = model_test(model, x_test, y_test)
    
    # Print the performance metrics
    print(f'TP={TP}, FP={FP}, TN={TN}, FN={FN}, Sens={Sens:.4f}, Spec={Spec:.4f}, Acc={Acc:.4f}, MCC={MCC:.4f}, AUC={AUC:.4f}, F1={F1:.4f}, Prec={Prec:.4f}, Recall={Recall:.4f}\n')
