In [36]:
! pip install -U scikit-learn
! pip install -U pyarrow



In [37]:
import numpy as np
import pandas as pd     # For loading and processing the dataset
import gc
import tensorflow as tf   
from tensorflow import keras
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder,MinMaxScaler
from sklearn.metrics import mutual_info_score
from sklearn.decomposition import PCA

In [38]:
gc.collect() # For performing a full garbage collection

289

# **Reading and cleaning the input data**
I read the CSV input file using Pandas. Next, I remove irrelevant entries, and prepare the data for our neural network.

In [39]:
# Read the CSV input file and show first 5 rows
data=pd.read_csv('/kaggle/input/affinity-data/binding_affinity_data.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,MHC_sequence,MHC_type,peptide_sequence,label
0,0,MRVTAPRTLLLLLWGAVALTETWAGSHSMRYFHTSVSRPGRGEPRF...,HLA-B*27:05,ERLKEVQKR,1
1,1,MRVTAPRTLLLLLWGAVALTETWAGSHSMRYFHTSVSRPGRGEPRF...,HLA-B*27:05,KPRKTAEVAGKTL,1
2,2,MRVTAPRTLLLLLWGAVALTETWAGSHSMRYFHTSVSRPGRGEPRF...,HLA-B*27:05,KEARRIIKK,1
3,3,MRVTAPRTLLLLLWGAVALTETWAGSHSMRYFHTSVSRPGRGEPRF...,HLA-B*27:05,EEKITEAKEL,0
4,4,MRVTAPRTLLLLLWGAVALTETWAGSHSMRYFHTSVSRPGRGEPRF...,HLA-B*27:05,SLPSSRAARVPG,0


In [40]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1793065 entries, 0 to 1793064
Data columns (total 5 columns):
 #   Column            Dtype 
---  ------            ----- 
 0   Unnamed: 0        int64 
 1   MHC_sequence      object
 2   MHC_type          object
 3   peptide_sequence  object
 4   label             int64 
dtypes: int64(2), object(3)
memory usage: 68.4+ MB


# **Remove unused features**
I should delete unused features from my dataset to increase performance and decrease complexity. some features like *id* or *Unnamed : 0* (in our dataset) is obviously unused so I can easily delete them but for another feature I should use some techniques which name is **feature selection**.
by using feature selection techniques, I can identify and delete the most irrelevant features from the model's features.

In [41]:
# We can't do anything with the 'Unnamed: 0', so we drop it.
data = data.drop(['Unnamed: 0'], axis=1)

before using feature selection, I should do some preprocessing stages for preparing data for that operation.

# **Handling missing values**

My dataset may contains missing values, I should choose to either remove the samples with missing values or impute the missing values using techniques like mean imputation or regression or KNN imputation.

In [42]:
data.isnull().sum()

MHC_sequence        0
MHC_type            0
peptide_sequence    0
label               0
dtype: int64

As you can see I dont have missing values in my dataset.

# **Encoding Stage**
The encoding stage in preprocessing data for neural networks involves transforming the raw input data into a format that can be effectively processed by the neural network model.

In [43]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1793065 entries, 0 to 1793064
Data columns (total 4 columns):
 #   Column            Dtype 
---  ------            ----- 
 0   MHC_sequence      object
 1   MHC_type          object
 2   peptide_sequence  object
 3   label             int64 
dtypes: int64(1), object(3)
memory usage: 54.7+ MB


I need to convert three object-type features into numerical representations. Initially, I use ordinal encoding to transform the MHC_type features into a valid structure. This approach is chosen due to the large amount of data and limited resources available. Additionally, it has been proven that ordinal encoding does not affect the performance of the neural network prediction in this case.


In [44]:
encoder = OrdinalEncoder()

In [45]:
data['MHC_type'] = encoder.fit_transform(data[['MHC_type']])

for two other features ("MHC_sequence" and "peptide_sequence") I use character-level encoding approach. I want to encode strings into numeric values while considering their similarity and allowing close strings to have close numeric values because of that I use that approach.

In [46]:
def encode_string(string,length):
    encoding = []
    for char in string:
        encoding.append(ord(char))  # Get the ASCII code for each character
    
    for i in range(length-len(string)):
        encoding.append(0)
        
    return encoding

max_len_mhc=sorted([len(i) for i in data['MHC_sequence']])[-1]
max_len_peptide=sorted([len(i) for i in data['peptide_sequence']])[-1]

mhc_sequence=[]
peptide_sequence=[]

for i in data['MHC_sequence'].values:
    mhc_sequence.append(encode_string(i,max_len_mhc))
    
for i in data['peptide_sequence'].values:
    peptide_sequence.append(encode_string(i,max_len_peptide))

I utilize the Principal Component Analysis (PCA) algorithm as a means to effectively reduce the dimensionality of the dataset by applying it to these two specific features. This allows me to extract the most significant information from the features while minimizing the loss of valuable data.

In [47]:
mhc_sequence=np.array(mhc_sequence).astype(np.float64)
peptide_sequence=np.array(peptide_sequence).astype(np.float64)

pca = PCA(n_components=1)  # Specify the number of components you want to keep
data['MHC_sequence'] = pca.fit_transform(mhc_sequence)
data['peptide_sequence']=pca.fit_transform(peptide_sequence)

In [48]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1793065 entries, 0 to 1793064
Data columns (total 4 columns):
 #   Column            Dtype  
---  ------            -----  
 0   MHC_sequence      float64
 1   MHC_type          float64
 2   peptide_sequence  float64
 3   label             int64  
dtypes: float64(3), int64(1)
memory usage: 54.7 MB


At this stage, we can see that the two features MHC_sequence and MHC_type are very similar and relevent.  In order to optimize memory consumption, we merge the two columns using the PCA algorithm.

In [49]:
scaler = MinMaxScaler()
data['MHC_sequence']=scaler.fit_transform(data[['MHC_sequence']])
data['peptide_sequence']=scaler.fit_transform(data[['peptide_sequence']])
data['MHC_type']=scaler.fit_transform(data[['MHC_type']])
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1793065 entries, 0 to 1793064
Data columns (total 4 columns):
 #   Column            Dtype  
---  ------            -----  
 0   MHC_sequence      float64
 1   MHC_type          float64
 2   peptide_sequence  float64
 3   label             int64  
dtypes: float64(3), int64(1)
memory usage: 54.7 MB


# **Remove noise values**

To detect outliers in datasets, the z-score method can be utilized. This involves comparing data points with a threshold and subsequently eliminating any noisy values.

In [50]:
y = data['label']

data = data.drop(['label'],axis=1)
z_scores = np.abs(stats.zscore(data))

# Define a threshold for outlier detection
threshold = 3

# Exclude rows with outliers in at least one column
data = data[(z_scores < threshold).all(axis=1)]
y = y[(z_scores < threshold).all(axis=1)]

In [51]:
print(mutual_info_score(data['MHC_sequence'], data['MHC_type']))

3.990176283634478




At first, I thought that two features MHC_sequence and MHC_type are relevant, but after this examination, you can see that the mutual information rate isn't really high. so I cant merge this two features.

# **Train-Test Split**
The purpose of this split is to evaluate the model's performance on unseen data and assess its ability to generalize.

In [52]:
x_train, x_test, y_train, y_test  = train_test_split(data, y, test_size=0.2, random_state=42)

In [53]:
x_train=x_train.to_numpy()
y_train=y_train.to_numpy()

x_test=x_test.to_numpy()
y_test=y_test.to_numpy()

x_train.shape

(1414369, 3)

# **Building Feed Forward Neural Network**
I also use **Regularization techniques** such as L1 and L2 regularization for preventing overfit in my feed-forward neural networks.

In [54]:
model = keras.models.Sequential()

model.add(keras.layers.Dense(units=2,activation='relu',input_shape=(3,))) # Input Layer
model.add(keras.layers.BatchNormalization())

model.add(keras.layers.Dense(units=16, activation='sigmoid'))
model.add(keras.layers.BatchNormalization())

model.add(keras.layers.Dense(units=8, activation='relu',kernel_regularizer=keras.regularizers.l1(0.01)))
model.add(keras.layers.BatchNormalization())

model.add(keras.layers.Dense(units=8, activation='sigmoid'))
model.add(keras.layers.BatchNormalization())

model.add(keras.layers.Dense(units=4, activation='relu',kernel_regularizer=keras.regularizers.l2(0.01)))
model.add(keras.layers.BatchNormalization())

model.add(keras.layers.Dense(units=4, activation='sigmoid'))
model.add(keras.layers.BatchNormalization())

model.add(keras.layers.Dense(units=1, activation='softmax')) # Output Layer

learning_rate = 0.005
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

In [55]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_7 (Dense)             (None, 2)                 8         
                                                                 
 batch_normalization_6 (Bat  (None, 2)                 8         
 chNormalization)                                                
                                                                 
 dense_8 (Dense)             (None, 16)                48        
                                                                 
 batch_normalization_7 (Bat  (None, 16)                64        
 chNormalization)                                                
                                                                 
 dense_9 (Dense)             (None, 8)                 136       
                                                                 
 batch_normalization_8 (Bat  (None, 8)                

In [56]:
model.fit(
    x_train,
    y_train,
    batch_size=1000,
    epochs=5,
    shuffle=True
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7d1160528790>

In [57]:
model.evaluate(x_test, y_test, batch_size=100)



[0.5307197570800781, 0.199757918715477]

My model has a underfitting problem. I can change features structure for example using only character-level encoding without using pca but it doesn't work for me because I have finite resources. So I should provide a deeper understanding and more accurate interpretation of the data for feature selection and encoding data to fix this problem and aslo balance the trade-off between memory and time.