### Import libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


### Data Loading and EDA

In [64]:
df = pd.read_csv('df/df.csv', sep=';')
df.head()

Unnamed: 0,Info_PepID,Info_organism_id,Info_protein_id,Info_pos,Info_AA,Info_epitope_id,Info_nPos,Info_nNeg,Info_cluster,Class,...,feat_esm1b_1271,feat_esm1b_1272,feat_esm1b_1273,feat_esm1b_1274,feat_esm1b_1275,feat_esm1b_1276,feat_esm1b_1277,feat_esm1b_1278,feat_esm1b_1279,feat_esm1b_1280
0,XP_809948.1:15,5693,XP_809948.1,218,E,276453,1,0,18,1,...,-0.177248,-0.275432,0.17578,-2.618115,-0.306478,0.248409,-0.144645,-0.163276,0.101016,0.571763
1,XP_809948.1:15,5693,XP_809948.1,219,E,276453,1,0,18,1,...,-0.089673,-0.258615,0.063662,-2.660741,-0.227916,0.233011,-0.14384,-0.119282,0.146848,0.507656
2,XP_809948.1:15,5693,XP_809948.1,220,D,276453,1,0,18,1,...,-0.037816,-0.205189,0.220458,-2.610207,-0.156326,0.231606,0.251916,0.07703,0.326192,0.071661
3,XP_809948.1:15,5693,XP_809948.1,221,N,276453,1,0,18,1,...,-0.249216,-0.211748,-0.130716,-2.598186,-0.130838,-0.026848,0.167203,-0.394126,0.222274,0.376358
4,XP_809948.1:15,5693,XP_809948.1,222,E,276453,1,0,18,1,...,-0.039318,-0.250912,-0.09387,-2.414023,-0.2535,0.118345,-0.160718,-0.07675,0.448067,0.326883


In [65]:
## list all the columns
df.columns

Index(['Info_PepID', 'Info_organism_id', 'Info_protein_id', 'Info_pos',
       'Info_AA', 'Info_epitope_id', 'Info_nPos', 'Info_nNeg', 'Info_cluster',
       'Class',
       ...
       'feat_esm1b_1271', 'feat_esm1b_1272', 'feat_esm1b_1273',
       'feat_esm1b_1274', 'feat_esm1b_1275', 'feat_esm1b_1276',
       'feat_esm1b_1277', 'feat_esm1b_1278', 'feat_esm1b_1279',
       'feat_esm1b_1280'],
      dtype='object', length=1291)

In [66]:
# check the info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49606 entries, 0 to 49605
Columns: 1291 entries, Info_PepID to feat_esm1b_1280
dtypes: float64(1281), int64(4), object(6)
memory usage: 488.6+ MB


In [67]:
df.shape

(49606, 1291)

In [68]:
## remove ids columns, cause it is not important in model training
ids_columns = ['Info_PepID', 'Info_organism_id', 'Info_protein_id','Info_epitope_id']
df.drop(ids_columns, axis=1, inplace=True)

In [69]:
df.shape

(49606, 1287)

In [70]:
# check for null values
df.isnull().sum()

Info_pos            0
Info_AA             0
Info_nPos           0
Info_nNeg           0
Info_cluster        0
                   ..
feat_esm1b_1276    84
feat_esm1b_1277    85
feat_esm1b_1278    83
feat_esm1b_1279    81
feat_esm1b_1280    82
Length: 1287, dtype: int64

In [71]:
null_columns = df.columns[df.isnull().any()]
null_columns

Index(['feat_esm1b_0', 'feat_esm1b_1', 'feat_esm1b_2', 'feat_esm1b_3',
       'feat_esm1b_4', 'feat_esm1b_5', 'feat_esm1b_6', 'feat_esm1b_7',
       'feat_esm1b_8', 'feat_esm1b_9',
       ...
       'feat_esm1b_1271', 'feat_esm1b_1272', 'feat_esm1b_1273',
       'feat_esm1b_1274', 'feat_esm1b_1275', 'feat_esm1b_1276',
       'feat_esm1b_1277', 'feat_esm1b_1278', 'feat_esm1b_1279',
       'feat_esm1b_1280'],
      dtype='object', length=1281)

In [72]:
df['feat_esm1b_285'].isnull().sum()

81

In [74]:
# # threshold
# threshold = 50  # Adjust this to your desired threshold

# # Count null values in each column
# null_counts = df.isnull().sum()

# # Filter columns with null values above the threshold
# columns_above_threshold = null_counts[null_counts > threshold].index


# # List columns with null values above the threshold
# print("Columns with null values above", threshold, ":")
# for col in columns_above_threshold:
#     print(col)

In [77]:
## Check the balance of the target feature
df.Class.value_counts()

-1    48797
 1      809
Name: Class, dtype: int64

In [79]:
categorical_columns = df.select_dtypes(include=['object']).columns
categorical_columns

Index(['Info_AA', 'Info_nPos', 'Info_nNeg'], dtype='object')

In [84]:
# list of numerical variables
categorical_features = [feature for feature in df.columns if df[feature].dtypes == 'O']
len(categorical_features)

3

In [89]:
df.Info_nPos.value_counts()

0              31940
0,0            12672
0,0,0           3317
1                675
0,0,0,0          674
0,0,0,0,0         76
1,1               63
1,0               61
0,1               35
0,1,1             21
1,1,1             18
1,0,0             14
1,1,1,2           10
1,0,1              9
1,1,0              6
0,0,1              4
0,0,0,0,0,0        3
1,0,1,1,2          2
1,0,1,1            2
0,1,1,2            1
1,1,2              1
0,1,0,0            1
0,1,2              1
Name: Info_nPos, dtype: int64

In [90]:
df.Info_nNeg.value_counts()

1              31940
1,1            12672
1,1,1           3317
0                675
1,1,1,1          674
1,1,1,1,1         76
0,0               63
0,1               61
1,0               35
1,0,0             22
0,0,0             19
0,1,1             14
0,0,0,0           10
0,1,0              9
0,0,1              6
1,1,0              4
1,1,1,1,1,1        3
0,1,0,0            2
0,1,0,0,0          2
1,0,0,0            1
1,0,1,1            1
Name: Info_nNeg, dtype: int64

In [85]:
df[categorical_columns].head()

Unnamed: 0,Info_AA,Info_nPos,Info_nNeg
0,E,1,0
1,E,1,0
2,D,1,0
3,N,1,0
4,E,1,0


In [80]:
numerical_columns = df.select_dtypes(include=['int', 'float']).columns
numerical_columns

Index(['Info_pos', 'Info_cluster', 'Class', 'feat_esm1b_0', 'feat_esm1b_1',
       'feat_esm1b_2', 'feat_esm1b_3', 'feat_esm1b_4', 'feat_esm1b_5',
       'feat_esm1b_6',
       ...
       'feat_esm1b_1271', 'feat_esm1b_1272', 'feat_esm1b_1273',
       'feat_esm1b_1274', 'feat_esm1b_1275', 'feat_esm1b_1276',
       'feat_esm1b_1277', 'feat_esm1b_1278', 'feat_esm1b_1279',
       'feat_esm1b_1280'],
      dtype='object', length=1284)

In [83]:
# list of numerical variables
numerical_features = [feature for feature in df.columns if df[feature].dtypes != 'O']
len(numerical_features)

1284

### Feature Engineering

### Model Training

### Model Evaluation