# XGBoost predictor on peptide descriptors

In this notebook we demonstrate the application of `peptidy` in predicting antimicrobial peptides (AMPs) using XGBoost. `peptidy` is used for feature extraction from amino acid sequences, after which the XGBoost algorithm is used for classification.


In [1]:
!pip install peptidy

Collecting peptidy
  Downloading peptidy-0.0.1-py3-none-any.whl.metadata (5.1 kB)
Downloading peptidy-0.0.1-py3-none-any.whl (21 kB)
Installing collected packages: peptidy
Successfully installed peptidy-0.0.1


In [2]:
import pandas as pd
from peptidy.descriptors import compute_descriptors

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import xgboost as xgb

### Load a dataframe with peptides

In [3]:
csv_url = 'https://raw.githubusercontent.com/AryaVenkatesh2010/AryaAIProject/refs/heads/main/subsample_AMP.csv'
subsample_AMP = pd.read_csv(csv_url)

X=subsample_AMP.drop('active',axis=1)
y=subsample_AMP['active']

### Encode and split the data

In [4]:
# Encode the data using peptidy
X_encoded = subsample_AMP['sequence'].apply(compute_descriptors)
X_encoded = pd.DataFrame(X_encoded.tolist(), index= X.index)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

In [5]:
X_encoded

Unnamed: 0,aliphatic_index,freq_A,freq_C,freq_D,freq_E,freq_F,freq_G,freq_H,freq_I,freq_K,...,n_C,n_H,n_N,n_O,n_S,n_P,molecular_weight,n_h_donors,n_h_acceptors,topological_polar_surface_area
0,35.0,0.071429,0.0,0.071429,0.0,0.0,0.071429,0.071429,0.0,0.142857,...,67,110,22,21,1,0,1555.755,38,53,1184.2
1,86.666667,0.044444,0.0,0.088889,0.066667,0.022222,0.022222,0.066667,0.066667,0.0,...,247,369,69,74,1,0,5485.97,125,172,3914.5
2,140.555556,0.055556,0.0,0.0,0.166667,0.0,0.055556,0.0,0.111111,0.0,...,85,147,23,30,0,0,1971.215,46,66,1463.5
3,97.5,0.0,0.0,0.15,0.05,0.05,0.05,0.0,0.05,0.1,...,97,159,23,34,1,0,2223.475,51,76,1604.0
4,150.0,0.0,0.0,0.090909,0.090909,0.0,0.090909,0.0,0.090909,0.090909,...,56,102,18,16,0,0,1283.53,29,40,927.1
5,35.454545,0.0,0.045455,0.136364,0.136364,0.090909,0.045455,0.045455,0.0,0.045455,...,116,163,31,41,1,0,2643.755,61,88,1914.2
6,59.722222,0.138889,0.0,0.0,0.194444,0.027778,0.055556,0.027778,0.0,0.055556,...,166,259,45,61,1,0,3857.115,92,136,2935.7
7,75.0,0.076923,0.0,0.115385,0.038462,0.038462,0.0,0.0,0.038462,0.076923,...,132,213,37,44,0,0,3022.315,73,100,2225.4
8,45.882353,0.058824,0.0,0.058824,0.294118,0.058824,0.0,0.0,0.0,0.058824,...,91,138,22,33,0,0,2141.25,47,68,1478.1
9,101.458333,0.0,0.041667,0.041667,0.125,0.0625,0.0625,0.041667,0.104167,0.083333,...,261,401,65,74,3,0,5657.505,125,180,3894.0


### Define the model

In [6]:
def GB_model(n_estimators=10,
             max_depth=10,
             gamma=0.1,
             reg_alpha=0.5,
             min_child_weight=int(5),
             colsample_bytree=0.1,
             learning_rate=0.1,
             subsample=0.9,
             reg_lambda=0.5,
             objective='binary:logistic',
             eval_metric='logloss',):

    model=xgb.XGBClassifier(n_estimators =int(n_estimators), max_depth = int(max_depth), gamma = gamma,
                           reg_alpha = reg_alpha,min_child_weight= min_child_weight,learning_rate=learning_rate,
                           subsample=subsample,reg_lambda=reg_lambda,objective=objective,
                           colsample_bytree=colsample_bytree, eval_metric=eval_metric)
    return model

### Fit the model to the data

In [7]:
model = GB_model(max_depth=10, n_estimators=7, gamma=0.5, learning_rate=0.001,
                 subsample=0.8, reg_alpha=0.8, reg_lambda=1,
                 min_child_weight=2, objective='binary:logistic', eval_metric='logloss',
                 colsample_bytree=0.9)

evaluation = [( X_train, y_train), ( X_test, y_test)]

fitted_model=model.fit(X_train, y_train,
    eval_set=evaluation,
    verbose=0)

### Evaluate the model

In [8]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

0.3
