In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, matthews_corrcoef, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt

In [2]:
import pandas as pd

# Load datasets
X_train = pd.read_csv("X_train.csv")  # Peptide sequences
train_labels = pd.read_csv("label_train.csv")  # Corresponding labels

# Combine features and labels
train_data = pd.concat([X_train, train_labels], axis=1)
train_data.columns = ['peptide_sequence', 'label']

# Load test data
X_test = pd.read_csv("X_test.csv")
test_labels = pd.read_csv("label_test.csv")
test_data = pd.concat([X_test, test_labels], axis=1)
test_data.columns = ['peptide_sequence', 'label']

In [3]:
train_data.head()

Unnamed: 0,peptide_sequence,label
0,DDRHKIVNVDQRQYG,1
1,EGNRPTNSIVFTKLT,1
2,TRQGGYSNDNTVIFR,1
3,LHGETFPYTAFDNNC,1
4,VMALEPVVGAAIAAP,1


In [4]:
test_data.head()

Unnamed: 0,peptide_sequence,label
0,TPETLFEIGSVSKTFTAT,1
1,HPGNTILHVDTIYNRPSNTT,1
2,YWAGIEFDVTHKGMALLHRL,1
3,EQGLLYMPQELAVSD,1
4,GARGFFQARHLEMDA,1


In [5]:
len(train_data)

2872

In [6]:
len(test_data)

342

In [7]:
test_data.head()

Unnamed: 0,peptide_sequence,label
0,TPETLFEIGSVSKTFTAT,1
1,HPGNTILHVDTIYNRPSNTT,1
2,YWAGIEFDVTHKGMALLHRL,1
3,EQGLLYMPQELAVSD,1
4,GARGFFQARHLEMDA,1


In [8]:
print(train_data['label'].value_counts())
print(test_data['label'].value_counts())

label
0    1627
1    1245
Name: count, dtype: int64
label
1    171
0    171
Name: count, dtype: int64


# TF-IDF Vectorization 

In [9]:
# Split features (peptide_sequence) and target (label)
X_train = train_data['peptide_sequence']
y_train = train_data['label']
X_test = test_data['peptide_sequence']
y_test = test_data['label']

# Further split train into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Initialize TF-IDF Vectorizer (Monogram and Bigram)
vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words='english')

# Fit on training data and transform both train and test data
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)
X_test_tfidf = vectorizer.transform(X_test)

# Apply SMOTE to handle class imbalance

In [10]:

# Apply SMOTE to handle class imbalance in training data
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train_tfidf, y_train)

# Define function for model training and evaluation
def train_and_evaluate_model(model, X_train, X_val, y_train, y_val, X_test, y_test):
    # Train the model
    model.fit(X_train, y_train)

    # Predict on the validation set
    y_val_pred = model.predict(X_val)
    print(f"\n{model.__class__.__name__} - Validation Results")
    print(confusion_matrix(y_val, y_val_pred))
    print(f"Validation Accuracy: {accuracy_score(y_val, y_val_pred)}")
    print(f"Validation MCC: {matthews_corrcoef(y_val, y_val_pred)}")
    print(classification_report(y_val, y_val_pred))

    # Evaluate on test set
    y_test_pred = model.predict(X_test)
    print(f"\n{model.__class__.__name__} - Test Results")
    print(confusion_matrix(y_test, y_test_pred))
    print(f"Test Accuracy: {accuracy_score(y_test, y_test_pred)}")
    print(f"Test MCC: {matthews_corrcoef(y_test, y_test_pred)}")
    print(classification_report(y_test, y_test_pred))

# Initialize models
best_svm = SVC(kernel='linear', C=1)
naive_bayes = MultinomialNB()
random_forest = RandomForestClassifier(n_estimators=100, random_state=42)
gradient_boosting = GradientBoostingClassifier(n_estimators=100, random_state=42)

# Train and evaluate models
print("\nTraining and Evaluating SVM Model...")
train_and_evaluate_model(best_svm, X_train_res, X_val_tfidf, y_train_res, y_val, X_test_tfidf, y_test)

print("\nTraining and Evaluating Naive Bayes Model...")
train_and_evaluate_model(naive_bayes, X_train_res, X_val_tfidf, y_train_res, y_val, X_test_tfidf, y_test)

print("\nTraining and Evaluating Random Forest Model...")
train_and_evaluate_model(random_forest, X_train_res, X_val_tfidf, y_train_res, y_val, X_test_tfidf, y_test)

print("\nTraining and Evaluating Gradient Boosting Model...")
train_and_evaluate_model(gradient_boosting, X_train_res, X_val_tfidf, y_train_res, y_val, X_test_tfidf, y_test)



Training and Evaluating SVM Model...

SVC - Validation Results
[[330   0]
 [245   0]]
Validation Accuracy: 0.5739130434782609
Validation MCC: 0.0
              precision    recall  f1-score   support

           0       0.57      1.00      0.73       330
           1       0.00      0.00      0.00       245

    accuracy                           0.57       575
   macro avg       0.29      0.50      0.36       575
weighted avg       0.33      0.57      0.42       575


SVC - Test Results
[[171   0]
 [171   0]]
Test Accuracy: 0.5
Test MCC: 0.0
              precision    recall  f1-score   support

           0       0.50      1.00      0.67       171
           1       0.00      0.00      0.00       171

    accuracy                           0.50       342
   macro avg       0.25      0.50      0.33       342
weighted avg       0.25      0.50      0.33       342


Training and Evaluating Naive Bayes Model...

MultinomialNB - Validation Results
[[330   0]
 [245   0]]
Validation Accurac

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



RandomForestClassifier - Validation Results
[[330   0]
 [245   0]]
Validation Accuracy: 0.5739130434782609
Validation MCC: 0.0
              precision    recall  f1-score   support

           0       0.57      1.00      0.73       330
           1       0.00      0.00      0.00       245

    accuracy                           0.57       575
   macro avg       0.29      0.50      0.36       575
weighted avg       0.33      0.57      0.42       575


RandomForestClassifier - Test Results
[[171   0]
 [171   0]]
Test Accuracy: 0.5
Test MCC: 0.0
              precision    recall  f1-score   support

           0       0.50      1.00      0.67       171
           1       0.00      0.00      0.00       171

    accuracy                           0.50       342
   macro avg       0.25      0.50      0.33       342
weighted avg       0.25      0.50      0.33       342


Training and Evaluating Gradient Boosting Model...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



GradientBoostingClassifier - Validation Results
[[330   0]
 [245   0]]
Validation Accuracy: 0.5739130434782609
Validation MCC: 0.0
              precision    recall  f1-score   support

           0       0.57      1.00      0.73       330
           1       0.00      0.00      0.00       245

    accuracy                           0.57       575
   macro avg       0.29      0.50      0.36       575
weighted avg       0.33      0.57      0.42       575


GradientBoostingClassifier - Test Results
[[171   0]
 [171   0]]
Test Accuracy: 0.5
Test MCC: 0.0
              precision    recall  f1-score   support

           0       0.50      1.00      0.67       171
           1       0.00      0.00      0.00       171

    accuracy                           0.50       342
   macro avg       0.25      0.50      0.33       342
weighted avg       0.25      0.50      0.33       342



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
