In [12]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
import pickle

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_curve, auc

In [13]:
df = pd.read_csv('PhageAcr_ML_dataset_cdhit.csv')       # Read dataset
df.set_index('ID', inplace=True)                        # Set the "ID" column as the index
df = df.rename(columns={'Protein Acr': 'Protein_Acr'})  # Renaming columns to avoid issues later

# Extracting features (X) and target variable (y)
X = df.drop('Protein_Acr', axis=1)    # Keep only the features in variable x
y = df['Protein_Acr']                 # Assign our Target variable as y

# Split the dataset into training+validation (80%) and test sets (20%)
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42)   # random_state will allow for the reproductibility of the split

# Split the training+validation set into training (70%) and validation sets (10% of the original data)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.125, random_state=42)

# Checking the shape of the resulting sets
print(f'Training Set \t Shape of X_train: {X_train.shape} \n\t\t Shape of y_train: {y_train.shape} \n')
print(f'Validation Set \t Shape of X_val: {X_val.shape} \n\t\t Shape of y_val: {y_val.shape} \n')
print(f'Test Set \t Shape of X_test: {X_test.shape} \n\t\t Shape of y_test: {y_test.shape}')

Training Set 	 Shape of X_train: (1570, 25) 
		 Shape of y_train: (1570,) 

Validation Set 	 Shape of X_val: (225, 25) 
		 Shape of y_val: (225,) 

Test Set 	 Shape of X_test: (449, 25) 
		 Shape of y_test: (449,)


### Hyperparameter Tuning for Decision Tree:

In [None]:
from sklearn.model_selection import GridSearchCV

# Define the hyperparameters grid for Decision Tree
param_grid_dt = {
    'max_depth': [3, 5, 7, 9],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize GridSearchCV for Decision Tree
grid_search_dt = GridSearchCV(dt_model, param_grid_dt, cv=5, scoring='accuracy')

# Perform GridSearchCV
grid_search_dt.fit(X_train, y_train)

# Get the best hyperparameters
best_params_dt = grid_search_dt.best_params_

# Initialize Decision Tree classifier with best hyperparameters
best_dt_model = DecisionTreeClassifier(random_state=42, **best_params_dt)

# Train the Decision Tree classifier with best hyperparameters
best_dt_model.fit(X_train, y_train)

## VALIDATION SET
# Make predictions on the validation set
y_val_pred_dt_tuned = best_dt_model.predict(X_val)

# Calculate evaluation metrics for the validation set
val_accuracy_dt_tuned = accuracy_score(y_val, y_val_pred_dt_tuned)
val_precision_dt_tuned = precision_score(y_val, y_val_pred_dt_tuned)
val_recall_dt_tuned = recall_score(y_val, y_val_pred_dt_tuned)
val_f1_dt_tuned = f1_score(y_val, y_val_pred_dt_tuned)

# Print the evaluation metrics for tuned Decision Tree and compare with the original metrics
print("Decision Tree Validation Set Default Parameters vs Hyperparameter Tuning (CD-HIT) \n")
df_metrics_dt_val_tuned = pd.DataFrame({
    "Parameters": ["Default", "Tuned"],
    "Accuracy": [val_accuracy_dt, val_accuracy_dt_tuned], 
    "Precision": [val_precision_dt, val_precision_dt_tuned], 
    "Recall": [val_recall_dt, val_recall_dt_tuned], 
    "F1-Score": [val_f1_dt, val_f1_dt_tuned]
}).round(2)

print(df_metrics_dt_val_tuned, "\n\n")

## TEST SET
# Make predictions on the test set
y_test_pred_dt_tuned = best_dt_model.predict(X_test)

# Calculate evaluation metrics for the test set
test_accuracy_dt_tuned = accuracy_score(y_test, y_test_pred_dt_tuned)
test_precision_dt_tuned = precision_score(y_test, y_test_pred_dt_tuned)
test_recall_dt_tuned = recall_score(y_test, y_test_pred_dt_tuned)
test_f1_dt_tuned = f1_score(y_test, y_test_pred_dt_tuned)

# Print the evaluation metrics for tuned Decision Tree and compare with the original metrics
print("Decision Tree Test Set Default Parameters vs Hyperparameter Tuning (CD-HIT) \n")
df_metrics_dt_test_tuned = pd.DataFrame({
    "Parameters": ["Default", "Tuned"],
    "Accuracy": [test_accuracy_dt, test_accuracy_dt_tuned], 
    "Precision": [test_precision_dt, test_precision_dt_tuned], 
    "Recall": [test_recall_dt, test_recall_dt_tuned], 
    "F1-Score": [test_f1_dt, test_f1_dt_tuned]
}).round(2)

print(df_metrics_dt_test_tuned)

### Hyperparameter Tuning for RandomForest:

In [None]:
# Define the hyperparameters grid for RandomForest
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize GridSearchCV for RandomForest
grid_search_rf = GridSearchCV(rf_model, param_grid_rf, cv=5, scoring='accuracy')

# Perform GridSearchCV
grid_search_rf.fit(X_train, y_train)

# Get the best hyperparameters
best_params_rf = grid_search_rf.best_params_

# Initialize RandomForest classifier with best hyperparameters
best_rf_model = RandomForestClassifier(random_state=42, **best_params_rf)

# Train the RandomForest classifier with best hyperparameters
best_rf_model.fit(X_train, y_train)

## VALIDATION SET
# Make predictions on the validation set
y_val_pred_rf_tuned = best_rf_model.predict(X_val)

# Calculate evaluation metrics for the validation set
val_accuracy_rf_tuned = accuracy_score(y_val, y_val_pred_rf_tuned)
val_precision_rf_tuned = precision_score(y_val, y_val_pred_rf_tuned)
val_recall_rf_tuned = recall_score(y_val, y_val_pred_rf_tuned)
val_f1_rf_tuned = f1_score(y_val, y_val_pred_rf_tuned)

# Print the evaluation metrics for tuned Random Forest and compare with the original metrics
print("RandomForest Validation Set Default Parameters vs Hyperparameter Tuning (CD-HIT) \n")
df_metrics_rf_val_tuned = pd.DataFrame({
    "Parameters": ["Default", "Tuned"],
    "Accuracy": [val_accuracy_rf, val_accuracy_rf_tuned], 
    "Precision": [val_precision_rf, val_precision_rf_tuned], 
    "Recall": [val_recall_rf, val_recall_rf_tuned], 
    "F1-Score": [val_f1_rf, val_f1_rf_tuned]
}).round(2)

print(df_metrics_rf_val_tuned, "\n\n")

## TEST SET
# Make predictions on the test set
y_test_pred_rf_tuned = best_rf_model.predict(X_test)

# Calculate evaluation metrics for the test set
test_accuracy_rf_tuned = accuracy_score(y_test, y_test_pred_rf_tuned)
test_precision_rf_tuned = precision_score(y_test, y_test_pred_rf_tuned)
test_recall_rf_tuned = recall_score(y_test, y_test_pred_rf_tuned)
test_f1_rf_tuned = f1_score(y_test, y_test_pred_rf_tuned)

# Print the evaluation metrics for tuned Random Forest and compare with the original metrics
print("RandomForest Test Set Default Parameters vs Hyperparameter Tuning (CD-HIT) \n")
df_metrics_rf_test_tuned = pd.DataFrame({
    "Parameters": ["Default", "Tuned"],
    "Accuracy": [test_accuracy_rf, test_accuracy_rf_tuned], 
    "Precision": [test_precision_rf, test_precision_rf_tuned], 
    "Recall": [test_recall_rf, test_recall_rf_tuned], 
    "F1-Score": [test_f1_rf, test_f1_rf_tuned]
}).round(2)

print(df_metrics_rf_test_tuned)

RandomForest Validation Set Default Parameters vs Hyperparameter Tuning (CD-HIT) 

  Parameters  Accuracy  Precision  Recall  F1-Score
0    Default      0.89       0.80    0.98      0.88
1      Tuned      0.91       0.81    0.99      0.89 


RandomForest Test Set Default Parameters vs Hyperparameter Tuning (CD-HIT) 

  Parameters  Accuracy  Precision  Recall  F1-Score
0    Default      0.94       0.90    0.97      0.94
1      Tuned      0.93       0.89    0.97      0.93


### Hyperparameter Tuning for SVM:

In [None]:
# Define the hyperparameters grid for SVM
param_grid_svm = {
    'C': [0.1, 1, 10, 100, 1000],                   
    'gamma': [1, 0.1, 0.01, 0.001, 0.0001],         
    'kernel': ['rbf']                            # Other Kernels: 'linear', 'rbf', 'poly' and 'sigmoid'; to note: only 'rbf' provides actual results for this dataset
}

# Initialize GridSearchCV for SVM
grid_search_svm = GridSearchCV(svm_model, param_grid_svm, refit = True, verbose = 3) 

# Perform GridSearchCV
grid_search_svm.fit(X_train, y_train)

# Get the best hyperparameters
best_params_svm = grid_search_svm.best_params_
print(best_params_svm)

# Initialize SVM with best hyperparameters
best_svm_model = grid_search_svm.best_estimator_
print(best_svm_model)

# Train SVM with best hyperparameters
best_svm_model.fit(X_train, y_train)

## VALIDATION SET
# Make predictions on the validation set
y_val_pred_svm_tuned = best_svm_model.predict(X_val)

# Calculate evaluation metrics for the validation set
val_accuracy_svm_tuned = accuracy_score(y_val, y_val_pred_svm_tuned)
val_precision_svm_tuned = precision_score(y_val, y_val_pred_svm_tuned)
val_recall_svm_tuned = recall_score(y_val, y_val_pred_svm_tuned)
val_f1_svm_tuned = f1_score(y_val, y_val_pred_svm_tuned)

# Print the evaluation metrics for tuned Random Forest and compare with the original metrics
print("SVM Validation Set Default Parameters vs Hyperparameter Tuning (CD-HIT) \n")
df_metrics_svm_val_tuned = pd.DataFrame({
    "Parameters": ["Default", "Tuned"],
    "Accuracy": [val_accuracy_svm, val_accuracy_svm_tuned], 
    "Precision": [val_precision_svm, val_precision_svm_tuned], 
    "Recall": [val_recall_svm, val_recall_svm_tuned], 
    "F1-Score": [val_f1_svm, val_f1_svm_tuned]
}).round(2)

print(df_metrics_svm_val_tuned, "\n\n")

## TEST SET
# Make predictions on the test set
y_test_pred_svm_tuned = best_svm_model.predict(X_test)

# Calculate evaluation metrics for the test set
test_accuracy_svm_tuned = accuracy_score(y_test, y_test_pred_svm_tuned)
test_precision_svm_tuned = precision_score(y_test, y_test_pred_svm_tuned)
test_recall_svm_tuned = recall_score(y_test, y_test_pred_svm_tuned)
test_f1_svm_tuned = f1_score(y_test, y_test_pred_svm_tuned)

# Print the evaluation metrics for tuned Random Forest and compare with the original metrics
print("SVM Test Set Default Parameters vs Hyperparameter Tuning (CD-HIT) \n")
df_metrics_svm_test_tuned = pd.DataFrame({
    "Parameters": ["Default", "Tuned"],
    "Accuracy": [test_accuracy_svm, test_accuracy_svm_tuned], 
    "Precision": [test_precision_svm, test_precision_svm_tuned], 
    "Recall": [test_recall_svm, test_recall_svm_tuned], 
    "F1-Score": [test_f1_svm, test_f1_svm_tuned]
}).round(2)

print(df_metrics_svm_test_tuned)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV 1/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.516 total time=   0.1s
[CV 2/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.516 total time=   0.1s
[CV 3/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.516 total time=   0.1s
[CV 4/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.513 total time=   0.1s
[CV 5/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.513 total time=   0.1s
[CV 1/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.516 total time=   0.1s
[CV 2/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.516 total time=   0.1s
[CV 3/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.516 total time=   0.1s
[CV 4/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.513 total time=   0.1s
[CV 5/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.513 total time=   0.1s
[CV 1/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.516 total time=   0.1s
[CV 2/5] END .....C=0.1, gamma=0.01, kernel=rbf

### Hyperparameter Tuning for XGBoost:

In [None]:
# Define the hyperparameters grid for XGBoost
param_grid_xgb = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01, 0.001],
    'n_estimators': [100, 200, 300]
}

# Initialize GridSearchCV for XGBoost
grid_search_xgb = GridSearchCV(xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss'), param_grid_xgb, cv=5, scoring='accuracy')

# Perform GridSearchCV
grid_search_xgb.fit(X_train, y_train)

# Get the best hyperparameters
best_params_xgb = grid_search_xgb.best_params_

# Initialize XGBoost classifier with best hyperparameters
best_xgb_model = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', **best_params_xgb)

# Train the XGBoost classifier with best hyperparameters
best_xgb_model.fit(X_train, y_train)

## VALIDATION SET
# Make predictions on the validation set
y_val_pred_xgb_tuned = best_xgb_model.predict(X_val)

# Calculate evaluation metrics for the validation set
val_accuracy_xgb_tuned = accuracy_score(y_val, y_val_pred_xgb_tuned)
val_precision_xgb_tuned = precision_score(y_val, y_val_pred_xgb_tuned)
val_recall_xgb_tuned = recall_score(y_val, y_val_pred_xgb_tuned)
val_f1_xgb_tuned = f1_score(y_val, y_val_pred_xgb_tuned)

# Print the evaluation metrics for tuned XGBoost and compare with the original metrics
print("XGBoost Validation Set Default Parameters vs Hyperparameter Tuning (CD-HIT) \n")
df_metrics_xgb_val_tuned = pd.DataFrame({
    "Parameters": ["Default", "Tuned"],
    "Accuracy": [val_accuracy_xgb, val_accuracy_xgb_tuned], 
    "Precision": [val_precision_xgb, val_precision_xgb_tuned], 
    "Recall": [val_recall_xgb, val_recall_xgb_tuned], 
    "F1-Score": [val_f1_xgb, val_f1_xgb_tuned]
}).round(2)

print(df_metrics_xgb_val_tuned, "\n\n")

## TEST SET
# Make predictions on the test set
y_test_pred_xgb_tuned = best_xgb_model.predict(X_test)

# Calculate evaluation metrics for the test set
test_accuracy_xgb_tuned = accuracy_score(y_test, y_test_pred_xgb_tuned)
test_precision_xgb_tuned = precision_score(y_test, y_test_pred_xgb_tuned)
test_recall_xgb_tuned = recall_score(y_test, y_test_pred_xgb_tuned)
test_f1_xgb_tuned = f1_score(y_test, y_test_pred_xgb_tuned)

# Print the evaluation metrics for tuned XGBoost and compare with the original metrics
print("XGBoost Test Set Default Parameters vs Hyperparameter Tuning (CD-HIT) \n")
df_metrics_xgb_test_tuned = pd.DataFrame({
    "Parameters": ["Default", "Tuned"],
    "Accuracy": [test_accuracy_xgb, test_accuracy_xgb_tuned], 
    "Precision": [test_precision_xgb, test_precision_xgb_tuned], 
    "Recall": [test_recall_xgb, test_recall_xgb_tuned], 
    "F1-Score": [test_f1_xgb, test_f1_xgb_tuned]
}).round(2)

print(df_metrics_xgb_test_tuned)

XGBoost Validation Set Default Parameters vs Hyperparameter Tuning (CD-HIT) 

  Parameters  Accuracy  Precision  Recall  F1-Score
0    Default      0.90       0.82    0.97      0.89
1      Tuned      0.91       0.83    0.97      0.89 


XGBoost Test Set Default Parameters vs Hyperparameter Tuning (CD-HIT) 

  Parameters  Accuracy  Precision  Recall  F1-Score
0    Default      0.92       0.90    0.95      0.92
1      Tuned      0.92       0.89    0.96      0.92
