# This notebook contains code for training and testing simple - baseline LHP predictor

based on the following paper: 
Juami Hermine Mariama van Gils, Dea Gogishvili, Jan van Eck, Robbin Bouwmeester, Erik van Dijk, Sanne Abeln, How sticky are our proteins? Quantifying hydrophobicity of the human proteome, Bioinformatics Advances, Volume 2, Issue 1, 2022, vbac002, https://doi.org/10.1093/bioadv/vbac002

In [4]:
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
import os 

In [2]:
data_path = os.path.dirname(os.getcwd()) + '/data'

In [3]:
train = pd.read_csv(data_path + '/patches/Train_LHP.csv')
CASP12_LHP = pd.read_csv(data_path + '/patches/CASP12_LHP.csv')
CB513_LHP = pd.read_csv(data_path + '/patches/CB513_LHP.csv')
TS115_LHP = pd.read_csv(data_path + '/patches/TS115_LHP.csv')

print('all entries', len(train),len(CASP12_LHP),len(CB513_LHP),len(TS115_LHP))
train = train[train['LHP'] > 0]
CASP12_LHP = CASP12_LHP[CASP12_LHP['LHP'] > 0]
CB513_LHP = CB513_LHP[CB513_LHP['LHP'] > 0]
TS115_LHP = TS115_LHP[TS115_LHP['LHP'] > 0]
print('with patches', len(train),len(CASP12_LHP),len(CB513_LHP),len(TS115_LHP))

all entries 10848 21 513 115
with patches 6433 9 319 67


In [9]:
# Extract features and labels
df = train.copy()
X = df[['THSA', 'RHSA']]
y = df['LHP']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the Random Forest Regressor model
rf_model = RandomForestRegressor()

# Define the parameter grid for grid search
param_grid = {'max_depth': [2, 3, 4]}

# Perform GridSearchCV with cross-validation
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='r2')
grid_search.fit(X_train, y_train)

# Get the best model from grid search
best_rf_model = grid_search.best_estimator_

# Print the best parameters found during the grid search
print("Best Parameters:", grid_search.best_params_)

# Evaluate the model on the test set
y_pred = best_rf_model.predict(X_test)
test_r2_score = r2_score(y_test, y_pred)

print("Test R^2 Score:", test_r2_score)

Best Parameters: {'max_depth': 4}
Test R^2 Score: 0.5986436924187686


In [10]:
# Predict on independent datasets
pred_independent_1 = best_rf_model.predict(CASP12_LHP[['THSA', 'RHSA']])
pred_independent_2 = best_rf_model.predict(CB513_LHP[['THSA', 'RHSA']])
pred_independent_3 = best_rf_model.predict(TS115_LHP[['THSA', 'RHSA']])

In [11]:
# Calculate MAE for independent dataset 1
mae_independent_1 = mean_absolute_error(CASP12_LHP['LHP'], pred_independent_1)

# Calculate MAE for independent dataset 2
mae_independent_2 = mean_absolute_error(CB513_LHP['LHP'], pred_independent_2)

# Calculate MAE for independent dataset 3
mae_independent_3 = mean_absolute_error(TS115_LHP['LHP'], pred_independent_3)

# Print MAE for each independent dataset
print("MAE for Independent Dataset 1:", mae_independent_1)
print("MAE for Independent Dataset 2:", mae_independent_2)
print("MAE for Independent Dataset 3:", mae_independent_3)

MAE for Independent Dataset 1: 180.13070679986419
MAE for Independent Dataset 2: 363.3577643331509
MAE for Independent Dataset 3: 332.7504139851342
