In [2]:
# import necessary libraries
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC
from xgboost import XGBClassifier
import optuna

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
data_dir = '/home/darshana/Projects/druggable_proteins/processed_dataset'

# load the training dataset
train_data = pd.read_csv(f'{data_dir}/TR_CTD.csv')
test_data = pd.read_csv(f'{data_dir}/TS_CTD.csv')

# separate features and target
X_train = train_data.drop(columns=['label', 'id'], axis=1)
y_train = train_data['label']

X_test = test_data.drop(columns=['label', 'id'], axis=1)
y_test = test_data['label']

train_data.head()

Unnamed: 0,_PolarizabilityC1,_PolarizabilityC2,_PolarizabilityC3,_SolventAccessibilityC1,_SolventAccessibilityC2,_SolventAccessibilityC3,_SecondaryStrC1,_SecondaryStrC2,_SecondaryStrC3,_ChargeC1,...,_HydrophobicityD2050,_HydrophobicityD2075,_HydrophobicityD2100,_HydrophobicityD3001,_HydrophobicityD3025,_HydrophobicityD3050,_HydrophobicityD3075,_HydrophobicityD3100,id,label
0,0.311,0.438,0.251,0.44,0.329,0.231,0.435,0.291,0.274,0.11,...,49.771,73.602,99.908,0.092,25.94,51.237,77.085,99.817,Positive_798,1
1,0.326,0.461,0.214,0.438,0.303,0.26,0.467,0.266,0.267,0.107,...,57.125,79.771,99.491,0.127,21.374,45.038,65.267,100.0,Negative_170,0
2,0.254,0.449,0.297,0.39,0.407,0.203,0.559,0.178,0.263,0.203,...,42.373,77.966,100.0,0.847,23.729,46.61,72.881,99.153,Negative_203,0
3,0.303,0.402,0.295,0.41,0.303,0.287,0.434,0.238,0.328,0.139,...,54.918,72.951,99.18,0.82,19.672,42.623,68.852,100.0,Negative_318,0
4,0.381,0.435,0.185,0.435,0.149,0.417,0.315,0.274,0.411,0.03,...,47.024,70.238,98.81,0.595,34.524,52.976,82.738,100.0,Negative_336,0


In [3]:
# define the model
model = LogisticRegression()

# define the cross validation procedure
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)

# evaluate model
scores = cross_val_score(model, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)

# report performance
print('Accuracy: %.3f (%.3f)' % (scores.mean(), scores.std()))

# fit the model on the training set
model.fit(X_train, y_train)

# predict the test set results
y_pred = model.predict(X_test)

# print classification report
print(classification_report(y_test, y_pred))

# print confusion matrix
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.874 (0.022)
              precision    recall  f1-score   support

           0       0.88      0.93      0.91       237
           1       0.92      0.87      0.89       224

    accuracy                           0.90       461
   macro avg       0.90      0.90      0.90       461
weighted avg       0.90      0.90      0.90       461

[[220  17]
 [ 29 195]]


In [8]:
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)

def objective(trial):
    # hyperparameters space
    c = trial.suggest_loguniform('svc_c', 1e-2, 1e2)
    gamma = trial.suggest_loguniform('svc_gamma', 1e-2, 1e2)
    
    # define the model with hyperparameters
    model = SVC(C=c, gamma=gamma)
    
    # perform cross validation and compute mean accuracy
    accuracy = cross_val_score(model, X_train, y_train, cv=cv, n_jobs=-1, scoring='accuracy').mean()
    
    # return accuracy
    return accuracy

# create a study object and optimize the objective
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

# print best parameters
print(study.best_trial.params)

# fit the model on the training set using best parameters
best_model = SVC(C=study.best_trial.params['svc_c'], gamma=study.best_trial.params['svc_gamma'])
best_model.fit(X_train, y_train)

# predict the test set results
y_pred = best_model.predict(X_test)

# print classification report
print(classification_report(y_test, y_pred))

# print confusion matrix
print(confusion_matrix(y_test, y_pred))


[32m[I 2023-05-12 20:28:49,028][0m A new study created in memory with name: no-name-cd20b073-f499-43f7-9559-b91e6e6f9741[0m
  c = trial.suggest_loguniform('svc_c', 1e-2, 1e2)
  gamma = trial.suggest_loguniform('svc_gamma', 1e-2, 1e2)
[32m[I 2023-05-12 20:28:58,361][0m Trial 0 finished with value: 0.5196680561988575 and parameters: {'svc_c': 1.804656059106834, 'svc_gamma': 0.4646505347408807}. Best is trial 0 with value: 0.5196680561988575.[0m
  c = trial.suggest_loguniform('svc_c', 1e-2, 1e2)
  gamma = trial.suggest_loguniform('svc_gamma', 1e-2, 1e2)
[32m[I 2023-05-12 20:29:05,334][0m Trial 1 finished with value: 0.5180947969739076 and parameters: {'svc_c': 6.490328866117436, 'svc_gamma': 0.9631610335368251}. Best is trial 0 with value: 0.5196680561988575.[0m
  c = trial.suggest_loguniform('svc_c', 1e-2, 1e2)
  gamma = trial.suggest_loguniform('svc_gamma', 1e-2, 1e2)
[32m[I 2023-05-12 20:29:12,257][0m Trial 2 finished with value: 0.5180947969739076 and parameters: {'svc_c': 

{'svc_c': 37.98586065187875, 'svc_gamma': 0.01006461522228624}
              precision    recall  f1-score   support

           0       0.57      0.95      0.71       237
           1       0.82      0.26      0.39       224

    accuracy                           0.61       461
   macro avg       0.70      0.60      0.55       461
weighted avg       0.69      0.61      0.56       461

[[224  13]
 [166  58]]


In [6]:
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)

def objective(trial):
    # hyperparameters space
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-2, 0.3)
    max_depth = trial.suggest_int("max_depth", 2, 6)
    n_estimators = trial.suggest_int("n_estimators", 100, 1000)
    
    # define the model with hyperparameters
    model = XGBClassifier(learning_rate=learning_rate, max_depth=max_depth, n_estimators=n_estimators)
    
    # perform cross validation and compute mean accuracy
    accuracy = cross_val_score(model, X_train, y_train, cv=cv, n_jobs=-1, scoring='accuracy').mean()
    
    # return accuracy
    return accuracy

# create a study object and optimize the objective
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

# print best parameters
print(study.best_trial.params)

# fit the model on the training set using best parameters
best_model = XGBClassifier(
    learning_rate=study.best_trial.params['learning_rate'],
    max_depth=study.best_trial.params['max_depth'],
    n_estimators=study.best_trial.params['n_estimators']
)
best_model.fit(X_train, y_train)

# predict the test set results
y_pred = best_model.predict(X_test)

# print classification report
print(classification_report(y_test, y_pred))

# print confusion matrix
print(confusion_matrix(y_test, y_pred))


[32m[I 2023-05-12 19:07:09,108][0m A new study created in memory with name: no-name-f7316d9f-657b-4767-8e13-f374dddc31c4[0m
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-2, 0.3)
[32m[I 2023-05-12 19:07:26,521][0m Trial 0 finished with value: 0.8845316519229562 and parameters: {'learning_rate': 0.011353650605796904, 'max_depth': 4, 'n_estimators': 504}. Best is trial 0 with value: 0.8845316519229562.[0m
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-2, 0.3)
[32m[I 2023-05-12 19:07:37,368][0m Trial 1 finished with value: 0.8738487358052577 and parameters: {'learning_rate': 0.013980154346535427, 'max_depth': 2, 'n_estimators': 655}. Best is trial 0 with value: 0.8845316519229562.[0m
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-2, 0.3)
[32m[I 2023-05-12 19:08:08,258][0m Trial 2 finished with value: 0.8888826149695715 and parameters: {'learning_rate': 0.01784345573577409, 'max_depth': 5, 'n_estimators': 747}. Best is trial 2 with

{'learning_rate': 0.021803782895725415, 'max_depth': 3, 'n_estimators': 765}
              precision    recall  f1-score   support

           0       0.85      0.93      0.89       237
           1       0.92      0.83      0.87       224

    accuracy                           0.88       461
   macro avg       0.88      0.88      0.88       461
weighted avg       0.88      0.88      0.88       461

[[220  17]
 [ 38 186]]
