In [1]:

# imports
import os
import sys
import types
import json

# figure size/format
fig_width = 5.5
fig_height = 3.5
fig_format = 'pdf'
fig_dpi = 300

# matplotlib defaults / format
try:
  import matplotlib.pyplot as plt
  plt.rcParams['figure.figsize'] = (fig_width, fig_height)
  plt.rcParams['figure.dpi'] = fig_dpi
  plt.rcParams['savefig.dpi'] = fig_dpi
  from IPython.display import set_matplotlib_formats
  set_matplotlib_formats(fig_format)
except Exception:
  pass

# plotly use connected mode
try:
  import plotly.io as pio
  pio.renderers.default = "notebook_connected"
except Exception:
  pass

# enable pandas latex repr when targeting pdfs
try:
  import pandas as pd
  if fig_format == 'pdf':
    pd.set_option('display.latex.repr', True)
except Exception:
  pass



# output kernel dependencies
kernel_deps = dict()
for module in list(sys.modules.values()):
  # Some modules play games with sys.modules (e.g. email/__init__.py
  # in the standard library), and occasionally this can cause strange
  # failures in getattr.  Just ignore anything that's not an ordinary
  # module.
  if not isinstance(module, types.ModuleType):
    continue
  path = getattr(module, "__file__", None)
  if not path:
    continue
  if path.endswith(".pyc") or path.endswith(".pyo"):
    path = path[:-1]
  if not os.path.exists(path):
    continue
  kernel_deps[path] = os.stat(path).st_mtime
print(json.dumps(kernel_deps))

# set run_path if requested
if r'/Users/alvaroromangomez/Documents/Documentos personales/Formación/Master Bioinformática y Estadística/TFM/tfm_alvaro/Codigo/notebooks':
  os.chdir(r'/Users/alvaroromangomez/Documents/Documentos personales/Formación/Master Bioinformática y Estadística/TFM/tfm_alvaro/Codigo/notebooks')

# reset state
%reset

def ojs_define(**kwargs):
  import json
  try:
    # IPython 7.14 preferred import
    from IPython.display import display, HTML
  except:
    from IPython.core.display import display, HTML

  # do some minor magic for convenience when handling pandas
  # dataframes
  def convert(v):
    try:
      import pandas as pd
    except ModuleNotFoundError: # don't do the magic when pandas is not available
      return v
    if type(v) == pd.Series:
      v = pd.DataFrame(v)
    if type(v) == pd.DataFrame:
      j = json.loads(v.T.to_json(orient='split'))
      return dict((k,v) for (k,v) in zip(j["index"], j["data"]))
    else:
      return v
  
  v = dict(contents=list(dict(name=key, value=convert(value)) for (key, value) in kwargs.items()))
  display(HTML('<script type="ojs-define">' + json.dumps(v) + '</script>'), metadata=dict(ojs_define = True))
globals()["ojs_define"] = ojs_define




In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pycaret.classification import *

# IMPORT CUSTOM MODULES
import sys

sys.path.append("../src")
import utils.stats as st

In [3]:
# DIRECTORIES
input_path = "../data/processed/"
train_path = "../data/processed/train_data/"
test_path = "../data/processed/test_data/"
# FILES
# MOLECULAR DESCRIPTORS
molecular_descriptors_training_file = "molecular_descriptors_training.csv"
molecular_descriptors_test_file = "molecular_descriptors_test.csv"
# MACCS KEYS
maccs_keys_training_file = "maccs_keys_training.csv"
maccs_keys_test_file = "maccs_keys_test.csv"
# ECFP4 FINGERPRINTS
ecfp4_fingerprints_training_file = "ecfp4_fingerprints_training.csv"
ecfp4_fingerprints_test_file = "ecfp4_fingerprints_test.csv"

In [4]:
# LOAD DATA
# MOLECULAR DESCRIPTORS
## TRAINING
molecular_descriptors_training = pd.read_csv(
    train_path + molecular_descriptors_training_file
)
X_training_molecular_descriptors = molecular_descriptors_training.drop(
    columns=["activity"]
)
Y_training_molecular_descriptors = molecular_descriptors_training["activity"]
## TEST
molecular_descriptors_test = pd.read_csv(test_path + molecular_descriptors_test_file)
X_test_molecular_descriptors = molecular_descriptors_test.drop(columns=["activity"])
Y_test_molecular_descriptors = molecular_descriptors_test["activity"]
# MACCS KEYS
## TRAINING
macc_keys_training = pd.read_csv(train_path + maccs_keys_training_file)
X_training_maccs_keys = macc_keys_training.drop(columns=["activity"])
Y_training_maccs_keys = macc_keys_training["activity"]
## TEST
macc_keys_test = pd.read_csv(test_path + maccs_keys_test_file)
X_test_maccs_keys = macc_keys_test.drop(columns=["activity"])
Y_test_maccs_keys = macc_keys_test["activity"]
# ECFP4 FINGERPRINTS
## TRAINING
ecfp4_fingerprints_training = pd.read_csv(train_path + ecfp4_fingerprints_training_file)
X_training_ecfp4_fingerprints = ecfp4_fingerprints_training.drop(columns=["activity"])
Y_training_ecfp4_fingerprints = ecfp4_fingerprints_training["activity"]
## TEST
ecfp4_fingerprints_test = pd.read_csv(test_path + ecfp4_fingerprints_test_file)
X_test_ecfp4_fingerprints = ecfp4_fingerprints_test.drop(columns=["activity"])
Y_test_ecfp4_fingerprints = ecfp4_fingerprints_test["activity"]

In [5]:
# CREATE MODELS WITH PYCARET
molecular_descriptors_models = setup(
    data=molecular_descriptors_training,
    target="activity",
    test_data=molecular_descriptors_test,
    session_id=123,
)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,activity
2,Target type,Binary
3,Original data shape,"(299, 87)"
4,Transformed data shape,"(419, 87)"
5,Transformed train set shape,"(299, 87)"
6,Transformed test set shape,"(120, 87)"
7,Numeric features,86
8,Preprocess,True
9,Imputation type,simple


In [6]:
# COMPARE MODELS
best_model = compare_models(verbose=True)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lda,Linear Discriminant Analysis,0.6518,0.651,0.6305,0.6707,0.6419,0.3038,0.3118,0.05
ada,Ada Boost Classifier,0.6386,0.6769,0.6519,0.6401,0.6398,0.2783,0.2839,0.073
xgboost,Extreme Gradient Boosting,0.6323,0.6988,0.6248,0.6371,0.6226,0.2647,0.2719,0.049
gbc,Gradient Boosting Classifier,0.6322,0.7008,0.6043,0.6383,0.6102,0.2643,0.2711,0.074
rf,Random Forest Classifier,0.6321,0.6875,0.6314,0.6321,0.629,0.2646,0.267,0.086
lightgbm,Light Gradient Boosting Machine,0.622,0.6915,0.611,0.6225,0.6129,0.2442,0.248,0.054
et,Extra Trees Classifier,0.6153,0.6901,0.61,0.6145,0.6091,0.2301,0.2318,0.096
lr,Logistic Regression,0.6053,0.6471,0.5776,0.6101,0.5885,0.2109,0.2144,0.053
ridge,Ridge Classifier,0.5986,0.0,0.5581,0.6024,0.5736,0.1979,0.2007,0.04
qda,Quadratic Discriminant Analysis,0.589,0.6274,0.4448,0.6556,0.5001,0.178,0.2,0.047


In [7]:
# TUNE MODELS
# tuned_molecular_descriptors_models = tune_model(best_model, optimize="AUC", n_iter=100)

In [8]:
# CREATE MODELS WITH PYCARET
maccs_keys_models = setup(
    data=macc_keys_training,
    target="activity",
    test_data=macc_keys_test,
    session_id=123,
)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,activity
2,Target type,Binary
3,Original data shape,"(299, 168)"
4,Transformed data shape,"(419, 168)"
5,Transformed train set shape,"(299, 168)"
6,Transformed test set shape,"(120, 168)"
7,Numeric features,167
8,Preprocess,True
9,Imputation type,simple


In [9]:
# COMPARE MODELS
compare_models(verbose=True)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lda,Linear Discriminant Analysis,0.6857,0.709,0.7119,0.6816,0.691,0.3717,0.3782,0.048
lightgbm,Light Gradient Boosting Machine,0.6622,0.699,0.6705,0.6674,0.6637,0.324,0.3291,0.052
gbc,Gradient Boosting Classifier,0.6521,0.7202,0.6438,0.6591,0.644,0.3039,0.311,0.065
xgboost,Extreme Gradient Boosting,0.6487,0.6951,0.671,0.6493,0.655,0.2976,0.3015,0.049
lr,Logistic Regression,0.6454,0.6798,0.6176,0.6564,0.6293,0.2909,0.297,0.052
rf,Random Forest Classifier,0.6454,0.6764,0.6843,0.6366,0.6564,0.2909,0.2936,0.085
ridge,Ridge Classifier,0.6421,0.0,0.6448,0.6424,0.6343,0.2846,0.291,0.042
dt,Decision Tree Classifier,0.6355,0.6474,0.6514,0.6316,0.6371,0.2713,0.2755,0.052
qda,Quadratic Discriminant Analysis,0.6286,0.6992,0.6243,0.6427,0.6207,0.2575,0.2657,0.048
ada,Ada Boost Classifier,0.622,0.6836,0.631,0.6275,0.6234,0.2442,0.2488,0.063


In [10]:
# CREATE MODELS WITH PYCARET
ecfp4_fingerprints_models = setup(
    data=ecfp4_fingerprints_training,
    target="activity",
    test_data=ecfp4_fingerprints_test,
    session_id=123,
)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,activity
2,Target type,Binary
3,Original data shape,"(299, 1025)"
4,Transformed data shape,"(419, 1025)"
5,Transformed train set shape,"(299, 1025)"
6,Transformed test set shape,"(120, 1025)"
7,Numeric features,1024
8,Preprocess,True
9,Imputation type,simple


In [11]:
# COMPARE MODELS
compare_models(verbose=True)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ridge,Ridge Classifier,0.7023,0.0,0.7048,0.7106,0.7044,0.4047,0.4083,0.063
lr,Logistic Regression,0.6921,0.7363,0.6505,0.7137,0.6761,0.3839,0.389,0.074
xgboost,Extreme Gradient Boosting,0.6787,0.7341,0.6643,0.6777,0.6669,0.3576,0.3615,0.086
rf,Random Forest Classifier,0.6655,0.723,0.6919,0.6746,0.6727,0.3316,0.3416,0.109
gbc,Gradient Boosting Classifier,0.6653,0.7298,0.6376,0.6837,0.6561,0.3309,0.3351,0.093
ada,Ada Boost Classifier,0.6585,0.7308,0.63,0.6767,0.646,0.3167,0.3232,0.092
et,Extra Trees Classifier,0.6522,0.7255,0.7052,0.6411,0.6671,0.3049,0.31,0.115
svm,SVM - Linear Kernel,0.6421,0.0,0.6038,0.6791,0.6222,0.2839,0.2952,0.064
lightgbm,Light Gradient Boosting Machine,0.632,0.6829,0.6376,0.6384,0.6315,0.2642,0.2681,0.087
dt,Decision Tree Classifier,0.6122,0.6126,0.6719,0.6011,0.6326,0.2249,0.2284,0.077
