In [1]:
# basic packages
import os, math, csv, scipy, time, random
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib import cm
import seaborn as sns
import pandas as pd
import numpy as np

# Model Preprocessing
from sklearn.preprocessing import (
    StandardScaler,
    MinMaxScaler,
    RobustScaler,
    MaxAbsScaler,
    QuantileTransformer,
    PowerTransformer,
    Normalizer,
    Binarizer
)

# Preprocessing
from sklearn import preprocessing
from sklearn import utils
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# Model selection/evaluation
from sklearn.model_selection import train_test_split, KFold
from sklearn.model_selection import (
    train_test_split,
    cross_val_predict,
    cross_val_score,
    cross_val_predict,
    GridSearchCV,
    RandomizedSearchCV,
    KFold,
    cross_validate,
    learning_curve,
    LeaveOneOut,
)
import scikitplot as skplt
from sklearn.metrics import mean_squared_error, r2_score


In [2]:
#Machine Learning
#Regressor functions

# import xgboost as xgb

import sklearn.linear_model as lm
import sklearn.ensemble as ens
#SGD regressor, Lasso, ElasticNet, Ridgeregression, SVR(kernel='linear'), SVT(kernel='rbf'), Ensemble Regressor

In [3]:
training_data = (
    "/Users/Training_Data_3.xlsx"
)

df = pd.read_excel(training_data, index_col="Molecule").dropna()

df.drop(columns=["Isoelectric charge @5.5 pH", "Lipophilicity (Log D @ 5.5 pH)",\
    "Vapor Pressure (mm Hg at 25 °C)", "logkp (cm/s)", "mg/mL @ pH 5.5"], inplace=True)
# Split data into input and predicted variables

y = df["JSS"]
X = df.drop(["JSS"], axis=1)

# split data into training and testing sets


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# look at z-scored distribution of data

# df_z = df.apply(stats.zscore)
# df_z.plot(kind="kde", subplots=True, figsize=(5, 20), sharex=True)


In [None]:
df.corrwith(df["JSS"])

In [4]:
regressors = [
    ens.GradientBoostingRegressor(),
    ens.ExtraTreesRegressor(),
    ens.RandomForestRegressor(),
    ens.AdaBoostRegressor(),
    ens.BaggingRegressor(),
    lm.LinearRegression(),
    lm.Lasso(),
    lm.Ridge(),
    lm.ElasticNet(),
    lm.SGDRegressor()
]

In [5]:
#Dictionary of models with parameters for GridSearchCV
gbr_params = {
        # 'gbr': ens.GradientBoostingRegressor(),
        'gbr__loss': ('ls', 'lad', 'huber', 'quantile'),
        # 'gbr__learning_rate': (0.1, 0.05, 0.02, 0.01),
        'gbr_n_estimators': (100, 200, 300, 400, 500),
        'gbr_max_depth': (1,3,5,7,9),
        'gbr_min_samples_leaf': (0.01,1,100)
    }
etr_params = {
        # 'etr': (ens.ExtraTreesRegressor()),
        # 'etr_scalers': scalers,
        'etr__n_estimators': (10, 30),
        'etr_max_depth': (1,2,3,4,5,6,7,8,9,10),
        'etr_min_samples_split': (0.001,0.01,0.1,1,10,100),
        'etr_min_samples_leaf': (0.001,0.01,0.1,1,10,100),
        'etr_min_weight_fraction_leaf': (0.001,0.01,0.1,1,10,100),
        'etr_max_features': ('auto', 'sqrt', 'log2'),
        'etr_max_leaf_nodes': (None, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100),
        'etr_min_impurity_decrease': (0.001,0.01,0.1,1,10,100)}

rfr_params = {
        'rfr': (ens.RandomForestRegressor()),
        # 'rfr_scalers': scalers,
        'rfr__n_estimators':[100, 200, 300, 400, 500],
        'rfr_max_depth': (1,2,3,4,5,6,7,8,9,10),
        'rfr_min_samples_split': (0.001,0.01,0.1,1,10,100),
        'rfr_min_samples_leaf': (0.001,0.01,0.1,1,10,100),
        'rfr_min_weight_fraction_leaf': (0.001,0.01,0.1,1,10,100),
        'rfr_max_features': ('auto', 'sqrt', 'log2'),
        'rfr_max_leaf_nodes': (None, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100),
        'rfr_min_impurity_decrease': (0.001,0.01,0.1,1,10,100)}
ada_params = {
        'clf': (ens.AdaBoostRegressor()),
        # 'clf_scalers': scalers,
        'clf_base_estimator': (lm.LinearRegression(), lm.Lasso(), lm.Ridge(), lm.ElasticNet(), lm.SGDRegressor(), ens.BaggingRegressor()),
        'clf_n_estimators': [100, 200, 300, 400, 500],
        'clf_learning_rate': (0.1, 0.01, 0.001),
        'clf_loss': ('linear', 'square', 'exponential')}
bag_params = {
        'clf': (ens.BaggingRegressor(),),
        # 'clf_scalers': scalers,
        'clf_base_estimator': (lm.LinearRegression(), lm.Lasso(), lm.Ridge(), lm.ElasticNet(), lm.SGDRegressor(), ens.BaggingRegressor()),
        'clf_n_estimators': [100, 200, 300, 400, 500],
        'clf_max_samples': (0.001,0.01,0.1,1,10,100),
        'clf_max_features': (0.001,0.01,0.1,1,10,100),
        'clf_bootstrap': (True, False),
        'clf_bootstrap_features': (True, False),
        'clf_oob_score': (True, False),
        'clf_warm_start': (True, False),
        'clf_n_jobs': (1,2,3,4,5,6,7,8,9,10)}
lr_params = {
        'clf': (lm.LinearRegression()),
        # 'clf_scalers': scalers,
        'clf__fit_intercept': (True, False),
        'clf__normalize': (True, False),
        'clf__copy_X': (True, False),
        'clf__n_jobs': (1,2,3,4,5,6,7,8,9,10)
        }
lasso_params = {
        'clf': (lm.Lasso()),
        # 'clf_scalers': scalers,
        'clf__alpha': (0.1, 0.01, 0.001),
        'clf__fit_intercept': (True, False),
        'clf__normalize': (True, False),
        'clf__precompute': (True, False),
        'clf__copy_X': (True, False),
        'clf__max_iter': (100, 200, 300, 400, 500),
        'clf__tol': (0.001, 0.0001, 0.00001),
        'clf__warm_start': (True, False),
        'clf__positive': (True, False),
        'clf__selection': ('cyclic', 'random'),
    }
ridge_params = {
        'clf': (lm.Ridge()),
        # 'clf_scalers': scalers,
        'clf__alpha': (0.1, 0.01, 0.001),
        'clf__fit_intercept': (True, False),
        'clf__normalize': (True, False),
        'clf__copy_X': (True, False),
        'clf__max_iter': (100, 200, 300, 400, 500),
        'clf__tol': (0.001, 0.0001, 0.00001),
        'clf__solver': ('auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'),
    }
enet_params = {
        'clf': (lm.ElasticNet()),
        # 'clf_scalers': scalers,
        'clf__alpha': (0.1, 0.01, 0.001),
        'clf__l1_ratio': (0.1, 0.01, 0.001),
        'clf__fit_intercept': (True, False),
        'clf__normalize': (True, False),
        'clf__precompute': (True, False),
        'clf__copy_X': (True, False),
        'clf__max_iter': (100, 200, 300, 400, 500),
        'clf__tol': (0.001, 0.0001, 0.00001),
        'clf__warm_start': (True, False),
        'clf__positive': (True, False),
        'clf__selection': ('cyclic', 'random'),
    }
sgd_params = {
        'clf': lm.SGDRegressor(),
        # 'clf_scalers': scalers,
        'clf__loss': ('squared_loss', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'),
        'clf__penalty': ('l2', 'l1', 'elasticnet'),
        'clf__alpha': (0.1, 0.01, 0.001),
        'clf__l1_ratio': (0.1, 0.01, 0.001),
        'clf__fit_intercept': (True, False),
        'clf__max_iter': (100, 200, 300, 400, 500),
        'clf__tol': (0.001, 0.0001, 0.00001),
        'clf__shuffle': (True, False),
        'clf__verbose': (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10),
        'clf__epsilon': (0.1, 0.01, 0.001),
        'clf__random_state': (None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10),
        'clf__learning_rate': ('constant', 'optimal', 'invscaling', 'adaptive'),
        'clf__eta0': (0.1, 0.01, 0.001),
        'clf__power_t': (0.1, 0.01, 0.001),
        'clf__early_stopping': (True, False),
        'clf__validation_fraction': (0.1, 0.01, 0.001),
        'clf__n_iter_no_change': (1, 2, 3, 4, 5, 6, 7, 8, 9, 10),
        'clf__warm_start': (True, False),
        'clf__average': (True, False)
        }

In [6]:
#Building the pipeline

pipeline = Pipeline([
    ('Transform_Step', RobustScaler()),
    ('Model_step', regressors)
])

In [7]:
gs_gb_reg = GridSearchCV(pipeline, param_grid=gbr_params, scoring='r2', cv=2, n_jobs=-1)
gs_gb_reg.fit(X_train, y_train)

In [None]:
# add iterative imputer to fill missing values
# add PCA to reduce dimensionality?

In [None]:
#Search the entire pipeline

# g_search = GridSearchCV(pipeline, cv = 10, scoring = 'r2')
# g_search.fit(X_train, y_train)
# g_search.best_params_

In [None]:
#Cross Validate the Entire Pipeline

cross_val_score(pipeline, X_train, y_train, cv=10, scoring='r2').mean()

In [None]:
#Fitting and predicting the pipeline
model = ens.ExtraTreesRegressor(n_estimators = 50)
model.fit(X_train, y_train)
model.predict(X_test).mean()

In [None]:
#Scoring the pipeline
model.score(X_test, y_test)

In [None]:
#Function to determine JSS value with input values, this is the final goal, to input a molecules parameters and get the JSS value

# # Printing coefficients and intercept to create function later
# print('Model_step'.coef_)
# print(.intercept_)


coefficients = pipe.coef_
intercept = pipe.intercept_
def calculate_JSS(MW_in, MP_in, Sol_in, Iso_in, Lipo_in, Vapo_in, logkp_in):
  return (MW_in * coefficients[0]) + (MP_in * coefficients[1]) + (Sol_in * coefficients[2]) + (Iso_in * coefficients[3]) \
  + (Lipo_in * coefficients[4]) + (Vapo_in * coefficients[5]) + (logkp_in * coefficients[6])+ intercept