## Classification

In [None]:
from sklearn.model_selection import train_test_split

def dataset_split(X, y):
    X_train, X_t, y_train, y_t = train_test_split(X, y, test_size=.2)
    X_val, X_test, y_val, y_test = train_test_split(X_t, y_t, test_size=.5)
    return X_train, y_train, X_val, y_val, X_test, y_test

#print('Train set size: ', len(X_train), len(y_train))
#print('Validation set size: ', len(X_val), len(y_val))
#print('Test set size: ', len(X_test), len(y_test))

## Regression

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
import matplotlib.pyplot as plt

# define evaluation function
def evaluation(model, X_test, y_test):
    prediction = model.predict(X_test)
    mae = mean_absolute_error(y_test, prediction)
    mse = mean_squared_error(y_test, prediction)
    rmse = np.sqrt(mean_squared_error(y_test, prediction))
    r2 = sklearn.metrics.r2_score(y_test, prediction)
    
    plt.figure(figsize=(10, 7))
    plt.plot(prediction[:300], "red", label="Predictions", linewidth=1.0)
    plt.plot(y_test[:300], 'green', label="Observations", linewidth=1.0)
    plt.legend()
    plt.show()
    x = np.linspace(y_test.min(),y_test.max(), 100)
    y = x
    #m, b = np.polyfit(y_test, prediction, 1)
    #plt.plot(y_test, m*y_test + b, '--', color='red', label='regression line')
    plt.figure(figsize=(10, 10))
    plt.scatter(y_test, prediction, alpha=0.5, label='logP predictions')
    plt.plot(x, y, '--', color='black', label='regression line')
    #plt.grid()
    plt.legend()
    plt.xlabel('Observations')
    plt.ylabel('Predictions')
    plt.ylabel('logP')
    #plt.title("MAE {}, MSE {}".format(round(mae, 4), round(mse, 4)))
    plt.title("Parity plot of the observed and predicted target values")
    plt.show()
    
    print('MAE score:', round(mae, 4))
    print('MSE score:', round(mse, 4))
    print('RMSE score:', round(rmse, 4))
    print('R2 score:', round(r2, 4))

In [None]:
import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
import xgboost
from xgboost import XGBRegressor
from sklearn.ensemble import VotingRegressor

# Linear Regression
lin_reg = LinearRegression().fit(X_train, y_train)
#evaluation(lin_reg, X_val, y_val)

#SVR
svr_rbf = SVR(kernel='rbf').fit(X_train, y_train)
#evaluation(svr_rbf, X_val, y_val)

#XGBoost
xgb = XGBRegressor(n_estimators=1000, max_depth=6, eta=0.3, subsample=0.7, colsample_bytree=0.8, reg='squarederror').fit(X_train, y_train)
#evaluation(xgb, X_val, y_val)

# Ensemble
ereg = VotingRegressor(estimators=[('lr', lin_reg), ('svr', svr_rbf), ('xgb', xgb)])
#ereg = ereg.fit(X_train, y_train)
#evaluation(ereg, X_val, y_val)

Calculating the performance on an unseen test set is very important to prevent having a biased model.

In [None]:
# final models
#evaluation(lin_reg, X_test, y_test)
#evaluation(svr_rbf, X_test, y_test)
#evaluation(xgb, X_test, y_test)
#evaluation(ereg, X_test, y_test)