In [197]:
import pandas as pd
from sklearn.kernel_ridge import KernelRidge
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn import cross_decomposition
from sklearn import ensemble
from sklearn import isotonic
from sklearn.neural_network import MLPRegressor

In [198]:
%run ml_functions.ipynb

Our plan is to test as many as possible models, with default settings, in a short time, then choose a few most promising for hyperparameter tuning.

Let's calculate RMSE for training data and mean value of RMSE for Cross-Validation.

<h3> Linear Models </h3>

In [199]:
%%capture --no-display
result_lm = regression_linear_models(train_set_new_ready, train_set_labels, cv=4)
result_lm = pd.DataFrame.from_dict(result_lm)
result_lm

Unnamed: 0,LinearRegression,Ridge,Lasso,ElasticNet,Lars,LassoLars,OrthogonalMatchingPursuit,BayesianRidge,ARDRegression,LogisticRegression,SGDRegressor,PassiveAggressiveRegressor,HuberRegressor,TweedieRegressor,TheilSenRegressor,RANSACRegressor
rmse_training,0.939249,1.391042,1.800469,1.79777,151.759964,1.800469,1.569185,1.715081,1.506267,1.532774,1.513862,1.73499,1.252213,1.637558,1.21073,2.780928
rmse_cv,7.029047,2.005982,1.84829,1.856344,2732.99697,1.84829,1.881482,1.84222,1.952275,2.337529,1.952035,2.20816,2.604354,1.849276,7.669986,10.609576


<h3> KernelRidge </h3>

In [200]:
result_kr = regression_kernelridge(train_set_new_ready, train_set_labels, cv=4)
result_kr = pd.DataFrame.from_dict(result_kr)
result_kr

Unnamed: 0,KernelRidge
rmse_cv,2.013866
rmse_training,1.391596


<h3> SVM </h3>

In [201]:
%%capture --no-display
result_svm = regression_svm(train_set_new_ready, train_set_labels, cv=4)
result_svm = pd.DataFrame.from_dict(result_svm)
result_svm

Unnamed: 0,SVR,NuSVR,LinearSVR
rmse_training,1.368489,1.357156,1.480323
rmse_cv,1.689241,1.717408,2.119062


<h3> Gaussian Process </h3>

In [202]:
%%capture --no-display
result_gpr = regression_gaussianprocess(train_set_new_ready, train_set_labels, cv=4)
result_gpr = pd.DataFrame.from_dict(result_gpr)
result_gpr

Unnamed: 0,GaussianProcessRegressor
rmse_cv,4.060732
rmse_training,4.07592e-10


<h3> Cross Decomposition </h3>

In [203]:
%%capture --no-display
result_cd = regression_crossdecomposition(train_set_new_ready, train_set_labels, cv=4)
result_cd = pd.DataFrame.from_dict(result_cd)
result_cd

Unnamed: 0,PLSRegression,PLSCanonical,CCA
rmse_training,1.600774,4.551446,1.782026
rmse_cv,1.862992,4.881097,1.846478


<h3> Decision Tree </h3>

In [204]:
result_tree = regression_decisiontree(train_set_new_ready, train_set_labels, cv=4)
result_tree = pd.DataFrame.from_dict(result_tree)
result_tree

Unnamed: 0,DecisionTreeRegressor
rmse_cv,2.214911
rmse_training,0.0


<h3> Ensemble Methods </h3>

In [205]:
%%capture --no-display
from sklearn import ensemble
from sklearn.linear_model import RidgeCV, LassoCV
from sklearn.neighbors import KNeighborsRegressor

result_ens = regression_ensemble(train_set_new_ready, train_set_labels, cv=4)
result_ens = pd.DataFrame.from_dict(result_ens)
result_ens

Unnamed: 0,AdaBoostRegressor,BaggingRegressor,ExtraTreesRegressor,GradientBoostingRegressor,IsolationForest,RandomForestRegressor,VotingRegressor,StackingRegressor
rmse_training,0.73212,0.789464,0.0,0.136624,3.551684,0.62685,1.587453,1.88709
rmse_cv,1.741218,1.779797,1.692288,1.799325,3.646744,1.693908,1.80856,1.838392


<h3> Neural Network </h3>

In [206]:
%%capture --no-display
regression_neuralnetwork
result_nn = regression_neuralnetwork(train_set_new_ready, train_set_labels, cv=4)
result_nn = pd.DataFrame.from_dict(result_nn)
result_nn

Unnamed: 0,neural_network
rmse_cv,2.104494
rmse_training,0.898979


<h3> Summary </h3>

In [211]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
result = pd.concat([result_lm, result_kr, result_svm, result_gpr, result_cd, result_tree, result_ens, result_nn] , axis=1)
result

Unnamed: 0,LinearRegression,Ridge,Lasso,ElasticNet,Lars,LassoLars,OrthogonalMatchingPursuit,BayesianRidge,ARDRegression,LogisticRegression,SGDRegressor,PassiveAggressiveRegressor,HuberRegressor,TweedieRegressor,TheilSenRegressor,RANSACRegressor,KernelRidge,SVR,NuSVR,LinearSVR,GaussianProcessRegressor,PLSRegression,PLSCanonical,CCA,DecisionTreeRegressor,AdaBoostRegressor,BaggingRegressor,ExtraTreesRegressor,GradientBoostingRegressor,IsolationForest,RandomForestRegressor,VotingRegressor,StackingRegressor,neural_network
rmse_training,0.939249,1.391042,1.800469,1.79777,151.759964,1.800469,1.569185,1.715081,1.506267,1.532774,1.513862,1.73499,1.252213,1.637558,1.21073,2.780928,1.391596,1.368489,1.357156,1.480323,4.07592e-10,1.600774,4.551446,1.782026,0.0,0.73212,0.789464,0.0,0.136624,3.551684,0.62685,1.587453,1.88709,0.898979
rmse_cv,7.029047,2.005982,1.84829,1.856344,2732.99697,1.84829,1.881482,1.84222,1.952275,2.337529,1.952035,2.20816,2.604354,1.849276,7.669986,10.609576,2.013866,1.689241,1.717408,2.119062,4.060732,1.862992,4.881097,1.846478,2.214911,1.741218,1.779797,1.692288,1.799325,3.646744,1.693908,1.80856,1.838392,2.104494


Somehow we have to decide to continue working with a few models. I have decided to choose the best models by taking the average of DataFrame's rows.

In [225]:
result.mean().sort_values().head(7)

ExtraTreesRegressor          0.846144
GradientBoostingRegressor    0.967975
DecisionTreeRegressor        1.107456
RandomForestRegressor        1.160379
AdaBoostRegressor            1.236669
BaggingRegressor             1.284630
neural_network               1.501737
dtype: float64

As we can see, mainly ensemble methods looks promising (5 in top 7), assuming our approach is correct.