In [1]:
%load_ext gprof2dot_magic
from sklearn import datasets, linear_model, neighbors, svm, ensemble
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from base import SuperLearner, BMA, try_super_learners
import pandas as pd
import numpy as np
from scipy import stats
from pyearth import Earth
import warnings
warnings.filterwarnings("ignore", category=FutureWarning) # warnings from py-earth

v_folds = 5
ols = linear_model.LinearRegression()
elnet = linear_model.ElasticNetCV(l1_ratio=0.5, cv=v_folds, normalize=True)
ridge = linear_model.RidgeCV(cv=v_folds)
lars = linear_model.LarsCV(cv=v_folds, normalize=True)
lasso = linear_model.LassoCV(cv=v_folds, normalize=True)
nn = neighbors.KNeighborsRegressor(weights='distance')
svm1 = svm.SVR(kernel='linear', C=10, gamma='auto')
svm2 = svm.SVR(kernel='poly', C=10, gamma='auto')
rf = ensemble.RandomForestRegressor(n_estimators=100,min_samples_split=5)
gbm = ensemble.GradientBoostingRegressor()
# earth is sort of like D/S/A?
earth=Earth(max_terms=50,max_degree=3,use_fast=True,verbose=0) # get this from https://github.com/scikit-learn-contrib/py-earth
rtree=DecisionTreeRegressor(max_depth=3,min_samples_split=5)

seed = 123
cands=[ols,lars,earth,rf]
metas=[ols,lasso,ridge,earth,rf,rtree,gbm]
def helper(X1,y1,X2,y2,cands_bma=cands,cands_sl=cands,metas=metas):
    display(try_super_learners(cands_sl,metas,X1,y1,X2,y2))
#     sl=SuperLearner(cand_learners=cands,V=10,meta_learner=ols).fit(X1,y1)
#     for meta in metas:
#         sl.meta_learner_=meta.fit(sl.Z_train_cv_,y1)
#         df=sl.debug(X1,y1,X2,y2,skip_fit=True)
        
    display(BMA(cand_learners=cands_bma).debug(X1,y1,X2,y2))

The gprof2dot_magic module is not an IPython extension.


In [3]:
# first simulation study
def sim1(n, seed=seed):
    np.random.seed(seed)
    w=np.random.binomial(1,.4,size=(10,n))
    eps=np.random.normal(0,1,size=n)
    y=2*w[0]*w[9]+4*w[1]*w[6]+3*w[3]*w[4]-\
    5*w[5]*w[9]+3*w[7]*w[8]+w[0]*w[1]*w[3]-\
    2*w[6]*(1-w[5])*w[1]*w[8]-4*(1-w[9])*w[0]*(1-w[3])+eps
    return np.transpose(w),y

train1,test1=sim1(500),sim1(10000)
helper(*train1,*test1)

Unnamed: 0,Learner,Train MSE,Train CV MSE,Test MSE
0,LinearRegression,6.204112,6.527278,6.454701
1,LarsCV,6.204112,6.527278,6.454701
2,Earth,1.023733,1.435625,1.196916
3,RandomForestRegressor,0.79264,2.167724,1.950301
4,Meta (LinearRegression),0.847284,1.333064,1.207656
5,Meta (LassoCV),0.856406,1.341736,1.225273
6,Meta (RidgeCV),0.849448,1.336416,1.219834
7,Meta (Earth),0.906899,1.275531,1.23318
8,Meta (RandomForestRegressor),1.035328,0.416248,1.351995
9,Meta (DecisionTreeRegressor),1.161339,1.207503,1.359726


Unnamed: 0,Learner,Train MSE,Test MSE,Coefs
0,LinearRegression,7.6334,5.392775,2.08937e-221
1,LarsCV,7.6334,5.392775,2.08937e-221
2,Earth,1.259578,1.0,8.775692999999999e-26
3,RandomForestRegressor,1.0,1.658271,1.0
4,BMA,1.0,1.658271,


In [24]:
# Second simulation (low noise linear)
def sim2(n, noise_ratio=0.1, seed=seed):
    np.random.seed(seed)
    w = np.zeros(12)
    w[0:6] = 0.9
    w[6:8] = 0.4
    w[8:10] = 0.2
    x12 = np.random.poisson(1 ,size=(2,n))
    x36 = np.random.uniform(0,1, size=(4,n))
    x78 = x12*x36[0:2]
    x910 = x36[0:2]*x36[1:3]
    x1112 = np.random.binomial(2, 0.5, size=(2,n))
    y_mat = pd.DataFrame(np.transpose(np.concatenate([x12, x36, x78, x910, x1112], axis=0)))
    X = y_mat[y_mat.columns[[0,1,2,3,4,5,10,11]]]
    Ey = y_mat.multiply(w, axis=1).sum(axis=1)
    var_y = np.var(Ey)
    eps = np.random.normal(0,noise_ratio*var_y,size=n)
    y = Ey + eps
    return X, y
        
train2,test2=sim2(1000,0.1),sim2(10000,0.1)
helper(*train2,*test2)

Unnamed: 0,Learner,Train MSE,Train CV MSE,Test MSE
0,LinearRegression,0.110115,0.111258,0.115705
1,LarsCV,0.110115,0.111255,0.115705
2,Earth,0.080892,0.085774,0.088976
3,RandomForestRegressor,0.048264,0.259168,0.238276
4,Meta (LinearRegression),0.081148,0.085573,0.088744
5,Meta (LassoCV),0.08105,0.085614,0.088709
6,Meta (RidgeCV),0.080947,0.085602,0.088712
7,Meta (Earth),0.080896,0.08577,0.088992
8,Meta (RandomForestRegressor),0.070026,0.021767,0.112294
9,Meta (DecisionTreeRegressor),0.160558,0.157442,0.198458


Unnamed: 0,Learner,Train MSE,Test MSE,Coefs
0,LinearRegression,2.301886,1.300416,8.32303e-182
1,LarsCV,2.301886,1.300416,8.32303e-182
2,Earth,1.690992,1.0,7.788325e-115
3,RandomForestRegressor,1.0,2.713798,1.0
4,BMA,1.0,2.713798,


In [25]:
# third simulation (linear high noise)
train3,test3=sim2(1000,0.35),sim2(10000,0.35)
helper(*train3,*test3)

Unnamed: 0,Learner,Train MSE,Train CV MSE,Test MSE
0,LinearRegression,1.02571,1.040051,1.09657
1,LarsCV,1.02571,1.039825,1.09657
2,Earth,0.948359,1.066175,1.280629
3,RandomForestRegressor,0.269981,1.311564,1.328867
4,Meta (LinearRegression),0.979251,1.032309,1.100846
5,Meta (LassoCV),0.979654,1.032512,1.100314
6,Meta (RidgeCV),0.96542,1.032411,1.099933
7,Meta (Earth),1.048677,1.012187,391.93811
8,Meta (RandomForestRegressor),1.042154,0.252806,1.237427
9,Meta (DecisionTreeRegressor),1.033505,1.042673,1.252626


Unnamed: 0,Learner,Train MSE,Test MSE,Coefs
0,LinearRegression,3.825432,1.0,4.119201e-292
1,LarsCV,3.825432,1.0,4.119201e-292
2,Earth,3.536948,1.16785,4.372829e-275
3,RandomForestRegressor,1.0,1.210472,1.0
4,BMA,1.0,1.210472,


In [26]:
# non-linear simulation (low noise)
def sim3(n, noise_ratio=0.2, seed=seed):
    np.random.seed(seed)
    x14 = np.random.binomial(1,.4,size=(4,n))
    x48 = np.random.binomial(8, 0.2, size=(4,n))
    x912 = np.random.normal(2, 2, size=(4,n))

    X = np.transpose(pd.DataFrame(np.concatenate([x14,x48,x912])))
    Ey = 0.4*(x48[1]> 3)*(x48[2] < 3) + x14[1]*x14[0]*(4-x48[2])\
        - x48[1]*0.1*x912[0] + 0.5*x912[3]*((x912[2]>0)*(x912[1]>6)) + x48[1]*(x14[1])\
        + 0.5*x912[1]*(x48[3]>2)*x48[3] + (1-x14[0])*(1+x48[2])

    var_y = np.var(Ey)
    eps = np.random.normal(0,noise_ratio*var_y,size=n)
    y = Ey + eps
    return X, y

train4,test4=sim3(1000,0.1),sim3(10000,0.1)
helper(*train4,*test4)

Unnamed: 0,Learner,Train MSE,Train CV MSE,Test MSE
0,LinearRegression,4.831023,5.142132,4.855559
1,LarsCV,4.843592,5.133655,4.819154
2,Earth,1.230263,1.524343,1.384693
3,RandomForestRegressor,0.388449,1.9205,1.679454
4,Meta (LinearRegression),0.609768,1.229939,1.023716
5,Meta (LassoCV),0.610833,1.230085,1.02365
6,Meta (RidgeCV),0.608854,1.229987,1.023424
7,Meta (Earth),0.85354,1.121986,1.07873
8,Meta (RandomForestRegressor),0.830539,0.307317,1.164912
9,Meta (DecisionTreeRegressor),1.324753,1.521423,1.668919


Unnamed: 0,Learner,Train MSE,Test MSE,Coefs
0,LinearRegression,12.779081,3.506595,0.0
1,LarsCV,12.812328,3.480305,0.0
2,Earth,3.254307,1.0,5.2962539999999996e-257
3,RandomForestRegressor,1.0,1.201807,1.0
4,BMA,1.0,1.201807,


In [27]:
# non-linear simulation (high noise)
train5,test5=sim3(1000,0.35),sim3(10000,0.35)
helper(*train5,*test5)

Unnamed: 0,Learner,Train MSE,Train CV MSE,Test MSE
0,LinearRegression,13.78522,14.51149,13.193668
1,LarsCV,13.844752,14.689122,13.10704
2,Earth,10.483606,12.20444,9.743058
3,RandomForestRegressor,2.44693,12.488183,10.905928
4,Meta (LinearRegression),5.935179,11.427065,9.669894
5,Meta (LassoCV),5.952914,11.429547,9.656074
6,Meta (RidgeCV),5.936539,11.427114,9.667656
7,Meta (Earth),6.080971,11.425965,9.558016
8,Meta (RandomForestRegressor),7.944156,2.726498,10.74364
9,Meta (DecisionTreeRegressor),8.569067,11.256014,10.539307


Unnamed: 0,Learner,Train MSE,Test MSE,Coefs
0,LinearRegression,5.709865,1.354161,0.0
1,LarsCV,5.734523,1.34527,0.0
2,Earth,4.34233,1.0,1.36164e-319
3,RandomForestRegressor,1.0,1.125834,1.0
4,BMA,1.0,1.125834,


In [28]:
diabetes=datasets.load_diabetes()

X_train, X_test, y_train, y_test = train_test_split(
    diabetes.data, diabetes.target, test_size=0.2)

helper(X_train,y_train,X_test,y_test)

Unnamed: 0,Learner,Train MSE,Train CV MSE,Test MSE
0,LinearRegression,2874.532531,3116.573137,2814.65899
1,LarsCV,2941.750346,3116.877092,2844.875331
2,Earth,2372.992223,3429.876733,3284.805861
3,RandomForestRegressor,650.796897,3366.631173,3499.841575
4,Meta (LinearRegression),1951.060212,3038.283966,2869.731732
5,Meta (LassoCV),1967.743375,3038.569851,2870.299308
6,Meta (RidgeCV),1951.086081,3038.283968,2869.727188
7,Meta (Earth),2292.857991,2870.269977,3132.044476
8,Meta (RandomForestRegressor),2567.647473,724.550605,3137.100189
9,Meta (DecisionTreeRegressor),2304.542116,2674.758575,3103.13619


Unnamed: 0,Learner,Train MSE,Test MSE,Coefs
0,LinearRegression,4.389313,1.0,4.140036e-114
1,LarsCV,4.491952,1.010735,7.001544e-116
2,Earth,3.623478,1.167035,2.06215e-99
3,RandomForestRegressor,1.0,1.281277,1.0
4,BMA,1.0,1.281277,


In [29]:
pr=pd.read_csv("datasets/CASP.csv")
feature_cols= pr.columns[pr.columns!='RMSD']
from sklearn.preprocessing import scale
pr.loc[:,feature_cols]=scale(pr.loc[:,feature_cols])

  after removing the cwd from sys.path.


In [30]:
prtrain,prtest=train_test_split(pr.sample(1000))

In [None]:
helper(prtrain.loc[:,feature_cols],prtrain.RMSD,prtest.loc[:,feature_cols],prtest.RMSD)