In [11]:
%load_ext gprof2dot_magic
from sklearn import datasets, linear_model, neighbors, svm, ensemble
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from base import SuperLearner, BMA, try_super_learners
import pandas as pd
import numpy as np
from scipy import stats
from pyearth import Earth
import warnings
warnings.filterwarnings("ignore", category=FutureWarning) # warnings from py-earth

v_folds = 5
seed = 123

ols = linear_model.LinearRegression()
elnet = linear_model.ElasticNetCV(l1_ratio=0.5, cv=v_folds, normalize=True)
ridge = linear_model.RidgeCV(cv=v_folds)
lars = linear_model.LarsCV(cv=v_folds, normalize=True)
lasso = linear_model.LassoCV(cv=v_folds, normalize=True)
nn = neighbors.KNeighborsRegressor(weights='distance')
svm1 = svm.SVR(kernel='linear', C=10, gamma='auto')
svm2 = svm.SVR(kernel='poly', C=10, gamma='auto')
rf = ensemble.RandomForestRegressor(n_estimators=100,min_samples_split=5, random_state=seed)
gbm = ensemble.GradientBoostingRegressor()
# earth is sort of like D/S/A?
earth=Earth(max_terms=50,max_degree=3,use_fast=True,verbose=0) # get this from https://github.com/scikit-learn-contrib/py-earth
rtree=DecisionTreeRegressor(max_depth=3,min_samples_split=5)

cands=[ols,lars,earth,rf]
metas=[ols,lasso,ridge,earth,rf,rtree,gbm]
def helper(X1,y1,X2,y2,cands_bma=cands,cands_sl=cands,metas=metas,relative=True):
    display(try_super_learners(cands_sl,metas,X1,y1,X2,y2,relative=relative))
#     sl=SuperLearner(cand_learners=cands,V=10,meta_learner=ols).fit(X1,y1)
#     for meta in metas:
#         sl.meta_learner_=meta.fit(sl.Z_train_cv_,y1)
#         df=sl.debug(X1,y1,X2,y2,skip_fit=True)
        
    display(BMA(cand_learners=cands_bma).debug(X1,y1,X2,y2,relative=relative))


The gprof2dot_magic module is not an IPython extension.


In [12]:
# first simulation study
def sim1(n, seed=seed):
    np.random.seed(seed)
    w=np.random.binomial(1,.4,size=(10,n))
    eps=np.random.normal(0,1,size=n)
    y=2*w[0]*w[9]+4*w[1]*w[6]+3*w[3]*w[4]-\
    5*w[5]*w[9]+3*w[7]*w[8]+w[0]*w[1]*w[3]-\
    2*w[6]*(1-w[5])*w[1]*w[8]-4*(1-w[9])*w[0]*(1-w[3])+eps
    return np.transpose(w),y

train1,test1=sim1(500),sim1(10000)
helper(*train1,*test1)

Unnamed: 0,Learner,Train MSE,Train CV MSE,Test MSE
0,LinearRegression,7.629,16.498,5.3928
1,LarsCV,7.629,16.498,5.3928
2,Earth,1.2589,3.6286,1.0
3,RandomForestRegressor,1.0,5.254,1.6745
4,Meta (LinearRegression),1.0354,3.313,1.0348
5,Meta (LassoCV),1.0391,3.3362,1.0434
6,Meta (RidgeCV),1.0328,3.3165,1.0394
7,Meta (Earth),1.14,3.1322,1.0798
8,Meta (RandomForestRegressor),1.2275,1.0,1.1761
9,Meta (DecisionTreeRegressor),1.4773,2.933,1.2108


Unnamed: 0,Learner,Train MSE,Test MSE,Coefs
0,LinearRegression,7.629,5.3928,0.0
1,LarsCV,7.629,5.3928,0.0
2,Earth,1.2589,1.0,0.0
3,RandomForestRegressor,1.0,1.6745,1.0
4,BMA,1.0,1.6745,1.0
5,Min Error,0.8132,1.1969,


In [13]:
# Second simulation (low noise linear)
def sim2(n, noise_ratio=0.1, seed=seed):
    np.random.seed(seed)
    w = np.zeros(12)
    w[0:6] = 0.9
    w[6:8] = 0.4
    w[8:10] = 0.2
    x12 = np.random.poisson(1 ,size=(2,n))
    x36 = np.random.uniform(0,1, size=(4,n))
    x78 = x12*x36[0:2]
    x910 = x36[0:2]*x36[1:3]
    x1112 = np.random.binomial(2, 0.5, size=(2,n))
    y_mat = pd.DataFrame(np.transpose(np.concatenate([x12, x36, x78, x910, x1112], axis=0)))
    X = y_mat[y_mat.columns[[0,1,2,3,4,5,10,11]]]
    Ey = y_mat.multiply(w, axis=1).sum(axis=1)
    var_y = np.var(Ey)
    eps = np.random.normal(0,noise_ratio*var_y,size=n)
    y = Ey + eps
    return X, y
        
train2,test2=sim2(1000,0.1),sim2(10000,0.1)
helper(*train2,*test2)

Unnamed: 0,Learner,Train MSE,Train CV MSE,Test MSE
0,LinearRegression,2.2207,5.0405,1.3043
1,LarsCV,2.2207,5.0404,1.3043
2,Earth,1.6313,3.886,1.003
3,RandomForestRegressor,1.0,12.1837,2.7467
4,Meta (LinearRegression),1.6549,3.876,1.0012
5,Meta (LassoCV),1.6345,3.8787,1.0
6,Meta (RidgeCV),1.6503,3.8775,1.0008
7,Meta (Earth),1.6314,3.8858,1.0032
8,Meta (RandomForestRegressor),1.433,1.0,1.26
9,Meta (DecisionTreeRegressor),3.238,7.1329,2.2372


Unnamed: 0,Learner,Train MSE,Test MSE,Coefs
0,LinearRegression,2.2207,1.3004,0.0
1,LarsCV,2.2207,1.3004,0.0
2,Earth,1.6313,1.0,0.0
3,RandomForestRegressor,1.0,2.7385,1.0
4,BMA,1.0,2.7385,1.0
5,Min Error,0.0496,0.089,


In [14]:
# third simulation (linear high noise)
train3,test3=sim2(1000,0.35),sim2(10000,0.35)
helper(*train3,*test3)

Unnamed: 0,Learner,Train MSE,Train CV MSE,Test MSE
0,LinearRegression,3.8414,4.0956,1.0
1,LarsCV,3.8414,4.0947,1.0
2,Earth,3.5517,4.1985,1.1678
3,RandomForestRegressor,1.0,5.2006,1.2305
4,Meta (LinearRegression),3.7989,4.0644,1.0066
5,Meta (LassoCV),3.6883,4.0659,1.0038
6,Meta (RidgeCV),3.7357,4.065,1.0052
7,Meta (Earth),3.91,3.982,386.6173
8,Meta (RandomForestRegressor),4.2893,1.0,1.1248
9,Meta (DecisionTreeRegressor),3.8706,4.1059,1.1423


Unnamed: 0,Learner,Train MSE,Test MSE,Coefs
0,LinearRegression,3.8414,1.0,0.0
1,LarsCV,3.8414,1.0,0.0
2,Earth,3.5517,1.1678,0.0
3,RandomForestRegressor,1.0,1.2305,1.0
4,BMA,1.0,1.2305,1.0
5,Min Error,0.267,1.0966,


In [15]:
# non-linear simulation (low noise)
def sim3(n, noise_ratio=0.2, seed=seed):
    np.random.seed(seed)
    x14 = np.random.binomial(1,.4,size=(4,n))
    x48 = np.random.binomial(8, 0.2, size=(4,n))
    x912 = np.random.normal(2, 2, size=(4,n))

    X = np.transpose(pd.DataFrame(np.concatenate([x14,x48,x912])))
    Ey = 0.4*(x48[1]> 3)*(x48[2] < 3) + x14[1]*x14[0]*(4-x48[2])\
        - x48[1]*0.1*x912[0] + 0.5*x912[3]*((x912[2]>0)*(x912[1]>6)) + x48[1]*(x14[1])\
        + 0.5*x912[1]*(x48[3]>2)*x48[3] + (1-x14[0])*(1+x48[2])

    var_y = np.var(Ey)
    eps = np.random.normal(0,noise_ratio*var_y,size=n)
    y = Ey + eps
    return X, y

train4,test4=sim3(1000,0.1),sim3(10000,0.1)
helper(*train4,*test4)

Unnamed: 0,Learner,Train MSE,Train CV MSE,Test MSE
0,LinearRegression,13.0967,16.9356,4.764
1,LarsCV,13.1308,16.9077,4.7282
2,Earth,3.3352,5.0204,1.3586
3,RandomForestRegressor,1.0,6.4142,1.6327
4,Meta (LinearRegression),1.6592,4.0732,1.0002
5,Meta (LassoCV),1.6633,4.0734,1.0003
6,Meta (RidgeCV),1.6581,4.0732,1.0
7,Meta (Earth),2.251,3.7995,1.0459
8,Meta (RandomForestRegressor),2.1779,1.0,1.1535
9,Meta (DecisionTreeRegressor),3.8705,5.1842,1.6414


Unnamed: 0,Learner,Train MSE,Test MSE,Coefs
0,LinearRegression,13.0967,3.5066,0.0
1,LarsCV,13.1308,3.4803,0.0
2,Earth,3.3352,1.0,0.0
3,RandomForestRegressor,1.0,1.2018,1.0
4,BMA,1.0,1.2018,1.0
5,Min Error,0.3689,1.3847,


In [16]:
# non-linear simulation (high noise)
train5,test5=sim3(1000,0.35),sim3(10000,0.35)
helper(*train5,*test5)

Unnamed: 0,Learner,Train MSE,Train CV MSE,Test MSE
0,LinearRegression,5.7777,5.4275,1.373
1,LarsCV,5.8026,5.494,1.364
2,Earth,4.3939,4.5646,1.0139
3,RandomForestRegressor,1.0,4.6388,1.1359
4,Meta (LinearRegression),2.4336,4.2739,1.006
5,Meta (LassoCV),2.4425,4.2749,1.0045
6,Meta (RidgeCV),2.4344,4.2739,1.0058
7,Meta (Earth),2.7687,4.2231,1.0
8,Meta (RandomForestRegressor),3.2864,1.0,1.1205
9,Meta (DecisionTreeRegressor),4.0349,4.183,1.0774


Unnamed: 0,Learner,Train MSE,Test MSE,Coefs
0,LinearRegression,5.7777,1.3542,0.0
1,LarsCV,5.8026,1.3453,0.0
2,Earth,4.3939,1.0,0.0
3,RandomForestRegressor,1.0,1.1204,1.0
4,BMA,1.0,1.1204,1.0
5,Min Error,2.3859,9.7431,


In [17]:
diabetes=datasets.load_diabetes()

X_train, X_test, y_train, y_test = train_test_split(
    diabetes.data, diabetes.target, test_size=0.2)

helper(X_train,y_train,X_test,y_test)

Unnamed: 0,Learner,Train MSE,Train CV MSE,Test MSE
0,LinearRegression,4.3508,4.097,1.0
1,LarsCV,4.4027,4.0485,1.023
2,Earth,3.8815,4.8658,1.416
3,RandomForestRegressor,1.0,4.3341,1.2362
4,Meta (LinearRegression),3.0547,3.9728,1.0579
5,Meta (LassoCV),3.0695,3.973,1.0576
6,Meta (RidgeCV),3.0546,3.9728,1.0579
7,Meta (Earth),4.316,3.6873,1.1293
8,Meta (RandomForestRegressor),3.5535,1.0,1.209
9,Meta (DecisionTreeRegressor),3.4926,3.5541,1.1044


Unnamed: 0,Learner,Train MSE,Test MSE,Coefs
0,LinearRegression,4.3508,1.0,0.0
1,LarsCV,4.4027,1.023,0.0
2,Earth,3.8815,1.416,0.0
3,RandomForestRegressor,1.0,1.2362,1.0
4,BMA,1.0,1.2362,1.0
5,Min Error,662.791,2809.6439,


In [18]:
pr=pd.read_csv("datasets/CASP.csv")
feature_cols= pr.columns[pr.columns!='RMSD']
from sklearn.preprocessing import scale
pr.loc[:,feature_cols]=scale(pr.loc[:,feature_cols])

  after removing the cwd from sys.path.


In [19]:
prtrain,prtest=train_test_split(pr.sample(1000))

In [20]:
helper(prtrain.loc[:,feature_cols],prtrain.RMSD,prtest.loc[:,feature_cols],prtest.RMSD)

Unnamed: 0,Learner,Train MSE,Train CV MSE,Test MSE
0,LinearRegression,5.5653,4.7139,1.1424
1,LarsCV,5.6071,4.829,1.1692
2,Earth,4.3222,23.8017,1.1741
3,RandomForestRegressor,1.0,4.4962,1.0681
4,Meta (LinearRegression),1.9524,4.1967,1.0
5,Meta (LassoCV),1.961,4.1968,1.0011
6,Meta (RidgeCV),1.9524,4.1967,1.0
7,Meta (Earth),2.2805,4.1734,1.0204
8,Meta (RandomForestRegressor),2.8689,1.0,1.1055
9,Meta (DecisionTreeRegressor),3.1963,4.0707,1.1122


Unnamed: 0,Learner,Train MSE,Test MSE,Coefs
0,LinearRegression,5.5653,1.0696,0.0
1,LarsCV,5.6071,1.0947,0.0
2,Earth,4.3222,1.0993,0.0
3,RandomForestRegressor,1.0,1.0,1.0
4,BMA,1.0,1.0,1.0
5,Min Error,4.6678,24.1225,
