In [1]:
%load_ext gprof2dot_magic
from sklearn import datasets, linear_model, neighbors, svm, ensemble
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from base import SuperLearner, BMA, try_super_learners
import pandas as pd
import numpy as np
from scipy import stats
from pyearth import Earth
import warnings
warnings.filterwarnings("ignore", category=FutureWarning) # warnings from py-earth

v_folds = 5
ols = linear_model.LinearRegression()
elnet = linear_model.ElasticNetCV(l1_ratio=0.5, cv=v_folds, normalize=True)
ridge = linear_model.RidgeCV(cv=v_folds)
lars = linear_model.LarsCV(cv=v_folds, normalize=True)
lasso = linear_model.LassoCV(cv=v_folds, normalize=True)
nn = neighbors.KNeighborsRegressor(weights='distance')
svm1 = svm.SVR(kernel='linear', C=10, gamma='auto')
svm2 = svm.SVR(kernel='poly', C=10, gamma='auto')
rf = ensemble.RandomForestRegressor(n_estimators=100,min_samples_split=5)
gbm = ensemble.GradientBoostingRegressor()
# earth is sort of like D/S/A?
earth=Earth(max_terms=50,max_degree=3,use_fast=True,verbose=0) # get this from https://github.com/scikit-learn-contrib/py-earth
rtree=DecisionTreeRegressor(max_depth=3,min_samples_split=5)

seed = 123
cands=[ols,lars,earth,rf]
metas=[ols,lasso,ridge,earth,rf,rtree,gbm]
def helper(X1,y1,X2,y2,cands_bma=cands,cands_sl=cands,metas=metas):
    display(try_super_learners(cands_sl,metas,X1,y1,X2,y2))
#     sl=SuperLearner(cand_learners=cands,V=10,meta_learner=ols).fit(X1,y1)
#     for meta in metas:
#         sl.meta_learner_=meta.fit(sl.Z_train_cv_,y1)
#         df=sl.debug(X1,y1,X2,y2,skip_fit=True)
        
    display(BMA(cand_learners=cands_bma).debug(X1,y1,X2,y2))

The gprof2dot_magic module is not an IPython extension.


In [2]:
# first simulation study
def sim1(n, seed=seed):
    np.random.seed(seed)
    w=np.random.binomial(1,.4,size=(10,n))
    eps=np.random.normal(0,1,size=n)
    y=2*w[0]*w[9]+4*w[1]*w[6]+3*w[3]*w[4]-\
    5*w[5]*w[9]+3*w[7]*w[8]+w[0]*w[1]*w[3]-\
    2*w[6]*(1-w[5])*w[1]*w[8]-4*(1-w[9])*w[0]*(1-w[3])+eps
    return np.transpose(w),y

train1,test1=sim1(500),sim1(10000)
helper(*train1,*test1)

Unnamed: 0,Learner,Train MSE,Train CV MSE,Test MSE
0,LinearRegression,7.8272,15.6812,5.3928
1,LarsCV,7.8272,15.6812,5.3928
2,Earth,1.2915,3.449,1.0
3,RandomForestRegressor,1.0,5.2078,1.6294
4,Meta (LinearRegression),1.0689,3.2026,1.009
5,Meta (LassoCV),1.0804,3.2234,1.0237
6,Meta (RidgeCV),1.0717,3.2106,1.0191
7,Meta (Earth),1.1442,3.0644,1.0303
8,Meta (RandomForestRegressor),1.3062,1.0,1.1296
9,Meta (DecisionTreeRegressor),1.4652,2.9009,1.136


Unnamed: 0,Learner,Train MSE,Test MSE,Coefs
0,LinearRegression,7.6334,5.3928,0.0
1,LarsCV,7.6334,5.3928,0.0
2,Earth,1.2596,1.0,0.0
3,RandomForestRegressor,1.0,1.6583,1.0
4,BMA,1.0,1.6583,
5,Min Error,0.8128,1.1969,


In [3]:
# Second simulation (low noise linear)
def sim2(n, noise_ratio=0.1, seed=seed):
    np.random.seed(seed)
    w = np.zeros(12)
    w[0:6] = 0.9
    w[6:8] = 0.4
    w[8:10] = 0.2
    x12 = np.random.poisson(1 ,size=(2,n))
    x36 = np.random.uniform(0,1, size=(4,n))
    x78 = x12*x36[0:2]
    x910 = x36[0:2]*x36[1:3]
    x1112 = np.random.binomial(2, 0.5, size=(2,n))
    y_mat = pd.DataFrame(np.transpose(np.concatenate([x12, x36, x78, x910, x1112], axis=0)))
    X = y_mat[y_mat.columns[[0,1,2,3,4,5,10,11]]]
    Ey = y_mat.multiply(w, axis=1).sum(axis=1)
    var_y = np.var(Ey)
    eps = np.random.normal(0,noise_ratio*var_y,size=n)
    y = Ey + eps
    return X, y
        
train2,test2=sim2(1000,0.1),sim2(10000,0.1)
helper(*train2,*test2)

Unnamed: 0,Learner,Train MSE,Train CV MSE,Test MSE
0,LinearRegression,2.2815,5.1113,1.3043
1,LarsCV,2.2815,5.1111,1.3043
2,Earth,1.676,3.9405,1.003
3,RandomForestRegressor,1.0,11.9064,2.686
4,Meta (LinearRegression),1.6813,3.9313,1.0004
5,Meta (LassoCV),1.6793,3.9332,1.0
6,Meta (RidgeCV),1.6771,3.9326,1.0
7,Meta (Earth),1.6761,3.9403,1.0032
8,Meta (RandomForestRegressor),1.4509,1.0,1.2659
9,Meta (DecisionTreeRegressor),3.3266,7.233,2.2372


Unnamed: 0,Learner,Train MSE,Test MSE,Coefs
0,LinearRegression,2.3019,1.3004,0.0
1,LarsCV,2.3019,1.3004,0.0
2,Earth,1.691,1.0,0.0
3,RandomForestRegressor,1.0,2.7138,1.0
4,BMA,1.0,2.7138,
5,Min Error,0.0478,0.089,


In [4]:
# third simulation (linear high noise)
train3,test3=sim2(1000,0.35),sim2(10000,0.35)
helper(*train3,*test3)

Unnamed: 0,Learner,Train MSE,Train CV MSE,Test MSE
0,LinearRegression,3.7992,4.114,1.0
1,LarsCV,3.7992,4.1131,1.0
2,Earth,3.5127,4.2174,1.1678
3,RandomForestRegressor,1.0,5.188,1.2118
4,Meta (LinearRegression),3.6271,4.0834,1.0039
5,Meta (LassoCV),3.6286,4.0842,1.0034
6,Meta (RidgeCV),3.5759,4.0838,1.0031
7,Meta (Earth),3.8843,4.0038,357.4219
8,Meta (RandomForestRegressor),3.8601,1.0,1.1285
9,Meta (DecisionTreeRegressor),3.8281,4.1244,1.1423


Unnamed: 0,Learner,Train MSE,Test MSE,Coefs
0,LinearRegression,3.8254,1.0,0.0
1,LarsCV,3.8254,1.0,0.0
2,Earth,3.5369,1.1678,0.0
3,RandomForestRegressor,1.0,1.2105,1.0
4,BMA,1.0,1.2105,
5,Min Error,0.2681,1.0966,


In [5]:
# non-linear simulation (low noise)
def sim3(n, noise_ratio=0.2, seed=seed):
    np.random.seed(seed)
    x14 = np.random.binomial(1,.4,size=(4,n))
    x48 = np.random.binomial(8, 0.2, size=(4,n))
    x912 = np.random.normal(2, 2, size=(4,n))

    X = np.transpose(pd.DataFrame(np.concatenate([x14,x48,x912])))
    Ey = 0.4*(x48[1]> 3)*(x48[2] < 3) + x14[1]*x14[0]*(4-x48[2])\
        - x48[1]*0.1*x912[0] + 0.5*x912[3]*((x912[2]>0)*(x912[1]>6)) + x48[1]*(x14[1])\
        + 0.5*x912[1]*(x48[3]>2)*x48[3] + (1-x14[0])*(1+x48[2])

    var_y = np.var(Ey)
    eps = np.random.normal(0,noise_ratio*var_y,size=n)
    y = Ey + eps
    return X, y

train4,test4=sim3(1000,0.1),sim3(10000,0.1)
helper(*train4,*test4)

Unnamed: 0,Learner,Train MSE,Train CV MSE,Test MSE
0,LinearRegression,12.4367,16.7323,4.7444
1,LarsCV,12.4691,16.7047,4.7089
2,Earth,3.1671,4.9602,1.353
3,RandomForestRegressor,1.0,6.2492,1.641
4,Meta (LinearRegression),1.5698,4.0022,1.0003
5,Meta (LassoCV),1.5725,4.0027,1.0002
6,Meta (RidgeCV),1.5674,4.0023,1.0
7,Meta (Earth),2.1973,3.6509,1.054
8,Meta (RandomForestRegressor),2.1381,1.0,1.1382
9,Meta (DecisionTreeRegressor),3.4104,4.9507,1.6307


Unnamed: 0,Learner,Train MSE,Test MSE,Coefs
0,LinearRegression,12.7791,3.5066,0.0
1,LarsCV,12.8123,3.4803,0.0
2,Earth,3.2543,1.0,0.0
3,RandomForestRegressor,1.0,1.2018,1.0
4,BMA,1.0,1.2018,
5,Min Error,0.378,1.3847,


In [None]:
# non-linear simulation (high noise)
train5,test5=sim3(1000,0.35),sim3(10000,0.35)
helper(*train5,*test5)

In [None]:
diabetes=datasets.load_diabetes()

X_train, X_test, y_train, y_test = train_test_split(
    diabetes.data, diabetes.target, test_size=0.2)

helper(X_train,y_train,X_test,y_test)

In [None]:
pr=pd.read_csv("datasets/CASP.csv")
feature_cols= pr.columns[pr.columns!='RMSD']
from sklearn.preprocessing import scale
pr.loc[:,feature_cols]=scale(pr.loc[:,feature_cols])

In [None]:
prtrain,prtest=train_test_split(pr.sample(1000))

In [None]:
helper(prtrain.loc[:,feature_cols],prtrain.RMSD,prtest.loc[:,feature_cols],prtest.RMSD)