In [21]:
%load_ext gprof2dot_magic
from sklearn import datasets, linear_model, neighbors, svm, ensemble
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from base import SuperLearner, BMA, try_super_learners
import pandas as pd
import numpy as np
from scipy import stats
from pyearth import Earth
import warnings
warnings.filterwarnings("ignore", category=FutureWarning) # warnings from py-earth

v_folds = 5
ols = linear_model.LinearRegression()
elnet = linear_model.ElasticNetCV(l1_ratio=0.5, cv=v_folds, normalize=True)
ridge = linear_model.RidgeCV(cv=v_folds)
lars = linear_model.LarsCV(cv=v_folds, normalize=True)
lasso = linear_model.LassoCV(cv=v_folds, normalize=True)
nn = neighbors.KNeighborsRegressor(weights='distance')
svm1 = svm.SVR(kernel='linear', C=10, gamma='auto')
svm2 = svm.SVR(kernel='poly', C=10, gamma='auto')
rf = ensemble.RandomForestRegressor(n_estimators=100,min_samples_split=5)
gbm = ensemble.GradientBoostingRegressor()
# earth is sort of like D/S/A?
earth=Earth(max_terms=50,max_degree=3,use_fast=True,verbose=0) # get this from https://github.com/scikit-learn-contrib/py-earth
rtree=DecisionTreeRegressor(max_depth=3,min_samples_split=5)

cands=[ols,lars,earth,rf]
metas=[ols,lasso,ridge,earth,rf,rtree,gbm]
def helper(X1,y1,X2,y2):
    display(try_super_learners(cands,metas,X1,y1,X2,y2))
#     sl=SuperLearner(cand_learners=cands,V=10,meta_learner=ols).fit(X1,y1)
#     for meta in metas:
#         sl.meta_learner_=meta.fit(sl.Z_train_cv_,y1)
#         df=sl.debug(X1,y1,X2,y2,skip_fit=True)
        
    display(BMA(cand_learners=cands).debug(X1,y1,X2,y2))

The gprof2dot_magic module is not an IPython extension.


In [17]:
# first simulation study
def sim1(n):
    w=np.random.binomial(1,.4,size=(10,n))
    eps=np.random.normal(0,1,size=n)
    y=2*w[0]*w[9]+4*w[1]*w[6]+3*w[3]*w[4]-\
    5*w[5]*w[9]+3*w[7]*w[8]+w[0]*w[1]*w[3]-\
    2*w[6]*(1-w[5])*w[1]*w[8]-4*(1-w[9])*w[0]*(1-w[3])+eps
    return np.transpose(w),y

train1,test1=sim1(500),sim1(10000)
helper(*train1,*test1)

Unnamed: 0,Learner,Train MSE,Train CV MSE,Test MSE
0,LinearRegression,6.25428,6.357725,6.427383
1,LarsCV,6.254786,6.35635,6.426881
2,Earth,1.155605,1.140024,1.178035
3,RandomForestRegressor,0.764694,1.471861,1.295168
4,Meta (LinearRegression),1.031271,1.124376,1.136143
5,Meta (LassoCV),1.035281,1.12513,1.137943
6,Meta (RidgeCV),1.027402,1.124398,1.135209
7,Meta (Earth),1.038735,1.115239,1.140714
8,Meta (RandomForestRegressor),1.107668,0.480576,1.264193
9,Meta (DecisionTreeRegressor),1.281951,1.233173,1.315497


Unnamed: 0,Learner,Train MSE,Test MSE,Coefs
0,LinearRegression,6.25428,6.427383,0.479776
1,LarsCV,6.254786,6.426881,0.520224
2,Earth,1.155605,1.178035,0.0
3,RandomForestRegressor,0.765175,1.2959,0.0
4,BMA,6.254417,6.427003,


In [86]:
# Second simulation (low noise linear)
def sim2(n, noise_ratio=0.1):
    w = np.zeros(12)
    w[0:6] = 0.9
    w[6:8] = 0.3
    w[8:10] = 0.1
    x12 = np.random.poisson(1 ,size=(n,2))
    x36 = np.random.uniform(0,1, size=(n,4))
    x78 = x12*x36[:,0:2]
    x910 = x36[:,0:2]*x36[:,1:3]
    x1112 = np.random.binomial(2, 0.5, size=(n,2))
    y_mat = pd.DataFrame(np.concatenate([x12, x36, x78, x910, x1112], axis=1))
    X = y_mat[y_mat.columns[[0,1,2,3,4,5,10,11]]]
    Ey = y_mat.multiply(w, axis=1).sum(axis=1)
    var_y = np.var(Ey)
    eps = np.random.normal(0,noise_ratio*var_y,size=n)
    y = Ey + eps
    return X, y
        
train2,test2=sim2(1000,0.1),sim2(10000,0.1)
helper(*train2,*test2)

Unnamed: 0,Learner,Train MSE,Train CV MSE,Test MSE
0,LinearRegression,0.076562,0.077989,0.089124
1,LarsCV,0.076562,0.077902,0.089124
2,Earth,0.068365,0.07018,0.081227
3,RandomForestRegressor,0.039125,0.21502,0.194152
4,Meta (LinearRegression),0.067063,0.069932,0.081249
5,Meta (LassoCV),0.066845,0.069958,0.081171
6,Meta (RidgeCV),0.066564,0.069997,0.081278
7,Meta (Earth),0.065356,0.069132,0.081408
8,Meta (RandomForestRegressor),0.067656,0.019166,0.096227
9,Meta (DecisionTreeRegressor),0.14071,0.132528,0.174829


Unnamed: 0,Learner,Train MSE,Test MSE,Coefs
0,LinearRegression,0.076562,0.089124,0.5
1,LarsCV,0.076562,0.089124,0.5
2,Earth,0.068365,0.081227,1.293509e-25
3,RandomForestRegressor,0.038599,0.193174,8.898812000000001e-150
4,BMA,0.076562,0.089124,


In [87]:
# third simulation (linear high noise)
train3,test3=sim2(1000,0.4),sim2(10000,0.4)
helper(*train3,*test3)

Unnamed: 0,Learner,Train MSE,Train CV MSE,Test MSE
0,LinearRegression,1.203567,1.225951,1.186815
1,LarsCV,1.204743,1.226927,1.180698
2,Earth,1.172358,1.308546,1.201401
3,RandomForestRegressor,0.302378,1.507046,1.390431
4,Meta (LinearRegression),1.201778,1.224218,1.189204
5,Meta (LassoCV),1.20352,1.226015,1.184249
6,Meta (RidgeCV),1.175837,1.224938,1.184952
7,Meta (Earth),1.20366,1.225858,1.185596
8,Meta (RandomForestRegressor),1.237267,0.298318,1.334253
9,Meta (DecisionTreeRegressor),1.259052,1.241132,1.298583


Unnamed: 0,Learner,Train MSE,Test MSE,Coefs
0,LinearRegression,1.203567,1.186815,0.3803571
1,LarsCV,1.204743,1.180698,0.6196421
2,Earth,1.172358,1.201401,7.501309e-07
3,RandomForestRegressor,0.302568,1.385643,4.798103e-301
4,BMA,1.204019,1.182728,


In [None]:
# non-linear simulation
def sim3(n, noise_ratio=0.2):
    # not finished
    w = np.zeros(12)
    x14 = np.random.binomial(1,.4,size=(n,4))
    x48 = np.random.binomial(8, 0.2, size=(n,4))
    
    X = pd.DataFrame(np.concatenate([x12, x36, x78, x910, x1112], axis=1))
    Ey = X.multiply(w, axis=1).sum(axis=1)
    var_y = np.var(Ey)
    eps = np.random.normal(0,noise_ratio*var_y,size=n)
    y = Ey + eps
    return X, y

In [6]:
diabetes=datasets.load_diabetes()

X_train, X_test, y_train, y_test = train_test_split(
    diabetes.data, diabetes.target, test_size=0.2)

helper(X_train,y_train,X_test,y_test)



Unnamed: 0,Learner,Train MSE,Train CV MSE,Test MSE
0,LinearRegression,2924.180414,3089.363281,2709.342336
1,LarsCV,3013.342624,3152.375809,2633.225901
2,Earth,2432.672601,3377.575004,3371.050633
3,RandomForestRegressor,656.902279,3505.297794,2909.303762
4,Meta (LinearRegression),2530.162268,3054.640262,2707.847372
5,Meta (LassoCV),2538.036979,3054.696219,2710.880546
6,Meta (RidgeCV),2530.112087,3054.640263,2707.843733
7,Meta (Earth),4233.243797,2587.447943,3228.879625
8,Meta (RandomForestRegressor),2827.377559,676.843566,3305.638772
9,Meta (DecisionTreeRegressor),3408.713926,2632.61141,2999.237021


Unnamed: 0,Learner,Train MSE,Test MSE,Coefs
0,LinearRegression,2924.180414,2709.342336,0.004960352
1,LarsCV,3013.342624,2633.225901,0.9950396
2,Earth,2432.672601,3371.050633,3.8863700000000004e-17
3,RandomForestRegressor,661.078828,2741.870309,5.030033e-117
4,BMA,3012.460266,2633.286333,


In [14]:
pr=pd.read_csv("../CASP.csv")
feature_cols= pr.columns[pr.columns!='RMSD']
from sklearn.preprocessing import scale
pr.loc[:,feature_cols]=scale(pr.loc[:,feature_cols])

FileNotFoundError: File b'../CASP.csv' does not exist

In [5]:
prtrain,prtest=train_test_split(pr.sample(1000))

In [13]:
helper(prtrain.loc[:,feature_cols],prtrain.RMSD,prtest.loc[:,feature_cols],prtest.RMSD)

NameError: name 'prtrain' is not defined